DIY PC Speaker to AI Voice Assistant Under $12

67 Views, 0 Favorites, 0 Comments

DIY PC Speaker to AI Voice Assistant Under $12

I Built an Alexa for Under ₹1000! (DIY Smart Assistant)

This project combines embedded systems and AI inference to create an end-to-end conversational assistant. The ESP32 handles real-time audio recording and playback, while a Python backend performs:

Speech-to-Text (STT) via Faster-Whisper
Language Understanding via Google Gemini
Text-to-Speech (TTS) using Piper TTS

Please find the source code in this repo. I'll be referencing the code from the same below.

Supplies

For this project, we'll need:

Components:

ESP32-WROOM-32 Development Board
INMP441 I2S MEMS Microphone
LM386 Audio Amplifier Module
2-inch 8Ω 12W speaker
Tactile Pushbutton
TP4056 Li-ion charging module
3.7V 1000mAh Li-Po battery
Built-in ESP32 LED

Tools Required:

Soldering iron
Jumper wires
Breadboard or perfboard
USB cable for flashing

Full BOM sheet here: Link

Wiring the Circuit

Make sure all components are on hand. Please refer to the above schematic for making the connections.

Please keep in mind:

INMP441 requires strictly 3v3 ONLY
DAC output (GPIO 25) → LM386 → Speaker
TP4056 powers ESP32 via 5V out

Flash Code to ESP32

First make sure you have WebSocketsClient library installed. For the Arduino IDE, you can simply download the zip file and install the libraries to the IDE.

ESP32 Sketch

//esp-code.ino

#include <WiFi.h>

#include <driver/i2s.h>

#include <driver/dac.h>

#include <WebSocketsClient.h>

#include "secrets.h"

#define I2S_WS 15

#define I2S_SD 32

#define I2S_SCK 14

#define I2S_PORT I2S_NUM_0

#define RECORD_BUTTON 26

#define LED_BUILTIN 2

#define DAC_CHANNEL DAC_CHANNEL_1

#define SAMPLE_RATE 16000

#define BUFFER_SIZE 4096

WebSocketsClient webSocket;

volatile bool isRecording = false;

volatile bool isReceivingAudio = false;

hw_timer_t* timer = NULL;

portMUX_TYPE timerMux = portMUX_INITIALIZER_UNLOCKED;

void IRAM_ATTR onTimer() {} // kept for future use if needed

void setupWifi() {

Serial.print("[ESP] WiFi");

WiFi.begin(WIFI_SSID, WIFI_PASS);

while (WiFi.status() != WL_CONNECTED) {

delay(500);

Serial.print(".");

}

Serial.println(" ✓");

}

void setupI2SMicrophone() {

Serial.print("[ESP] Microphone...");

const i2s_config_t i2s_config = {

.mode = i2s_mode_t(I2S_MODE_MASTER | I2S_MODE_RX),

.sample_rate = 16000,

.bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT,

.channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT,

.communication_format = i2s_comm_format_t(I2S_COMM_FORMAT_STAND_I2S),

.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,

.dma_buf_count = 8,

.dma_buf_len = 512,

.use_apll = false,

.tx_desc_auto_clear = false,

.fixed_mclk = 0

};

esp_err_t err = i2s_driver_install(I2S_PORT, &i2s_config, 0, NULL);

if (err != ESP_OK) {

Serial.printf("[ESP] Failed to install I2S driver: %d\n", err);

return;

}

const i2s_pin_config_t pin_config = {

.bck_io_num = I2S_SCK,

.ws_io_num = I2S_WS,

.data_out_num = I2S_PIN_NO_CHANGE,

.data_in_num = I2S_SD

};

err = i2s_set_pin(I2S_PORT, &pin_config);

if (err != ESP_OK) {

Serial.printf("[ESP] Failed to set I2S pins: %d\n", err);

return;

}

i2s_zero_dma_buffer(I2S_PORT);

Serial.println("✓");

}

void setupDACOutput() {

Serial.print("[ESP] Speaker (DAC)...");

dac_output_enable(DAC_CHANNEL);

dac_output_voltage(DAC_CHANNEL, 128);

timer = timerBegin(0, 80, true);

timerAttachInterrupt(timer, &onTimer, true);

timerAlarmWrite(timer, 1000000 / SAMPLE_RATE, true);

timerAlarmEnable(timer);

Serial.println(" ✓");

}

void send_audio_chunk() {

const int samples = 1024;

int32_t buffer32[samples];

size_t bytes_read;

esp_err_t result = i2s_read(I2S_PORT, buffer32, samples * sizeof(int32_t),

&bytes_read, portMAX_DELAY);

if (result != ESP_OK) {

Serial.printf("[ESP] I2S read error: %d\n", result);

return;

}

int16_t pcm16[samples];

for (int i = 0; i < samples; i++) {

pcm16[i] = (int16_t)(buffer32[i] >> 16);

}

webSocket.sendBIN((uint8_t*)pcm16, sizeof(pcm16));

}

void playTestTone(int durationMs, int times) {

Serial.println("[ESP] Playing 1kHz test tone...");

int samples = (8000 * durationMs) / 1000;

for (int i = 0; i < times; i++) {

for (int i = 0; i < samples; i++) {

float t = (float)i / 8000.0;

float sine = sin(2.0 * PI * 1000.0 * t);

uint8_t value = (uint8_t)((sine * 100) + 128);

dac_output_voltage(DAC_CHANNEL, value);

delayMicroseconds(62.5);

}

delay(50);

}

dac_output_voltage(DAC_CHANNEL, 128);

}

void webSocketEvent(WStype_t type, uint8_t* payload, size_t length) {

switch (type) {

case WStype_CONNECTED:

webSocket.sendTXT("ping");

break;

case WStype_DISCONNECTED:

Serial.println("[WS] Disconnected");

isReceivingAudio = false;

dac_output_voltage(DAC_CHANNEL, 128);

break;

case WStype_TEXT:

if (strcmp((const char*)payload, "pong") == 0) {

Serial.println("[WS] Connected to server");

} else {

Serial.print("[WS] ");

Serial.println((const char*)payload);

}

break;

case WStype_BIN:

if (isReceivingAudio || length > 100) {

if (!isReceivingAudio) {

Serial.println("[ESP] Starting playback");

isReceivingAudio = true;

}

for (size_t i = 0; i < length; i++) {

dac_output_voltage(DAC_CHANNEL, payload[i]);

delayMicroseconds(62.5);

}

static unsigned long lastDot = 0;

if (millis() - lastDot > 500) {

Serial.print(".");

lastDot = millis();

}

break;

case WStype_ERROR:

Serial.println("[WS] Error occurred");

break;

}

void setup() {

Serial.begin(115200);

pinMode(LED_BUILTIN, OUTPUT);

pinMode(RECORD_BUTTON, INPUT_PULLDOWN);

Serial.println("\n\n╔═══════════════════════════════════╗");

Serial.println("║ ESP32 Voice Assistant v1.0 ║");

Serial.println("╚═══════════════════════════════════╝\n");

setupWifi();

setupI2SMicrophone();

setupDACOutput();

Serial.print("[ESP] WiFi addr: ");

Serial.println(WiFi.localIP());

Serial.print("[ESP] Server: ");

Serial.print(WS_HOST);

Serial.print(":");

Serial.println(WS_PORT);

webSocket.begin(WS_HOST, WS_PORT, WS_PATH);

webSocket.onEvent(webSocketEvent);

webSocket.setReconnectInterval(5000);

// digitalWrite(LED_BUILTIN, HIGH);

// delay(3000);

// digitalWrite(LED_BUILTIN, LOW);

playTestTone(250, 2);

Serial.println("[ESP] Setup complete! Press button to talk.");

}

void loop() {

webSocket.loop();

static bool lastButtonState = LOW;

bool button = digitalRead(RECORD_BUTTON);

if (button == HIGH && lastButtonState == LOW) {

digitalWrite(LED_BUILTIN, HIGH);

Serial.println("[ESP] 🎤 Recording...");

isRecording = true;

isReceivingAudio = false;

dac_output_voltage(DAC_CHANNEL, 128);

webSocket.sendTXT("pause");

} else if (button == LOW && lastButtonState == HIGH) {

digitalWrite(LED_BUILTIN, LOW);

Serial.println("[ESP] ⏹ Stopped. Processing...");

isRecording = false;

webSocket.sendTXT("stop");

}

lastButtonState = button;

if (isRecording && webSocket.isConnected()) {

send_audio_chunk();

}

delay(10);

}

You'll also require a secrets.h:

//secrets.h

const char* WIFI_SSID = "YOUR WIFI SSID";

const char* WIFI_PASS = "YOUR WIFI PASSWORD";

const char* WS_HOST = "YOUR WS HOST"; // ip addr

const uint16_t WS_PORT = 7860;

const char* WS_PATH = "/ws";

AI Backend Setup (Python)

For this I would recommend hosting an AWS EC2 instance assigned with a static ip. Alternatively you can run a local server on your laptop and connect your ESP32 through hotspot. Consider this backend as the brain of the system because this is where all the processing happens. Find the source code under /server directory. Here's how to set it up:

1) Install all the dependencies:

cd server

pip install uv

uv sync

uv run main.py

or using Docker:

docker build -t esp32-ws-server .

docker run -p 7860:7860 esp32-ws-server

2) Environment variables and models needed

GEMINI_API_KEY
You'll also need a piper voice model. Download it using the following command:

python -m piper.download_voices en_US-libritts_r-medium --data-dir tts_models

You can choose any other model as well, find them here.

And that's all! Your voice assistant should now respond to your queries once you press the push button!