A1:
该系统采用 “语音采集→唤醒词检测→ASR→NLP→TTS→语音播放” 的流水线架构:
A2:
使用 I2S 接口连接麦克风,配置为录音模式:
#include "driver/i2s.h"
#define I2S_MIC_CHANNEL I2S_CHANNEL_0
#define I2S_MIC_SERIAL_CLOCK GPIO_NUM_32 // BCLK
#define I2S_MIC_LEFT_RIGHT_CLOCK GPIO_NUM_33 // LRCLK
#define I2S_MIC_SERIAL_DATA GPIO_NUM_25 // DATA_IN
void i2s_mic_init() {
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
.sample_rate = 16000, // 采样率16kHz(常见语音识别采样率)
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT, // 单声道
.communication_format = I2S_COMM_FORMAT_STAND_I2S,
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 8,
.dma_buf_len = 64,
.use_apll = false,
.tx_desc_auto_clear = false,
.fixed_mclk = 0
};
i2s_pin_config_t pin_config = {
.bck_io_num = I2S_MIC_SERIAL_CLOCK,
.ws_io_num = I2S_MIC_LEFT_RIGHT_CLOCK,
.data_out_num = I2S_PIN_NO_CHANGE, // 仅接收,不发送
.data_in_num = I2S_MIC_SERIAL_DATA
};
i2s_driver_install(I2S_MIC_CHANNEL, &i2s_config, 0, NULL);
i2s_set_pin(I2S_MIC_CHANNEL, &pin_config);
i2s_start(I2S_MIC_CHANNEL);
}
// 读取音频数据
size_t read_audio_data(uint8_t* buffer, size_t buffer_size) {
size_t bytes_read = 0;
i2s_read(I2S_MIC_CHANNEL, buffer, buffer_size, &bytes_read, portMAX_DELAY);
return bytes_read;
}
A3:
使用 TensorFlow Lite Micro 部署轻量级神经网络模型(如 KWS [Keyword Spotting]):
#include "tensorflow/lite/micro/all_ops_resolver.h"
#include "tensorflow/lite/micro/micro_error_reporter.h"
#include "tensorflow/lite/micro/micro_interpreter.h"
#include "model_data.h" // 唤醒词模型(如"Hi, 小智")
// 模型输入输出缓冲区
constexpr int kTensorArenaSize = 10 * 1024; // 10KB
uint8_t tensor_arena[kTensorArenaSize];
void wake_word_init() {
// 初始化错误报告器
const tflite::ErrorReporter* error_reporter = tflite::MicroErrorReporter();
// 注册操作符
tflite::AllOpsResolver resolver;
// 加载模型
const tflite::FlatBufferModel* model = tflite::FlatBufferModel::BuildFromBuffer(
g_model_data, g_model_data_len);
// 创建解释器
tflite::SimpleTensorAllocator tensor_allocator(tensor_arena, kTensorArenaSize);
tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator, error_reporter);
// 分配张量
TfLiteStatus allocate_status = interpreter.AllocateTensors();
if (allocate_status != kTfLiteOk) {
Serial.println("Failed to allocate tensors!");
}
}
bool detect_wake_word(const int16_t* audio_data, size_t length) {
// 预处理音频(如MFCC特征提取)
float* input = interpreter.input(0)->data.f;
preprocess_audio(audio_data, length, input);
// 运行模型推理
if (interpreter.Invoke() != kTfLiteOk) {
Serial.println("Error invoking interpreter!");
return false;
}
// 获取输出结果
const float* output = interpreter.output(0)->data.f;
return output[1] > 0.95; // 假设索引1是唤醒词类别,阈值0.95
}
A4:
使用 ESP32-S3 的 WiFi 功能,通过 HTTPS 请求调用百度 / 阿里云 ASR API:
#include
#include
String recognize_speech(const uint8_t* audio_data, size_t length) {
HTTPClient http;
String result = "";
// 准备请求URL和API密钥
String url = "https://vop.baidu.com/server_api";
String api_key = "YOUR_API_KEY";
String secret_key = "YOUR_SECRET_KEY";
// 获取访问令牌
String token = get_access_token(api_key, secret_key);
if (token.isEmpty()) return "Token获取失败";
// 构建请求体
String request_body = "{\"format\":\"wav\",\"rate\":16000,\"channel\":1,"
"\"token\":\"" + token + "\",\"cuid\":\"esp32_s3\",\"len\":"
+ String(length) + ",\"speech\":\"" + base64_encode(audio_data, length) + "\"}";
// 发送请求
http.begin(url);
http.addHeader("Content-Type", "application/json");
int httpResponseCode = http.POST(request_body);
if (httpResponseCode > 0) {
String payload = http.getString();
// 解析JSON响应
DynamicJsonDocument doc(4096);
DeserializationError error = deserializeJson(doc, payload);
if (!error && doc["err_no"] == 0) {
result = doc["result"][0].as();
} else {
result = "识别失败: " + payload;
}
} else {
result = "HTTP请求错误: " + String(httpResponseCode);
}
http.end();
return result;
}
A5:
采用意图分类 + 槽位填充的简单实现,也可调用云端 NLP 服务:
// 定义支持的意图类型
enum Intent {
INTENT_UNKNOWN,
INTENT_WEATHER,
INTENT_LIGHT_ON,
INTENT_LIGHT_OFF,
INTENT_TIME
};
// 简单的关键词匹配实现
Intent recognize_intent(const String& text) {
if (text.indexOf("天气") != -1) return INTENT_WEATHER;
if (text.indexOf("开灯") != -1 || text.indexOf("打开") != -1) return INTENT_LIGHT_ON;
if (text.indexOf("关灯") != -1 || text.indexOf("关闭") != -1) return INTENT_LIGHT_OFF;
if (text.indexOf("时间") != -1 || text.indexOf("几点") != -1) return INTENT_TIME;
return INTENT_UNKNOWN;
}
// 处理用户意图
String handle_intent(Intent intent, const String& text) {
switch (intent) {
case INTENT_WEATHER:
return query_weather(); // 查询天气函数
case INTENT_LIGHT_ON:
digitalWrite(LED_PIN, HIGH);
return "已为您打开灯光";
case INTENT_LIGHT_OFF:
digitalWrite(LED_PIN, LOW);
return "已为您关闭灯光";
case INTENT_TIME:
return get_current_time(); // 获取当前时间函数
default:
return "抱歉,我不理解您的问题";
}
}
A6:
调用云端 TTS 服务生成语音文件,通过 I2S 接口输出到扬声器:
// 调用百度TTS服务
bool text_to_speech(const String& text, uint8_t*& audio_data, size_t& length) {
HTTPClient http;
// 构建请求URL
String url = "https://tsn.baidu.com/text2audio";
url += "?tex=" + urlEncode(text);
url += "&tok=" + get_access_token(API_KEY, SECRET_KEY);
url += "&cuid=esp32_s3";
url += "&ctp=1";
url += "&lan=zh"; // 中文
url += "&per=0"; // 发音人选择
http.begin(url);
int httpResponseCode = http.GET();
if (httpResponseCode == 200) {
// 获取响应内容长度
length = http.getSize();
// 分配内存
audio_data = (uint8_t*)malloc(length);
if (!audio_data) return false;
// 读取响应数据
WiFiClient* stream = http.getStreamPtr();
stream->readBytes(audio_data, length);
return true;
}
return false;
}
// 通过I2S播放音频
void play_audio(const uint8_t* audio_data, size_t length) {
// 配置I2S为播放模式
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),
.sample_rate = 16000,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_STAND_I2S,
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 8,
.dma_buf_len = 64,
.use_apll = false,
.tx_desc_auto_clear = true,
.fixed_mclk = 0
};
// 初始化I2S驱动
i2s_driver_install(I2S_CHANNEL_0, &i2s_config, 0, NULL);
i2s_set_pin(I2S_CHANNEL_0, &pin_config);
i2s_start(I2S_CHANNEL_0);
// 播放音频
size_t bytes_written = 0;
i2s_write(I2S_CHANNEL_0, audio_data, length, &bytes_written, portMAX_DELAY);
// 停止并清理
i2s_stop(I2S_CHANNEL_0);
i2s_driver_uninstall(I2S_CHANNEL_0);
}
A7:
硬件优化:
软件优化:
算法优化:
A8:
休眠策略:
void enter_low_power_mode() {
// 关闭WiFi和蓝牙
WiFi.disconnect(true);
WiFi.mode(WIFI_OFF);
btStop();
// 仅保留唤醒词检测模块运行
set_wake_word_detector_active(true);
// 进入Light Sleep模式
esp_sleep_enable_timer_wakeup(10 * 1000000); // 10秒后唤醒
esp_light_sleep_start();
}
硬件节能:
算法节能:
A9:
环境噪声干扰:
网络延迟:
唤醒词误触发:
内存不足:
A10:
ESP32-S3 小智语音对话系统通过合理的硬件选型和软件架构设计,实现了低成本、低功耗的智能语音交互。关键技术包括:唤醒词检测、云端 ASR/TTS 服务集成、意图识别和 I2S 音频处理。在实际部署中,需重点关注环境适应性、网络稳定性和功耗优化。