diff --git a/README.md b/README.md index 7a150e911b..f022c007ff 100644 --- a/README.md +++ b/README.md @@ -299,7 +299,7 @@ Websocket接口地址: wss://2662r3426b.vicp.fun/xiaozhi/v1/ | 使用方式 | 支持平台 | 免费平台 | |:---:|:---:|:---:| | 本地使用 | FunASR、SherpaASR | FunASR、SherpaASR | -| 接口调用 | DoubaoASR、FunASRServer、TencentASR、AliyunASR | FunASRServer | +| 接口调用 | DoubaoASR、FunASRServer、TencentASR、AliyunASR、GroqASR | FunASRServer | --- diff --git a/README_en.md b/README_en.md index d66cbe5d36..dd19892d52 100644 --- a/README_en.md +++ b/README_en.md @@ -268,7 +268,7 @@ In fact, any LLM that supports openai interface calls can be integrated and used | Usage Method | Supported Platforms | Free Platforms | |:---:|:---:|:---:| | Local Usage | FunASR, SherpaASR | FunASR, SherpaASR | -| API Call | DoubaoASR, FunASRServer, TencentASR, AliyunASR | FunASRServer | +| API Call | DoubaoASR, FunASRServer, TencentASR, AliyunASR, GroqASR | FunASRServer | --- diff --git a/main/xiaozhi-server/config.yaml b/main/xiaozhi-server/config.yaml index 6dc0b25d4e..1ef3ee9c05 100644 --- a/main/xiaozhi-server/config.yaml +++ b/main/xiaozhi-server/config.yaml @@ -317,6 +317,10 @@ ASR: # 语言参数,1537为普通话,具体参考:https://ai.baidu.com/ai-doc/SPEECH/0lbxfnc9b dev_pid: 1537 output_dir: tmp/ + Groq: + type: groq + model: whisper-large-v3 # https://groq.com/largest-most-capable-asr-model-now-faster-on-groqcloud/ + api_key: 你的groq api key VAD: SileroVAD: diff --git a/main/xiaozhi-server/core/providers/asr/groq.py b/main/xiaozhi-server/core/providers/asr/groq.py new file mode 100644 index 0000000000..4f1e18b3f1 --- /dev/null +++ b/main/xiaozhi-server/core/providers/asr/groq.py @@ -0,0 +1,69 @@ +import os +import tempfile +import subprocess +import wave +from pathlib import Path +from groq import Groq, RateLimitError +from config.logger import setup_logging +from core.providers.asr.base import ASRProviderBase +from core.providers.asr.dto.dto import InterfaceType + +TAG = __name__ +logger = setup_logging() + +class ASRProvider(ASRProviderBase): + def __init__(self, config, delete_audio_file): + super().__init__() + self.interface_type = InterfaceType.NON_STREAM + self.api_key = config.get("api_key") or os.getenv("GROQ_API_KEY") + self.model = config.get("model") or "distil-whisper-large-v3" + self.delete_audio_file = delete_audio_file + + def preprocess_audio(self, pcm_data: bytes) -> Path: + # Write PCM data to a valid WAV file + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav: + with wave.open(temp_wav, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) # 16-bit PCM + wf.setframerate(16000) + wf.writeframes(pcm_data) + wav_path = Path(temp_wav.name) + with tempfile.NamedTemporaryFile(suffix='.flac', delete=False) as temp_flac: + flac_path = Path(temp_flac.name) + # Convert to 16kHz mono FLAC + subprocess.run([ + 'ffmpeg', '-hide_banner', '-loglevel', 'error', + '-i', str(wav_path), + '-ar', '16000', '-ac', '1', '-c:a', 'flac', '-y', str(flac_path) + ], check=True) + wav_path.unlink(missing_ok=True) + return flac_path + + async def speech_to_text(self, opus_data, session_id, audio_format="opus"): + # Decode opus to PCM using base class method + if audio_format == "pcm": + pcm_data = b"".join(opus_data) + else: + pcm_data = b"".join(self.decode_opus(opus_data)) + flac_path = self.preprocess_audio(pcm_data) + try: + client = Groq(api_key=self.api_key) + with open(flac_path, "rb") as f: + result = client.audio.transcriptions.create( + file=("audio.flac", f, "audio/flac"), + model=self.model, + response_format="verbose_json" + ) + # Handle both dict and object result + if hasattr(result, "text"): + text = result.text + elif isinstance(result, dict): + text = result.get("text", "") + else: + text = "" + return text, None + except Exception as e: + logger.bind(tag=TAG).error(f"Groq ASR error: {e}") + return "", None + finally: + flac_path.unlink(missing_ok=True) \ No newline at end of file diff --git a/main/xiaozhi-server/requirements.txt b/main/xiaozhi-server/requirements.txt index 3ffbcfa463..e302e19f14 100755 --- a/main/xiaozhi-server/requirements.txt +++ b/main/xiaozhi-server/requirements.txt @@ -33,4 +33,5 @@ markitdown==0.1.1 mcp-proxy==0.6.0 PyJWT==2.8.0 psutil==7.0.0 -portalocker==2.10.1 \ No newline at end of file +portalocker==2.10.1 +groq==0.28.0 \ No newline at end of file