From 1d4a3adaad06920e35c8d63f07117a2904487e96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=95=E6=9F=8F=E9=9D=92?= Date: Thu, 4 Jun 2026 20:19:40 +0800 Subject: [PATCH] =?UTF-8?q?feat(asr):=20=E6=94=AF=E6=8C=81=20OpenRouter=20?= =?UTF-8?q?=E4=BD=9C=E4=B8=BA=20ASR=20=E6=8F=90=E4=BE=9B=E5=95=86=20(#582)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenRouter 的 /audio/transcriptions 与 OpenAI 路径相同、也走 Bearer,但请求体是 application/json:{model, input_audio:{data:, format:"wav"}},而非 OpenAI 官方的 multipart/form-data。直接当 whisper 兼容 provider 会被 OpenRouter 拒绝。 - whisper.rs: 新增 AsrRequestFormat{Multipart,OpenRouterJson},transcribe_chunk 按格式 分发;OpenRouterJson 走 JSON+标准 base64(带 padding),不带 multipart 专属的 prompt/response_format 字段。用 with_request_format builder 避免改 new() 签名(不动 既有 4 处构造点)。 - coordinator.rs: is_whisper_compatible_provider 注册 openrouter;新增 whisper_request_format;verbose_json 对 openrouter 保持关闭;base64 膨胀 ~33%,长 录音按 30s 切分(同 zhipu)。 - 前端: ASR_PRESETS / AsrPresetId / Overview 名称映射 + 5 份 i18n 加 openrouter。 - 测试: OpenRouterJson 发 JSON body 单测 + 注册/格式断言;既有 transcribe 测试不变。 默认 model 用报告人已验证的 openai/whisper-large-v3-turbo。 新增直接依赖 base64=0.22(此前已是传递依赖,未引入新 crate)。 --- openless-all/app/src-tauri/Cargo.lock | 1 + openless-all/app/src-tauri/Cargo.toml | 2 + openless-all/app/src-tauri/src/asr/whisper.rs | 163 ++++++++++++++---- openless-all/app/src-tauri/src/coordinator.rs | 55 ++++-- .../src-tauri/src/coordinator/dictation.rs | 20 ++- openless-all/app/src/i18n/en.ts | 1 + openless-all/app/src/i18n/ja.ts | 1 + openless-all/app/src/i18n/ko.ts | 1 + openless-all/app/src/i18n/zh-CN.ts | 1 + openless-all/app/src/i18n/zh-TW.ts | 1 + openless-all/app/src/pages/Overview.tsx | 1 + .../src/pages/settings/ProvidersSection.tsx | 3 + .../app/src/pages/settings/shared.tsx | 1 + 13 files changed, 203 insertions(+), 48 deletions(-) diff --git a/openless-all/app/src-tauri/Cargo.lock b/openless-all/app/src-tauri/Cargo.lock index b1f8c9ce..b04c5855 100644 --- a/openless-all/app/src-tauri/Cargo.lock +++ b/openless-all/app/src-tauri/Cargo.lock @@ -3680,6 +3680,7 @@ version = "1.3.6-2" dependencies = [ "anyhow", "arboard", + "base64 0.22.1", "block2 0.5.1", "bytes", "bzip2 0.4.4", diff --git a/openless-all/app/src-tauri/Cargo.toml b/openless-all/app/src-tauri/Cargo.toml index b0a3e014..1779b069 100644 --- a/openless-all/app/src-tauri/Cargo.toml +++ b/openless-all/app/src-tauri/Cargo.toml @@ -26,6 +26,8 @@ tauri-plugin-autostart = "2" tauri-plugin-dialog = "2" serde = { version = "1", features = ["derive"] } serde_json = "1" +# OpenRouter ASR 把音频以标准 base64(带 padding)放进 JSON body(issue #582)。 +base64 = "0.22" sha2 = "0.10" bzip2 = "0.4" tar = "0.4" diff --git a/openless-all/app/src-tauri/src/asr/whisper.rs b/openless-all/app/src-tauri/src/asr/whisper.rs index eabeec10..cc3120d2 100644 --- a/openless-all/app/src-tauri/src/asr/whisper.rs +++ b/openless-all/app/src-tauri/src/asr/whisper.rs @@ -2,6 +2,7 @@ //! to any OpenAI-compatible `/audio/transcriptions` endpoint on session end. use anyhow::{Context, Result}; +use base64::Engine; use parking_lot::Mutex; use crate::asr::wav::encode_wav_16k_mono; @@ -21,6 +22,19 @@ pub const PROMPT_CHAR_BUDGET: usize = 240; /// 区切り文字(ASCII)。Whisper のトークナイザはどの言語でも安定して扱える。 const PROMPT_SEPARATOR: &str = ", "; +/// `/audio/transcriptions` 请求体编码方式。 +/// +/// OpenAI 官方及多数兼容厂商用 `multipart/form-data`(file + model)。 +/// OpenRouter 虽路径相同、也走 Bearer,但请求体是 `application/json`: +/// `{model, input_audio:{data:, format:"wav"}}`(issue #582)。 +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum AsrRequestFormat { + /// `multipart/form-data`(既有行为,默认)。 + Multipart, + /// OpenRouter `application/json` + base64 音频。 + OpenRouterJson, +} + pub struct WhisperBatchASR { api_key: String, base_url: String, @@ -36,6 +50,8 @@ pub struct WhisperBatchASR { /// (SiliconFlow)は response_format 自体が無いので false にして従来の /// `json` のまま送る(壊さない)。 verbose_json: bool, + /// 请求体编码方式。默认 `Multipart`,OpenRouter 走 `OpenRouterJson`。 + request_format: AsrRequestFormat, buffer: Mutex>, } @@ -55,10 +71,18 @@ impl WhisperBatchASR { prompt, max_chunk_duration_ms, verbose_json, + request_format: AsrRequestFormat::Multipart, buffer: Mutex::new(Vec::new()), } } + /// 设置请求体编码方式(默认 `Multipart`)。OpenRouter 需 `OpenRouterJson`。 + /// 用 builder 而非给 `new()` 加参数,避免改动既有 4 处构造点的签名。 + pub fn with_request_format(mut self, request_format: AsrRequestFormat) -> Self { + self.request_format = request_format; + self + } + /// Stop collecting audio, encode the buffer as WAV, and POST to the /// Whisper transcriptions endpoint. /// @@ -110,40 +134,62 @@ impl WhisperBatchASR { .collect(); let wav = encode_wav_16k_mono(&samples); let url = transcription_url(&self.base_url)?; + let client = reqwest::Client::new(); - let wav_part = reqwest::multipart::Part::bytes(wav) - .file_name("audio.wav") - .mime_str("audio/wav") - .context("set MIME type")?; - let mut form = reqwest::multipart::Form::new() - .part("file", wav_part) - .text("model", self.model.clone()); - - // verbose_json 対応プロバイダ(OpenAI / Groq)のときだけ、セグメント - // メタデータ付きの応答を要求し、temperature も 0 に固定する。非対応 - // プロバイダ(SiliconFlow の SenseVoice / TeleSpeech 等)には送らず - // 従来どおりの応答にして、未知パラメータでの 4xx を避ける。 - if self.verbose_json { - form = form - .text("response_format", "verbose_json") - .text("temperature", "0"); - } + let request = match self.request_format { + AsrRequestFormat::Multipart => { + let wav_part = reqwest::multipart::Part::bytes(wav) + .file_name("audio.wav") + .mime_str("audio/wav") + .context("set MIME type")?; + let mut form = reqwest::multipart::Form::new() + .part("file", wav_part) + .text("model", self.model.clone()); + + // verbose_json 対応プロバイダ(OpenAI / Groq)のときだけ、セグメント + // メタデータ付きの応答を要求し、temperature も 0 に固定する。非対応 + // プロバイダ(SiliconFlow の SenseVoice / TeleSpeech 等)には送らず + // 従来どおりの応答にして、未知パラメータでの 4xx を避ける。 + if self.verbose_json { + form = form + .text("response_format", "verbose_json") + .text("temperature", "0"); + } + + // `prompt` は空文字を送らない:OpenAI 互換実装によっては空文字でエラーに + // なるリスクがある(Groq は許容するが防御的にスキップ)。`trim()` で + // 空白のみのケースも除外。 + if let Some(prompt) = self.prompt.as_ref() { + let trimmed = prompt.trim(); + if !trimmed.is_empty() { + form = form.text("prompt", trimmed.to_string()); + } + } - // `prompt` は空文字を送らない:OpenAI 互換実装によっては空文字でエラーに - // なるリスクがある(Groq は許容するが防御的にスキップ)。`trim()` で - // 空白のみのケースも除外。 - if let Some(prompt) = self.prompt.as_ref() { - let trimmed = prompt.trim(); - if !trimmed.is_empty() { - form = form.text("prompt", trimmed.to_string()); + client + .post(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .multipart(form) } - } + AsrRequestFormat::OpenRouterJson => { + // OpenRouter /audio/transcriptions:application/json,音频走标准 + // base64(带 padding)。不带 multipart 专属的 prompt/response_format + // 字段,避免未知字段导致 4xx;verbose_json 对该协议保持关闭。 + let body = serde_json::json!({ + "model": self.model, + "input_audio": { + "data": base64::engine::general_purpose::STANDARD.encode(&wav), + "format": "wav", + }, + }); + client + .post(&url) + .header("Authorization", format!("Bearer {}", self.api_key)) + .json(&body) + } + }; - let client = reqwest::Client::new(); - let resp = client - .post(&url) - .header("Authorization", format!("Bearer {}", self.api_key)) - .multipart(form) + let resp = request .send() .await .context("Whisper HTTP request failed")?; @@ -725,6 +771,63 @@ mod tests { server.join().unwrap(); } + #[tokio::test] + async fn openrouter_format_posts_json_with_base64_audio() { + // issue #582:OpenRouterJson 走 application/json + input_audio.data(base64), + // 而非 multipart;响应仍按 {text} 解析。 + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + listener.set_nonblocking(true).unwrap(); + let addr = listener.local_addr().unwrap(); + let server = thread::spawn(move || { + let deadline = Instant::now() + Duration::from_secs(5); + let mut stream = loop { + match listener.accept() { + Ok((stream, _)) => break stream, + Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => { + assert!( + Instant::now() < deadline, + "timed out waiting for ASR test request" + ); + thread::sleep(Duration::from_millis(10)); + } + Err(err) => panic!("accept ASR test request failed: {err}"), + } + }; + stream.set_nonblocking(false).unwrap(); + stream + .set_read_timeout(Some(Duration::from_secs(5))) + .unwrap(); + let request = read_http_request(&mut stream); + let request_text = String::from_utf8_lossy(&request); + let lower = request_text.to_ascii_lowercase(); + assert!(request_text.starts_with("POST /audio/transcriptions HTTP/1.1")); + assert!(lower.contains("content-type: application/json")); + assert!(lower.contains("authorization: bearer key")); + // body 是 JSON:含 input_audio.data + format:"wav",且不是 multipart。 + assert!(request_text.contains("input_audio")); + assert!(request_text.contains(r#""format":"wav""#)); + assert!(!lower.contains("multipart/form-data")); + write_json_response(&mut stream, r#"{"text":"openrouter ok"}"#); + }); + let base_url = format!("http://{}", addr); + + let asr = WhisperBatchASR::new( + "key".to_string(), + base_url, + "openai/whisper-large-v3-turbo".to_string(), + None, + None, + false, + ) + .with_request_format(AsrRequestFormat::OpenRouterJson); + let pcm = vec![0u8; 32_000 * 2]; + asr.consume_pcm_chunk(&pcm); + + let transcript = asr.transcribe().await.unwrap(); + assert_eq!(transcript.text, "openrouter ok"); + server.join().unwrap(); + } + fn start_whisper_test_server(texts: Vec<&'static str>) -> (String, thread::JoinHandle<()>) { let listener = TcpListener::bind("127.0.0.1:0").unwrap(); listener.set_nonblocking(true).unwrap(); diff --git a/openless-all/app/src-tauri/src/coordinator.rs b/openless-all/app/src-tauri/src/coordinator.rs index 72ce6dce..4ab7e5d2 100644 --- a/openless-all/app/src-tauri/src/coordinator.rs +++ b/openless-all/app/src-tauri/src/coordinator.rs @@ -184,7 +184,9 @@ fn active_asr_provider_kind(id: &str) -> ActiveAsrProviderKind { fn batch_asr_chunk_limit_ms(provider_id: &str) -> Option { match provider_id { - "zhipu" => Some(30_000), + // OpenRouter 把音频 base64 进 JSON body,体积比二进制大 ~33%,长录音易撞 + // body/时长上限,保守按 30s 切分(与 zhipu 同)。 + "zhipu" | "openrouter" => Some(30_000), _ => None, } } @@ -2489,7 +2491,16 @@ async fn build_local_qwen3( /// (messages=[{content:[{audio:...}]}]) 协议,不是 Whisper multipart,需要 /// 单独 ASR 客户端,留给 V2。 fn is_whisper_compatible_provider(id: &str) -> bool { - matches!(id, "whisper" | "siliconflow" | "zhipu" | "groq") + matches!(id, "whisper" | "siliconflow" | "zhipu" | "groq" | "openrouter") +} + +/// 该 provider 的请求体编码方式。OpenRouter 的 `/audio/transcriptions` 是 +/// `application/json` + base64 音频(issue #582),其余兼容厂商沿用 multipart。 +fn whisper_request_format(provider_id: &str) -> crate::asr::whisper::AsrRequestFormat { + match provider_id { + "openrouter" => crate::asr::whisper::AsrRequestFormat::OpenRouterJson, + _ => crate::asr::whisper::AsrRequestFormat::Multipart, + } } /// 该 provider 的 `/audio/transcriptions` 是否支持 `response_format=verbose_json` @@ -2665,14 +2676,17 @@ async fn build_qa_asr_start(inner: &Arc, active_asr: &str) -> Result = whisper; Ok(QaAsrStart::Ready { active, consumer }) @@ -4057,6 +4071,27 @@ mod tests { assert!(!whisper_supports_verbose_json("zhipu")); } + #[test] + fn openrouter_is_whisper_compatible_json_provider() { + use crate::asr::whisper::AsrRequestFormat; + // issue #582:OpenRouter 走 whisper 兼容路由,但请求体是 JSON+base64。 + assert!(is_whisper_compatible_provider("openrouter")); + assert_eq!( + whisper_request_format("openrouter"), + AsrRequestFormat::OpenRouterJson + ); + // 其余兼容厂商保持 multipart。 + assert_eq!( + whisper_request_format("whisper"), + AsrRequestFormat::Multipart + ); + assert_eq!(whisper_request_format("groq"), AsrRequestFormat::Multipart); + // OpenRouter 的 JSON 协议不吃 response_format,verbose_json 保持关闭。 + assert!(!whisper_supports_verbose_json("openrouter")); + // base64 膨胀,长录音保守按 30s 切分。 + assert_eq!(batch_asr_chunk_limit_ms("openrouter"), Some(30_000)); + } + #[test] fn qa_asr_provider_kind_tracks_active_provider() { assert_eq!( diff --git a/openless-all/app/src-tauri/src/coordinator/dictation.rs b/openless-all/app/src-tauri/src/coordinator/dictation.rs index e06d0172..dfbbe3a5 100644 --- a/openless-all/app/src-tauri/src/coordinator/dictation.rs +++ b/openless-all/app/src-tauri/src/coordinator/dictation.rs @@ -811,14 +811,17 @@ pub(super) async fn begin_session(inner: &Arc) -> Result<(), String> { // 互換プロバイダにも揃えるのが筋。 let whisper_prompt = crate::asr::whisper::build_prompt_from_phrases(&enabled_phrases(inner)); - let whisper = Arc::new(WhisperBatchASR::new( - api_key, - base_url, - model, - whisper_prompt, - batch_asr_chunk_limit_ms(&active_asr), - whisper_supports_verbose_json(&active_asr), - )); + let whisper = Arc::new( + WhisperBatchASR::new( + api_key, + base_url, + model, + whisper_prompt, + batch_asr_chunk_limit_ms(&active_asr), + whisper_supports_verbose_json(&active_asr), + ) + .with_request_format(whisper_request_format(&active_asr)), + ); store_asr_for_session( inner, current_session_id, @@ -2166,6 +2169,7 @@ mod tests { #[test] fn batch_asr_chunk_limit_applies_only_to_zhipu() { assert_eq!(batch_asr_chunk_limit_ms("zhipu"), Some(30_000)); + assert_eq!(batch_asr_chunk_limit_ms("openrouter"), Some(30_000)); assert_eq!(batch_asr_chunk_limit_ms("whisper"), None); assert_eq!(batch_asr_chunk_limit_ms("siliconflow"), None); assert_eq!(batch_asr_chunk_limit_ms("groq"), None); diff --git a/openless-all/app/src/i18n/en.ts b/openless-all/app/src/i18n/en.ts index 2957b4db..74c2890d 100644 --- a/openless-all/app/src/i18n/en.ts +++ b/openless-all/app/src/i18n/en.ts @@ -618,6 +618,7 @@ export const en: typeof zhCN = { asrZhipu: 'Zhipu GLM-ASR', asrGroq: 'Groq Whisper-large-v3', asrWhisper: 'OpenAI Whisper (compatible)', + asrOpenrouter: 'OpenRouter Whisper', asrSherpaOnnxLocal: 'Local sherpa-onnx (experimental)', asrFoundryLocalWhisper: 'Local Whisper (Foundry Local)', asrLocalQwen3: 'Local Qwen3-ASR', diff --git a/openless-all/app/src/i18n/ja.ts b/openless-all/app/src/i18n/ja.ts index 4ca0a390..f43eae92 100644 --- a/openless-all/app/src/i18n/ja.ts +++ b/openless-all/app/src/i18n/ja.ts @@ -620,6 +620,7 @@ export const ja: typeof zhCN = { asrZhipu: 'Zhipu GLM-ASR', asrGroq: 'Groq Whisper-large-v3', asrWhisper: 'OpenAI Whisper(互換)', + asrOpenrouter: 'OpenRouter Whisper', asrSherpaOnnxLocal: 'ローカル sherpa-onnx(実験的)', asrFoundryLocalWhisper: 'ローカル Whisper(Foundry Local)', asrLocalQwen3: 'ローカル Qwen3-ASR', diff --git a/openless-all/app/src/i18n/ko.ts b/openless-all/app/src/i18n/ko.ts index 0b849849..a0657a27 100644 --- a/openless-all/app/src/i18n/ko.ts +++ b/openless-all/app/src/i18n/ko.ts @@ -620,6 +620,7 @@ export const ko: typeof zhCN = { asrZhipu: 'Zhipu GLM-ASR', asrGroq: 'Groq Whisper-large-v3', asrWhisper: 'OpenAI Whisper(호환)', + asrOpenrouter: 'OpenRouter Whisper', asrSherpaOnnxLocal: '로컬 sherpa-onnx(실험적)', asrFoundryLocalWhisper: '로컬 Whisper(Foundry Local)', asrLocalQwen3: '로컬 Qwen3-ASR', diff --git a/openless-all/app/src/i18n/zh-CN.ts b/openless-all/app/src/i18n/zh-CN.ts index ca387876..6e20aa23 100644 --- a/openless-all/app/src/i18n/zh-CN.ts +++ b/openless-all/app/src/i18n/zh-CN.ts @@ -616,6 +616,7 @@ export const zhCN = { asrZhipu: '智谱 GLM-ASR', asrGroq: 'Groq Whisper-large-v3', asrWhisper: 'OpenAI Whisper(兼容)', + asrOpenrouter: 'OpenRouter Whisper', asrSherpaOnnxLocal: '本地 sherpa-onnx(实验性)', asrFoundryLocalWhisper: '本地 Whisper(Foundry Local)', asrLocalQwen3: '本地 Qwen3-ASR', diff --git a/openless-all/app/src/i18n/zh-TW.ts b/openless-all/app/src/i18n/zh-TW.ts index 043e71b9..bbe51030 100644 --- a/openless-all/app/src/i18n/zh-TW.ts +++ b/openless-all/app/src/i18n/zh-TW.ts @@ -618,6 +618,7 @@ export const zhTW: typeof zhCN = { asrZhipu: '智譜 GLM-ASR', asrGroq: 'Groq Whisper-large-v3', asrWhisper: 'OpenAI Whisper(兼容)', + asrOpenrouter: 'OpenRouter Whisper', asrSherpaOnnxLocal: '本地 sherpa-onnx(實驗性)', asrFoundryLocalWhisper: '本地 Whisper(Foundry Local)', asrLocalQwen3: '本地 Qwen3-ASR', diff --git a/openless-all/app/src/pages/Overview.tsx b/openless-all/app/src/pages/Overview.tsx index 609fbe47..acb830d7 100644 --- a/openless-all/app/src/pages/Overview.tsx +++ b/openless-all/app/src/pages/Overview.tsx @@ -30,6 +30,7 @@ const ASR_NAME_KEY_BY_ID: Record = { zhipu: 'asrZhipu', groq: 'asrGroq', whisper: 'asrWhisper', + openrouter: 'asrOpenrouter', 'foundry-local-whisper': 'asrFoundryLocalWhisper', 'sherpa-onnx-local': 'asrSherpaOnnxLocal', 'local-qwen3': 'asrLocalQwen3', diff --git a/openless-all/app/src/pages/settings/ProvidersSection.tsx b/openless-all/app/src/pages/settings/ProvidersSection.tsx index 8dd052ec..dae82737 100644 --- a/openless-all/app/src/pages/settings/ProvidersSection.tsx +++ b/openless-all/app/src/pages/settings/ProvidersSection.tsx @@ -142,6 +142,9 @@ const ASR_PRESETS: ReadonlyArray<{ id: AsrPresetId; nameKey: string; baseUrl: st { id: 'zhipu', nameKey: 'asrZhipu', baseUrl: 'https://open.bigmodel.cn/api/paas/v4', model: 'glm-asr-2512' }, { id: 'groq', nameKey: 'asrGroq', baseUrl: 'https://api.groq.com/openai/v1', model: 'whisper-large-v3-turbo' }, { id: 'whisper', nameKey: 'asrWhisper', baseUrl: 'https://api.openai.com/v1', model: 'whisper-1' }, + // OpenRouter 的 /audio/transcriptions 走 application/json + base64(issue #582), + // 后端 coordinator.rs::whisper_request_format 对该 id 切换到 OpenRouterJson 编码。 + { id: 'openrouter', nameKey: 'asrOpenrouter', baseUrl: 'https://openrouter.ai/api/v1', model: 'openai/whisper-large-v3-turbo' }, { id: 'foundry-local-whisper', nameKey: 'asrFoundryLocalWhisper', baseUrl: '', model: '' }, // 本地引擎(Foundry / sherpa-onnx / Qwen3):无 baseUrl/model 配置, // 模型在「高级 → 本地模型」里下载与切换。 diff --git a/openless-all/app/src/pages/settings/shared.tsx b/openless-all/app/src/pages/settings/shared.tsx index 214f9284..18eb644d 100644 --- a/openless-all/app/src/pages/settings/shared.tsx +++ b/openless-all/app/src/pages/settings/shared.tsx @@ -179,6 +179,7 @@ export type AsrPresetId = | "zhipu" | "groq" | "whisper" + | "openrouter" | "foundry-local-whisper" | "sherpa-onnx-local" | "local-qwen3"