Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions openless-all/app/src-tauri/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions openless-all/app/src-tauri/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ tauri-plugin-autostart = "2"
tauri-plugin-dialog = "2"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
# OpenRouter ASR 把音频以标准 base64(带 padding)放进 JSON body(issue #582)。
base64 = "0.22"
sha2 = "0.10"
bzip2 = "0.4"
tar = "0.4"
Expand Down
163 changes: 133 additions & 30 deletions openless-all/app/src-tauri/src/asr/whisper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
//! to any OpenAI-compatible `/audio/transcriptions` endpoint on session end.

use anyhow::{Context, Result};
use base64::Engine;
use parking_lot::Mutex;

use crate::asr::wav::encode_wav_16k_mono;
Expand All @@ -21,6 +22,19 @@ pub const PROMPT_CHAR_BUDGET: usize = 240;
/// 区切り文字(ASCII)。Whisper のトークナイザはどの言語でも安定して扱える。
const PROMPT_SEPARATOR: &str = ", ";

/// `/audio/transcriptions` 请求体编码方式。
///
/// OpenAI 官方及多数兼容厂商用 `multipart/form-data`(file + model)。
/// OpenRouter 虽路径相同、也走 Bearer,但请求体是 `application/json`:
/// `{model, input_audio:{data:<base64 wav>, format:"wav"}}`(issue #582)。
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum AsrRequestFormat {
/// `multipart/form-data`(既有行为,默认)。
Multipart,
/// OpenRouter `application/json` + base64 音频。
OpenRouterJson,
}

pub struct WhisperBatchASR {
api_key: String,
base_url: String,
Expand All @@ -36,6 +50,8 @@ pub struct WhisperBatchASR {
/// (SiliconFlow)は response_format 自体が無いので false にして従来の
/// `json` のまま送る(壊さない)。
verbose_json: bool,
/// 请求体编码方式。默认 `Multipart`,OpenRouter 走 `OpenRouterJson`。
request_format: AsrRequestFormat,
buffer: Mutex<Vec<u8>>,
}

Expand All @@ -55,10 +71,18 @@ impl WhisperBatchASR {
prompt,
max_chunk_duration_ms,
verbose_json,
request_format: AsrRequestFormat::Multipart,
buffer: Mutex::new(Vec::new()),
}
}

/// 设置请求体编码方式(默认 `Multipart`)。OpenRouter 需 `OpenRouterJson`。
/// 用 builder 而非给 `new()` 加参数,避免改动既有 4 处构造点的签名。
pub fn with_request_format(mut self, request_format: AsrRequestFormat) -> Self {
self.request_format = request_format;
self
}

/// Stop collecting audio, encode the buffer as WAV, and POST to the
/// Whisper transcriptions endpoint.
///
Expand Down Expand Up @@ -110,40 +134,62 @@ impl WhisperBatchASR {
.collect();
let wav = encode_wav_16k_mono(&samples);
let url = transcription_url(&self.base_url)?;
let client = reqwest::Client::new();

let wav_part = reqwest::multipart::Part::bytes(wav)
.file_name("audio.wav")
.mime_str("audio/wav")
.context("set MIME type")?;
let mut form = reqwest::multipart::Form::new()
.part("file", wav_part)
.text("model", self.model.clone());

// verbose_json 対応プロバイダ(OpenAI / Groq)のときだけ、セグメント
// メタデータ付きの応答を要求し、temperature も 0 に固定する。非対応
// プロバイダ(SiliconFlow の SenseVoice / TeleSpeech 等)には送らず
// 従来どおりの応答にして、未知パラメータでの 4xx を避ける。
if self.verbose_json {
form = form
.text("response_format", "verbose_json")
.text("temperature", "0");
}
let request = match self.request_format {
AsrRequestFormat::Multipart => {
let wav_part = reqwest::multipart::Part::bytes(wav)
.file_name("audio.wav")
.mime_str("audio/wav")
.context("set MIME type")?;
let mut form = reqwest::multipart::Form::new()
.part("file", wav_part)
.text("model", self.model.clone());

// verbose_json 対応プロバイダ(OpenAI / Groq)のときだけ、セグメント
// メタデータ付きの応答を要求し、temperature も 0 に固定する。非対応
// プロバイダ(SiliconFlow の SenseVoice / TeleSpeech 等)には送らず
// 従来どおりの応答にして、未知パラメータでの 4xx を避ける。
if self.verbose_json {
form = form
.text("response_format", "verbose_json")
.text("temperature", "0");
}

// `prompt` は空文字を送らない:OpenAI 互換実装によっては空文字でエラーに
// なるリスクがある(Groq は許容するが防御的にスキップ)。`trim()` で
// 空白のみのケースも除外。
if let Some(prompt) = self.prompt.as_ref() {
let trimmed = prompt.trim();
if !trimmed.is_empty() {
form = form.text("prompt", trimmed.to_string());
}
}

// `prompt` は空文字を送らない:OpenAI 互換実装によっては空文字でエラーに
// なるリスクがある(Groq は許容するが防御的にスキップ)。`trim()` で
// 空白のみのケースも除外。
if let Some(prompt) = self.prompt.as_ref() {
let trimmed = prompt.trim();
if !trimmed.is_empty() {
form = form.text("prompt", trimmed.to_string());
client
.post(&url)
.header("Authorization", format!("Bearer {}", self.api_key))
.multipart(form)
}
}
AsrRequestFormat::OpenRouterJson => {
// OpenRouter /audio/transcriptions:application/json,音频走标准
// base64(带 padding)。不带 multipart 专属的 prompt/response_format
// 字段,避免未知字段导致 4xx;verbose_json 对该协议保持关闭。
let body = serde_json::json!({
"model": self.model,
"input_audio": {
"data": base64::engine::general_purpose::STANDARD.encode(&wav),
"format": "wav",
},
});
client
.post(&url)
.header("Authorization", format!("Bearer {}", self.api_key))
.json(&body)
}
};

let client = reqwest::Client::new();
let resp = client
.post(&url)
.header("Authorization", format!("Bearer {}", self.api_key))
.multipart(form)
let resp = request
.send()
.await
.context("Whisper HTTP request failed")?;
Expand Down Expand Up @@ -725,6 +771,63 @@ mod tests {
server.join().unwrap();
}

#[tokio::test]
async fn openrouter_format_posts_json_with_base64_audio() {
// issue #582:OpenRouterJson 走 application/json + input_audio.data(base64),
// 而非 multipart;响应仍按 {text} 解析。
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
listener.set_nonblocking(true).unwrap();
let addr = listener.local_addr().unwrap();
let server = thread::spawn(move || {
let deadline = Instant::now() + Duration::from_secs(5);
let mut stream = loop {
match listener.accept() {
Ok((stream, _)) => break stream,
Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
assert!(
Instant::now() < deadline,
"timed out waiting for ASR test request"
);
thread::sleep(Duration::from_millis(10));
}
Err(err) => panic!("accept ASR test request failed: {err}"),
}
};
stream.set_nonblocking(false).unwrap();
stream
.set_read_timeout(Some(Duration::from_secs(5)))
.unwrap();
let request = read_http_request(&mut stream);
let request_text = String::from_utf8_lossy(&request);
let lower = request_text.to_ascii_lowercase();
assert!(request_text.starts_with("POST /audio/transcriptions HTTP/1.1"));
assert!(lower.contains("content-type: application/json"));
assert!(lower.contains("authorization: bearer key"));
// body 是 JSON:含 input_audio.data + format:"wav",且不是 multipart。
assert!(request_text.contains("input_audio"));
assert!(request_text.contains(r#""format":"wav""#));
assert!(!lower.contains("multipart/form-data"));
write_json_response(&mut stream, r#"{"text":"openrouter ok"}"#);
});
let base_url = format!("http://{}", addr);

let asr = WhisperBatchASR::new(
"key".to_string(),
base_url,
"openai/whisper-large-v3-turbo".to_string(),
None,
None,
false,
)
.with_request_format(AsrRequestFormat::OpenRouterJson);
let pcm = vec![0u8; 32_000 * 2];
asr.consume_pcm_chunk(&pcm);

let transcript = asr.transcribe().await.unwrap();
assert_eq!(transcript.text, "openrouter ok");
server.join().unwrap();
}

fn start_whisper_test_server(texts: Vec<&'static str>) -> (String, thread::JoinHandle<()>) {
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
listener.set_nonblocking(true).unwrap();
Expand Down
55 changes: 45 additions & 10 deletions openless-all/app/src-tauri/src/coordinator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,9 @@ fn active_asr_provider_kind(id: &str) -> ActiveAsrProviderKind {

fn batch_asr_chunk_limit_ms(provider_id: &str) -> Option<u64> {
match provider_id {
"zhipu" => Some(30_000),
// OpenRouter 把音频 base64 进 JSON body,体积比二进制大 ~33%,长录音易撞
// body/时长上限,保守按 30s 切分(与 zhipu 同)。
"zhipu" | "openrouter" => Some(30_000),
_ => None,
}
}
Expand Down Expand Up @@ -2489,7 +2491,16 @@ async fn build_local_qwen3(
/// (messages=[{content:[{audio:...}]}]) 协议,不是 Whisper multipart,需要
/// 单独 ASR 客户端,留给 V2。
fn is_whisper_compatible_provider(id: &str) -> bool {
matches!(id, "whisper" | "siliconflow" | "zhipu" | "groq")
matches!(id, "whisper" | "siliconflow" | "zhipu" | "groq" | "openrouter")
}

/// 该 provider 的请求体编码方式。OpenRouter 的 `/audio/transcriptions` 是
/// `application/json` + base64 音频(issue #582),其余兼容厂商沿用 multipart。
fn whisper_request_format(provider_id: &str) -> crate::asr::whisper::AsrRequestFormat {
match provider_id {
"openrouter" => crate::asr::whisper::AsrRequestFormat::OpenRouterJson,
_ => crate::asr::whisper::AsrRequestFormat::Multipart,
}
}

/// 该 provider 的 `/audio/transcriptions` 是否支持 `response_format=verbose_json`
Expand Down Expand Up @@ -2665,14 +2676,17 @@ async fn build_qa_asr_start(inner: &Arc<Inner>, active_asr: &str) -> Result<QaAs
let (api_key, base_url, model) = read_whisper_credentials();
let whisper_prompt =
crate::asr::whisper::build_prompt_from_phrases(&enabled_phrases(inner));
let whisper = Arc::new(WhisperBatchASR::new(
api_key,
base_url,
model,
whisper_prompt,
batch_asr_chunk_limit_ms(active_asr),
whisper_supports_verbose_json(active_asr),
));
let whisper = Arc::new(
WhisperBatchASR::new(
api_key,
base_url,
model,
whisper_prompt,
batch_asr_chunk_limit_ms(active_asr),
whisper_supports_verbose_json(active_asr),
)
.with_request_format(whisper_request_format(active_asr)),
);
let active = ActiveAsr::Whisper(Arc::clone(&whisper));
let consumer: Arc<dyn crate::recorder::AudioConsumer> = whisper;
Ok(QaAsrStart::Ready { active, consumer })
Expand Down Expand Up @@ -4057,6 +4071,27 @@ mod tests {
assert!(!whisper_supports_verbose_json("zhipu"));
}

#[test]
fn openrouter_is_whisper_compatible_json_provider() {
use crate::asr::whisper::AsrRequestFormat;
// issue #582:OpenRouter 走 whisper 兼容路由,但请求体是 JSON+base64。
assert!(is_whisper_compatible_provider("openrouter"));
assert_eq!(
whisper_request_format("openrouter"),
AsrRequestFormat::OpenRouterJson
);
// 其余兼容厂商保持 multipart。
assert_eq!(
whisper_request_format("whisper"),
AsrRequestFormat::Multipart
);
assert_eq!(whisper_request_format("groq"), AsrRequestFormat::Multipart);
// OpenRouter 的 JSON 协议不吃 response_format,verbose_json 保持关闭。
assert!(!whisper_supports_verbose_json("openrouter"));
// base64 膨胀,长录音保守按 30s 切分。
assert_eq!(batch_asr_chunk_limit_ms("openrouter"), Some(30_000));
}

#[test]
fn qa_asr_provider_kind_tracks_active_provider() {
assert_eq!(
Expand Down
20 changes: 12 additions & 8 deletions openless-all/app/src-tauri/src/coordinator/dictation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -811,14 +811,17 @@ pub(super) async fn begin_session(inner: &Arc<Inner>) -> Result<(), String> {
// 互換プロバイダにも揃えるのが筋。
let whisper_prompt =
crate::asr::whisper::build_prompt_from_phrases(&enabled_phrases(inner));
let whisper = Arc::new(WhisperBatchASR::new(
api_key,
base_url,
model,
whisper_prompt,
batch_asr_chunk_limit_ms(&active_asr),
whisper_supports_verbose_json(&active_asr),
));
let whisper = Arc::new(
WhisperBatchASR::new(
api_key,
base_url,
model,
whisper_prompt,
batch_asr_chunk_limit_ms(&active_asr),
whisper_supports_verbose_json(&active_asr),
)
.with_request_format(whisper_request_format(&active_asr)),
);
store_asr_for_session(
inner,
current_session_id,
Expand Down Expand Up @@ -2166,6 +2169,7 @@ mod tests {
#[test]
fn batch_asr_chunk_limit_applies_only_to_zhipu() {
assert_eq!(batch_asr_chunk_limit_ms("zhipu"), Some(30_000));
assert_eq!(batch_asr_chunk_limit_ms("openrouter"), Some(30_000));
assert_eq!(batch_asr_chunk_limit_ms("whisper"), None);
assert_eq!(batch_asr_chunk_limit_ms("siliconflow"), None);
assert_eq!(batch_asr_chunk_limit_ms("groq"), None);
Expand Down
1 change: 1 addition & 0 deletions openless-all/app/src/i18n/en.ts
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,7 @@ export const en: typeof zhCN = {
asrZhipu: 'Zhipu GLM-ASR',
asrGroq: 'Groq Whisper-large-v3',
asrWhisper: 'OpenAI Whisper (compatible)',
asrOpenrouter: 'OpenRouter Whisper',
asrSherpaOnnxLocal: 'Local sherpa-onnx (experimental)',
asrFoundryLocalWhisper: 'Local Whisper (Foundry Local)',
asrLocalQwen3: 'Local Qwen3-ASR',
Expand Down
1 change: 1 addition & 0 deletions openless-all/app/src/i18n/ja.ts
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,7 @@ export const ja: typeof zhCN = {
asrZhipu: 'Zhipu GLM-ASR',
asrGroq: 'Groq Whisper-large-v3',
asrWhisper: 'OpenAI Whisper(互換)',
asrOpenrouter: 'OpenRouter Whisper',
asrSherpaOnnxLocal: 'ローカル sherpa-onnx(実験的)',
asrFoundryLocalWhisper: 'ローカル Whisper(Foundry Local)',
asrLocalQwen3: 'ローカル Qwen3-ASR',
Expand Down
1 change: 1 addition & 0 deletions openless-all/app/src/i18n/ko.ts
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,7 @@ export const ko: typeof zhCN = {
asrZhipu: 'Zhipu GLM-ASR',
asrGroq: 'Groq Whisper-large-v3',
asrWhisper: 'OpenAI Whisper(호환)',
asrOpenrouter: 'OpenRouter Whisper',
asrSherpaOnnxLocal: '로컬 sherpa-onnx(실험적)',
asrFoundryLocalWhisper: '로컬 Whisper(Foundry Local)',
asrLocalQwen3: '로컬 Qwen3-ASR',
Expand Down
1 change: 1 addition & 0 deletions openless-all/app/src/i18n/zh-CN.ts
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,7 @@ export const zhCN = {
asrZhipu: '智谱 GLM-ASR',
asrGroq: 'Groq Whisper-large-v3',
asrWhisper: 'OpenAI Whisper(兼容)',
asrOpenrouter: 'OpenRouter Whisper',
asrSherpaOnnxLocal: '本地 sherpa-onnx(实验性)',
asrFoundryLocalWhisper: '本地 Whisper(Foundry Local)',
asrLocalQwen3: '本地 Qwen3-ASR',
Expand Down
1 change: 1 addition & 0 deletions openless-all/app/src/i18n/zh-TW.ts
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,7 @@ export const zhTW: typeof zhCN = {
asrZhipu: '智譜 GLM-ASR',
asrGroq: 'Groq Whisper-large-v3',
asrWhisper: 'OpenAI Whisper(兼容)',
asrOpenrouter: 'OpenRouter Whisper',
asrSherpaOnnxLocal: '本地 sherpa-onnx(實驗性)',
asrFoundryLocalWhisper: '本地 Whisper(Foundry Local)',
asrLocalQwen3: '本地 Qwen3-ASR',
Expand Down
1 change: 1 addition & 0 deletions openless-all/app/src/pages/Overview.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const ASR_NAME_KEY_BY_ID: Record<string, string> = {
zhipu: 'asrZhipu',
groq: 'asrGroq',
whisper: 'asrWhisper',
openrouter: 'asrOpenrouter',
'foundry-local-whisper': 'asrFoundryLocalWhisper',
'sherpa-onnx-local': 'asrSherpaOnnxLocal',
'local-qwen3': 'asrLocalQwen3',
Expand Down
Loading
Loading