diff --git a/.gitignore b/.gitignore index b9c16bf..75db1bb 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ samples /output TODO.md +BENCHMARKS.local.md vendor/ diff --git a/Cargo.lock b/Cargo.lock index c6c0012..c294e89 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3083,7 +3083,7 @@ dependencies = [ [[package]] name = "transcribeit" -version = "1.3.0" +version = "1.4.0" dependencies = [ "anyhow", "async-trait", @@ -3107,6 +3107,7 @@ dependencies = [ "tar", "tempfile", "tokio", + "urlencoding", "uuid", "whisper-rs", ] diff --git a/Cargo.toml b/Cargo.toml index bfa8754..9ddf605 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "transcribeit" -version = "1.3.0" +version = "1.4.0" edition = "2024" rust-version = "1.96" license-file = "LICENSE" @@ -14,7 +14,7 @@ strip = "symbols" incremental = false [features] -default = ["sherpa-onnx"] +default = [] [build-dependencies] dotenvy = "0.15" @@ -33,8 +33,9 @@ serde = { version = "1", features = ["derive"] } serde_json = "1" tempfile = "3" regex = "1" +urlencoding = "2" tokio = { version = "1", features = ["full"] } -sherpa-onnx = { version = "1.13", optional = true } +sherpa-onnx = { version = "1.13", default-features = false, features = ["shared"], optional = true } tar = "0.4" bzip2 = "0.6" libc = "0.2" diff --git a/README.md b/README.md index 3fa2376..8942db8 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # transcribeit -A Rust CLI for speech-to-text transcription. Supports local inference via [whisper.cpp](https://github.com/ggerganov/whisper.cpp), local inference via [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx), remote transcription via OpenAI-compatible APIs, Azure OpenAI, and Qwen ASR file transcription. +A Rust CLI for speech-to-text transcription. Supports local inference via [whisper.cpp](https://github.com/ggerganov/whisper.cpp), local inference via [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx), remote transcription via OpenAI-compatible APIs, Azure OpenAI, Qwen ASR file transcription, and Gemini multimodal transcription. Accepts any audio or video format — FFmpeg handles conversion automatically. @@ -15,11 +15,11 @@ Accepts any audio or video format — FFmpeg handles conversion automatically. ## Quick start ```bash -# Build (reads SHERPA_ONNX_LIB_DIR from .env automatically via build.rs) +# Build the default binary cargo build --release -# Build without sherpa-onnx (no shared library dependency needed) -cargo build --release --no-default-features +# Build with sherpa-onnx (reads SHERPA_ONNX_LIB_DIR from .env automatically via build.rs) +cargo build --release --features sherpa-onnx # Download a GGML model (default format, for --provider local) transcribeit download-model -s base @@ -57,6 +57,10 @@ transcribeit run -i meeting.mp4 -m base -f srt -o ./output # Transcribe via OpenAI API transcribeit run -p openai -i recording.mp3 +# Transcribe via OpenAI hosted diarization +transcribeit run -p openai --remote-model gpt-4o-transcribe-diarize \ + -i meeting.mp3 -f srt -o ./output + # Transcribe via Azure OpenAI transcribeit run -p azure -i recording.mp3 \ --azure-deployment my-whisper -b https://myresource.openai.azure.com @@ -64,6 +68,10 @@ transcribeit run -p azure -i recording.mp3 \ # Transcribe whole files with Qwen ASR via S3/R2 pre-signed URLs transcribeit run -p qwen-filetrans -i recording.mp3 -f vtt -o ./output +# Transcribe whole files with Gemini Files API + generateContent +transcribeit run -p gemini --remote-model gemini-3.5-flash \ + -i recording.mp3 -f vtt -o ./output + # Force language and normalize before transcription transcribeit run -i recording.wav -m base --language en --normalize @@ -79,10 +87,12 @@ transcribeit run -i interview.mp3 -m base --speakers 2 \ ## Features - **Any input format** — MP3, MP4, WAV, FLAC, OGG, etc. FFmpeg converts to mono 16kHz WAV automatically. -- **5 providers** — Local whisper.cpp, sherpa-onnx, OpenAI API, Azure OpenAI, and Qwen file transcription. Extensible via the `Transcriber` trait. +- **6 providers** — Local whisper.cpp, sherpa-onnx, OpenAI API, Azure OpenAI, Qwen file transcription, and Gemini. Extensible via the `Transcriber` trait. - **Qwen ASR whole-file transcription** — `qwen-filetrans` stages audio in S3-compatible storage, passes a pre-signed URL to DashScope, polls the async task, and maps Qwen timestamps into the transcript model. +- **Stable manifest schema** — Manifests use `transcribeit.manifest.v2` with canonical millisecond timestamps, provider-neutral capabilities/quality fields, and provider-specific metadata under `provider_metadata.data`. - **Qwen provider metadata** — Manifests include Qwen task timing/usage, audio info, per-segment language/emotion, and word-level timestamps. Temporary pre-signed URLs are not persisted. - **Qwen model guardrails** — Accidental short-audio `qwen3-asr-flash` model selection is rejected before conversion and S3 upload; use `qwen3-asr-flash-filetrans` for this provider. +- **Gemini whole-file transcription** — `gemini` uploads prepared audio through Gemini Files API, calls `generateContent` with structured JSON output, and maps segment timestamps, speaker labels, language, and emotion when returned. - **3 model architectures via sherpa-onnx** — Whisper, Moonshine, and SenseVoice are auto-detected from the model directory contents. Just point `--model` at any supported model directory. - **Model aliases** — `-m base`, `-m tiny`, etc. resolve from `MODEL_CACHE_DIR` for both `local` and `sherpa-onnx` providers. The sherpa-onnx resolver also supports glob matching (e.g., `-m moonshine-base`, `-m sense-voice`). - **Language hinting** — Pass `--language` to force local and API transcription language. @@ -90,7 +100,7 @@ transcribeit run -i interview.mp3 -m base --speakers 2 \ - **VAD-based segmentation** — Speech-aware segmentation via Silero VAD (sherpa-onnx). Detects speech boundaries with padding and gap merging to avoid mid-word cuts. Use `--vad-model .cache/silero_vad.onnx`. - **Silence-based segmentation** — Fallback segmentation via FFmpeg `silencedetect` for API providers or when VAD model is not available. - **sherpa-onnx auto-segmentation** — Whisper ONNX models only support ≤30s per call; segmentation is enabled automatically. -- **sherpa-onnx is optional** — Enabled by default as a Cargo feature. Build without it: `cargo build --no-default-features`. +- **sherpa-onnx is optional** — Enable it explicitly with `cargo build --features sherpa-onnx` when you need ONNX providers or Sherpa-backed diarization. - **Auto-split for API limits** — Files exceeding 25MB are automatically segmented when using remote providers. - **Progress spinner** — Shows live terminal feedback during transcription (single file and segmented mode). - **Parallel API segment transcription** — Multiple segment requests can be processed concurrently with `--segment-concurrency`. @@ -110,6 +120,8 @@ HF_TOKEN=hf_your_token_here MODEL_CACHE_DIR=.cache SHERPA_ONNX_LIB_DIR=/path/to/sherpa-onnx/lib OPENAI_API_KEY=sk-your_key_here +GEMINI_API_KEY=your_gemini_key_here +GEMINI_API_BASE_URL=https://generativelanguage.googleapis.com/v1beta AZURE_API_KEY=your_azure_key_here AZURE_OPENAI_ENDPOINT=https://myresource.openai.azure.com AZURE_DEPLOYMENT_NAME=whisper @@ -149,7 +161,7 @@ On first run, use `transcribeit setup` to download models and additional compone To build a distributable binary: ```bash -cargo build --release +cargo build --release --features sherpa-onnx # Copy binary + libs cp target/release/transcribeit dist/ cp vendor/sherpa-onnx-*/lib/lib*.dylib dist/lib/ @@ -158,7 +170,7 @@ cp vendor/sherpa-onnx-*/lib/lib*.dylib dist/lib/ To build without sherpa-onnx (no shared library dependency): ```bash -cargo build --release --no-default-features +cargo build --release ``` ## License diff --git a/Taskfile.yaml b/Taskfile.yaml index b7785b0..9f3f921 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -6,6 +6,10 @@ dotenv: - .env tasks: + test-hf-token: + cmds: + - | + test -n "$HF_TOKEN" || (echo "HF_TOKEN is not set" >&2; exit 1) print-openai-base-url: cmds: - | diff --git a/docs/architecture.md b/docs/architecture.md index 232c22e..d1ea5e0 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -26,6 +26,7 @@ src/ ├── sherpa_onnx.rs # Local sherpa-onnx engine (auto-detects Whisper, Moonshine, SenseVoice) ├── openai_api.rs # OpenAI-compatible REST API ├── azure_openai.rs # Azure OpenAI REST API + ├── gemini.rs # Gemini Files API + generateContent ├── qwen_filetrans.rs # Qwen async file transcription provider ├── qwen_filetrans/ # Qwen request/response types and model limits ├── rate_limit.rs # Retry logic and 429 handling @@ -55,6 +56,7 @@ pub trait Transcriber: Send + Sync { - **Sherpa-ONNX engine** (`sherpa_onnx`) uses `transcribe()` — it needs decoded samples for the ONNX runtime. - **OpenAI/Azure API engines** override `transcribe_path()` to upload files directly via multipart, and `transcribe_wav()` to upload in-memory bytes — avoiding the decode→re-encode round-trip. - **Qwen file transcription** overrides `transcribe_path()` to upload prepared audio to S3-compatible storage, generate a pre-signed URL, and submit that URL to DashScope. +- **Gemini** overrides `transcribe_path()` to upload prepared audio through Gemini Files API and call `generateContent` with structured JSON output. ## Processing pipeline @@ -64,7 +66,7 @@ The `pipeline.rs` module orchestrates the full flow: Input file (any format) │ ├─ needs_conversion()? ──→ extract_to_wav(normalize) for local provider - ├─ upload_as_mp3(normalize) for API providers and Qwen filetrans (16kHz mono MP3) + ├─ upload_as_mp3(normalize) for OpenAI/Azure, Qwen filetrans, and Gemini (16kHz mono MP3) │ ├─ get_duration() via ffprobe │ @@ -99,11 +101,23 @@ Input file (any format) ├─ Text to stdout or `.txt` ├─ VTT to file or stdout (with `` tags when diarized) ├─ SRT to file or stdout (with `[Speaker N]` labels when diarized) - └─ JSON manifest to output directory (includes speaker field per segment) + └─ JSON manifest to output directory (`transcribeit.manifest.v2`) ``` Temporary files use the `tempfile` crate and are cleaned up automatically on drop. +## Manifest contract + +When `--output-dir` is set, the JSON manifest is the stable machine-readable contract for downstream applications. The current schema is `transcribeit.manifest.v2`. + +- `transcript.text` and `transcript.segments` are the preferred consumer-facing transcript fields. +- Segment and word timestamps include canonical integer millisecond fields (`start_ms`, `end_ms`) plus second fields for readability. +- `capabilities` describes which optional fields are present, such as word timestamps, speaker labels, segment language, and emotion. +- `quality` describes how reliable timing/speaker metadata is, including `timing_source`, `timing_reliable`, and `timestamps_clamped`. +- `provider_metadata` is a stable envelope: `{ "provider": "...", "schema_version": "...", "data": { ... } }`. +- Provider-specific payloads live only under `provider_metadata.data`; temporary URLs and secrets must not be persisted. +- The top-level `segments` array remains as a compatibility mirror for older consumers. + ## Engines ### Local (`whisper_local.rs`) @@ -147,6 +161,20 @@ The S3 staging implementation lives in `storage::s3` and works with AWS S3-compa Short synchronous Qwen models such as `qwen3-asr-flash` use a different API path and have strict 10 MB / 300 second limits. If one is selected with `-p qwen-filetrans`, the CLI fails before conversion or S3 upload. +### Gemini (`gemini.rs`) + +Uses Gemini Files API and `generateContent` for whole-file multimodal transcription. The provider: + +- converts input audio/video to 16 kHz mono MP3 +- uploads the prepared file with a resumable Files API upload +- waits for the file to become `ACTIVE` +- requests structured JSON with `text`, segment timestamps, speaker, language, and emotion fields +- maps valid segments into the normalized transcript/manifest model +- falls back to generated transcript text when structured JSON is missing or invalid +- deletes the temporary Gemini file after the transcription request + +Gemini is not a dedicated ASR endpoint. Timestamp, speaker, language, and emotion values come from the model's structured output, so benchmark quality before relying on them for subtitle workflows. + ### Sherpa-ONNX (`sherpa_onnx.rs`) Local inference using [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) with automatic model architecture detection. Uses a **dedicated worker thread pattern**: the `OfflineRecognizer` is created on a plain `std::thread` (not on the Tokio runtime) and stays there for its entire lifetime. Transcription requests are sent to the thread via an `mpsc` channel and results come back through `tokio::sync::oneshot` channels. This design avoids: @@ -190,7 +218,7 @@ All settings (timeout, retries, wait times) are configurable via CLI flags and e ### Shared WAV encoding -OpenAI/Azure engines can send file uploads directly and choose the correct container format for compatibility (WAV for local transcribe path, MP3 for API provider uploads). Qwen file transcription stages MP3 in S3-compatible storage and sends DashScope a pre-signed URL. The `audio::wav::encode_wav()` helper is still used by local engines and non-file upload paths. +OpenAI/Azure engines can send file uploads directly and choose the correct container format for compatibility (WAV for local transcribe path, MP3 for API provider uploads). Qwen file transcription stages MP3 in S3-compatible storage and sends DashScope a pre-signed URL. Gemini uploads MP3 through Gemini Files API. The `audio::wav::encode_wav()` helper is still used by local engines and non-file upload paths. ## Model cache (`model_cache.rs`) @@ -202,7 +230,7 @@ OpenAI/Azure engines can send file uploads directly and choose the correct conta ## Build requirements -The `sherpa-onnx` Cargo feature is **enabled by default**. It requires the sherpa-onnx shared libraries at both compile time and runtime. The `build.rs` script loads a `.env` file and reads `SHERPA_ONNX_LIB_DIR` to configure the linker search path and embed an `rpath` so the binary can find the dylibs at runtime. +The `sherpa-onnx` Cargo feature is opt-in. It requires the sherpa-onnx shared libraries at both compile time and runtime. The `build.rs` script loads a `.env` file and reads `SHERPA_ONNX_LIB_DIR` to configure the linker search path and embed an `rpath` so the binary can find the dylibs at runtime. Set `SHERPA_ONNX_LIB_DIR` in your `.env` file or environment before building: @@ -211,13 +239,13 @@ Set `SHERPA_ONNX_LIB_DIR` in your `.env` file or environment before building: SHERPA_ONNX_LIB_DIR=/path/to/sherpa-onnx/lib ``` -To build without the sherpa-onnx dependency entirely: +To build with sherpa-onnx enabled: ```bash -cargo build --release --no-default-features +cargo build --release --features sherpa-onnx ``` -This removes the sherpa-onnx provider and eliminates the need for `SHERPA_ONNX_LIB_DIR`. +The default build omits the sherpa-onnx provider and eliminates the need for `SHERPA_ONNX_LIB_DIR`. ## VAD-based segmentation (`audio/vad.rs`) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 401c298..cc4e64e 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -46,7 +46,7 @@ transcribeit run [OPTIONS] --input | Option | Description | Default | |--------|-------------|---------| | `-i, --input` | Input path, directory, or glob pattern for audio/video files | required | -| `-p, --provider` | `local`, `sherpa-onnx`, `openai`, `azure`, or `qwen-filetrans` | `local` | +| `-p, --provider` | `local`, `sherpa-onnx`, `openai`, `azure`, `qwen-filetrans`, or `gemini` | `local` | #### Local provider options (`-p local`) @@ -80,6 +80,10 @@ Sherpa-ONNX automatically enables segmentation and caps segment length at 30 sec | `-a, --api-key` | API key | `OPENAI_API_KEY` env var | | `--remote-model` | Model name | `whisper-1` | +Supported hosted OpenAI transcription models include `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. + +`whisper-1` returns timestamped segments through the default `verbose_json` request path. `gpt-4o-mini-transcribe` and `gpt-4o-transcribe` return plain transcript text through the current CLI. When `--remote-model gpt-4o-transcribe-diarize` is selected, the provider requests `diarized_json` with `chunking_strategy=auto` and maps speaker labels into VTT/SRT/manifest output. + #### Azure provider options | Option | Description | Default | @@ -109,10 +113,24 @@ Sherpa-ONNX automatically enables segmentation and caps segment length at 30 sec Qwen file transcription uploads the prepared audio to S3-compatible storage and passes a pre-signed GET URL to DashScope. The provider is intended for whole-file transcription; avoid `--segment` unless you explicitly want multiple independent remote jobs. -When available, Qwen manifests include `provider_metadata.qwen` with task timing/usage, audio info, transcript counts, and word-level timestamps on each segment. Temporary pre-signed URLs are not persisted. +When available, Qwen manifests include `provider_metadata.provider = "qwen-filetrans"` and Qwen task timing/usage, audio info, and transcript counts under `provider_metadata.data`. Word-level timestamps remain on each normalized segment. Temporary pre-signed URLs are not persisted. If a short-audio `qwen3-asr-flash` model is selected with `-p qwen-filetrans`, the CLI validates the file size and duration before upload and fails without staging the file to S3. Use `qwen3-asr-flash-filetrans` for this provider. +#### Gemini provider options (`-p gemini`) + +| Option | Description | Default | +|--------|-------------|---------| +| `--gemini-api-key` | Gemini API key | `GEMINI_API_KEY` env var | +| `--gemini-api-base-url` | Gemini API base URL | `GEMINI_API_BASE_URL` env var, or `https://generativelanguage.googleapis.com/v1beta` | +| `--remote-model` | Gemini model name | `gemini-3.5-flash` | + +The Gemini provider uses the Gemini Files API plus `generateContent` with structured JSON output. It converts input audio/video to 16 kHz mono MP3 before upload, then asks Gemini for a transcript object with `text`, `segments`, timestamps, speaker, language, and emotion fields. + +Current model candidates verified through the Gemini models API include `gemini-3.5-flash`, `gemini-3.1-pro-preview`, `gemini-3-flash-preview`, `gemini-3-pro-preview`, and `gemini-2.5-flash`. Prefer stable `gemini-3.5-flash` for the default path and benchmark preview models before adopting them in production workflows. + +Gemini timestamps and speaker labels are generated structured output rather than a dedicated ASR response schema. The parser is defensive: invalid JSON, missing fields, empty segments, and unknown future response fields fall back to transcript text instead of failing the run. + #### Output options | Option | Description | Default | @@ -124,7 +142,7 @@ If a short-audio `qwen3-asr-flash` model is selected with `-p qwen-filetrans`, t #### API resilience options -These options apply to OpenAI, Azure, and Qwen file transcription providers: +These options apply to OpenAI, Azure, Qwen file transcription, and Gemini providers: | Option | Description | Default | |--------|-------------|---------| @@ -183,10 +201,12 @@ When `--input` resolves to multiple files (directory or glob), all files are pro | Variable | Description | Default | |----------|-------------|---------| -| `SHERPA_ONNX_LIB_DIR` | Path to sherpa-onnx shared libraries (required for build) | none | +| `SHERPA_ONNX_LIB_DIR` | Path to sherpa-onnx shared libraries (required when building with `--features sherpa-onnx`) | none | | `MODEL_CACHE_DIR` | Directory for downloaded models | `.cache` | | `HF_TOKEN` | Hugging Face API token (optional) | none | | `OPENAI_API_KEY` | OpenAI API key | none | +| `GEMINI_API_KEY` | Gemini API key | none | +| `GEMINI_API_BASE_URL` | Gemini API base URL | `https://generativelanguage.googleapis.com/v1beta` | | `AZURE_API_KEY` | Azure API key fallback for Azure provider if `--azure-api-key` is unset | none | | `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | none | | `AZURE_DEPLOYMENT_NAME` | Azure deployment name | `whisper` | @@ -310,9 +330,11 @@ transcribeit run -p qwen-filetrans -i recording.mp3 \ - **Local** (`-p local`) runs whisper.cpp in-process using GGML models. - **Sherpa-ONNX** (`-p sherpa-onnx`) runs sherpa-onnx in-process. Auto-detects Whisper, Moonshine, and SenseVoice models from directory contents. Always auto-segments at 30s. - **OpenAI-compatible** (`-p openai`) uses `--remote-model` and calls `POST {base-url}/v1/audio/transcriptions`. + `gpt-4o-transcribe-diarize` is handled specially: the request includes `response_format=diarized_json` and `chunking_strategy=auto`, and response segments are parsed defensively so unknown or missing fields do not fail the run. - **Azure** (`-p azure`) uses `--azure-deployment` and calls: `POST {base-url}/openai/deployments/{deployment}/audio/transcriptions?api-version={version}`. - **Qwen file transcription** (`-p qwen-filetrans`) uploads audio to S3-compatible storage, passes a pre-signed URL to DashScope, and polls the async transcription task. +- **Gemini** (`-p gemini`) uploads audio through Gemini Files API, calls `generateContent`, and parses structured transcript JSON defensively. For the full matrix and upload/auth notes, see: [Provider behavior](provider-behavior.md). For benchmark guidance and result templates, see: [Performance benchmarks](performance-benchmarks.md). @@ -328,11 +350,15 @@ When `--output-dir` is specified, the following files are created: ### Manifest format +Manifests use `schema_version: "transcribeit.manifest.v2"`. New consumers should prefer `transcript.text`, `transcript.segments`, `capabilities`, `quality`, and the `provider_metadata` envelope. The top-level `segments` array remains for compatibility with earlier consumers. + ```json { + "schema_version": "transcribeit.manifest.v2", "input": { "file": "meeting.mp4", - "duration_secs": 3600.0 + "duration_secs": 3600.0, + "duration_ms": 3600000 }, "config": { "provider": "local", @@ -344,33 +370,77 @@ When `--output-dir` is specified, the following files are created: "language": "en", "normalized_audio": true }, + "capabilities": { + "segments": true, + "word_timestamps": true, + "speaker_labels": true, + "language_per_segment": true, + "emotion_per_segment": true, + "native_timestamps": true + }, + "quality": { + "timing_source": "provider_native", + "timing_reliable": true, + "timestamps_clamped": false, + "speaker_source": "provider_native", + "warnings": [] + }, + "transcript": { + "text": "Hello, welcome to the meeting.", + "segments": [ + { + "id": "seg_000001", + "index": 0, + "start_secs": 0.0, + "end_secs": 5.25, + "start_ms": 0, + "end_ms": 5250, + "text": "Hello, welcome to the meeting.", + "speaker": "Speaker 0", + "language": "en", + "emotion": "neutral", + "words": [ + { + "id": "seg_000001_word_000001", + "index": 0, + "start_secs": 0.0, + "end_secs": 0.4, + "start_ms": 0, + "end_ms": 400, + "text": "Hello", + "punctuation": "," + } + ] + } + ] + }, "segments": [ { + "id": "seg_000001", "index": 0, "start_secs": 0.0, "end_secs": 5.25, + "start_ms": 0, + "end_ms": 5250, "text": "Hello, welcome to the meeting.", "speaker": "Speaker 0", "language": "en", "emotion": "neutral", - "words": [ - { - "start_secs": 0.0, - "end_secs": 0.4, - "text": "Hello", - "punctuation": "," - } - ] + "words": [] } ], "stats": { "total_duration_secs": 3600.0, + "total_duration_ms": 3600000, "total_segments": 42, "total_characters": 15000, - "processing_time_secs": 120.5 + "processing_time_secs": 120.5, + "processing_time_ms": 120500 }, "provider_metadata": { - "qwen": { + "provider": "qwen-filetrans", + "schema_version": "qwen-filetrans.metadata.v1", + "data": { "model": "qwen3-asr-flash-filetrans", "task": { "task_status": "SUCCEEDED", diff --git a/docs/performance-benchmarks.md b/docs/performance-benchmarks.md index 2c46d1b..17c3e78 100644 --- a/docs/performance-benchmarks.md +++ b/docs/performance-benchmarks.md @@ -51,6 +51,7 @@ time transcribeit run -p sherpa-onnx -i -m base -f text -o ./output time transcribeit run -p openai -i -f text -o ./output time transcribeit run -p azure -i -f text -o ./output time transcribeit run -p qwen-filetrans -i -f text -o ./output +time transcribeit run -p gemini -i -f text -o ./output ``` Record: @@ -104,11 +105,28 @@ Record: - input size and duration - S3-compatible storage provider and region - DashScope ASR base URL -- task `usage.seconds` from `provider_metadata.qwen.task` +- task `usage.seconds` from `provider_metadata.data.task` - local wall-clock time -- manifest `provider_metadata.qwen.result.word_count` +- manifest `provider_metadata.data.result.word_count` - whether word-level timestamps were present +### 6. Gemini hosted transcription + +Gemini is a whole-file multimodal provider with model-generated structured output, so benchmark transcript quality and timestamp reliability separately from dedicated ASR providers: + +```bash +time transcribeit run -p gemini --remote-model gemini-3.5-flash -i -f vtt -o ./output +time transcribeit run -p gemini --remote-model gemini-3.1-pro-preview -i -f vtt -o ./output +``` + +Record: +- model name +- wall-clock time +- manifest `quality.timing_reliable` +- manifest `quality.timestamps_clamped` +- manifest `provider_metadata.data.response.usage_metadata` +- whether speaker/language/emotion fields were useful or only generic + ## Suggested result format ```text diff --git a/docs/provider-behavior.md b/docs/provider-behavior.md index 26444b5..4ac38a0 100644 --- a/docs/provider-behavior.md +++ b/docs/provider-behavior.md @@ -1,6 +1,6 @@ # Provider behavior -This project supports five providers. They share the same input/output surface, but engine type, API shape, and credentials differ. +This project supports six providers. They share the same input/output surface, but engine type, API shape, and credentials differ. ## Local (`-p local`) @@ -31,8 +31,8 @@ This project supports five providers. They share the same input/output surface, - **SenseVoice limitation:** emotion and audio event detection tags are stripped by the sherpa-onnx C API and are not available in the output. - Segment concurrency is always 1 (sequential processing). - No external API key is required. -- The `sherpa-onnx` feature is enabled by default. Build without it using `cargo build --no-default-features`. -- Requires `SHERPA_ONNX_LIB_DIR` to be set at build time (see [Architecture](architecture.md#build-requirements)). +- The `sherpa-onnx` feature is opt-in. Build with it using `cargo build --features sherpa-onnx`. +- Requires `SHERPA_ONNX_LIB_DIR` to be set at build time when the feature is enabled (see [Architecture](architecture.md#build-requirements)). ## OpenAI-compatible (`-p openai`) @@ -41,6 +41,11 @@ This project supports five providers. They share the same input/output surface, - Model/engine: `--remote-model` (default `whisper-1`). - Endpoint used: `POST {base-url}/v1/audio/transcriptions`. - Files are uploaded as 16 kHz mono MP3 by default for compatibility. +- Response handling: + - `whisper-1` is requested as `verbose_json` first, then retried without `response_format` if the endpoint rejects it. + - `gpt-4o-mini-transcribe` and `gpt-4o-transcribe` usually return top-level `text`, which becomes one untimed segment in the current CLI. + - `gpt-4o-transcribe-diarize` is requested as `diarized_json` with `chunking_strategy=auto`; speaker labels and segment timestamps are mapped into the transcript model. + - JSON responses are parsed defensively. Unknown fields are ignored, missing segment timestamps default to `0`, and invalid/empty segments fall back to top-level `text` when available. - Supports API resilience options: - `--max-retries` - `--request-timeout-secs` @@ -86,13 +91,45 @@ This project supports five providers. They share the same input/output surface, - Input audio/video is converted with FFmpeg to 16 kHz mono MP3 before upload. - The engine uploads the prepared file, generates a pre-signed GET URL, submits the Qwen async task, polls until completion, downloads the transcription JSON, and maps Qwen sentence timestamps into the project transcript model. - Manifests include Qwen provider metadata when available: - - `provider_metadata.qwen.task` with task ID, request ID, timing, status, and usage - - `provider_metadata.qwen.result` with audio info and transcript/sentence/word counts + - `provider_metadata.provider = "qwen-filetrans"` + - `provider_metadata.schema_version = "qwen-filetrans.metadata.v1"` + - `provider_metadata.data.task` with task ID, request ID, timing, status, and usage + - `provider_metadata.data.result` with audio info and transcript/sentence/word counts - per-segment `language`, `emotion`, and `words` with word-level timestamps - Temporary pre-signed URLs are not persisted in the manifest; only `file_url_present` is recorded. - Qwen file transcription is intended for whole-file processing. Do not enable segmentation unless you explicitly want multiple independent remote tasks. - If a short-audio `qwen3-asr-flash` model is accidentally selected with `-p qwen-filetrans`, the CLI validates the local file before upload and fails without staging it to S3. Short flash models have a 10 MB and 300 second limit and use a different API path. +## Gemini (`-p gemini`) + +- Uses Gemini Files API plus `generateContent`. +- Authentication: `--gemini-api-key` or `GEMINI_API_KEY`. + - `--api-key`/`OPENAI_API_KEY` is accepted as a fallback for scripting consistency. +- Base URL defaults to `https://generativelanguage.googleapis.com/v1beta` and can be overridden with `--gemini-api-base-url` or `GEMINI_API_BASE_URL`. +- Default model: `gemini-3.5-flash`. +- Useful benchmark candidates include `gemini-3.1-pro-preview`, `gemini-3-flash-preview`, `gemini-3-pro-preview`, and `gemini-2.5-flash`. +- Endpoint flow: + - `POST {upload-base-url}/files` to start a resumable file upload. + - Upload bytes to the returned `x-goog-upload-url`. + - Poll `GET {base-url}/files/{id}` until the file is `ACTIVE`. + - `POST {base-url}/models/{model}:generateContent`. + - `DELETE {base-url}/files/{id}` after transcription. +- Input audio/video is converted with FFmpeg to 16 kHz mono MP3 before upload. +- The request uses Gemini structured JSON output and asks for: + - full transcript text + - chronological segments + - optional segment timestamps + - optional speaker, language, and emotion fields +- Gemini timestamps, speakers, and emotions are generated model output, not a dedicated ASR response schema. The parser accepts future response fields, skips empty segments, and falls back to top-level generated text if structured JSON is missing or invalid. +- Manifests include Gemini provider metadata when available: + - `provider_metadata.provider = "gemini"` + - `provider_metadata.schema_version = "gemini.metadata.v1"` + - `provider_metadata.data.model` + - `provider_metadata.data.upload_method` + - `provider_metadata.data.response.usage_metadata` + - `provider_metadata.data.response.finish_reasons` + - `provider_metadata.data.file.deleted` + ## Why providers differ ### Local vs Sherpa-ONNX @@ -100,7 +137,7 @@ This project supports five providers. They share the same input/output surface, Both are local engines that run without network access. They differ in the model format and inference backend: - **Local** uses GGML models via `whisper.cpp` (`whisper-rs` binding). Supports all Whisper model sizes. Uses FFmpeg `silencedetect` for segmentation. -- **Sherpa-ONNX** uses ONNX models via the `sherpa-onnx` C library. Supports three model architectures (Whisper, Moonshine, SenseVoice) with automatic detection. Whisper ONNX supports all sizes except `large-v3`. Requires auto-segmentation at 30s due to Whisper ONNX limitations. Supports VAD-based segmentation via `--vad-model` for cleaner speech boundaries (recommended). Also supports speaker diarization via `--speakers`. The `sherpa-onnx` feature is optional (enabled by default); build without it using `cargo build --no-default-features`. +- **Sherpa-ONNX** uses ONNX models via the `sherpa-onnx` C library. Supports three model architectures (Whisper, Moonshine, SenseVoice) with automatic detection. Whisper ONNX supports all sizes except `large-v3`. Requires auto-segmentation at 30s due to Whisper ONNX limitations. Supports VAD-based segmentation via `--vad-model` for cleaner speech boundaries (recommended). Also supports speaker diarization via `--speakers`. The `sherpa-onnx` feature is optional; enable it with `cargo build --features sherpa-onnx`. ### Segmentation: VAD vs FFmpeg silencedetect diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index d892136..7b607e9 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -101,13 +101,17 @@ DIARIZE_EMBEDDING_MODEL=/path/to/embedding.onnx ### Building without sherpa-onnx -If you do not need the sherpa-onnx provider and want to avoid installing the shared libraries: +If you do not need the sherpa-onnx provider, use the default build. It does not require the shared libraries: ```bash -cargo build --release --no-default-features +cargo build --release ``` -This disables the `sherpa-onnx` Cargo feature (which is enabled by default) and removes the dependency on `SHERPA_ONNX_LIB_DIR`. +To enable the sherpa-onnx provider, install the shared libraries and build with: + +```bash +cargo build --release --features sherpa-onnx +``` ### Model download fails @@ -277,7 +281,7 @@ Copy the dylibs from `vendor/sherpa-onnx-*/lib/` or download them with `transcri If you see a hardcoded path from another machine (e.g., `/Users/someone/...`), the binary was built with an old `build.rs`. Rebuild with the latest code — the portable `@executable_path/lib` rpath is now used. -To avoid this dependency entirely, build without sherpa-onnx: +To avoid this dependency entirely, use the default build: ```bash -cargo build --release --no-default-features +cargo build --release ``` diff --git a/src/cli.rs b/src/cli.rs index 518a7fd..ca395a3 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -79,6 +79,8 @@ pub(crate) enum Provider { /// Qwen3-ASR-Flash-Filetrans via DashScope and S3 pre-signed URLs #[value(name = "qwen-filetrans")] QwenFiletrans, + /// Gemini multimodal transcription through generateContent and Files API + Gemini, } #[derive(Debug, Clone, ValueEnum)] @@ -189,11 +191,15 @@ pub(crate) enum Command { #[arg(long, env = "DASHSCOPE_API_KEY")] dashscope_api_key: Option, + /// Gemini API key (or set GEMINI_API_KEY) + #[arg(long, env = "GEMINI_API_KEY")] + gemini_api_key: Option, + /// Azure API key (or set AZURE_API_KEY env var) #[arg(long, env = "AZURE_API_KEY")] azure_api_key: Option, - /// Remote model name (for --provider openai or qwen-filetrans) + /// Remote model name (for --provider openai, qwen-filetrans, or gemini) #[arg(long)] remote_model: Option, @@ -205,6 +211,14 @@ pub(crate) enum Command { )] qwen_api_base_url: String, + /// Gemini API base URL + #[arg( + long, + env = "GEMINI_API_BASE_URL", + default_value = "https://generativelanguage.googleapis.com/v1beta" + )] + gemini_api_base_url: String, + /// Language code (e.g. en, fr, auto). If not set, auto-detection is used. #[arg(long)] language: Option, diff --git a/src/engines/gemini.rs b/src/engines/gemini.rs new file mode 100644 index 0000000..95cc013 --- /dev/null +++ b/src/engines/gemini.rs @@ -0,0 +1,318 @@ +use std::path::Path; + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use reqwest::Client; +use serde::Deserialize; +use serde_json::{Value, json}; + +use crate::audio::segment::get_duration; +use crate::audio::wav::encode_wav; +use crate::engines::rate_limit::{self, send_with_retry}; +use crate::transcriber::{Transcriber, Transcript}; + +mod response; +mod schema; + +use response::parse_generate_response; +use schema::{audio_mime, generate_payload, prompt_text, upload_base_url}; + +pub struct GeminiApi { + api_base_url: String, + upload_base_url: String, + api_key: String, + model: String, + language: Option, + settings: rate_limit::ApiRequestSettings, + client: Client, +} + +impl GeminiApi { + pub fn new( + api_base_url: String, + api_key: String, + model: String, + language: Option, + settings: rate_limit::ApiRequestSettings, + ) -> Result { + let client = Client::builder() + .timeout(settings.request_timeout) + .build() + .context("Failed to build HTTP client")?; + let api_base_url = api_base_url.trim_end_matches('/').to_string(); + let upload_base_url = upload_base_url(&api_base_url); + + Ok(Self { + api_base_url, + upload_base_url, + api_key, + model, + language, + settings, + client, + }) + } + + async fn transcribe_file(&self, audio_path: &Path) -> Result { + let bytes = tokio::fs::read(audio_path) + .await + .with_context(|| format!("Failed to read audio file: {}", audio_path.display()))?; + let mime_type = audio_mime(audio_path); + let duration_secs = get_duration(audio_path).await.ok(); + let file = self.upload_file(audio_path, &bytes, mime_type).await?; + let active_file = self.wait_for_active_file(file).await?; + let response = self + .generate_transcript( + &active_file.uri, + mime_type, + bytes.len() as u64, + duration_secs, + ) + .await; + let delete_result = self.delete_file(&active_file.name).await; + + let mut transcript = response?; + transcript.provider_metadata = Some(with_file_cleanup_metadata( + transcript.provider_metadata.take(), + active_file.name, + mime_type, + bytes.len() as u64, + delete_result, + )); + Ok(transcript) + } + + async fn upload_file( + &self, + audio_path: &Path, + bytes: &[u8], + mime_type: &str, + ) -> Result { + let display_name = audio_path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or("audio"); + let start_url = format!("{}/files", self.upload_base_url); + let start_response = self + .client + .post(start_url) + .header("x-goog-api-key", &self.api_key) + .header("X-Goog-Upload-Protocol", "resumable") + .header("X-Goog-Upload-Command", "start") + .header( + "X-Goog-Upload-Header-Content-Length", + bytes.len().to_string(), + ) + .header("X-Goog-Upload-Header-Content-Type", mime_type) + .json(&json!({ "file": { "display_name": display_name } })) + .send() + .await + .context("Failed to start Gemini file upload")?; + + let status = start_response.status(); + if !status.is_success() { + let body = start_response.text().await.unwrap_or_default(); + anyhow::bail!("Gemini file upload start returned {status}: {body}"); + } + + let upload_url = start_response + .headers() + .get("x-goog-upload-url") + .and_then(|value| value.to_str().ok()) + .map(ToOwned::to_owned) + .context("Gemini file upload start response did not include x-goog-upload-url")?; + + let upload_response = self + .client + .post(upload_url) + .header("Content-Length", bytes.len().to_string()) + .header("X-Goog-Upload-Offset", "0") + .header("X-Goog-Upload-Command", "upload, finalize") + .body(bytes.to_vec()) + .send() + .await + .context("Failed to upload audio bytes to Gemini Files API")?; + + let status = upload_response.status(); + let body = upload_response + .bytes() + .await + .context("Failed to read Gemini file upload response")?; + if !status.is_success() { + anyhow::bail!( + "Gemini file upload returned {status}: {}", + String::from_utf8_lossy(&body) + ); + } + + let response: FileResponse = + serde_json::from_slice(&body).context("Failed to parse Gemini file upload response")?; + response + .file + .context("Gemini file upload response did not include file metadata") + } + + async fn wait_for_active_file(&self, mut file: FileRef) -> Result { + for _ in 0..60 { + match file.state.as_deref() { + None | Some("ACTIVE") => return Ok(file), + Some("FAILED") => anyhow::bail!("Gemini file processing failed for {}", file.name), + _ => { + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + file = self.get_file(&file.name).await?; + } + } + } + + anyhow::bail!("Gemini file did not become ACTIVE within 60 seconds"); + } + + async fn get_file(&self, name: &str) -> Result { + let url = format!("{}/{}", self.api_base_url, name); + let response = self + .client + .get(url) + .header("x-goog-api-key", &self.api_key) + .send() + .await + .context("Failed to get Gemini file metadata")?; + let status = response.status(); + let body = response + .bytes() + .await + .context("Failed to read Gemini file metadata response")?; + if !status.is_success() { + anyhow::bail!( + "Gemini file metadata returned {status}: {}", + String::from_utf8_lossy(&body) + ); + } + let response: FileResponse = + serde_json::from_slice(&body).context("Failed to parse Gemini file metadata")?; + response + .file + .context("Gemini file metadata response did not include file") + } + + async fn generate_transcript( + &self, + file_uri: &str, + mime_type: &str, + input_bytes: u64, + duration_secs: Option, + ) -> Result { + let url = format!( + "{}/models/{}:generateContent", + self.api_base_url, + urlencoding::encode(&self.model) + ); + let prompt = prompt_text(self.language.as_deref(), duration_secs); + let payload = generate_payload(file_uri, mime_type, &prompt); + let body = send_with_retry(&self.settings, "Gemini generateContent", || { + let client = self.client.clone(); + let api_key = self.api_key.clone(); + let url = url.clone(); + let payload = payload.clone(); + Box::pin(async move { + client + .post(url) + .header("x-goog-api-key", api_key) + .json(&payload) + .send() + .await + .context("Failed to send request to Gemini generateContent") + }) + }) + .await + .map_err(|(status, body)| { + anyhow::anyhow!("Gemini generateContent returned {status}: {body}") + })?; + + Ok(parse_generate_response( + &body, + &self.model, + &self.api_base_url, + mime_type, + input_bytes, + duration_secs, + )) + } + + async fn delete_file(&self, name: &str) -> Result<()> { + let url = format!("{}/{}", self.api_base_url, name); + let response = self + .client + .delete(url) + .header("x-goog-api-key", &self.api_key) + .send() + .await + .context("Failed to delete Gemini file")?; + if response.status().is_success() { + Ok(()) + } else { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + anyhow::bail!("Gemini file delete returned {status}: {body}"); + } + } +} + +#[async_trait] +impl Transcriber for GeminiApi { + async fn transcribe(&self, audio_samples: Vec) -> Result { + let wav_bytes = encode_wav(&audio_samples)?; + self.transcribe_wav(wav_bytes).await + } + + async fn transcribe_path(&self, audio_path: &Path) -> Result { + self.transcribe_file(audio_path).await + } + + async fn transcribe_wav(&self, wav_bytes: Vec) -> Result { + let tmp = tempfile::Builder::new() + .prefix("transcribeit-gemini-") + .suffix(".wav") + .tempfile() + .context("Failed to create temporary WAV file")?; + tokio::fs::write(tmp.path(), wav_bytes) + .await + .context("Failed to write temporary WAV file")?; + self.transcribe_path(tmp.path()).await + } +} + +#[derive(Clone, Deserialize)] +struct FileResponse { + file: Option, +} + +#[derive(Clone, Deserialize)] +struct FileRef { + name: String, + uri: String, + #[serde(default)] + state: Option, +} + +fn with_file_cleanup_metadata( + metadata: Option, + file_name: String, + mime_type: &str, + input_bytes: u64, + delete_result: Result<()>, +) -> Value { + let mut metadata = metadata.unwrap_or_else(|| json!({ "gemini": {} })); + if let Some(gemini) = metadata.get_mut("gemini").and_then(Value::as_object_mut) { + gemini.insert( + "file".to_string(), + json!({ + "name": file_name, + "mime_type": mime_type, + "bytes": input_bytes, + "deleted": delete_result.is_ok(), + "delete_error": delete_result.err().map(|err| err.to_string()), + }), + ); + } + metadata +} diff --git a/src/engines/gemini/response.rs b/src/engines/gemini/response.rs new file mode 100644 index 0000000..d358526 --- /dev/null +++ b/src/engines/gemini/response.rs @@ -0,0 +1,331 @@ +use serde_json::{Value, json}; + +use crate::transcriber::{Segment, Transcript}; + +pub fn parse_generate_response( + body: &[u8], + model: &str, + api_base_url: &str, + mime_type: &str, + input_bytes: u64, + duration_secs: Option, +) -> Transcript { + let response_value = serde_json::from_slice::(body).unwrap_or_else(|_| { + json!({ + "raw_text": String::from_utf8_lossy(body).to_string() + }) + }); + let generated_text = generated_text(&response_value).unwrap_or_default(); + let generated_json = parse_generated_json(&generated_text); + let segments = generated_json + .as_ref() + .and_then(|value| parse_transcript_segments(value, duration_secs)) + .filter(|segments| !segments.is_empty()) + .or_else(|| { + generated_json + .as_ref() + .and_then(|value| value.get("text").and_then(Value::as_str)) + .map(single_text_segment) + }) + .unwrap_or_else(|| single_text_segment(&generated_text)); + + Transcript { + segments, + provider_metadata: Some(json!({ + "gemini": { + "model": model, + "api_base_url": api_base_url, + "upload_method": "files_api", + "input": { + "mime_type": mime_type, + "bytes": input_bytes, + "duration_secs": duration_secs, + }, + "response": { + "generated_json_valid": generated_json.is_some(), + "timestamps_clamped": generated_json + .as_ref() + .is_some_and(|value| timestamps_need_clamp(value, duration_secs)), + "candidate_count": response_value + .get("candidates") + .and_then(Value::as_array) + .map_or(0, Vec::len), + "finish_reasons": finish_reasons(&response_value), + "usage_metadata": response_value.get("usageMetadata").cloned(), + "prompt_feedback": response_value.get("promptFeedback").cloned(), + } + } + })), + } +} + +fn generated_text(response: &Value) -> Option { + let parts = response + .get("candidates")? + .as_array()? + .first()? + .get("content")? + .get("parts")? + .as_array()?; + + let text = parts + .iter() + .filter_map(|part| part.get("text").and_then(Value::as_str)) + .collect::>() + .join(""); + (!text.trim().is_empty()).then_some(text) +} + +fn parse_generated_json(text: &str) -> Option { + let trimmed = strip_json_fence(text.trim()); + serde_json::from_str(trimmed).ok() +} + +fn strip_json_fence(text: &str) -> &str { + let Some(without_prefix) = text.strip_prefix("```") else { + return text; + }; + let without_lang = without_prefix + .strip_prefix("json") + .unwrap_or(without_prefix) + .trim_start(); + without_lang + .strip_suffix("```") + .unwrap_or(without_lang) + .trim() +} + +fn parse_transcript_segments( + value: &Value, + max_duration_secs: Option, +) -> Option> { + let segments = value.get("segments")?.as_array()?; + Some( + segments + .iter() + .filter_map(|segment| parse_transcript_segment(segment, max_duration_secs)) + .collect(), + ) +} + +fn parse_transcript_segment(value: &Value, max_duration_secs: Option) -> Option { + let text = value + .get("text") + .and_then(Value::as_str) + .map(str::trim) + .filter(|text| !text.is_empty())?; + let start_ms = timestamp_ms( + value + .get("start_secs") + .or_else(|| value.get("start")) + .or_else(|| value.get("begin_secs")), + ) + .unwrap_or(0); + let mut end_ms = timestamp_ms( + value + .get("end_secs") + .or_else(|| value.get("end")) + .or_else(|| value.get("end_time")), + ) + .unwrap_or(start_ms); + if let Some(max_ms) = max_duration_secs.map(|seconds| (seconds * 1000.0).round() as i64) { + end_ms = end_ms.min(max_ms); + } + let start_ms = if end_ms > 0 { + start_ms.min(end_ms) + } else { + start_ms + }; + + Some(Segment { + start_ms, + end_ms, + text: text.to_string(), + speaker: string_field(value, &["speaker", "speaker_id"]), + language: string_field(value, &["language", "lang"]), + emotion: string_field(value, &["emotion"]), + words: Vec::new(), + }) +} + +fn timestamps_need_clamp(value: &Value, duration_secs: Option) -> bool { + let Some(max_ms) = duration_secs.map(|seconds| (seconds * 1000.0).round() as i64) else { + return false; + }; + value + .get("segments") + .and_then(Value::as_array) + .is_some_and(|segments| { + segments.iter().any(|segment| { + timestamp_ms(segment.get("start_secs").or_else(|| segment.get("start"))) + .is_some_and(|start_ms| start_ms > max_ms) + || timestamp_ms(segment.get("end_secs").or_else(|| segment.get("end"))) + .is_some_and(|end_ms| end_ms > max_ms) + }) + }) +} + +fn timestamp_ms(value: Option<&Value>) -> Option { + let value = value?; + if value.is_null() { + return None; + } + let seconds = match value { + Value::Number(n) => n.as_f64()?, + Value::String(s) => s.parse().ok()?, + _ => return None, + }; + Some((seconds * 1000.0).round() as i64) +} + +fn string_field(value: &Value, names: &[&str]) -> Option { + names + .iter() + .filter_map(|name| value.get(name)) + .find_map(Value::as_str) + .map(str::trim) + .filter(|text| !text.is_empty()) + .map(ToOwned::to_owned) +} + +fn single_text_segment(text: &str) -> Vec { + let text = text.trim(); + if text.is_empty() { + return Vec::new(); + } + vec![Segment { + start_ms: 0, + end_ms: 0, + text: text.to_string(), + speaker: None, + ..Default::default() + }] +} + +fn finish_reasons(response: &Value) -> Vec { + response + .get("candidates") + .and_then(Value::as_array) + .map(|candidates| { + candidates + .iter() + .filter_map(|candidate| candidate.get("finishReason").cloned()) + .collect() + }) + .unwrap_or_default() +} + +#[cfg(test)] +mod tests { + use super::parse_generate_response; + + #[test] + fn parses_structured_transcript_segments() { + let body = br#"{ + "candidates": [{ + "finishReason": "STOP", + "content": { + "parts": [{ + "text": "{\"text\":\"hello world\",\"segments\":[{\"start_secs\":1.2,\"end_secs\":2.4,\"speaker\":\"A\",\"language\":\"en\",\"emotion\":\"Neutral\",\"text\":\"hello\"}]}" + }] + } + }], + "usageMetadata": {"totalTokenCount": 42} + }"#; + + let transcript = parse_generate_response( + body, + "gemini-test", + "https://example.com", + "audio/mp3", + 12, + None, + ); + assert_eq!(transcript.segments.len(), 1); + assert_eq!(transcript.segments[0].text, "hello"); + assert_eq!(transcript.segments[0].speaker.as_deref(), Some("A")); + assert_eq!(transcript.segments[0].language.as_deref(), Some("en")); + assert_eq!(transcript.segments[0].emotion.as_deref(), Some("Neutral")); + assert_eq!(transcript.segments[0].start_ms, 1200); + assert_eq!(transcript.segments[0].end_ms, 2400); + } + + #[test] + fn falls_back_to_top_level_text_when_segments_are_invalid() { + let body = br#"{ + "candidates": [{ + "content": { + "parts": [{ + "text": "```json\n{\"text\":\"fallback text\",\"segments\":[{\"start_secs\":null,\"end_secs\":null,\"speaker\":null,\"language\":null,\"emotion\":null,\"text\":\" \"}]}\n```" + }] + } + }] + }"#; + + let transcript = parse_generate_response( + body, + "gemini-test", + "https://example.com", + "audio/mp3", + 12, + None, + ); + assert_eq!(transcript.segments.len(), 1); + assert_eq!(transcript.segments[0].text, "fallback text"); + assert_eq!(transcript.segments[0].start_ms, 0); + } + + #[test] + fn falls_back_to_raw_generated_text_when_json_is_invalid() { + let body = br#"{ + "candidates": [{ + "content": { + "parts": [{"text": "plain transcript"}] + } + }] + }"#; + + let transcript = parse_generate_response( + body, + "gemini-test", + "https://example.com", + "audio/mp3", + 12, + None, + ); + assert_eq!(transcript.segments.len(), 1); + assert_eq!(transcript.segments[0].text, "plain transcript"); + } + + #[test] + fn clamps_timestamps_to_known_audio_duration() { + let body = br#"{ + "candidates": [{ + "content": { + "parts": [{ + "text": "{\"text\":\"hello\",\"segments\":[{\"start_secs\":299,\"end_secs\":500,\"text\":\"hello\"}]}" + }] + } + }] + }"#; + + let transcript = parse_generate_response( + body, + "gemini-test", + "https://example.com", + "audio/mp3", + 12, + Some(300.0), + ); + assert_eq!(transcript.segments[0].start_ms, 299000); + assert_eq!(transcript.segments[0].end_ms, 300000); + assert_eq!( + transcript + .provider_metadata + .as_ref() + .and_then(|value| value.pointer("/gemini/response/timestamps_clamped")) + .and_then(serde_json::Value::as_bool), + Some(true) + ); + } +} diff --git a/src/engines/gemini/schema.rs b/src/engines/gemini/schema.rs new file mode 100644 index 0000000..73da649 --- /dev/null +++ b/src/engines/gemini/schema.rs @@ -0,0 +1,145 @@ +use std::path::Path; + +use serde_json::{Value, json}; + +const DEFAULT_PROMPT: &str = r#"Transcribe this audio as accurately and verbatim as possible. + +Return only JSON matching the provided schema. +Use domain-specific spelling when clear from the audio. +Create short, readable segments in chronological order. +Use null for timestamps, speaker, language, or emotion when uncertain. +Do not summarize, paraphrase, omit disfluencies that are clearly spoken, or invent content. +"#; + +pub(super) fn upload_base_url(api_base_url: &str) -> String { + if let Some(root) = api_base_url.strip_suffix("/v1beta") { + format!("{root}/upload/v1beta") + } else if let Some(root) = api_base_url.strip_suffix("/v1") { + format!("{root}/upload/v1") + } else { + format!("{}/upload/v1beta", api_base_url.trim_end_matches('/')) + } +} + +pub(super) fn audio_mime(path: &Path) -> &'static str { + match path.extension().and_then(|ext| ext.to_str()) { + Some(ext) if ext.eq_ignore_ascii_case("mp3") => "audio/mp3", + Some(ext) if ext.eq_ignore_ascii_case("wav") => "audio/wav", + Some(ext) if ext.eq_ignore_ascii_case("aiff") || ext.eq_ignore_ascii_case("aif") => { + "audio/aiff" + } + Some(ext) if ext.eq_ignore_ascii_case("aac") => "audio/aac", + Some(ext) if ext.eq_ignore_ascii_case("ogg") || ext.eq_ignore_ascii_case("oga") => { + "audio/ogg" + } + Some(ext) if ext.eq_ignore_ascii_case("flac") => "audio/flac", + _ => "audio/wav", + } +} + +pub(super) fn prompt_text(language: Option<&str>, duration_secs: Option) -> String { + let mut prompt = DEFAULT_PROMPT.to_string(); + if let Some(duration_secs) = duration_secs { + prompt.push_str(&format!( + "\nThe audio duration is {duration_secs:.2} seconds. Do not return any timestamp greater than this duration, and do not infer content beyond the end of the audio." + )); + } + match language { + Some(lang) if !lang.eq_ignore_ascii_case("auto") => { + prompt.push_str(&format!("\nThe expected spoken language is `{lang}`.")); + prompt + } + _ => prompt, + } +} + +pub(super) fn generate_payload(file_uri: &str, mime_type: &str, prompt: &str) -> Value { + json!({ + "contents": [{ + "parts": [ + { + "file_data": { + "mime_type": mime_type, + "file_uri": file_uri + } + }, + { "text": prompt } + ] + }], + "generationConfig": { + "responseMimeType": "application/json", + "responseSchema": transcript_schema() + } + }) +} + +fn transcript_schema() -> Value { + json!({ + "type": "OBJECT", + "properties": { + "text": { + "type": "STRING", + "description": "The complete transcript text." + }, + "segments": { + "type": "ARRAY", + "description": "Chronological transcript segments.", + "items": { + "type": "OBJECT", + "properties": { + "start_secs": { + "type": "NUMBER", + "nullable": true, + "description": "Segment start time in seconds, or null when uncertain." + }, + "end_secs": { + "type": "NUMBER", + "nullable": true, + "description": "Segment end time in seconds, or null when uncertain." + }, + "speaker": { + "type": "STRING", + "nullable": true, + "description": "Speaker label when confidently distinguishable." + }, + "language": { + "type": "STRING", + "nullable": true, + "description": "BCP-47 language code when confidently detected." + }, + "emotion": { + "type": "STRING", + "nullable": true, + "description": "Dominant speaker emotion when confidently detected." + }, + "text": { + "type": "STRING", + "description": "Verbatim segment transcript text." + } + }, + "required": ["text"], + "propertyOrdering": ["start_secs", "end_secs", "speaker", "language", "emotion", "text"] + } + } + }, + "required": ["text", "segments"], + "propertyOrdering": ["text", "segments"] + }) +} + +#[cfg(test)] +mod tests { + use super::upload_base_url; + + #[test] + fn derives_upload_base_url_from_api_base_url() { + assert_eq!( + upload_base_url("https://generativelanguage.googleapis.com/v1beta"), + "https://generativelanguage.googleapis.com/upload/v1beta" + ); + assert_eq!( + upload_base_url("https://example.com/v1"), + "https://example.com/upload/v1" + ); + } +} diff --git a/src/engines/mod.rs b/src/engines/mod.rs index 871767d..8a3d152 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -1,4 +1,5 @@ pub mod azure_openai; +pub mod gemini; pub mod model_cache; pub mod openai_api; pub mod qwen_filetrans; diff --git a/src/engines/openai_api.rs b/src/engines/openai_api.rs index dda21aa..2528bc5 100644 --- a/src/engines/openai_api.rs +++ b/src/engines/openai_api.rs @@ -3,6 +3,7 @@ use async_trait::async_trait; use reqwest::Client; use reqwest::multipart; use serde::Deserialize; +use serde_json::Value; use std::path::Path; use crate::audio::wav::encode_wav; @@ -49,6 +50,32 @@ impl OpenAiApi { } } + fn is_diarize_model(&self) -> bool { + self.model.eq_ignore_ascii_case("gpt-4o-transcribe-diarize") + } + + fn response_formats(&self) -> Vec> { + if self.is_diarize_model() { + vec![Some("diarized_json"), None] + } else { + vec![Some("verbose_json"), None] + } + } + + fn base_form(&self, response_format: Option<&'static str>) -> multipart::Form { + let mut form = multipart::Form::new().text("model", self.model.clone()); + if let Some(fmt) = response_format { + form = form.text("response_format", fmt); + } + if self.is_diarize_model() { + form = form.text("chunking_strategy", "auto"); + } + if let Some(lang) = self.language.as_deref() { + form = form.text("language", lang.to_string()); + } + form + } + /// Run the format-fallback + retry loop for a given form builder closure. async fn transcribe_with_fallback(&self, build_form: F) -> Result where @@ -57,7 +84,7 @@ impl OpenAiApi { let url = format!("{}/v1/audio/transcriptions", self.base_url); let mut last_error: Option<(reqwest::StatusCode, String)> = None; - for response_format in [Some("verbose_json"), None] { + for response_format in self.response_formats() { let result = { let url = &url; let client = &self.client; @@ -111,13 +138,7 @@ impl Transcriber for OpenAiApi { async fn transcribe_path(&self, wav_path: &Path) -> Result { let path = wav_path.to_path_buf(); self.transcribe_with_fallback(|response_format| { - let mut form = multipart::Form::new().text("model", self.model.clone()); - if let Some(fmt) = response_format { - form = form.text("response_format", fmt); - } - if let Some(lang) = self.language.as_deref() { - form = form.text("language", lang.to_string()); - } + let mut form = self.base_form(response_format); // Note: Part::file is async but we need a sync closure here. // Use blocking read since form building happens before the async send. let bytes = std::fs::read(&path) @@ -141,13 +162,7 @@ impl Transcriber for OpenAiApi { async fn transcribe_wav(&self, wav_bytes: Vec) -> Result { self.transcribe_with_fallback(|response_format| { - let mut form = multipart::Form::new().text("model", self.model.clone()); - if let Some(fmt) = response_format { - form = form.text("response_format", fmt); - } - if let Some(lang) = self.language.as_deref() { - form = form.text("language", lang.to_string()); - } + let mut form = self.base_form(response_format); form = form.part( "file", multipart::Part::bytes(wav_bytes.clone()) @@ -160,60 +175,33 @@ impl Transcriber for OpenAiApi { } } -/// Response with segments (verbose_json format). -#[derive(Deserialize)] -struct VerboseResponse { - segments: Option>, -} - -/// Minimal response (json format, or verbose_json without segments). -#[derive(Deserialize)] -struct PlainResponse { - text: String, -} - -#[derive(Deserialize)] -struct ApiSegment { - start: f64, - end: f64, - text: String, -} - /// Parse response bytes, trying verbose_json first then falling back to plain json. /// This ensures compatibility with endpoints that don't support verbose_json. pub fn parse_response_bytes(body: &[u8]) -> Transcript { - // Try verbose format with segments first. - if let Ok(resp) = serde_json::from_slice::(body) - && let Some(segs) = resp.segments - && !segs.is_empty() - { - return Transcript { - segments: segs - .into_iter() - .map(|s| Segment { - start_ms: (s.start * 1000.0) as i64, - end_ms: (s.end * 1000.0) as i64, - text: s.text, + if let Ok(value) = serde_json::from_slice::(body) { + if let Some(segments) = parse_json_segments(&value) + && !segments.is_empty() + { + return Transcript { + segments, + provider_metadata: None, + }; + } + + if let Some(text) = value.get("text").and_then(Value::as_str) + && !text.trim().is_empty() + { + return Transcript { + segments: vec![Segment { + start_ms: 0, + end_ms: 0, + text: text.to_string(), speaker: None, ..Default::default() - }) - .collect(), - provider_metadata: None, - }; - } - - // Fall back to plain text response. - if let Ok(resp) = serde_json::from_slice::(body) { - return Transcript { - segments: vec![Segment { - start_ms: 0, - end_ms: 0, - text: resp.text, - speaker: None, - ..Default::default() - }], - provider_metadata: None, - }; + }], + provider_metadata: None, + }; + } } // Last resort: treat entire body as text. @@ -229,6 +217,45 @@ pub fn parse_response_bytes(body: &[u8]) -> Transcript { } } +fn parse_json_segments(value: &Value) -> Option> { + let segments = value.get("segments")?.as_array()?; + let parsed = segments.iter().filter_map(parse_json_segment).collect(); + Some(parsed) +} + +fn parse_json_segment(value: &Value) -> Option { + let text = value + .get("text") + .and_then(Value::as_str) + .map(str::trim) + .filter(|text| !text.is_empty())?; + + let start_ms = timestamp_ms(value.get("start")).unwrap_or(0); + let end_ms = timestamp_ms(value.get("end")).unwrap_or(start_ms); + let speaker = value + .get("speaker") + .or_else(|| value.get("speaker_id")) + .and_then(Value::as_str) + .map(ToOwned::to_owned); + + Some(Segment { + start_ms, + end_ms, + text: text.to_string(), + speaker, + ..Default::default() + }) +} + +fn timestamp_ms(value: Option<&Value>) -> Option { + let seconds = match value? { + Value::Number(n) => n.as_f64()?, + Value::String(s) => s.parse().ok()?, + _ => return None, + }; + Some((seconds * 1000.0).round() as i64) +} + pub(crate) fn is_response_format_not_supported(body: &str) -> bool { #[derive(Deserialize)] struct ErrorPayload { @@ -277,37 +304,4 @@ pub(crate) fn is_response_format_not_supported(body: &str) -> bool { } #[cfg(test)] -mod tests { - use super::is_response_format_not_supported; - - #[test] - fn detects_structured_openai_error_param() { - let body = r#"{"error":{"message":"Unsupported value: 'response_format'","param":"response_format","type":"invalid_request_error"}}"#; - assert!(is_response_format_not_supported(body)); - } - - #[test] - fn detects_structured_azure_error_message() { - let body = r#"{"error":{"code":"BadRequest","message":"The request is invalid. Parameter 'response_format' is invalid."}}"#; - assert!(is_response_format_not_supported(body)); - } - - #[test] - fn falls_back_to_structured_error_code() { - let body = r#"{"error":{"code":"response_format","message":"Unsupported value"}}"#; - assert!(is_response_format_not_supported(body)); - } - - #[test] - fn rejects_unrelated_errors() { - let body = - r#"{"error":{"message":"Model not found","param":"model","code":"model_not_found"}}"#; - assert!(!is_response_format_not_supported(body)); - } - - #[test] - fn detects_plaintext_error_when_param_missing() { - let body = "unsupported value 'response_format' for audio transcription"; - assert!(is_response_format_not_supported(body)); - } -} +mod tests; diff --git a/src/engines/openai_api/tests.rs b/src/engines/openai_api/tests.rs new file mode 100644 index 0000000..edd3d09 --- /dev/null +++ b/src/engines/openai_api/tests.rs @@ -0,0 +1,131 @@ +use std::time::Duration; + +use super::{OpenAiApi, is_response_format_not_supported, parse_response_bytes}; +use crate::engines::rate_limit::ApiRequestSettings; + +fn api_for_model(model: &str) -> OpenAiApi { + OpenAiApi::new( + "https://api.openai.com".into(), + "test-key".into(), + model.into(), + Some("en".into()), + ApiRequestSettings::new( + Duration::from_secs(30), + 0, + Duration::from_secs(1), + Duration::from_secs(1), + ), + ) + .unwrap() +} + +#[test] +fn diarize_model_prefers_diarized_json_with_plain_fallback() { + let api = api_for_model("gpt-4o-transcribe-diarize"); + assert_eq!(api.response_formats(), vec![Some("diarized_json"), None]); +} + +#[test] +fn non_diarize_model_prefers_verbose_json_with_plain_fallback() { + let api = api_for_model("gpt-4o-mini-transcribe"); + assert_eq!(api.response_formats(), vec![Some("verbose_json"), None]); +} + +#[test] +fn parses_diarized_segments_with_speakers() { + let body = br#"{ + "text": "hello world", + "segments": [ + { + "type": "transcript.text.segment", + "text": "hello", + "speaker": "A", + "start": 1.25, + "end": 2.5, + "id": "seg_0" + }, + { + "type": "transcript.text.segment", + "text": "world", + "speaker": "B", + "start": "2.50", + "end": "3.75" + } + ], + "future_field": {"kept_by_openai": true} + }"#; + + let transcript = parse_response_bytes(body); + assert_eq!(transcript.segments.len(), 2); + assert_eq!(transcript.segments[0].text, "hello"); + assert_eq!(transcript.segments[0].speaker.as_deref(), Some("A")); + assert_eq!(transcript.segments[0].start_ms, 1250); + assert_eq!(transcript.segments[0].end_ms, 2500); + assert_eq!(transcript.segments[1].speaker.as_deref(), Some("B")); + assert_eq!(transcript.segments[1].start_ms, 2500); + assert_eq!(transcript.segments[1].end_ms, 3750); +} + +#[test] +fn skips_invalid_segments_and_falls_back_to_text_when_needed() { + let body = br#"{ + "text": "fallback transcript", + "segments": [ + {"speaker": "A", "start": 0, "end": 1}, + {"text": " "} + ] + }"#; + + let transcript = parse_response_bytes(body); + assert_eq!(transcript.segments.len(), 1); + assert_eq!(transcript.segments[0].text, "fallback transcript"); + assert_eq!(transcript.segments[0].start_ms, 0); + assert_eq!(transcript.segments[0].end_ms, 0); +} + +#[test] +fn parses_segments_without_timestamps_without_crashing() { + let body = br#"{ + "segments": [ + {"text": "untimed", "speaker_id": "speaker-1"} + ] + }"#; + + let transcript = parse_response_bytes(body); + assert_eq!(transcript.segments.len(), 1); + assert_eq!(transcript.segments[0].text, "untimed"); + assert_eq!(transcript.segments[0].speaker.as_deref(), Some("speaker-1")); + assert_eq!(transcript.segments[0].start_ms, 0); + assert_eq!(transcript.segments[0].end_ms, 0); +} + +#[test] +fn detects_structured_openai_error_param() { + let body = r#"{"error":{"message":"Unsupported value: 'response_format'","param":"response_format","type":"invalid_request_error"}}"#; + assert!(is_response_format_not_supported(body)); +} + +#[test] +fn detects_structured_azure_error_message() { + let body = r#"{"error":{"code":"BadRequest","message":"The request is invalid. Parameter 'response_format' is invalid."}}"#; + assert!(is_response_format_not_supported(body)); +} + +#[test] +fn falls_back_to_structured_error_code() { + let body = r#"{"error":{"code":"response_format","message":"Unsupported value"}}"#; + assert!(is_response_format_not_supported(body)); +} + +#[test] +fn rejects_unrelated_errors() { + let body = + r#"{"error":{"message":"Model not found","param":"model","code":"model_not_found"}}"#; + assert!(!is_response_format_not_supported(body)); +} + +#[test] +fn detects_plaintext_error_when_param_missing() { + let body = "unsupported value 'response_format' for audio transcription"; + assert!(is_response_format_not_supported(body)); +} diff --git a/src/main.rs b/src/main.rs index 0eb5a1e..9ad0070 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,6 +21,7 @@ use clap::Parser; use crate::audio::extract::check_ffmpeg; use crate::cli::{Cli, Command, ModelFormat, OutputFormatArg, Provider, SetupComponent}; use crate::engines::azure_openai::AzureOpenAi; +use crate::engines::gemini::GeminiApi; use crate::engines::model_cache::ModelCache; use crate::engines::openai_api::OpenAiApi; use crate::engines::qwen_filetrans::QwenFileTrans; @@ -131,9 +132,11 @@ async fn main() -> Result<()> { base_url, api_key, dashscope_api_key, + gemini_api_key, azure_api_key, remote_model, qwen_api_base_url, + gemini_api_base_url, language, azure_deployment, azure_api_version, @@ -185,13 +188,15 @@ async fn main() -> Result<()> { .unwrap_or("qwen3-asr-flash-filetrans"); let qwen_needs_mp3_staging = matches!(provider, Provider::QwenFiletrans) && is_qwen_filetrans_model(qwen_filetrans_model); + let openai_style_upload = matches!(provider, Provider::Openai | Provider::Azure); + let gemini_needs_mp3_upload = matches!(provider, Provider::Gemini); let upload_as_mp3 = - matches!(provider, Provider::Openai | Provider::Azure) || qwen_needs_mp3_staging; + openai_style_upload || qwen_needs_mp3_staging || gemini_needs_mp3_upload; #[cfg(feature = "sherpa-onnx")] let is_sherpa = matches!(provider, Provider::SherpaOnnx); #[cfg(not(feature = "sherpa-onnx"))] let is_sherpa = false; - let auto_split = upload_as_mp3 || is_sherpa; + let auto_split = openai_style_upload || qwen_needs_mp3_staging || is_sherpa; let max_segment_secs = if is_sherpa { // sherpa-onnx Whisper only supports ≤30s per call max_segment_secs.min(30.0) @@ -316,6 +321,25 @@ async fn main() -> Result<()> { model_name, ) } + Provider::Gemini => { + let key = gemini_api_key + .or(api_key) + .context( + "--gemini-api-key, GEMINI_API_KEY, --api-key, or OPENAI_API_KEY is required for --provider gemini", + )?; + let model_name = remote_model.unwrap_or_else(|| "gemini-3.5-flash".into()); + ( + Box::new(GeminiApi::new( + gemini_api_base_url, + key, + model_name.clone(), + language.clone(), + api_settings, + )?), + "gemini".into(), + model_name, + ) + } }; for (index, input_path) in input_paths.iter().enumerate() { diff --git a/src/models.rs b/src/models.rs index a8dda5c..13f89c6 100644 --- a/src/models.rs +++ b/src/models.rs @@ -295,6 +295,7 @@ async fn write_response_to_path( Ok(()) } +#[cfg(feature = "sherpa-onnx")] async fn extract_archive(archive_path: &Path, extract_to: &Path) -> Result<()> { let archive_path = archive_path.to_path_buf(); let extract_to = extract_to.to_path_buf(); diff --git a/src/output/manifest.rs b/src/output/manifest.rs index 2fb081f..77a0d4b 100644 --- a/src/output/manifest.rs +++ b/src/output/manifest.rs @@ -4,18 +4,23 @@ use std::io::Write; #[derive(Serialize)] pub struct Manifest { + pub schema_version: &'static str, pub input: InputInfo, pub config: ProcessingConfig, + pub capabilities: Capabilities, + pub quality: QualityInfo, + pub transcript: TranscriptInfo, pub segments: Vec, pub stats: Stats, #[serde(skip_serializing_if = "Option::is_none")] - pub provider_metadata: Option, + pub provider_metadata: Option, } #[derive(Serialize)] pub struct InputInfo { pub file: String, pub duration_secs: f64, + pub duration_ms: i64, } #[derive(Serialize)] @@ -30,11 +35,14 @@ pub struct ProcessingConfig { pub normalized_audio: bool, } -#[derive(Serialize)] +#[derive(Clone, Serialize)] pub struct SegmentInfo { + pub id: String, pub index: usize, pub start_secs: f64, pub end_secs: f64, + pub start_ms: i64, + pub end_ms: i64, pub text: String, #[serde(skip_serializing_if = "Option::is_none")] pub speaker: Option, @@ -46,10 +54,14 @@ pub struct SegmentInfo { pub words: Vec, } -#[derive(Serialize)] +#[derive(Clone, Serialize)] pub struct WordInfo { + pub id: String, + pub index: usize, pub start_secs: f64, pub end_secs: f64, + pub start_ms: i64, + pub end_ms: i64, pub text: String, #[serde(skip_serializing_if = "Option::is_none")] pub punctuation: Option, @@ -58,9 +70,43 @@ pub struct WordInfo { #[derive(Serialize)] pub struct Stats { pub total_duration_secs: f64, + pub total_duration_ms: i64, pub total_segments: usize, pub total_characters: usize, pub processing_time_secs: f64, + pub processing_time_ms: i64, +} + +#[derive(Serialize)] +pub struct TranscriptInfo { + pub text: String, + pub segments: Vec, +} + +#[derive(Serialize)] +pub struct Capabilities { + pub segments: bool, + pub word_timestamps: bool, + pub speaker_labels: bool, + pub language_per_segment: bool, + pub emotion_per_segment: bool, + pub native_timestamps: bool, +} + +#[derive(Serialize)] +pub struct QualityInfo { + pub timing_source: String, + pub timing_reliable: bool, + pub timestamps_clamped: bool, + pub speaker_source: Option, + pub warnings: Vec, +} + +#[derive(Serialize)] +pub struct ProviderMetadata { + pub provider: String, + pub schema_version: String, + pub data: serde_json::Value, } pub fn write_manifest(manifest: &Manifest, writer: &mut impl Write) -> Result<()> { diff --git a/src/pipeline.rs b/src/pipeline.rs index 7ce7562..7af13b7 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -3,7 +3,9 @@ use std::path::{Path, PathBuf}; use std::time::Duration; use std::time::Instant; -use anyhow::{Context, Result}; +#[cfg(feature = "sherpa-onnx")] +use anyhow::Context; +use anyhow::Result; use futures_util::future::join_all; use indicatif::{ProgressBar, ProgressStyle}; diff --git a/src/pipeline/tests.rs b/src/pipeline/tests.rs index 25e4bbf..84c177b 100644 --- a/src/pipeline/tests.rs +++ b/src/pipeline/tests.rs @@ -3,9 +3,9 @@ use crate::transcriber::{Segment, Transcriber, Transcript}; use anyhow::Result; use async_trait::async_trait; use hound::WavSpec; -use serde_json::Value; +use serde_json::{Value, json}; use std::f32::consts::PI; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::process::Command; use tempfile::tempdir; @@ -25,24 +25,9 @@ async fn pipeline_end_to_end_writes_vtt_and_manifest() -> Result<()> { run_pipeline( &FakeTranscriber, PipelineConfig { - input: input_path.clone(), - output_dir: Some(output_dir.clone()), - output_format: OutputFormat::Vtt, - language: None, - segment: false, - silence_threshold: -40.0, - min_silence_duration: 0.8, - max_segment_secs: 600.0, provider_name: "fake".into(), model_name: "fake-model".into(), - auto_split_for_api: false, - upload_as_mp3: false, - segment_concurrency: 1, - normalize_audio: false, - speakers: None, - diarize_segmentation_model: None, - diarize_embedding_model: None, - vad_model: None, + ..test_config(input_path.clone(), output_dir.clone(), OutputFormat::Vtt) }, ) .await?; @@ -66,9 +51,79 @@ async fn pipeline_end_to_end_writes_vtt_and_manifest() -> Result<()> { ); assert_eq!(manifest["config"]["provider"], "fake"); assert_eq!(manifest["config"]["model"], "fake-model"); + assert_eq!(manifest["schema_version"], "transcribeit.manifest.v2"); + assert_eq!(manifest["input"]["duration_ms"], 1000); assert_eq!(manifest["segments"][0]["start_secs"], 0.0); assert_eq!(manifest["segments"][0]["end_secs"], 1.0); + assert_eq!(manifest["segments"][0]["id"], "seg_000001"); + assert_eq!(manifest["segments"][0]["start_ms"], 0); + assert_eq!(manifest["segments"][0]["end_ms"], 1000); assert_eq!(manifest["segments"][0]["text"], "integration"); + assert_eq!(manifest["transcript"]["text"], "integration"); + assert_eq!(manifest["transcript"]["segments"][0]["id"], "seg_000001"); + assert_eq!(manifest["capabilities"]["segments"], true); + assert_eq!(manifest["capabilities"]["word_timestamps"], false); + assert_eq!(manifest["quality"]["timing_source"], "unknown"); + assert_eq!(manifest["quality"]["timing_reliable"], false); + assert!(manifest.get("provider_metadata").is_none()); + + Ok(()) +} + +#[tokio::test] +async fn pipeline_manifest_wraps_provider_metadata_in_stable_envelope() -> Result<()> { + if !command_exists("ffprobe") { + eprintln!("Skipping integration test: ffprobe not available"); + return Ok(()); + } + + let workdir = tempdir()?; + let input_path = workdir.path().join("sample.wav"); + write_test_wav(&input_path, 1_000)?; + + let output_dir = workdir.path().join("out"); + + run_pipeline( + &FakeMetadataTranscriber, + PipelineConfig { + language: Some("en".to_string()), + provider_name: "gemini".into(), + model_name: "gemini-test".into(), + ..test_config(input_path.clone(), output_dir.clone(), OutputFormat::Text) + }, + ) + .await?; + + let manifest_path = output_dir.join("sample.manifest.json"); + let manifest_data = std::fs::read_to_string(manifest_path)?; + let manifest: Value = serde_json::from_str(&manifest_data)?; + + assert_eq!(manifest["provider_metadata"]["provider"], "gemini"); + assert_eq!( + manifest["provider_metadata"]["schema_version"], + "gemini.metadata.v1" + ); + assert_eq!( + manifest["provider_metadata"]["data"]["response"]["timestamps_clamped"], + true + ); + assert_eq!( + manifest["provider_metadata"]["data"]["file"]["deleted"], + true + ); + assert_eq!(manifest["quality"]["timing_source"], "model_generated"); + assert_eq!(manifest["quality"]["timing_reliable"], false); + assert_eq!(manifest["quality"]["timestamps_clamped"], true); + assert_eq!(manifest["quality"]["speaker_source"], "model_generated"); + assert!( + manifest["quality"]["warnings"] + .as_array() + .expect("warnings should be an array") + .iter() + .any(|warning| warning + .as_str() + .is_some_and(|text| text.contains("model-generated"))) + ); Ok(()) } @@ -89,24 +144,9 @@ async fn pipeline_end_to_end_writes_text_file_and_manifest() -> Result<()> { run_pipeline( &FakeTranscriber, PipelineConfig { - input: input_path.clone(), - output_dir: Some(output_dir.clone()), - output_format: OutputFormat::Text, - language: None, - segment: false, - silence_threshold: -40.0, - min_silence_duration: 0.8, - max_segment_secs: 600.0, provider_name: "fake".into(), model_name: "fake-model".into(), - auto_split_for_api: false, - upload_as_mp3: false, - segment_concurrency: 1, - normalize_audio: false, - speakers: None, - diarize_segmentation_model: None, - diarize_embedding_model: None, - vad_model: None, + ..test_config(input_path.clone(), output_dir.clone(), OutputFormat::Text) }, ) .await?; @@ -139,24 +179,10 @@ async fn pipeline_end_to_end_writes_srt_file_and_manifest() -> Result<()> { run_pipeline( &FakeTranscriber, PipelineConfig { - input: input_path.clone(), - output_dir: Some(output_dir.clone()), - output_format: OutputFormat::Srt, language: Some("en".to_string()), - segment: false, - silence_threshold: -40.0, - min_silence_duration: 0.8, - max_segment_secs: 600.0, provider_name: "fake".into(), model_name: "fake-model".into(), - auto_split_for_api: false, - upload_as_mp3: false, - segment_concurrency: 1, - normalize_audio: false, - speakers: None, - diarize_segmentation_model: None, - diarize_embedding_model: None, - vad_model: None, + ..test_config(input_path.clone(), output_dir.clone(), OutputFormat::Srt) }, ) .await?; @@ -191,24 +217,13 @@ async fn pipeline_segmented_api_uploads_are_processed_concurrently() -> Result<( run_pipeline( &FakeApiTranscriber, PipelineConfig { - input: input_path.clone(), - output_dir: Some(output_dir.clone()), - output_format: OutputFormat::Text, - language: None, segment: true, - silence_threshold: -40.0, - min_silence_duration: 0.8, max_segment_secs: 5.0, provider_name: "fake-api".into(), model_name: "fake-model".into(), - auto_split_for_api: false, upload_as_mp3: true, segment_concurrency: 2, - normalize_audio: false, - speakers: None, - diarize_segmentation_model: None, - diarize_embedding_model: None, - vad_model: None, + ..test_config(input_path.clone(), output_dir.clone(), OutputFormat::Text) }, ) .await?; @@ -233,6 +248,29 @@ fn command_exists(command: &str) -> bool { .unwrap_or(false) } +fn test_config(input: PathBuf, output_dir: PathBuf, output_format: OutputFormat) -> PipelineConfig { + PipelineConfig { + input, + output_dir: Some(output_dir), + output_format, + language: None, + segment: false, + silence_threshold: -40.0, + min_silence_duration: 0.8, + max_segment_secs: 600.0, + provider_name: "fake".into(), + model_name: "fake-model".into(), + auto_split_for_api: false, + upload_as_mp3: false, + segment_concurrency: 1, + normalize_audio: false, + speakers: None, + diarize_segmentation_model: None, + diarize_embedding_model: None, + vad_model: None, + } +} + fn write_test_wav(path: &Path, duration_ms: u64) -> Result<()> { let sample_rate = 16_000u32; let sample_count = (sample_rate as u64 * duration_ms / 1000) as usize; @@ -294,3 +332,32 @@ impl Transcriber for FakeApiTranscriber { self.transcribe(Vec::new()).await } } + +struct FakeMetadataTranscriber; + +#[async_trait] +impl Transcriber for FakeMetadataTranscriber { + async fn transcribe(&self, _audio_samples: Vec) -> Result { + Ok(Transcript { + segments: vec![Segment { + start_ms: 0, + end_ms: 1000, + text: "metadata".to_string(), + speaker: Some("Speaker 1".to_string()), + language: Some("en".to_string()), + emotion: Some("neutral".to_string()), + ..Default::default() + }], + provider_metadata: Some(json!({ + "gemini": { + "response": { + "timestamps_clamped": true + }, + "file": { + "deleted": true + } + } + })), + }) + } +} diff --git a/src/pipeline_output.rs b/src/pipeline_output.rs index 5d799cf..577e911 100644 --- a/src/pipeline_output.rs +++ b/src/pipeline_output.rs @@ -1,7 +1,9 @@ use anyhow::{Context, Result}; +use serde_json::Value; use crate::output::manifest::{ - InputInfo, Manifest, ProcessingConfig, SegmentInfo, Stats, WordInfo, write_manifest, + Capabilities, InputInfo, Manifest, ProcessingConfig, ProviderMetadata, QualityInfo, + SegmentInfo, Stats, TranscriptInfo, WordInfo, write_manifest, }; use crate::output::{srt::write_srt, vtt::write_vtt}; use crate::pipeline::{OutputFormat, PipelineConfig}; @@ -112,9 +114,11 @@ fn build_manifest( processing_time: f64, ) -> Manifest { Manifest { + schema_version: "transcribeit.manifest.v2", input: InputInfo { file: config.input.display().to_string(), duration_secs: total_duration, + duration_ms: secs_to_ms(total_duration), }, config: ProcessingConfig { provider: config.provider_name.clone(), @@ -126,36 +130,212 @@ fn build_manifest( language: config.language.clone(), normalized_audio: config.normalize_audio, }, - segments: transcript - .segments - .iter() - .enumerate() - .map(|(i, s)| SegmentInfo { - index: i, - start_secs: s.start_ms as f64 / 1000.0, - end_secs: s.end_ms as f64 / 1000.0, - text: s.text.trim().to_string(), - speaker: s.speaker.clone(), - language: s.language.clone(), - emotion: s.emotion.clone(), - words: s - .words - .iter() - .map(|w| WordInfo { - start_secs: w.start_ms as f64 / 1000.0, - end_secs: w.end_ms as f64 / 1000.0, - text: w.text.clone(), - punctuation: w.punctuation.clone(), - }) - .collect(), - }) - .collect(), + capabilities: build_capabilities(config, transcript), + quality: build_quality(config, transcript), + transcript: TranscriptInfo { + text: transcript.text(), + segments: build_segment_infos(transcript), + }, + segments: build_segment_infos(transcript), stats: Stats { total_duration_secs: total_duration, + total_duration_ms: secs_to_ms(total_duration), total_segments: transcript.segments.len(), total_characters: transcript.segments.iter().map(|s| s.text.len()).sum(), processing_time_secs: processing_time, + processing_time_ms: secs_to_ms(processing_time), }, - provider_metadata: transcript.provider_metadata.clone(), + provider_metadata: build_provider_metadata( + &config.provider_name, + transcript.provider_metadata.clone(), + ), + } +} + +fn build_segment_infos(transcript: &Transcript) -> Vec { + transcript + .segments + .iter() + .enumerate() + .map(|(i, s)| SegmentInfo { + id: format!("seg_{:06}", i + 1), + index: i, + start_secs: s.start_ms as f64 / 1000.0, + end_secs: s.end_ms as f64 / 1000.0, + start_ms: s.start_ms, + end_ms: s.end_ms, + text: s.text.trim().to_string(), + speaker: s.speaker.clone(), + language: s.language.clone(), + emotion: s.emotion.clone(), + words: s + .words + .iter() + .enumerate() + .map(|(word_index, w)| WordInfo { + id: format!("seg_{:06}_word_{:06}", i + 1, word_index + 1), + index: word_index, + start_secs: w.start_ms as f64 / 1000.0, + end_secs: w.end_ms as f64 / 1000.0, + start_ms: w.start_ms, + end_ms: w.end_ms, + text: w.text.clone(), + punctuation: w.punctuation.clone(), + }) + .collect(), + }) + .collect() +} + +fn build_capabilities(config: &PipelineConfig, transcript: &Transcript) -> Capabilities { + Capabilities { + segments: !transcript.segments.is_empty(), + word_timestamps: transcript + .segments + .iter() + .any(|segment| !segment.words.is_empty()), + speaker_labels: transcript + .segments + .iter() + .any(|segment| segment.speaker.is_some()), + language_per_segment: transcript + .segments + .iter() + .any(|segment| segment.language.is_some()), + emotion_per_segment: transcript + .segments + .iter() + .any(|segment| segment.emotion.is_some()), + native_timestamps: native_timestamps(&config.provider_name), + } +} + +fn build_quality(config: &PipelineConfig, transcript: &Transcript) -> QualityInfo { + let timing_source = timing_source(&config.provider_name); + let timestamps_clamped = metadata_bool( + transcript.provider_metadata.as_ref(), + &[ + "/data/response/timestamps_clamped", + "/gemini/response/timestamps_clamped", + "/response/timestamps_clamped", + ], + ); + let mut warnings = Vec::new(); + let has_durations = transcript + .segments + .iter() + .any(|segment| segment.end_ms > segment.start_ms); + + if config.provider_name == "gemini" { + warnings.push( + "Gemini timestamps, speakers, language, and emotion are model-generated structured output, not a dedicated ASR schema." + .to_string(), + ); + } + if timestamps_clamped { + warnings.push( + "One or more provider timestamps exceeded the source duration and were clamped." + .to_string(), + ); + } + if transcript + .segments + .iter() + .any(|segment| segment.end_ms < segment.start_ms) + { + warnings.push("One or more segments has end_ms earlier than start_ms.".to_string()); + } + if !transcript.segments.is_empty() && !has_durations { + warnings.push("No positive-duration segment timestamps were returned.".to_string()); } + + QualityInfo { + timing_source: timing_source.to_string(), + timing_reliable: matches!(timing_source, "provider_native" | "model_native") + && !timestamps_clamped + && has_durations, + timestamps_clamped, + speaker_source: transcript + .segments + .iter() + .any(|segment| segment.speaker.is_some()) + .then(|| speaker_source(&config.provider_name).to_string()), + warnings, + } +} + +fn build_provider_metadata(provider: &str, metadata: Option) -> Option { + let metadata = metadata?; + if metadata.get("provider").and_then(Value::as_str).is_some() && metadata.get("data").is_some() + { + return Some(ProviderMetadata { + provider: metadata + .get("provider") + .and_then(Value::as_str) + .unwrap_or(provider) + .to_string(), + schema_version: metadata + .get("schema_version") + .and_then(Value::as_str) + .unwrap_or("provider.metadata.v1") + .to_string(), + data: metadata.get("data").cloned().unwrap_or(Value::Null), + }); + } + + let data = metadata + .get(provider) + .cloned() + .or_else(|| provider_key(provider).and_then(|key| metadata.get(key).cloned())) + .unwrap_or(metadata); + + Some(ProviderMetadata { + provider: provider.to_string(), + schema_version: format!("{provider}.metadata.v1"), + data, + }) +} + +fn metadata_bool(metadata: Option<&Value>, pointers: &[&str]) -> bool { + metadata.is_some_and(|metadata| { + pointers.iter().any(|pointer| { + metadata + .pointer(pointer) + .and_then(Value::as_bool) + .unwrap_or(false) + }) + }) +} + +fn native_timestamps(provider: &str) -> bool { + matches!(provider, "local" | "openai" | "azure" | "qwen-filetrans") +} + +fn timing_source(provider: &str) -> &'static str { + match provider { + "gemini" => "model_generated", + "qwen-filetrans" | "openai" | "azure" => "provider_native", + "local" | "sherpa-onnx" => "model_native", + _ => "unknown", + } +} + +fn speaker_source(provider: &str) -> &'static str { + match provider { + "gemini" => "model_generated", + "openai" => "provider_native", + "local" | "sherpa-onnx" => "local_diarization", + _ => "provider_native", + } +} + +fn provider_key(provider: &str) -> Option<&'static str> { + match provider { + "qwen-filetrans" => Some("qwen"), + _ => None, + } +} + +fn secs_to_ms(seconds: f64) -> i64 { + (seconds * 1000.0).round() as i64 } diff --git a/src/setup.rs b/src/setup.rs index fe24265..b9b31f2 100644 --- a/src/setup.rs +++ b/src/setup.rs @@ -8,7 +8,7 @@ use tokio::io::AsyncWriteExt; use crate::cli::ModelSize; use crate::models::{download_model, models_dir}; -const SHERPA_ONNX_VERSION: &str = "v1.12.29"; +const SHERPA_ONNX_VERSION: &str = "v1.13.2"; pub(crate) async fn setup_models( output_dir: Option, @@ -66,9 +66,10 @@ pub(crate) async fn setup_sherpa_libs() -> Result { let arch = std::env::consts::ARCH; let archive_suffix = match (os, arch) { - ("macos", _) => "osx-universal2-shared", - ("linux", "x86_64") => "linux-x86_64-shared", - ("linux", "aarch64") => "linux-aarch64-shared", + ("macos", "x86_64") => "osx-x64-shared-lib", + ("macos", "aarch64") => "osx-arm64-shared-lib", + ("linux", "x86_64") => "linux-x64-shared-lib", + ("linux", "aarch64") => "linux-aarch64-shared-cpu-lib", _ => anyhow::bail!( "Unsupported platform: {os}-{arch}. Download sherpa-onnx shared libraries manually." ), @@ -90,14 +91,6 @@ pub(crate) async fn setup_sherpa_libs() -> Result { ) .await?; - if status == "installed" { - let lib_dir = check_dir.join("lib"); - eprintln!( - "\nAdd to .env:\n SHERPA_ONNX_LIB_DIR={}\n", - lib_dir.display() - ); - } - Ok(format!("{status} ({archive_suffix})")) } @@ -126,19 +119,35 @@ pub(crate) fn print_setup_summary(summary: &[(&str, String)]) { println!(" DIARIZE_EMBEDDING_MODEL={}", emb_path.display()); } - if let Ok(entries) = std::fs::read_dir("vendor") { - for entry in entries.flatten() { - let path = entry.path(); - if path.is_dir() && path.join("lib").exists() { - println!(" SHERPA_ONNX_LIB_DIR={}", path.join("lib").display()); - break; - } - } + if let Some(lib_dir) = sherpa_lib_dir_hint() { + println!(" SHERPA_ONNX_LIB_DIR={}", lib_dir.display()); } println!(); } +fn sherpa_lib_dir_hint() -> Option { + let vendor_dir = PathBuf::from("vendor"); + let expected_prefix = format!("sherpa-onnx-{SHERPA_ONNX_VERSION}-"); + + let entries: Vec<_> = std::fs::read_dir(&vendor_dir) + .ok()? + .flatten() + .map(|entry| entry.path()) + .filter(|path| path.is_dir() && path.join("lib").exists()) + .collect(); + + entries + .iter() + .find(|path| { + path.file_name() + .and_then(|name| name.to_str()) + .is_some_and(|name| name.starts_with(&expected_prefix)) + }) + .or_else(|| entries.first()) + .map(|path| path.join("lib")) +} + async fn download_file_with_progress(url: &str, dest: &Path, label: &str) -> Result { if dest.exists() { println!("{label}: already present at {}", dest.display());