From 67e59d742d6d2adf7dd9a675fec53d483a670fb2 Mon Sep 17 00:00:00 2001 From: skitsanos Date: Wed, 17 Jun 2026 15:00:17 +0300 Subject: [PATCH 1/4] Add Deepgram transcription provider docs --- README.md | 29 +- Taskfile.yaml | 13 + docs/architecture.md | 22 +- docs/cli-reference.md | 77 ++++- docs/performance-benchmarks.md | 40 +++ docs/provider-behavior.md | 60 +++- docs/troubleshooting.md | 4 +- src/cli.rs | 72 ++++- src/engines/deepgram.rs | 533 +++++++++++++++++++++++++++++++++ src/engines/deepgram/tests.rs | 217 ++++++++++++++ src/engines/mod.rs | 1 + src/main.rs | 178 +++++++++-- src/pipeline_output.rs | 6 +- 13 files changed, 1209 insertions(+), 43 deletions(-) create mode 100644 src/engines/deepgram.rs create mode 100644 src/engines/deepgram/tests.rs diff --git a/README.md b/README.md index 7bb42e0..681ab0c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # transcribeit -A Rust CLI for speech-to-text transcription. Supports local inference via [whisper.cpp](https://github.com/ggerganov/whisper.cpp), local inference via [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx), remote transcription via OpenAI-compatible APIs, Azure OpenAI, Qwen ASR file transcription, Gemini multimodal transcription, and NVIDIA hosted Riva ASR. +A Rust CLI for speech-to-text transcription. Supports local inference via [whisper.cpp](https://github.com/ggerganov/whisper.cpp), local inference via [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx), remote transcription via OpenAI-compatible APIs, Azure OpenAI, Qwen ASR file transcription, Gemini multimodal transcription, NVIDIA hosted Riva ASR, and Deepgram. Accepts any audio or video format — FFmpeg handles conversion automatically. @@ -10,8 +10,9 @@ Accepts any audio or video format — FFmpeg handles conversion automatically. - [FFmpeg](https://ffmpeg.org/) installed and on PATH - C/C++ toolchain and CMake (for building whisper.cpp) - sherpa-onnx shared libraries (if using the `sherpa-onnx` provider) — set `SHERPA_ONNX_LIB_DIR` in `.env` to the directory containing them -- S3-compatible storage credentials when using `qwen-filetrans`; Cloudflare R2 is supported through `S3_ENDPOINT_URL` +- S3-compatible storage credentials when using `qwen-filetrans` or Deepgram pre-signed URL mode; Cloudflare R2 is supported through `S3_ENDPOINT_URL` - NVIDIA API key and hosted Riva function id when using `nvidia-riva` +- Deepgram API key when using `deepgram` ## Quick start @@ -90,6 +91,20 @@ transcribeit run -p nvidia-riva -i recording.wav \ --nvidia-riva-function-id "$NVIDIA_RIVA_FUNCTION_ID" \ -f vtt -o ./output +# Transcribe with Deepgram Nova-3 batch ASR and provider-native diarization +transcribeit run -p deepgram --remote-model nova-3 --diarize \ + -i recording.wav -f vtt -o ./output + +# Transcribe with Deepgram by staging the prepared audio in S3/R2 first +transcribeit run -p deepgram --remote-model nova-3 --deepgram-use-presigned-url \ + -i recording.wav -f vtt -o ./output + +# Transcribe with Deepgram Nova-3 Medical, intelligence metadata, and domain keyterms +transcribeit run -p deepgram --remote-model nova-3-medical \ + --diarize --deepgram-intelligence \ + --deepgram-keyterm Ofev --deepgram-keyterm Esbriet --deepgram-keyterm IPF \ + -i interview.wav -f vtt -o ./output + # Force language and normalize before transcription transcribeit run -i recording.wav -m base --language en --normalize @@ -105,7 +120,7 @@ transcribeit run -i interview.mp3 -m base --diarize --speakers 2 \ ## Features - **Any input format** — MP3, MP4, WAV, FLAC, OGG, etc. FFmpeg converts to mono 16kHz WAV automatically. -- **7 providers** — Local whisper.cpp, sherpa-onnx, OpenAI API, Azure OpenAI, Qwen file transcription, Gemini, and NVIDIA Riva. Extensible via the `Transcriber` trait. +- **8 providers** — Local whisper.cpp, sherpa-onnx, OpenAI API, Azure OpenAI, Qwen file transcription, Gemini, NVIDIA Riva, and Deepgram. Extensible via the `Transcriber` trait. - **Qwen ASR whole-file transcription** — `qwen-filetrans` stages audio in S3-compatible storage, passes a pre-signed URL to DashScope, polls the async task, and maps Qwen timestamps into the transcript model. - **Stable manifest schema** — Manifests use `transcribeit.manifest.v2` with canonical millisecond timestamps, provider-neutral capabilities/quality fields, and provider-specific metadata under `provider_metadata.data`. - **Cache telemetry** — Manifests normalize provider token-cache signals under `cache`, including Gemini `cachedContentTokenCount` and OpenAI/Azure-style `cached_tokens` when returned. @@ -116,6 +131,8 @@ transcribeit run -i interview.mp3 -m base --diarize --speakers 2 \ - **Gemini explicit cache** — `--gemini-explicit-cache` creates and reuses Gemini `cachedContent` objects with a configurable TTL, producing deterministic `cachedContentTokenCount` telemetry when Gemini accepts the cache. - **Gemini summary analysis** — `--analysis summary` runs a second Gemini JSON pass over the transcript and stores a provider-neutral summary, key points, topics, questions, and follow-ups in the manifest. - **NVIDIA hosted Riva ASR** — `nvidia-riva` calls hosted NVIDIA Riva gRPC endpoints with provider-native word timestamps, optional server-side diarization, and manifest metadata. +- **Deepgram Nova batch ASR** — `deepgram` calls Deepgram's `/listen` API, defaults to `nova-3`, requests utterances and smart formatting, supports provider-native diarization through `--diarize`, and can submit either direct audio bytes or an S3/R2 pre-signed URL with `--deepgram-use-presigned-url`. +- **Deepgram audio intelligence** — `--deepgram-intelligence` captures Deepgram summary, topics, intents, entity detection, and sentiment in `provider_metadata.data.intelligence`; `--deepgram-keyterm` passes Nova-3 keyterm prompts for domain terminology. - **3 model architectures via sherpa-onnx** — Whisper, Moonshine, and SenseVoice are auto-detected from the model directory contents. Just point `--model` at any supported model directory. - **Model aliases** — `-m base`, `-m tiny`, etc. resolve from `MODEL_CACHE_DIR` for both `local` and `sherpa-onnx` providers. The sherpa-onnx resolver also supports glob matching (e.g., `-m moonshine-base`, `-m sense-voice`). - **Language hinting** — Pass `--language` to force local and API transcription language. @@ -148,6 +165,11 @@ GEMINI_API_BASE_URL=https://generativelanguage.googleapis.com/v1beta NVIDIA_API_KEY=your_nvidia_key_here NVIDIA_RIVA_FUNCTION_ID=your_hosted_riva_function_id NVIDIA_RIVA_SERVER=grpc.nvcf.nvidia.com:443 +DEEPGRAM_API_KEY=your_deepgram_key_here +DEEPGRAM_API_BASE_URL=https://api.deepgram.com/v1 +DEEPGRAM_INTELLIGENCE=false +DEEPGRAM_KEYTERM=Ofev,Esbriet,IPF +DEEPGRAM_USE_PRESIGNED_URL=false AZURE_API_KEY=your_azure_key_here AZURE_OPENAI_ENDPOINT=https://myresource.openai.azure.com AZURE_DEPLOYMENT_NAME=whisper @@ -159,6 +181,7 @@ S3_REGION=auto S3_ENDPOINT_URL=https://.r2.cloudflarestorage.com S3_ACCESS_KEY_ID=your_s3_access_key S3_SECRET_ACCESS_KEY=your_s3_secret_key +# Optional; when unset, URL-staging providers choose their own prefix. S3_PREFIX=transcribeit/qwen-filetrans S3_PRESIGN_EXPIRES_SECS=3600 S3_FORCE_PATH_STYLE=false diff --git a/Taskfile.yaml b/Taskfile.yaml index 9f3f921..daa3962 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -68,3 +68,16 @@ tasks: "enable_itn": false } }' + test-deepgram: + cmds: + - | + test -n "$DEEPGRAM_API_KEY" || (echo "DEEPGRAM_API_KEY is not set" >&2; exit 1) + DEEPGRAM_API_BASE_URL="${DEEPGRAM_API_BASE_URL:-https://api.deepgram.com/v1}" + + curl --silent --show-error --location \ + --request POST \ + --write-out "\nHTTP_STATUS:%{http_code}\n" \ + --header "Authorization: Token ${DEEPGRAM_API_KEY}" \ + --header "Content-Type: application/json" \ + --data '{"url":"https://dpgr.am/spacewalk.wav"}' \ + "${DEEPGRAM_API_BASE_URL%/}/listen?model=nova-3&smart_format=true" diff --git a/docs/architecture.md b/docs/architecture.md index 5275c4d..5649862 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -28,6 +28,7 @@ src/ ├── azure_openai.rs # Azure OpenAI REST API ├── gemini.rs # Gemini Files API + streamed generateContent ├── nvidia_riva.rs # NVIDIA hosted Riva gRPC ASR + ├── deepgram.rs # Deepgram Nova batch ASR + audio intelligence ├── qwen_filetrans.rs # Qwen async file transcription provider ├── qwen_filetrans/ # Qwen request/response types and model limits ├── rate_limit.rs # Retry logic and 429 handling @@ -59,6 +60,7 @@ pub trait Transcriber: Send + Sync { - **Qwen file transcription** overrides `transcribe_path()` to upload prepared audio to S3-compatible storage, generate a pre-signed URL, and submit that URL to DashScope. - **Gemini** overrides `transcribe_path()` to upload prepared audio through Gemini Files API and call streamed `streamGenerateContent` with structured JSON output. - **NVIDIA Riva** overrides `transcribe_path()` and `transcribe_wav()` to send WAV bytes to a hosted Riva gRPC endpoint with provider-native timestamps. +- **Deepgram** overrides `transcribe_path()` and `transcribe_wav()` to post WAV bytes to Deepgram's `/listen` endpoint with utterances, word timestamps, optional diarization, and optional audio intelligence flags. In URL mode, it stages the prepared WAV in S3-compatible storage and sends Deepgram a pre-signed URL JSON request instead. ## Processing pipeline @@ -69,7 +71,7 @@ Input file (any format) │ ├─ needs_conversion()? ──→ extract_to_wav(normalize) for local provider ├─ upload_as_mp3(normalize) for OpenAI/Azure, Qwen filetrans, and Gemini (16kHz mono MP3) - ├─ hosted Riva path keeps WAV audio for gRPC recognition + ├─ hosted Riva and Deepgram paths keep WAV audio for recognition │ ├─ get_duration() via ffprobe │ @@ -206,6 +208,22 @@ Uses hosted NVIDIA Riva ASR over gRPC through generated protobuf bindings in `pr The provider is implemented entirely in Rust with `tonic`/`prost`. It does not download local NVIDIA NIM containers or require Python clients. +### Deepgram (`deepgram.rs`) + +Uses Deepgram's pre-recorded `/listen` REST API for batch transcription. The provider: + +- defaults to `nova-3`, with `nova-3-medical` available through `--remote-model` when enabled for the account +- requests `smart_format=true` and `utterances=true` +- enables provider-native diarization with `diarize_model=latest` when `--diarize` or `--speakers` is set +- can send either direct audio bytes or a staged pre-signed S3/R2 URL with `--deepgram-use-presigned-url` +- accepts Nova-3 keyterm prompts through `--deepgram-keyterm` +- can enable Deepgram audio intelligence through `--deepgram-intelligence` or individual flags for summary, topics, intents, entities, and sentiment +- maps Deepgram utterances and word timestamps into normalized segments and words +- preserves returned intelligence blocks under `provider_metadata.data.intelligence` +- clamps provider timestamps to `metadata.duration` when necessary and records that under `provider_metadata.data.response.timestamps_clamped` + +Deepgram's intelligence JSON is intentionally kept as provider metadata because it is richer than the normalized transcript schema and because downstream Transcript Intelligence consumers may want to inspect provider-native topics, intents, sentiments, entities, and token usage. URL-mode metadata records only that a file URL was used; temporary pre-signed URLs are not persisted. + ## Analysis (`analysis.rs`) Post-transcription analysis is separate from transcription. The first supported analysis is `--analysis summary`, which currently uses Gemini to run a second structured JSON call over the transcript text. Results are written to the manifest only when `--output-dir` is set: @@ -263,7 +281,7 @@ All settings (timeout, retries, wait times) are configurable via CLI flags and e ### Shared WAV encoding -OpenAI/Azure engines can send file uploads directly and choose the correct container format for compatibility (WAV for local transcribe path, MP3 for API provider uploads). Qwen file transcription stages MP3 in S3-compatible storage and sends DashScope a pre-signed URL. Gemini uploads MP3 through Gemini Files API. NVIDIA Riva sends WAV bytes through gRPC. The `audio::wav::encode_wav()` helper is still used by local engines and non-file upload paths. +OpenAI/Azure engines can send file uploads directly and choose the correct container format for compatibility (WAV for local transcribe path, MP3 for API provider uploads). Qwen file transcription stages MP3 in S3-compatible storage and sends DashScope a pre-signed URL. Gemini uploads MP3 through Gemini Files API. NVIDIA Riva sends WAV bytes through gRPC. Deepgram posts WAV bytes to `/listen` by default, or stages WAV in S3-compatible storage and sends a pre-signed URL when URL mode is enabled. The `audio::wav::encode_wav()` helper is still used by local engines and non-file upload paths. ## Model cache (`model_cache.rs`) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index cffe90e..cbbe2f3 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -46,7 +46,7 @@ transcribeit run [OPTIONS] --input | Option | Description | Default | |--------|-------------|---------| | `-i, --input` | Input path, directory, or glob pattern for audio/video files | required | -| `-p, --provider` | `local`, `sherpa-onnx`, `openai`, `azure`, `qwen-filetrans`, `gemini`, or `nvidia-riva` | `local` | +| `-p, --provider` | `local`, `sherpa-onnx`, `openai`, `azure`, `qwen-filetrans`, `gemini`, `nvidia-riva`, or `deepgram` | `local` | #### Local provider options (`-p local`) @@ -153,6 +153,39 @@ The NVIDIA Riva provider uses gRPC and sends `function-id` plus Bearer authoriza Manifests include `provider_metadata.provider = "nvidia-riva"`, audio details, request ids, feature flags, detected languages, response counts, elapsed time, and mean confidence when returned. +#### Deepgram provider options (`-p deepgram`) + +| Option | Description | Default | +|--------|-------------|---------| +| `--deepgram-api-key` | Deepgram API key | `DEEPGRAM_API_KEY` env var | +| `--deepgram-api-base-url` | Deepgram API base URL | `DEEPGRAM_API_BASE_URL` env var, or `https://api.deepgram.com/v1` | +| `--remote-model` | Deepgram model name | `nova-3` | +| `--deepgram-intelligence` | Enable summary, topics, intents, entity detection, and sentiment | disabled | +| `--deepgram-summarize` | Enable `summarize=v2` | disabled | +| `--deepgram-topics` | Enable topic detection | disabled | +| `--deepgram-intents` | Enable intent recognition | disabled | +| `--deepgram-detect-entities` | Enable entity detection | disabled | +| `--deepgram-sentiment` | Enable sentiment analysis | disabled | +| `--deepgram-keyterm` | Nova-3 keyterm prompt; repeat or comma-separate terms | none | +| `--deepgram-search` | Search term or phrase; repeat or comma-separate terms | none | +| `--deepgram-redact` | Redaction target such as `pii`, `phi`, `pci`, `numbers`, or entity class | none | +| `--deepgram-replace` | Find/replace rule in `FIND:REPLACE` format | none | +| `--deepgram-filler-words` | Enable filler word transcription | disabled | +| `--deepgram-numerals` | Enable numerals formatting | disabled | +| `--deepgram-use-presigned-url` | Stage prepared audio in S3-compatible storage and send Deepgram a JSON URL request | disabled | + +The Deepgram provider calls `POST {deepgram-api-base-url}/listen` with `smart_format=true` and `utterances=true`. Input audio/video is converted to 16 kHz mono WAV when needed. The default model is `nova-3`; `nova-3-general` and `nova-3-medical` can be passed through `--remote-model` when available for the account/region. + +By default, Deepgram receives direct audio bytes. With `--deepgram-use-presigned-url`, the CLI uploads the prepared audio to S3-compatible storage, generates a pre-signed GET URL, and sends Deepgram `{"url":"..."}`. This uses the same `S3_*` settings as Qwen file transcription; if `S3_PREFIX` is unset, the Deepgram default prefix is `transcribeit/deepgram`. The pre-signed URL itself is not written to manifests. + +If `--diarize` is set, the request uses `diarize_model=latest`, which enables Deepgram's current provider-native batch diarizer. `--speakers N` is accepted as a request to enable diarization, but Deepgram does not accept an exact speaker-count hint through this path. + +`--deepgram-intelligence` is the convenience switch for Transcript Intelligence workflows. It enables Deepgram summarization, topic detection, intent recognition, entity detection, and sentiment analysis in the same transcription request. Returned intelligence blocks are stored under `provider_metadata.data.intelligence`; they are useful downstream metadata, but should still be treated as model output rather than validated facts. + +For Nova-3 and Nova-3 Medical, use `--deepgram-keyterm` for important domain terms and brands. In local benchmarking, keyterms such as `Ofev`, `Esbriet`, `IPF`, and `Producta` materially improved medical brand recognition and speaker consistency. + +Manifests include `provider_metadata.provider = "deepgram"`, Deepgram request metadata, channel/utterance/alternative counts, model info, intelligence token usage metadata, mean confidence, extracted intelligence blocks, and `timestamps_clamped` when provider timestamps exceed the reported media duration and are clamped for output safety. + #### Output options | Option | Description | Default | @@ -167,7 +200,7 @@ Manifests include `provider_metadata.provider = "nvidia-riva"`, audio details, r #### API resilience options -These options apply to OpenAI, Azure, Qwen file transcription, Gemini, and NVIDIA Riva providers where supported: +These options apply to OpenAI, Azure, Qwen file transcription, Gemini, NVIDIA Riva, and Deepgram providers where supported: | Option | Description | Default | |--------|-------------|---------| @@ -189,7 +222,7 @@ REST providers retry HTTP 429, HTTP 5xx, and transport send/stream failures when | `--segment-concurrency` | Max parallel segment requests (API providers only) | `2` | | `--vad-model` | Path to Silero VAD ONNX model (`silero_vad.onnx`) for speech-aware segmentation | `VAD_MODEL` env var | -When using `openai`, `azure`, `qwen-filetrans`, or `nvidia-riva` providers, files exceeding the conservative 25MB auto-split threshold are automatically segmented even without `--segment`. This keeps long remote requests smaller and more reliable. Gemini stays whole-file by default to preserve model-level speaker continuity; use `--segment` only when you want independent chunk requests, or let the provider fall back to segmentation if a long whole-file request fails. When using `sherpa-onnx`, segmentation is always enabled with a maximum segment length of 30 seconds. +When using `openai`, `azure`, `qwen-filetrans`, or `nvidia-riva` providers, files exceeding the conservative 25MB auto-split threshold are automatically segmented even without `--segment`. This keeps long remote requests smaller and more reliable. Gemini and Deepgram stay whole-file by default to preserve provider/model-level speaker continuity; use `--segment` only when you want independent chunk requests, or let Gemini fall back to segmented transcription if a long whole-file request fails. When using `sherpa-onnx`, segmentation is always enabled with a maximum segment length of 30 seconds. When `--vad-model` is set and segmentation is needed, VAD-based segmentation is used instead of FFmpeg `silencedetect`. VAD detects actual speech boundaries using Silero VAD, avoiding mid-word cuts. It pads chunks by 250ms, merges gaps shorter than 200ms, and splits long chunks at low-energy points. This requires the `sherpa-onnx` feature to be enabled. When `--vad-model` is not set, the original FFmpeg silence-based segmentation is used as a fallback. @@ -208,6 +241,8 @@ For OpenAI, `--diarize` uses provider-native diarization through `gpt-4o-transcr For NVIDIA Riva, use `--diarize` when the exact speaker count is unknown. The provider uses `--speakers N` as a maximum speaker hint; if omitted, the CLI sends a default maximum of 4 speakers. +For Deepgram, `--diarize` enables provider-native diarization with `diarize_model=latest`. `--speakers N` is treated as a request to enable diarization, but no fixed speaker count is sent. + For Gemini, speaker labels are model-generated structured output and may be present even without local diarization. For Qwen file transcription, Azure, local Whisper, and non-diarizing OpenAI models, `--diarize` requires the local Sherpa diarizer. ## Output behavior @@ -249,19 +284,34 @@ When `--input` resolves to multiple files (directory or glob), all files are pro | `NVIDIA_API_KEY` | NVIDIA hosted Riva API key | none | | `NVIDIA_RIVA_FUNCTION_ID` | NVIDIA hosted Riva function id | none | | `NVIDIA_RIVA_SERVER` | NVIDIA Riva gRPC server | `grpc.nvcf.nvidia.com:443` | +| `DEEPGRAM_API_KEY` | Deepgram API key | none | +| `DEEPGRAM_API_BASE_URL` | Deepgram API base URL | `https://api.deepgram.com/v1` | +| `DEEPGRAM_INTELLIGENCE` | Enable Deepgram summary/topics/intents/entities/sentiment | disabled | +| `DEEPGRAM_SUMMARIZE` | Enable Deepgram summarization | disabled | +| `DEEPGRAM_TOPICS` | Enable Deepgram topic detection | disabled | +| `DEEPGRAM_INTENTS` | Enable Deepgram intent recognition | disabled | +| `DEEPGRAM_DETECT_ENTITIES` | Enable Deepgram entity detection | disabled | +| `DEEPGRAM_SENTIMENT` | Enable Deepgram sentiment analysis | disabled | +| `DEEPGRAM_KEYTERM` | Comma-separated Deepgram keyterms | none | +| `DEEPGRAM_SEARCH` | Comma-separated Deepgram search terms | none | +| `DEEPGRAM_REDACT` | Comma-separated Deepgram redaction targets | none | +| `DEEPGRAM_REPLACE` | Comma-separated Deepgram find/replace rules | none | +| `DEEPGRAM_FILLER_WORDS` | Enable Deepgram filler words | disabled | +| `DEEPGRAM_NUMERALS` | Enable Deepgram numerals | disabled | +| `DEEPGRAM_USE_PRESIGNED_URL` | Stage Deepgram input in S3-compatible storage and submit a pre-signed URL | disabled | | `AZURE_API_KEY` | Azure API key fallback for Azure provider if `--azure-api-key` is unset | none | | `AZURE_OPENAI_ENDPOINT` | Azure OpenAI endpoint URL | none | | `AZURE_DEPLOYMENT_NAME` | Azure deployment name | `whisper` | | `AZURE_API_VERSION` | Azure API version | `2024-06-01` | | `DASHSCOPE_API_KEY` | DashScope API key for Qwen providers | none | | `DASHSCOPE_ASR_BASE_URL` | DashScope async ASR base URL for Qwen file transcription | `https://dashscope-intl.aliyuncs.com/api/v1` | -| `S3_BUCKET` | S3 bucket for Qwen file transcription staging | none | -| `S3_REGION` / `AWS_REGION` | S3 region for Qwen file transcription staging | none | +| `S3_BUCKET` | S3 bucket for remote-provider URL staging | none | +| `S3_REGION` / `AWS_REGION` | S3 region for remote-provider URL staging | none | | `S3_ENDPOINT_URL` | S3-compatible endpoint URL | none | | `S3_ACCESS_KEY_ID` / `AWS_ACCESS_KEY_ID` | S3 access key ID | none | | `S3_SECRET_ACCESS_KEY` / `AWS_SECRET_ACCESS_KEY` | S3 secret access key | none | | `S3_SESSION_TOKEN` / `AWS_SESSION_TOKEN` | S3 session token | none | -| `S3_PREFIX` | S3 object prefix for Qwen staging uploads | `transcribeit/qwen-filetrans` | +| `S3_PREFIX` | S3 object prefix for remote-provider uploads | Provider-specific if unset: `transcribeit/qwen-filetrans` for Qwen, `transcribeit/deepgram` for Deepgram URL mode | | `S3_PRESIGN_EXPIRES_SECS` | S3 pre-signed URL expiry in seconds | `3600` | | `S3_FORCE_PATH_STYLE` | Force path-style URLs for S3-compatible storage | `false` | | `VAD_MODEL` | Path to Silero VAD ONNX model for speech-aware segmentation | none | @@ -387,6 +437,20 @@ transcribeit run -p nvidia-riva -i recording.wav \ --nvidia-api-key "$NVIDIA_API_KEY" \ --nvidia-riva-function-id "$NVIDIA_RIVA_FUNCTION_ID" \ --language en-US -f vtt -o ./output + +# Deepgram Nova-3 ASR with provider-native diarization +transcribeit run -p deepgram --remote-model nova-3 --diarize \ + --language en -i recording.wav -f vtt -o ./output + +# Deepgram Nova-3 using S3/R2 pre-signed URL input instead of direct bytes +transcribeit run -p deepgram --remote-model nova-3 --deepgram-use-presigned-url \ + --language en -i recording.wav -f vtt -o ./output + +# Deepgram Nova-3 Medical with intelligence metadata and domain keyterms +transcribeit run -p deepgram --remote-model nova-3-medical \ + --diarize --deepgram-intelligence \ + --deepgram-keyterm Ofev --deepgram-keyterm Esbriet --deepgram-keyterm IPF \ + -i interview.wav -f vtt -o ./output ``` ### Provider behavior @@ -403,6 +467,7 @@ transcribeit run -p nvidia-riva -i recording.wav \ - **Gemini explicit cache** (`--gemini-explicit-cache`) creates/reuses Gemini `cachedContent` objects and sends `cachedContent` in the streamed generation request. Manifests report `cache.transcription.mode = "explicit"` when this path is used. - **Analysis** (`--analysis summary`) runs a second Gemini structured JSON pass over the transcript and stores summary results in the manifest. - **NVIDIA Riva** (`-p nvidia-riva`) sends WAV audio to hosted Riva gRPC, requesting native word timestamps and optional server-side diarization. +- **Deepgram** (`-p deepgram`) posts audio to Deepgram `/listen`, requesting smart formatting, utterances, word timestamps, and optional provider-native diarization. With `--deepgram-use-presigned-url`, it stages the prepared audio in S3/R2 and submits a JSON URL request. For the full matrix and upload/auth notes, see: [Provider behavior](provider-behavior.md). For benchmark guidance and result templates, see: [Performance benchmarks](performance-benchmarks.md). diff --git a/docs/performance-benchmarks.md b/docs/performance-benchmarks.md index 3d7412a..c4dc1a0 100644 --- a/docs/performance-benchmarks.md +++ b/docs/performance-benchmarks.md @@ -162,6 +162,30 @@ Record: - manifest `quality.timing_reliable` - whether server-side speaker labels were useful +### 8. Deepgram + +Benchmark Deepgram as a whole-file batch provider with both plain Nova-3 and medical/intelligence options: + +```bash +time transcribeit run -p deepgram --remote-model nova-3 \ + -i --diarize -f vtt -o ./output + +time transcribeit run -p deepgram --remote-model nova-3-medical \ + --diarize --deepgram-intelligence \ + --deepgram-keyterm Ofev --deepgram-keyterm Esbriet --deepgram-keyterm IPF \ + -i -f vtt -o ./output +``` + +Record: +- model name and `provider_metadata.data.metadata.model_info` +- wall-clock time and realtime factor +- manifest `provider_metadata.data.response.mean_confidence` +- manifest `provider_metadata.data.intelligence.summary` +- counts for returned topics, intents, sentiments, and entities +- whether keyterm prompting improved domain terms or brand names +- diarization behavior, especially unexpected extra speakers +- whether `quality.timestamps_clamped` was triggered + ## Suggested result format ```text @@ -177,6 +201,22 @@ Output size: 4.6 MB Keep rows in a simple table (date + commit hash + environment + results) in your preferred tracker so regressions are easy to catch. +## Current provider assessment + +Based on the provider evaluations captured so far, Deepgram is currently the most advanced provider for Transcript Intelligence workflows, especially `nova-3-medical` with domain keyterms. It is the only tested provider that returned high-quality ASR together with provider-native utterances, word timestamps, diarization, summary, topics, intents, sentiment, entity extraction, model metadata, and intelligence token usage in one transcription response. + +This does not mean every Deepgram intelligence field should be treated as ground truth. In the 5-minute medical interview sample, `nova-3-medical` returned useful entities, topics, intents, and sentiment, but its summary made a role error. Without keyterms it also misheard `Ofev` as `OFAP`; adding keyterms such as `Ofev`, `Esbriet`, `IPF`, and `Producta` corrected the medical brand terms and improved speaker consistency in the observed run. + +Use this working ranking until broader benchmark data says otherwise: + +| Rank | Provider / Model | Current assessment | +|---|---|---| +| 1 | Deepgram `nova-3-medical` + keyterms | Best Transcript Intelligence candidate; strongest structured metadata and good ASR when keyterms are supplied. | +| 2 | Qwen `qwen3-asr-flash-filetrans` | Strong pure ASR baseline with word timestamps, but less downstream intelligence metadata. | +| 3 | OpenAI hosted transcription | Strong general ASR, but less structured transcript intelligence in the current CLI path. | +| 4 | Gemini | Useful whole-file multimodal transcription and summary path, but timestamps/speakers are model-generated rather than dedicated ASR metadata. | +| 5 | NVIDIA Riva | Provider-native timestamps/diarization through hosted Riva, but less transcript intelligence returned through the current provider path. | + ## Reference benchmark results These results were measured on a 5-minute medical interview recording. diff --git a/docs/provider-behavior.md b/docs/provider-behavior.md index cf9ca23..cb94ace 100644 --- a/docs/provider-behavior.md +++ b/docs/provider-behavior.md @@ -1,6 +1,17 @@ # Provider behavior -This project supports seven providers. They share the same input/output surface, but engine type, API shape, and credentials differ. +This project supports eight providers. They share the same input/output surface, but engine type, API shape, and credentials differ. + +## Remote URL input support + +| Provider | Pre-signed URL support in this CLI | Notes | +|----------|------------------------------------|-------| +| `qwen-filetrans` | Required | Stages prepared MP3 audio in S3/R2 and passes the pre-signed URL to DashScope async ASR. | +| `deepgram` | Optional | Direct byte upload is the default. `--deepgram-use-presigned-url` stages prepared WAV audio in S3/R2 and sends Deepgram `{"url":"..."}`. | +| `gemini` | Not currently implemented | The current implementation uses Gemini Files API upload/cache because that path supports file reuse and explicit cached-content workflows. | +| `openai` | No | Uses multipart transcription upload. | +| `azure` | No | Uses Azure OpenAI multipart transcription upload. | +| `nvidia-riva` | No | Uses hosted Riva gRPC audio streaming/buffers. | ## Local (`-p local`) @@ -184,6 +195,53 @@ Gemini summary analysis includes: - Riva does not expose token-cache telemetry through this path, so manifests use `cache.transcription.mode = "none"`. - The implementation targets hosted Riva gRPC. Local/self-hosted NIM REST transcription is not wired into this provider yet. +## Deepgram (`-p deepgram`) + +- Uses Deepgram batch speech-to-text through `POST {deepgram-api-base-url}/listen`. +- Authentication: + - `--deepgram-api-key` or `DEEPGRAM_API_KEY` + - `--api-key`/`OPENAI_API_KEY` is accepted as an API key fallback for scripting consistency. +- Base URL defaults to `https://api.deepgram.com/v1` and can be overridden with `--deepgram-api-base-url` or `DEEPGRAM_API_BASE_URL`. +- Default model: `nova-3`. Use `--remote-model nova-3-medical` for Deepgram's medical-domain Nova-3 model when enabled for the account. +- The request always enables `smart_format=true` and `utterances=true` so the provider returns readable utterance segments plus word-level timestamps. +- Input audio/video is converted with FFmpeg to 16 kHz mono WAV when it is not already compatible with the internal pipeline format. +- By default, the provider uploads audio bytes directly. With `--deepgram-use-presigned-url` or `DEEPGRAM_USE_PRESIGNED_URL=true`, the CLI stages the prepared WAV in S3-compatible storage, generates a pre-signed GET URL, and sends Deepgram a JSON body containing that URL. +- Deepgram URL mode uses the shared S3 settings: + - `S3_BUCKET` + - `S3_REGION` or `AWS_REGION` + - `S3_ACCESS_KEY_ID` or `AWS_ACCESS_KEY_ID` + - `S3_SECRET_ACCESS_KEY` or `AWS_SECRET_ACCESS_KEY` + - `S3_ENDPOINT_URL` for S3-compatible providers such as Cloudflare R2 + - `S3_SESSION_TOKEN` or `AWS_SESSION_TOKEN` + - `S3_PREFIX` (defaults to `transcribeit/deepgram` for this mode when unset) + - `S3_PRESIGN_EXPIRES_SECS` (defaults to `3600`, minimum `300`) + - `S3_FORCE_PATH_STYLE=true` for providers that require path-style URLs +- When `--diarize` is set, the request uses `diarize_model=latest`. `--speakers N` is treated as a request to enable diarization, but Deepgram does not accept a fixed speaker-count hint through this provider path. +- `--deepgram-intelligence` enables `summarize=v2`, `topics=true`, `intents=true`, `detect_entities=true`, and `sentiment=true`. +- Individual feature flags are also available: + - `--deepgram-summarize` + - `--deepgram-topics` + - `--deepgram-intents` + - `--deepgram-detect-entities` + - `--deepgram-sentiment` + - `--deepgram-filler-words` + - `--deepgram-numerals` +- Custom vocabulary and downstream-processing flags: + - `--deepgram-keyterm TERM` for Nova-3 keyterm prompting; repeat or comma-separate terms. + - `--deepgram-search TERM` for provider search hits. + - `--deepgram-redact TARGET` for redaction targets such as `pii`, `phi`, `pci`, `numbers`, or entity classes. + - `--deepgram-replace FIND:REPLACE` for find-and-replace rules. +- Manifests include Deepgram provider metadata when available: + - `provider_metadata.provider = "deepgram"` + - `provider_metadata.schema_version = "deepgram.metadata.v1"` + - `provider_metadata.data.request` with `audio_source` (`direct_upload` or `presigned_url`) and `file_url_present`; the actual pre-signed URL is not persisted. + - `provider_metadata.data.metadata` with Deepgram request id, duration, channel count, model ids, `model_info`, and intelligence token usage metadata. + - `provider_metadata.data.response` with channel, utterance, and alternative counts, mean confidence, and timestamp clamping telemetry. + - `provider_metadata.data.intelligence` with returned `summary`, `topics`, `intents`, `sentiments`, `entities`, `summaries`, `search`, and warnings. +- If Deepgram returns timestamps beyond its reported media duration, the provider clamps segment and word times to the duration and records `provider_metadata.data.response.timestamps_clamped = true`. +- Deepgram intelligence output is valuable for Transcript Intelligence workflows, but summaries/topics/intents/entities are still model output. Treat them as provider metadata for downstream review rather than as validated facts. +- Deepgram does not expose token-cache telemetry through this path, so manifests use `cache.transcription.mode = "none"`. + ## Why providers differ ### Local vs Sherpa-ONNX diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 6964b2a..0e2e1e6 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -87,6 +87,8 @@ Symptoms: Fix: - When using local post-processing diarization, pass `--diarize --speakers N`; both `--diarize-segmentation-model` and `--diarize-embedding-model` are required. - When using NVIDIA Riva, `--diarize` can be used without `--speakers`; the CLI sends a default max-speaker hint of 4. +- When using Deepgram, `--diarize` enables `diarize_model=latest`; `--speakers N` is treated only as a request to enable diarization because Deepgram does not accept a fixed speaker-count hint here. +- If Deepgram URL mode fails before transcription, confirm `--deepgram-use-presigned-url` has valid `S3_*` or AWS-compatible credentials and that Deepgram can fetch the generated pre-signed URL before it expires. - When using OpenAI, `--diarize` selects `gpt-4o-transcribe-diarize` by default. If you explicitly choose another OpenAI model, local Sherpa diarization is required. - Qwen file transcription and Azure do not currently provide native diarization through this CLI; use local Sherpa diarization for those providers. - Ensure both model paths point to valid ONNX files: @@ -291,7 +293,7 @@ Common symptoms: Fix: - Use `--normalize` to reduce volume inconsistency from recorded content. - Ensure input is not corrupted and ffmpeg conversion succeeds. -- For OpenAI/Azure providers, MP3 conversion is used internally; local provider uses WAV input internally. Qwen file transcription stages a prepared MP3 in S3-compatible storage and passes a pre-signed URL to DashScope. +- For OpenAI/Azure providers, MP3 conversion is used internally; local provider uses WAV input internally. Qwen file transcription stages a prepared MP3 in S3-compatible storage and passes a pre-signed URL to DashScope. Deepgram and NVIDIA Riva use WAV input internally; Deepgram can optionally stage that prepared WAV in S3/R2 and submit a pre-signed URL with `--deepgram-use-presigned-url`. ### Empty or tiny transcript outputs diff --git a/src/cli.rs b/src/cli.rs index a0b0852..bdce506 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -84,6 +84,8 @@ pub(crate) enum Provider { /// NVIDIA-hosted Riva ASR over gRPC #[value(name = "nvidia-riva")] NvidiaRiva, + /// Deepgram batch transcription API + Deepgram, } #[derive(Debug, Clone, ValueEnum)] @@ -216,6 +218,10 @@ pub(crate) enum Command { #[arg(long, env = "NVIDIA_RIVA_SERVER")] nvidia_riva_server: Option, + /// Deepgram API key (or set DEEPGRAM_API_KEY) + #[arg(long, env = "DEEPGRAM_API_KEY")] + deepgram_api_key: Option, + /// Azure API key (or set AZURE_API_KEY env var) #[arg(long, env = "AZURE_API_KEY")] azure_api_key: Option, @@ -240,6 +246,66 @@ pub(crate) enum Command { )] gemini_api_base_url: String, + /// Deepgram API base URL + #[arg( + long, + env = "DEEPGRAM_API_BASE_URL", + default_value = "https://api.deepgram.com/v1" + )] + deepgram_api_base_url: String, + + /// Enable Deepgram summarization, topics, intents, entity detection, and sentiment + #[arg(long, env = "DEEPGRAM_INTELLIGENCE")] + deepgram_intelligence: bool, + + /// Enable Deepgram summarization (summarize=v2) + #[arg(long, env = "DEEPGRAM_SUMMARIZE")] + deepgram_summarize: bool, + + /// Enable Deepgram topic detection + #[arg(long, env = "DEEPGRAM_TOPICS")] + deepgram_topics: bool, + + /// Enable Deepgram intent recognition + #[arg(long, env = "DEEPGRAM_INTENTS")] + deepgram_intents: bool, + + /// Enable Deepgram entity detection + #[arg(long, env = "DEEPGRAM_DETECT_ENTITIES")] + deepgram_detect_entities: bool, + + /// Enable Deepgram sentiment analysis + #[arg(long, env = "DEEPGRAM_SENTIMENT")] + deepgram_sentiment: bool, + + /// Deepgram Nova-3 keyterm prompt; repeat or comma-separate up to provider limits + #[arg(long, env = "DEEPGRAM_KEYTERM", value_delimiter = ',')] + deepgram_keyterm: Vec, + + /// Deepgram search term or phrase; repeat or comma-separate + #[arg(long, env = "DEEPGRAM_SEARCH", value_delimiter = ',')] + deepgram_search: Vec, + + /// Deepgram redaction target, such as pii, phi, pci, numbers, or an entity class + #[arg(long, env = "DEEPGRAM_REDACT", value_delimiter = ',')] + deepgram_redact: Vec, + + /// Deepgram find/replace rule in FIND:REPLACE format; repeat or comma-separate + #[arg(long, env = "DEEPGRAM_REPLACE", value_delimiter = ',')] + deepgram_replace: Vec, + + /// Enable Deepgram filler word transcription + #[arg(long, env = "DEEPGRAM_FILLER_WORDS")] + deepgram_filler_words: bool, + + /// Enable Deepgram numerals formatting + #[arg(long, env = "DEEPGRAM_NUMERALS")] + deepgram_numerals: bool, + + /// Stage Deepgram input in S3-compatible storage and submit a pre-signed URL + #[arg(long, env = "DEEPGRAM_USE_PRESIGNED_URL")] + deepgram_use_presigned_url: bool, + /// Reuse Gemini Files API uploads keyed by SHA-256 of prepared upload bytes #[arg(long, env = "GEMINI_FILE_CACHE")] gemini_file_cache: bool, @@ -344,11 +410,11 @@ pub(crate) enum Command { #[arg(long, env = "VAD_MODEL")] vad_model: Option, - /// S3 bucket used to stage audio for Qwen file transcription + /// S3 bucket used to stage audio for providers that need pre-signed URLs #[arg(long, env = "S3_BUCKET")] s3_bucket: Option, - /// S3 region used to stage audio for Qwen file transcription + /// S3 region used to stage audio for providers that need pre-signed URLs #[arg(long, env = "S3_REGION")] s3_region: Option, @@ -368,7 +434,7 @@ pub(crate) enum Command { #[arg(long, env = "S3_SESSION_TOKEN")] s3_session_token: Option, - /// S3 object prefix for temporary Qwen uploads + /// S3 object prefix for temporary remote-provider uploads #[arg(long, env = "S3_PREFIX")] s3_prefix: Option, diff --git a/src/engines/deepgram.rs b/src/engines/deepgram.rs new file mode 100644 index 0000000..0a1fb64 --- /dev/null +++ b/src/engines/deepgram.rs @@ -0,0 +1,533 @@ +#[cfg(test)] +mod tests; + +use std::path::Path; + +use anyhow::{Context, Result}; +use async_trait::async_trait; +use reqwest::Client; +use serde_json::{Value, json}; + +use crate::audio::wav::encode_wav; +use crate::engines::rate_limit::{self, send_with_retry}; +use crate::storage::s3::S3Uploader; +use crate::transcriber::{Segment, Transcriber, Transcript, Word}; + +pub struct DeepgramApi { + base_url: String, + api_key: String, + model: String, + language: Option, + settings: rate_limit::ApiRequestSettings, + client: Client, + options: DeepgramOptions, + presigned_url_uploader: Option, +} + +#[derive(Debug, Clone, Default)] +pub struct DeepgramOptions { + pub diarize: bool, + pub intelligence: bool, + pub summarize: bool, + pub topics: bool, + pub intents: bool, + pub detect_entities: bool, + pub sentiment: bool, + pub keyterms: Vec, + pub search: Vec, + pub redact: Vec, + pub replace: Vec, + pub filler_words: bool, + pub numerals: bool, +} + +impl DeepgramApi { + pub fn new( + base_url: String, + api_key: String, + model: String, + language: Option, + settings: rate_limit::ApiRequestSettings, + options: DeepgramOptions, + presigned_url_uploader: Option, + ) -> Result { + let client = Client::builder() + .timeout(settings.request_timeout) + .build() + .context("Failed to build HTTP client")?; + + Ok(Self { + base_url: base_url.trim_end_matches('/').to_string(), + api_key, + model, + language, + settings, + client, + options, + presigned_url_uploader, + }) + } + + fn listen_url(&self) -> String { + let mut params = vec![ + format!("model={}", urlencoding::encode(&self.model)), + "smart_format=true".to_string(), + "utterances=true".to_string(), + ]; + + if self.options.diarize { + params.push("diarize_model=latest".to_string()); + } + if self.options.intelligence || self.options.summarize { + params.push("summarize=v2".to_string()); + } + if self.options.intelligence || self.options.topics { + params.push("topics=true".to_string()); + } + if self.options.intelligence || self.options.intents { + params.push("intents=true".to_string()); + } + if self.options.intelligence || self.options.detect_entities { + params.push("detect_entities=true".to_string()); + } + if self.options.intelligence || self.options.sentiment { + params.push("sentiment=true".to_string()); + } + if self.options.filler_words { + params.push("filler_words=true".to_string()); + } + if self.options.numerals { + params.push("numerals=true".to_string()); + } + append_list_params(&mut params, "keyterm", &self.options.keyterms); + append_list_params(&mut params, "search", &self.options.search); + append_list_params(&mut params, "redact", &self.options.redact); + append_list_params(&mut params, "replace", &self.options.replace); + if let Some(language) = self + .language + .as_deref() + .filter(|language| !language.is_empty()) + { + params.push(format!("language={}", urlencoding::encode(language))); + } + + format!("{}/listen?{}", self.base_url, params.join("&")) + } + + fn audio_mime(path: &Path) -> &'static str { + match path.extension().and_then(|ext| ext.to_str()) { + Some(ext) if ext.eq_ignore_ascii_case("mp3") => "audio/mpeg", + Some(ext) if ext.eq_ignore_ascii_case("m4a") => "audio/mp4", + Some(ext) if ext.eq_ignore_ascii_case("mp4") => "audio/mp4", + Some(ext) if ext.eq_ignore_ascii_case("webm") => "audio/webm", + Some(ext) if ext.eq_ignore_ascii_case("ogg") => "audio/ogg", + Some(ext) if ext.eq_ignore_ascii_case("flac") => "audio/flac", + Some(ext) if ext.eq_ignore_ascii_case("wav") => "audio/wav", + _ => "audio/wav", + } + } + + async fn transcribe_bytes(&self, bytes: Vec, mime: &'static str) -> Result { + let url = self.listen_url(); + let body = send_with_retry(&self.settings, "Deepgram listen", || { + let client = self.client.clone(); + let api_key = self.api_key.clone(); + let url = url.clone(); + let bytes = bytes.clone(); + Box::pin(async move { + client + .post(url) + .header("Authorization", format!("Token {api_key}")) + .header("Content-Type", mime) + .body(bytes) + .send() + .await + .context("Failed to send request to Deepgram listen") + }) + }) + .await + .map_err(|(status, body)| anyhow::anyhow!("Deepgram listen returned {status}: {body}"))?; + + parse_response(&body, &self.model, &self.base_url, "direct_upload") + } + + async fn transcribe_file_url(&self, file_url: String) -> Result { + let url = self.listen_url(); + let body = send_with_retry(&self.settings, "Deepgram listen", || { + let client = self.client.clone(); + let api_key = self.api_key.clone(); + let url = url.clone(); + let file_url = file_url.clone(); + Box::pin(async move { + client + .post(url) + .header("Authorization", format!("Token {api_key}")) + .json(&json!({ "url": file_url })) + .send() + .await + .context("Failed to send request to Deepgram listen") + }) + }) + .await + .map_err(|(status, body)| anyhow::anyhow!("Deepgram listen returned {status}: {body}"))?; + + parse_response(&body, &self.model, &self.base_url, "presigned_url") + } +} + +fn append_list_params(params: &mut Vec, name: &str, values: &[String]) { + params.extend( + values + .iter() + .map(|value| value.trim()) + .filter(|value| !value.is_empty()) + .map(|value| format!("{name}={}", urlencoding::encode(value))), + ); +} + +#[async_trait] +impl Transcriber for DeepgramApi { + async fn transcribe(&self, audio_samples: Vec) -> Result { + let wav_bytes = encode_wav(&audio_samples)?; + self.transcribe_wav(wav_bytes).await + } + + async fn transcribe_path(&self, audio_path: &Path) -> Result { + if let Some(uploader) = &self.presigned_url_uploader { + let url = uploader.upload_and_presign(audio_path).await?; + return self.transcribe_file_url(url).await; + } + + let bytes = tokio::fs::read(audio_path) + .await + .with_context(|| format!("Failed to read audio file: {}", audio_path.display()))?; + self.transcribe_bytes(bytes, Self::audio_mime(audio_path)) + .await + } + + async fn transcribe_wav(&self, wav_bytes: Vec) -> Result { + if self.presigned_url_uploader.is_some() { + let tmp = tempfile::Builder::new() + .prefix("transcribeit-deepgram-") + .suffix(".wav") + .tempfile() + .context("Failed to create temporary WAV file")?; + tokio::fs::write(tmp.path(), wav_bytes) + .await + .context("Failed to write temporary WAV file")?; + return self.transcribe_path(tmp.path()).await; + } + + self.transcribe_bytes(wav_bytes, "audio/wav").await + } +} + +fn parse_response( + body: &[u8], + model: &str, + base_url: &str, + audio_source: &str, +) -> Result { + let response: Value = serde_json::from_slice(body).context("Failed to parse Deepgram JSON")?; + let mut segments = parse_utterances(&response); + if segments.is_empty() { + segments = parse_alternatives(&response); + } + let timestamps_clamped = response + .pointer("/metadata/duration") + .and_then(|value| timestamp_ms(Some(value))) + .is_some_and(|duration_ms| clamp_segments_to_duration(&mut segments, duration_ms)); + + Ok(Transcript { + segments, + provider_metadata: Some(json!({ + "provider": "deepgram", + "schema_version": "deepgram.metadata.v1", + "data": { + "model": model, + "base_url": base_url, + "request": { + "audio_source": audio_source, + "file_url_present": audio_source == "presigned_url", + }, + "metadata": response.get("metadata").cloned().unwrap_or(Value::Null), + "intelligence": intelligence_payload(&response), + "response": { + "channel_count": response.pointer("/results/channels") + .and_then(Value::as_array) + .map_or(0, Vec::len), + "timestamps_clamped": timestamps_clamped, + "utterance_count": response.pointer("/results/utterances") + .and_then(Value::as_array) + .map_or(0, Vec::len), + "alternative_count": alternative_count(&response), + "mean_confidence": mean_confidence(&response), + } + } + })), + }) +} + +fn intelligence_payload(response: &Value) -> Value { + json!({ + "summary": response.pointer("/results/summary").cloned().unwrap_or(Value::Null), + "topics": response.pointer("/results/topics").cloned().unwrap_or(Value::Null), + "intents": response.pointer("/results/intents").cloned().unwrap_or(Value::Null), + "sentiments": response.pointer("/results/sentiments").cloned().unwrap_or(Value::Null), + "entities": collect_alternative_field(response, "entities"), + "summaries": collect_alternative_field(response, "summaries"), + "search": collect_channel_field(response, "search"), + "warnings": response.get("warnings").cloned().unwrap_or(Value::Null), + }) +} + +fn collect_channel_field(response: &Value, field: &str) -> Value { + let values = response + .pointer("/results/channels") + .and_then(Value::as_array) + .into_iter() + .flatten() + .filter_map(|channel| channel.get(field).cloned()) + .filter(|value| !value.is_null()) + .collect::>(); + json!(values) +} + +fn collect_alternative_field(response: &Value, field: &str) -> Value { + let values = response + .pointer("/results/channels") + .and_then(Value::as_array) + .into_iter() + .flatten() + .flat_map(|channel| { + channel + .get("alternatives") + .and_then(Value::as_array) + .into_iter() + .flatten() + }) + .filter_map(|alternative| alternative.get(field).cloned()) + .filter(|value| !value.is_null()) + .collect::>(); + json!(values) +} + +fn clamp_segments_to_duration(segments: &mut [Segment], duration_ms: i64) -> bool { + let mut clamped = false; + for segment in segments { + if segment.start_ms > duration_ms { + segment.start_ms = duration_ms; + clamped = true; + } + if segment.end_ms > duration_ms { + segment.end_ms = duration_ms; + clamped = true; + } + for word in &mut segment.words { + if word.start_ms > duration_ms { + word.start_ms = duration_ms; + clamped = true; + } + if word.end_ms > duration_ms { + word.end_ms = duration_ms; + clamped = true; + } + } + } + clamped +} + +fn parse_utterances(response: &Value) -> Vec { + response + .pointer("/results/utterances") + .and_then(Value::as_array) + .into_iter() + .flatten() + .filter_map(|utterance| { + let text = utterance + .get("transcript") + .and_then(Value::as_str) + .map(str::trim) + .filter(|text| !text.is_empty())?; + let words = parse_words(utterance.get("words")); + let speaker = speaker_label(utterance.get("speaker")).or_else(|| { + words + .iter() + .find_map(|_| first_word_speaker(utterance.get("words"))) + }); + + Some(Segment { + start_ms: timestamp_ms(utterance.get("start")) + .unwrap_or_else(|| words.first().map_or(0, |word| word.start_ms)), + end_ms: timestamp_ms(utterance.get("end")) + .unwrap_or_else(|| words.last().map_or(0, |word| word.end_ms)), + text: text.to_string(), + speaker, + words, + ..Default::default() + }) + }) + .collect() +} + +fn parse_alternatives(response: &Value) -> Vec { + response + .pointer("/results/channels") + .and_then(Value::as_array) + .into_iter() + .flatten() + .flat_map(|channel| { + channel + .get("alternatives") + .and_then(Value::as_array) + .into_iter() + .flatten() + }) + .flat_map(parse_alternative) + .collect() +} + +fn parse_alternative(alternative: &Value) -> Vec { + let words = parse_words(alternative.get("words")); + let sentence_segments = alternative + .pointer("/paragraphs/paragraphs") + .and_then(Value::as_array) + .into_iter() + .flatten() + .flat_map(|paragraph| { + paragraph + .get("sentences") + .and_then(Value::as_array) + .into_iter() + .flatten() + }) + .filter_map(|sentence| sentence_segment(sentence, &words)) + .collect::>(); + + if !sentence_segments.is_empty() { + return sentence_segments; + } + + alternative + .get("transcript") + .and_then(Value::as_str) + .map(str::trim) + .filter(|text| !text.is_empty()) + .map(|text| { + vec![Segment { + start_ms: words.first().map_or(0, |word| word.start_ms), + end_ms: words.last().map_or(0, |word| word.end_ms), + text: text.to_string(), + speaker: first_word_speaker(alternative.get("words")), + words, + ..Default::default() + }] + }) + .unwrap_or_default() +} + +fn sentence_segment(sentence: &Value, all_words: &[Word]) -> Option { + let text = sentence + .get("text") + .and_then(Value::as_str) + .map(str::trim) + .filter(|text| !text.is_empty())?; + let start_ms = timestamp_ms(sentence.get("start")).unwrap_or(0); + let end_ms = timestamp_ms(sentence.get("end")).unwrap_or(start_ms); + let words = all_words + .iter() + .filter(|word| word.start_ms >= start_ms && word.end_ms <= end_ms) + .cloned() + .collect::>(); + + Some(Segment { + start_ms, + end_ms, + text: text.to_string(), + speaker: None, + words, + ..Default::default() + }) +} + +fn parse_words(value: Option<&Value>) -> Vec { + value + .and_then(Value::as_array) + .into_iter() + .flatten() + .filter_map(|word| { + let raw = word.get("word").and_then(Value::as_str)?; + let punctuated = word.get("punctuated_word").and_then(Value::as_str); + Some(Word { + start_ms: timestamp_ms(word.get("start")).unwrap_or(0), + end_ms: timestamp_ms(word.get("end")).unwrap_or(0), + text: punctuated.unwrap_or(raw).to_string(), + punctuation: punctuated + .filter(|punctuated| *punctuated != raw) + .map(ToOwned::to_owned), + }) + }) + .collect() +} + +fn timestamp_ms(value: Option<&Value>) -> Option { + let seconds = match value? { + Value::Number(number) => number.as_f64()?, + Value::String(text) => text.parse().ok()?, + _ => return None, + }; + Some((seconds * 1000.0).round() as i64) +} + +fn speaker_label(value: Option<&Value>) -> Option { + match value? { + Value::Number(number) => number.as_i64().map(|speaker| format!("Speaker {speaker}")), + Value::String(text) if !text.trim().is_empty() => Some(format!("Speaker {}", text.trim())), + _ => None, + } +} + +fn first_word_speaker(words: Option<&Value>) -> Option { + words + .and_then(Value::as_array)? + .iter() + .find_map(|word| speaker_label(word.get("speaker"))) +} + +fn alternative_count(response: &Value) -> usize { + response + .pointer("/results/channels") + .and_then(Value::as_array) + .into_iter() + .flatten() + .map(|channel| { + channel + .get("alternatives") + .and_then(Value::as_array) + .map_or(0, Vec::len) + }) + .sum() +} + +fn mean_confidence(response: &Value) -> Value { + let confidences = response + .pointer("/results/channels") + .and_then(Value::as_array) + .into_iter() + .flatten() + .flat_map(|channel| { + channel + .get("alternatives") + .and_then(Value::as_array) + .into_iter() + .flatten() + }) + .filter_map(|alternative| alternative.get("confidence").and_then(Value::as_f64)) + .collect::>(); + + if confidences.is_empty() { + Value::Null + } else { + json!(confidences.iter().sum::() / confidences.len() as f64) + } +} diff --git a/src/engines/deepgram/tests.rs b/src/engines/deepgram/tests.rs new file mode 100644 index 0000000..85d9c76 --- /dev/null +++ b/src/engines/deepgram/tests.rs @@ -0,0 +1,217 @@ +use serde_json::json; + +use super::parse_response; + +#[test] +fn parses_utterances_with_speaker_and_words() { + let body = json!({ + "metadata": { + "request_id": "req-1", + "duration": 2.4, + "model_info": { + "model-id": { + "name": "nova-3", + "version": "2026-01-01", + "arch": "nova-3" + } + } + }, + "results": { + "channels": [ + { + "alternatives": [ + { + "transcript": "Hello there.", + "confidence": 0.98, + "words": [] + } + ] + } + ], + "utterances": [ + { + "start": 0.12, + "end": 1.5, + "speaker": 0, + "transcript": "Hello there.", + "words": [ + { + "word": "hello", + "punctuated_word": "Hello", + "start": 0.12, + "end": 0.6, + "speaker": 0 + }, + { + "word": "there", + "punctuated_word": "there.", + "start": 0.6, + "end": 1.5, + "speaker": 0 + } + ] + } + ] + } + }) + .to_string(); + + let transcript = parse_response( + body.as_bytes(), + "nova-3", + "https://api.deepgram.com/v1", + "direct_upload", + ) + .expect("response should parse"); + + assert_eq!(transcript.segments.len(), 1); + assert_eq!(transcript.segments[0].start_ms, 120); + assert_eq!(transcript.segments[0].end_ms, 1500); + assert_eq!(transcript.segments[0].speaker.as_deref(), Some("Speaker 0")); + assert_eq!(transcript.segments[0].words.len(), 2); + assert_eq!(transcript.segments[0].words[1].text, "there."); + assert_eq!( + transcript + .provider_metadata + .as_ref() + .and_then(|value| value.pointer("/data/response/utterance_count")) + .and_then(serde_json::Value::as_u64), + Some(1) + ); +} + +#[test] +fn falls_back_to_paragraph_sentences() { + let body = json!({ + "metadata": {"request_id": "req-2"}, + "results": { + "channels": [ + { + "alternatives": [ + { + "transcript": "First sentence. Second sentence.", + "confidence": 0.9, + "words": [ + {"word": "first", "punctuated_word": "First", "start": 0.0, "end": 0.4}, + {"word": "sentence", "punctuated_word": "sentence.", "start": 0.4, "end": 1.0}, + {"word": "second", "punctuated_word": "Second", "start": 1.1, "end": 1.5}, + {"word": "sentence", "punctuated_word": "sentence.", "start": 1.5, "end": 2.1} + ], + "paragraphs": { + "paragraphs": [ + { + "sentences": [ + {"text": "First sentence.", "start": 0.0, "end": 1.0}, + {"text": "Second sentence.", "start": 1.1, "end": 2.1} + ] + } + ] + } + } + ] + } + ] + } + }) + .to_string(); + + let transcript = parse_response( + body.as_bytes(), + "nova-3", + "https://api.deepgram.com/v1", + "direct_upload", + ) + .expect("response should parse"); + + assert_eq!(transcript.segments.len(), 2); + assert_eq!(transcript.segments[0].text, "First sentence."); + assert_eq!(transcript.segments[1].start_ms, 1100); + assert_eq!(transcript.segments[1].words.len(), 2); +} + +#[test] +fn clamps_timestamps_to_metadata_duration() { + let body = json!({ + "metadata": {"duration": 1.0}, + "results": { + "utterances": [ + { + "start": 0.5, + "end": 1.2, + "transcript": "Too long.", + "words": [ + {"word": "too", "start": 0.5, "end": 0.7}, + {"word": "long", "punctuated_word": "long.", "start": 0.7, "end": 1.2} + ] + } + ] + } + }) + .to_string(); + + let transcript = parse_response( + body.as_bytes(), + "nova-3", + "https://api.deepgram.com/v1", + "direct_upload", + ) + .expect("response should parse"); + + assert_eq!(transcript.segments[0].end_ms, 1000); + assert_eq!(transcript.segments[0].words[1].end_ms, 1000); + assert_eq!( + transcript + .provider_metadata + .as_ref() + .and_then(|value| value.pointer("/data/response/timestamps_clamped")) + .and_then(serde_json::Value::as_bool), + Some(true) + ); +} + +#[test] +fn records_presigned_url_request_metadata_without_url() { + let body = json!({ + "metadata": {"request_id": "req-3"}, + "results": { + "channels": [ + { + "alternatives": [ + { + "transcript": "Remote file.", + "confidence": 0.9, + "words": [] + } + ] + } + ] + } + }) + .to_string(); + + let transcript = parse_response( + body.as_bytes(), + "nova-3", + "https://api.deepgram.com/v1", + "presigned_url", + ) + .expect("response should parse"); + + let metadata = transcript + .provider_metadata + .as_ref() + .expect("provider metadata should exist"); + assert_eq!( + metadata + .pointer("/data/request/audio_source") + .and_then(serde_json::Value::as_str), + Some("presigned_url") + ); + assert_eq!( + metadata + .pointer("/data/request/file_url_present") + .and_then(serde_json::Value::as_bool), + Some(true) + ); + assert!(!metadata.to_string().contains("https://signed.example")); +} diff --git a/src/engines/mod.rs b/src/engines/mod.rs index d90e873..2c398a8 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -1,4 +1,5 @@ pub mod azure_openai; +pub mod deepgram; pub mod gemini; pub mod model_cache; pub mod nvidia_riva; diff --git a/src/main.rs b/src/main.rs index 9ccedb1..6dcef9d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -25,6 +25,7 @@ use crate::cli::{ AnalysisKind, Cli, Command, ModelFormat, OutputFormatArg, Provider, SetupComponent, }; use crate::engines::azure_openai::AzureOpenAi; +use crate::engines::deepgram::DeepgramApi; use crate::engines::gemini::GeminiApi; use crate::engines::model_cache::ModelCache; use crate::engines::nvidia_riva::NvidiaRiva; @@ -148,10 +149,25 @@ async fn main() -> Result<()> { nvidia_api_key, nvidia_riva_function_id, nvidia_riva_server, + deepgram_api_key, azure_api_key, remote_model, qwen_api_base_url, gemini_api_base_url, + deepgram_api_base_url, + deepgram_intelligence, + deepgram_summarize, + deepgram_topics, + deepgram_intents, + deepgram_detect_entities, + deepgram_sentiment, + deepgram_keyterm, + deepgram_search, + deepgram_redact, + deepgram_replace, + deepgram_filler_words, + deepgram_numerals, + deepgram_use_presigned_url, gemini_file_cache, gemini_file_cache_index, gemini_autoclean, @@ -326,30 +342,20 @@ async fn main() -> Result<()> { let key = dashscope_api_key.or(api_key).context( "--dashscope-api-key, --api-key, DASHSCOPE_API_KEY, or OPENAI_API_KEY is required for --provider qwen-filetrans", )?; - let s3_config = s3_config_from_input(S3ConfigInput { - bucket: s3_bucket.context( - "--s3-bucket or S3_BUCKET is required for --provider qwen-filetrans", - )?, - region: s3_region.or_else(|| std::env::var("AWS_REGION").ok()).context( - "--s3-region, S3_REGION, or AWS_REGION is required for --provider qwen-filetrans", - )?, - endpoint_url: s3_endpoint_url, - access_key_id: s3_access_key_id - .or_else(|| std::env::var("AWS_ACCESS_KEY_ID").ok()) - .context( - "--s3-access-key-id, S3_ACCESS_KEY_ID, or AWS_ACCESS_KEY_ID is required for --provider qwen-filetrans", - )?, - secret_access_key: s3_secret_access_key - .or_else(|| std::env::var("AWS_SECRET_ACCESS_KEY").ok()) - .context( - "--s3-secret-access-key, S3_SECRET_ACCESS_KEY, or AWS_SECRET_ACCESS_KEY is required for --provider qwen-filetrans", - )?, - session_token: s3_session_token.or_else(|| std::env::var("AWS_SESSION_TOKEN").ok()), - prefix: s3_prefix, - presign_expires_secs: s3_presign_expires_secs, - force_path_style: s3_force_path_style, - })?; - let uploader = S3Uploader::new(s3_config).await?; + let uploader = build_s3_uploader(S3UploaderArgs { + bucket: s3_bucket, + region: s3_region, + endpoint_url: s3_endpoint_url, + access_key_id: s3_access_key_id, + secret_access_key: s3_secret_access_key, + session_token: s3_session_token, + prefix: s3_prefix, + default_prefix: "transcribeit/qwen-filetrans", + presign_expires_secs: s3_presign_expires_secs, + force_path_style: s3_force_path_style, + context_label: "--provider qwen-filetrans", + }) + .await?; let model_name = remote_model.unwrap_or_else(|| "qwen3-asr-flash-filetrans".into()); ProviderRuntime { @@ -443,6 +449,72 @@ async fn main() -> Result<()> { model_name, } } + Provider::Deepgram => { + let key = deepgram_api_key.or(api_key).context( + "--deepgram-api-key, DEEPGRAM_API_KEY, --api-key, or OPENAI_API_KEY is required for --provider deepgram", + )?; + let model_name = remote_model.unwrap_or_else(|| "nova-3".into()); + if speakers.is_some() && !diarize { + eprintln!( + "--speakers was provided for Deepgram, so provider-native diarization will be enabled with diarize_model=latest." + ); + } + if speakers.is_some() { + eprintln!( + "Deepgram does not accept an exact speaker-count hint for batch diarization; --speakers is treated as a request to enable diarization." + ); + } + ProviderRuntime { + engine: Box::new(DeepgramApi::new( + deepgram_api_base_url, + key, + model_name.clone(), + language.clone(), + api_settings, + crate::engines::deepgram::DeepgramOptions { + diarize: diarize || speakers.is_some(), + intelligence: deepgram_intelligence, + summarize: deepgram_summarize, + topics: deepgram_topics, + intents: deepgram_intents, + detect_entities: deepgram_detect_entities, + sentiment: deepgram_sentiment, + keyterms: deepgram_keyterm, + search: deepgram_search, + redact: deepgram_redact, + replace: deepgram_replace, + filler_words: deepgram_filler_words, + numerals: deepgram_numerals, + }, + if deepgram_use_presigned_url { + Some( + build_s3_uploader( + S3UploaderArgs { + bucket: s3_bucket, + region: s3_region, + endpoint_url: s3_endpoint_url, + access_key_id: s3_access_key_id, + secret_access_key: s3_secret_access_key, + session_token: s3_session_token, + prefix: s3_prefix, + default_prefix: "transcribeit/deepgram", + presign_expires_secs: s3_presign_expires_secs, + force_path_style: s3_force_path_style, + context_label: + "--provider deepgram --deepgram-use-presigned-url", + }, + ) + .await?, + ) + } else { + None + }, + )?), + analyzer: None, + provider_name: "deepgram".into(), + model_name, + } + } }; let requested_diarization = diarize || speakers.is_some(); let provider_native_diarization = @@ -509,7 +581,65 @@ async fn main() -> Result<()> { fn provider_handles_diarization(provider_name: &str, model_name: &str) -> bool { match provider_name { "nvidia-riva" | "gemini" => true, + "deepgram" => true, "openai" => model_name.eq_ignore_ascii_case("gpt-4o-transcribe-diarize"), _ => false, } } + +struct S3UploaderArgs { + bucket: Option, + region: Option, + endpoint_url: Option, + access_key_id: Option, + secret_access_key: Option, + session_token: Option, + prefix: Option, + default_prefix: &'static str, + presign_expires_secs: u64, + force_path_style: bool, + context_label: &'static str, +} + +async fn build_s3_uploader(args: S3UploaderArgs) -> Result { + let context_label = args.context_label; + + let s3_config = s3_config_from_input(S3ConfigInput { + bucket: args + .bucket + .with_context(|| format!("--s3-bucket or S3_BUCKET is required for {context_label}"))?, + region: args + .region + .or_else(|| std::env::var("AWS_REGION").ok()) + .with_context(|| { + format!("--s3-region, S3_REGION, or AWS_REGION is required for {context_label}") + })?, + endpoint_url: args.endpoint_url, + access_key_id: args + .access_key_id + .or_else(|| std::env::var("AWS_ACCESS_KEY_ID").ok()) + .with_context(|| { + format!( + "--s3-access-key-id, S3_ACCESS_KEY_ID, or AWS_ACCESS_KEY_ID is required for {context_label}" + ) + })?, + secret_access_key: args + .secret_access_key + .or_else(|| std::env::var("AWS_SECRET_ACCESS_KEY").ok()) + .with_context(|| { + format!( + "--s3-secret-access-key, S3_SECRET_ACCESS_KEY, or AWS_SECRET_ACCESS_KEY is required for {context_label}" + ) + })?, + session_token: args + .session_token + .or_else(|| std::env::var("AWS_SESSION_TOKEN").ok()), + prefix: args + .prefix + .or_else(|| Some(args.default_prefix.to_string())), + presign_expires_secs: args.presign_expires_secs, + force_path_style: args.force_path_style, + })?; + + S3Uploader::new(s3_config).await +} diff --git a/src/pipeline_output.rs b/src/pipeline_output.rs index 076619c..9f32f05 100644 --- a/src/pipeline_output.rs +++ b/src/pipeline_output.rs @@ -335,7 +335,7 @@ fn cache_entry_for_provider(provider: &str, metadata: Option<&Value>, source: &s match provider { "gemini" => gemini_cache_entry(provider, metadata, source), "openai" | "azure" => openai_cache_entry(provider, metadata, source), - "qwen-filetrans" | "nvidia-riva" | "local" | "sherpa-onnx" => CacheEntry { + "qwen-filetrans" | "nvidia-riva" | "deepgram" | "local" | "sherpa-onnx" => CacheEntry { provider: provider.to_string(), mode: "none".to_string(), hit: false, @@ -497,14 +497,14 @@ fn metadata_bool(metadata: Option<&Value>, pointers: &[&str]) -> bool { fn native_timestamps(provider: &str) -> bool { matches!( provider, - "local" | "openai" | "azure" | "qwen-filetrans" | "nvidia-riva" + "local" | "openai" | "azure" | "qwen-filetrans" | "nvidia-riva" | "deepgram" ) } fn timing_source(provider: &str) -> &'static str { match provider { "gemini" => "model_generated", - "qwen-filetrans" | "openai" | "azure" | "nvidia-riva" => "provider_native", + "qwen-filetrans" | "openai" | "azure" | "nvidia-riva" | "deepgram" => "provider_native", "local" | "sherpa-onnx" => "model_native", _ => "unknown", } From 79d0de524780807050a366ba7008e690b9ab91b4 Mon Sep 17 00:00:00 2001 From: skitsanos Date: Wed, 17 Jun 2026 15:48:21 +0300 Subject: [PATCH 2/4] Add generic cleanup for staged provider inputs --- README.md | 12 ++ docs/architecture.md | 9 +- docs/cli-reference.md | 21 +++- docs/performance-benchmarks.md | 2 + docs/provider-behavior.md | 19 +++- docs/troubleshooting.md | 16 ++- src/cli.rs | 10 +- src/engines/deepgram.rs | 77 +++++++++---- src/engines/gemini.rs | 164 ++++++++++++++++++++------- src/engines/gemini/response.rs | 67 +++++------ src/engines/gemini/response/tests.rs | 90 ++++++++------- src/engines/qwen_filetrans.rs | 38 ++++++- src/main.rs | 101 ++++++++++++----- src/storage/s3.rs | 67 ++++++++++- 14 files changed, 512 insertions(+), 181 deletions(-) diff --git a/README.md b/README.md index 681ab0c..de4a93b 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,14 @@ transcribeit run -p gemini --gemini-file-cache \ transcribeit run -p gemini --gemini-explicit-cache --gemini-cache-ttl-secs 3600 \ -i recording.mp3 -f vtt -o ./output +# Use S3/R2 pre-signed URL input for a one-off Gemini run +transcribeit run -p gemini --gemini-use-presigned-url \ + -i recording.mp3 -f vtt -o ./output + +# Delete temporary staged provider resources after the provider consumes them +transcribeit run -p qwen-filetrans --autoclean \ + -i recording.mp3 -f vtt -o ./output + # Transcribe with Gemini and add a structured summary to the manifest transcribeit run -p gemini --analysis summary \ -i interview.mp4 -f vtt -o ./output @@ -128,8 +136,10 @@ transcribeit run -i interview.mp3 -m base --diarize --speakers 2 \ - **Qwen model guardrails** — Accidental short-audio `qwen3-asr-flash` model selection is rejected before conversion and S3 upload; use `qwen3-asr-flash-filetrans` for this provider. - **Gemini whole-file transcription** — `gemini` uploads prepared audio through Gemini Files API, streams `generateContent` response chunks with structured JSON output, and maps segment timestamps, speaker labels, language, and emotion when returned. - **Gemini file reuse** — `--gemini-file-cache` keeps a local index of Gemini Files API uploads keyed by SHA-256 of the prepared 16 kHz mono MP3 bytes, verifies the remote file before reuse, and records reuse metadata in the manifest. +- **Gemini signed URL input** — `--gemini-use-presigned-url` stages prepared MP3 audio in S3/R2 and sends the signed URL as Gemini `file_uri` for one-off inputs up to 100 MB. Files API cache and explicit cached content remain Files API-only. - **Gemini explicit cache** — `--gemini-explicit-cache` creates and reuses Gemini `cachedContent` objects with a configurable TTL, producing deterministic `cachedContentTokenCount` telemetry when Gemini accepts the cache. - **Gemini summary analysis** — `--analysis summary` runs a second Gemini JSON pass over the transcript and stores a provider-neutral summary, key points, topics, questions, and follow-ups in the manifest. +- **Temporary resource cleanup** — `--autoclean` performs best-effort cleanup of temporary provider resources created by the run, including S3/R2 staged objects for Qwen, Gemini signed URL mode, and Deepgram signed URL mode. - **NVIDIA hosted Riva ASR** — `nvidia-riva` calls hosted NVIDIA Riva gRPC endpoints with provider-native word timestamps, optional server-side diarization, and manifest metadata. - **Deepgram Nova batch ASR** — `deepgram` calls Deepgram's `/listen` API, defaults to `nova-3`, requests utterances and smart formatting, supports provider-native diarization through `--diarize`, and can submit either direct audio bytes or an S3/R2 pre-signed URL with `--deepgram-use-presigned-url`. - **Deepgram audio intelligence** — `--deepgram-intelligence` captures Deepgram summary, topics, intents, entity detection, and sentiment in `provider_metadata.data.intelligence`; `--deepgram-keyterm` passes Nova-3 keyterm prompts for domain terminology. @@ -162,6 +172,7 @@ SHERPA_ONNX_LIB_DIR=/path/to/sherpa-onnx/lib OPENAI_API_KEY=sk-your_key_here GEMINI_API_KEY=your_gemini_key_here GEMINI_API_BASE_URL=https://generativelanguage.googleapis.com/v1beta +GEMINI_USE_PRESIGNED_URL=false NVIDIA_API_KEY=your_nvidia_key_here NVIDIA_RIVA_FUNCTION_ID=your_hosted_riva_function_id NVIDIA_RIVA_SERVER=grpc.nvcf.nvidia.com:443 @@ -185,6 +196,7 @@ S3_SECRET_ACCESS_KEY=your_s3_secret_key S3_PREFIX=transcribeit/qwen-filetrans S3_PRESIGN_EXPIRES_SECS=3600 S3_FORCE_PATH_STYLE=false +TRANSCRIBEIT_AUTOCLEAN=false TRANSCRIBEIT_MAX_RETRIES=5 TRANSCRIBEIT_REQUEST_TIMEOUT_SECS=120 TRANSCRIBEIT_RETRY_WAIT_BASE_SECS=10 diff --git a/docs/architecture.md b/docs/architecture.md index 5649862..bc16e85 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -58,7 +58,7 @@ pub trait Transcriber: Send + Sync { - **Sherpa-ONNX engine** (`sherpa_onnx`) uses `transcribe()` — it needs decoded samples for the ONNX runtime. - **OpenAI/Azure API engines** override `transcribe_path()` to upload files directly via multipart, and `transcribe_wav()` to upload in-memory bytes — avoiding the decode→re-encode round-trip. - **Qwen file transcription** overrides `transcribe_path()` to upload prepared audio to S3-compatible storage, generate a pre-signed URL, and submit that URL to DashScope. -- **Gemini** overrides `transcribe_path()` to upload prepared audio through Gemini Files API and call streamed `streamGenerateContent` with structured JSON output. +- **Gemini** overrides `transcribe_path()` to upload prepared audio through Gemini Files API and call streamed `streamGenerateContent` with structured JSON output. In signed URL mode, it stages the prepared MP3 in S3-compatible storage and sends the pre-signed URL as Gemini `file_uri` instead. - **NVIDIA Riva** overrides `transcribe_path()` and `transcribe_wav()` to send WAV bytes to a hosted Riva gRPC endpoint with provider-native timestamps. - **Deepgram** overrides `transcribe_path()` and `transcribe_wav()` to post WAV bytes to Deepgram's `/listen` endpoint with utterances, word timestamps, optional diarization, and optional audio intelligence flags. In URL mode, it stages the prepared WAV in S3-compatible storage and sends Deepgram a pre-signed URL JSON request instead. @@ -192,9 +192,14 @@ Uses Gemini Files API and streamed `streamGenerateContent` for whole-file multim - deletes the temporary Gemini file after the transcription request by default - optionally reuses Gemini Files API uploads with `--gemini-file-cache`, using a local index keyed by SHA-256 of the exact prepared upload bytes - optionally creates and reuses Gemini explicit `cachedContent` objects with `--gemini-explicit-cache` +- optionally bypasses Gemini Files API upload with `--gemini-use-presigned-url`, staging the prepared MP3 in S3/R2 and passing the signed URL as `file_uri` Gemini is not a dedicated ASR endpoint. Timestamp, speaker, language, and emotion values come from the model's structured output, so benchmark quality before relying on them for subtitle workflows. The default path keeps Gemini whole-file for speaker continuity; explicit segmentation and long-input fallback are available with the expected risk that speakers may not remain stable between chunks. +Gemini signed URL mode is for one-off prepared inputs up to 100 MB. It is rejected for Gemini 2.0 family models and cannot be combined with Gemini Files API cache or explicit cached content. + +`--autoclean` deletes temporary provider resources created during a run when the provider lifecycle makes that safe. For S3/R2 URL-staging providers, cleanup runs after the provider has consumed the URL and records best-effort cleanup metadata without failing a successful transcription. + ### NVIDIA Riva (`nvidia_riva.rs`) Uses hosted NVIDIA Riva ASR over gRPC through generated protobuf bindings in `proto/riva/proto/`. The provider: @@ -281,7 +286,7 @@ All settings (timeout, retries, wait times) are configurable via CLI flags and e ### Shared WAV encoding -OpenAI/Azure engines can send file uploads directly and choose the correct container format for compatibility (WAV for local transcribe path, MP3 for API provider uploads). Qwen file transcription stages MP3 in S3-compatible storage and sends DashScope a pre-signed URL. Gemini uploads MP3 through Gemini Files API. NVIDIA Riva sends WAV bytes through gRPC. Deepgram posts WAV bytes to `/listen` by default, or stages WAV in S3-compatible storage and sends a pre-signed URL when URL mode is enabled. The `audio::wav::encode_wav()` helper is still used by local engines and non-file upload paths. +OpenAI/Azure engines can send file uploads directly and choose the correct container format for compatibility (WAV for local transcribe path, MP3 for API provider uploads). Qwen file transcription stages MP3 in S3-compatible storage and sends DashScope a pre-signed URL. Gemini uploads MP3 through Gemini Files API by default, or stages MP3 in S3-compatible storage and sends a pre-signed URL when signed URL mode is enabled. NVIDIA Riva sends WAV bytes through gRPC. Deepgram posts WAV bytes to `/listen` by default, or stages WAV in S3-compatible storage and sends a pre-signed URL when URL mode is enabled. The `audio::wav::encode_wav()` helper is still used by local engines and non-file upload paths. ## Model cache (`model_cache.rs`) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index cbbe2f3..bd78c63 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -125,14 +125,17 @@ If a short-audio `qwen3-asr-flash` model is selected with `-p qwen-filetrans`, t | `--gemini-api-base-url` | Gemini API base URL | `GEMINI_API_BASE_URL` env var, or `https://generativelanguage.googleapis.com/v1beta` | | `--remote-model` | Gemini model name | `gemini-3.5-flash` | | `--gemini-file-cache` | Reuse Gemini Files API uploads keyed by SHA-256 of prepared upload bytes | disabled | +| `--gemini-use-presigned-url` | Stage prepared audio in S3-compatible storage and pass the pre-signed URL as Gemini `file_uri` | disabled | | `--gemini-file-cache-index` | Local Gemini file cache index path | `GEMINI_FILE_CACHE_INDEX` env var, or `.cache/transcribeit/gemini-files.json` | -| `--gemini-autoclean` | Delete Gemini Files API uploads after transcription even when file cache is enabled | disabled | +| `--gemini-autoclean` | Deprecated alias for `--autoclean` for Gemini temporary uploads | disabled | | `--gemini-explicit-cache` | Create and reuse Gemini explicit `cachedContent` objects for prepared audio | disabled | | `--gemini-cache-ttl-secs` | TTL in seconds for Gemini explicit `cachedContent` objects | `3600` | The Gemini provider uses the Gemini Files API plus streamed `streamGenerateContent` with structured JSON output. It converts input audio/video to 16 kHz mono MP3 before upload, then asks Gemini for a transcript object with `text`, `segments`, timestamps, speaker, language, and emotion fields. -By default, Gemini uploads are deleted after each run. With `--gemini-file-cache`, the CLI stores a local JSON index for the uploaded Gemini file reference and keeps the remote file for reuse within the Gemini Files API retention window. The cache key is the SHA-256 hash of the exact prepared 16 kHz mono MP3 bytes, not the input path. Before reuse, the CLI calls `files.get` and only reuses files that still exist and are `ACTIVE`. Use `--gemini-autoclean` to force deletion after a run while keeping the same command shape for experiments. +With `--gemini-use-presigned-url`, the CLI uploads the prepared MP3 to S3-compatible storage, generates a pre-signed GET URL, and sends that URL as Gemini `file_uri` instead of using the Gemini Files API. This mode is useful for one-off inputs up to 100 MB when S3/R2 staging is already configured. It is rejected for Gemini 2.0 family models and cannot be combined with `--gemini-file-cache` or `--gemini-explicit-cache`; use the Files API path when Gemini file reuse or explicit cached content is required. + +By default, Gemini Files API uploads are deleted after each run. With `--gemini-file-cache`, the CLI stores a local JSON index for the uploaded Gemini file reference and keeps the remote file for reuse within the Gemini Files API retention window. The cache key is the SHA-256 hash of the exact prepared 16 kHz mono MP3 bytes, not the input path. Before reuse, the CLI calls `files.get` and only reuses files that still exist and are `ACTIVE`. Use `--autoclean` to force deletion after a run while keeping the same command shape for experiments; `--gemini-autoclean` remains as a deprecated Gemini-only alias. With `--gemini-explicit-cache`, the CLI also creates or reuses a Gemini `cachedContent` object for the prepared audio and passes its name as `cachedContent` in the streamed generation request. This automatically enables the local Gemini file cache index because the cached-content handle must be persisted between runs. Explicit cached content has its own TTL and provider billing behavior; it is separate from the 48-hour Files API upload retention window. @@ -194,6 +197,7 @@ Manifests include `provider_metadata.provider = "deepgram"`, Deepgram request me | `-f, --output-format` | `text`, `vtt`, or `srt` | `vtt` | | `--language` | Language hint (e.g. `en`, `es`, `auto`) | `auto` | | `--normalize` | Normalize audio with ffmpeg `loudnorm` before transcription | disabled | +| `--autoclean` | Best-effort cleanup of temporary provider resources created during the run | disabled | | `--analysis summary` | Add post-transcription summary analysis to the manifest | disabled | `--analysis summary` currently requires `--provider gemini` and `--output-dir`. It runs after transcription, uses the transcript text as input, and writes a provider-neutral `analysis` object into the manifest without changing the VTT/SRT/text transcript output. @@ -277,8 +281,9 @@ When `--input` resolves to multiple files (directory or glob), all files are pro | `GEMINI_API_KEY` | Gemini API key | none | | `GEMINI_API_BASE_URL` | Gemini API base URL | `https://generativelanguage.googleapis.com/v1beta` | | `GEMINI_FILE_CACHE` | Enable Gemini Files API upload reuse | disabled | +| `GEMINI_USE_PRESIGNED_URL` | Stage Gemini input in S3-compatible storage and submit a pre-signed URL | disabled | | `GEMINI_FILE_CACHE_INDEX` | Local Gemini file cache index path | `.cache/transcribeit/gemini-files.json` | -| `GEMINI_AUTOCLEAN` | Delete Gemini uploads after each run, even with file cache enabled | disabled | +| `GEMINI_AUTOCLEAN` | Deprecated Gemini cleanup alias; prefer `TRANSCRIBEIT_AUTOCLEAN` / `--autoclean` | disabled | | `GEMINI_EXPLICIT_CACHE` | Enable Gemini explicit `cachedContent` reuse | disabled | | `GEMINI_CACHE_TTL_SECS` | Gemini explicit `cachedContent` TTL in seconds | `3600` | | `NVIDIA_API_KEY` | NVIDIA hosted Riva API key | none | @@ -311,7 +316,7 @@ When `--input` resolves to multiple files (directory or glob), all files are pro | `S3_ACCESS_KEY_ID` / `AWS_ACCESS_KEY_ID` | S3 access key ID | none | | `S3_SECRET_ACCESS_KEY` / `AWS_SECRET_ACCESS_KEY` | S3 secret access key | none | | `S3_SESSION_TOKEN` / `AWS_SESSION_TOKEN` | S3 session token | none | -| `S3_PREFIX` | S3 object prefix for remote-provider uploads | Provider-specific if unset: `transcribeit/qwen-filetrans` for Qwen, `transcribeit/deepgram` for Deepgram URL mode | +| `S3_PREFIX` | S3 object prefix for remote-provider uploads | Provider-specific if unset: `transcribeit/qwen-filetrans` for Qwen, `transcribeit/gemini` for Gemini URL mode, `transcribeit/deepgram` for Deepgram URL mode | | `S3_PRESIGN_EXPIRES_SECS` | S3 pre-signed URL expiry in seconds | `3600` | | `S3_FORCE_PATH_STYLE` | Force path-style URLs for S3-compatible storage | `false` | | `VAD_MODEL` | Path to Silero VAD ONNX model for speech-aware segmentation | none | @@ -321,6 +326,7 @@ When `--input` resolves to multiple files (directory or glob), all files are pro | `TRANSCRIBEIT_REQUEST_TIMEOUT_SECS` | API request timeout in seconds | `120` | | `TRANSCRIBEIT_RETRY_WAIT_BASE_SECS` | Base retry wait time in seconds | `10` | | `TRANSCRIBEIT_RETRY_WAIT_MAX_SECS` | Maximum retry wait time in seconds | `120` | +| `TRANSCRIBEIT_AUTOCLEAN` | Enable best-effort cleanup of temporary provider resources created during the run | disabled | All variables can be set in a `.env` file in the project root. @@ -432,6 +438,11 @@ transcribeit run -p gemini --gemini-explicit-cache \ --remote-model gemini-3.5-flash \ -i interview.mp4 -f vtt -o ./output +# Gemini using S3/R2 pre-signed URL input instead of Gemini Files API upload +transcribeit run -p gemini --gemini-use-presigned-url \ + --remote-model gemini-3.5-flash \ + -i interview.mp4 -f vtt -o ./output + # NVIDIA hosted Riva ASR transcribeit run -p nvidia-riva -i recording.wav \ --nvidia-api-key "$NVIDIA_API_KEY" \ @@ -462,7 +473,7 @@ transcribeit run -p deepgram --remote-model nova-3-medical \ - **Azure** (`-p azure`) uses `--azure-deployment` and calls: `POST {base-url}/openai/deployments/{deployment}/audio/transcriptions?api-version={version}`. - **Qwen file transcription** (`-p qwen-filetrans`) uploads audio to S3-compatible storage, passes a pre-signed URL to DashScope, and polls the async transcription task. -- **Gemini** (`-p gemini`) uploads audio through Gemini Files API, calls streamed `streamGenerateContent`, and parses structured transcript JSON defensively. +- **Gemini** (`-p gemini`) uploads audio through Gemini Files API by default, calls streamed `streamGenerateContent`, and parses structured transcript JSON defensively. With `--gemini-use-presigned-url`, it stages the prepared MP3 in S3/R2 and sends the signed URL as `file_uri`; Files API cache and explicit cached content are unavailable in that mode. - **Gemini file cache** (`--gemini-file-cache`) reuses verified Gemini Files API uploads by prepared-byte SHA-256 hash; this avoids repeated uploads and may improve implicit cache locality, but provider cache hits still depend on Gemini returning `cachedContentTokenCount`. - **Gemini explicit cache** (`--gemini-explicit-cache`) creates/reuses Gemini `cachedContent` objects and sends `cachedContent` in the streamed generation request. Manifests report `cache.transcription.mode = "explicit"` when this path is used. - **Analysis** (`--analysis summary`) runs a second Gemini structured JSON pass over the transcript and stores summary results in the manifest. diff --git a/docs/performance-benchmarks.md b/docs/performance-benchmarks.md index c4dc1a0..de8565c 100644 --- a/docs/performance-benchmarks.md +++ b/docs/performance-benchmarks.md @@ -117,10 +117,12 @@ Gemini is a whole-file multimodal provider with streamed response tokens and mod ```bash time transcribeit run -p gemini --remote-model gemini-3.5-flash -i -f vtt -o ./output time transcribeit run -p gemini --remote-model gemini-3.1-pro-preview -i -f vtt -o ./output +time transcribeit run -p gemini --remote-model gemini-3.5-flash --gemini-use-presigned-url -i -f vtt -o ./output ``` Record: - model name +- upload method from `provider_metadata.data.upload_method` - wall-clock time - manifest `quality.timing_reliable` - manifest `quality.timestamps_clamped` diff --git a/docs/provider-behavior.md b/docs/provider-behavior.md index cb94ace..8189187 100644 --- a/docs/provider-behavior.md +++ b/docs/provider-behavior.md @@ -8,11 +8,13 @@ This project supports eight providers. They share the same input/output surface, |----------|------------------------------------|-------| | `qwen-filetrans` | Required | Stages prepared MP3 audio in S3/R2 and passes the pre-signed URL to DashScope async ASR. | | `deepgram` | Optional | Direct byte upload is the default. `--deepgram-use-presigned-url` stages prepared WAV audio in S3/R2 and sends Deepgram `{"url":"..."}`. | -| `gemini` | Not currently implemented | The current implementation uses Gemini Files API upload/cache because that path supports file reuse and explicit cached-content workflows. | +| `gemini` | Optional | Files API upload/cache remains the default. `--gemini-use-presigned-url` stages prepared MP3 audio in S3/R2 and sends the signed URL as `file_uri`; this path is limited to 100 MB and excludes Gemini 2.0 family models. | | `openai` | No | Uses multipart transcription upload. | | `azure` | No | Uses Azure OpenAI multipart transcription upload. | | `nvidia-riva` | No | Uses hosted Riva gRPC audio streaming/buffers. | +For URL-staging providers, `--autoclean` / `TRANSCRIBEIT_AUTOCLEAN=true` enables best-effort deletion of the S3/R2 object after the provider has consumed it. Cleanup status is written to provider metadata. Cleanup errors are warnings and do not fail an otherwise successful transcription. + ## Local (`-p local`) - Input audio/video is converted with FFmpeg to 16 kHz mono WAV. @@ -106,6 +108,7 @@ This project supports eight providers. They share the same input/output surface, - `S3_FORCE_PATH_STYLE=true` for providers that require path-style URLs - Input audio/video is converted with FFmpeg to 16 kHz mono MP3 before upload. - The engine uploads the prepared file, generates a pre-signed GET URL, submits the Qwen async task, polls until completion, downloads the transcription JSON, and maps Qwen sentence timestamps into the project transcript model. +- With `--autoclean`, the staged S3/R2 object is deleted after the Qwen result JSON is downloaded. - Manifests include Qwen provider metadata when available: - `provider_metadata.provider = "qwen-filetrans"` - `provider_metadata.schema_version = "qwen-filetrans.metadata.v1"` @@ -113,6 +116,7 @@ This project supports eight providers. They share the same input/output surface, - `provider_metadata.data.result` with audio info and transcript/sentence/word counts - per-segment `language`, `emotion`, and `words` with word-level timestamps - Temporary pre-signed URLs are not persisted in the manifest; only `file_url_present` is recorded. +- S3/R2 staging cleanup, when attempted, is recorded under `provider_metadata.data.staging.cleanup`. - Qwen file transcription does not expose token-cache telemetry through this path, so manifests use `cache.transcription.mode = "none"`. - Qwen file transcription is intended for whole-file processing. Do not enable segmentation unless you explicitly want multiple independent remote tasks. - If a short-audio `qwen3-asr-flash` model is accidentally selected with `-p qwen-filetrans`, the CLI validates the local file before upload and fails without staging it to S3. Short flash models have a 10 MB and 300 second limit and use a different API path. @@ -125,12 +129,20 @@ This project supports eight providers. They share the same input/output surface, - Base URL defaults to `https://generativelanguage.googleapis.com/v1beta` and can be overridden with `--gemini-api-base-url` or `GEMINI_API_BASE_URL`. - Default model: `gemini-3.5-flash`. - Useful benchmark candidates include `gemini-3.1-pro-preview`, `gemini-3-flash-preview`, `gemini-3-pro-preview`, and `gemini-2.5-flash`. +- By default, Gemini uses the Files API. With `--gemini-use-presigned-url` or `GEMINI_USE_PRESIGNED_URL=true`, the CLI stages the prepared MP3 in S3-compatible storage and sends the pre-signed HTTPS URL directly as `file_uri`. +- Gemini signed URL mode: + - is intended for prepared files up to 100 MB + - is rejected for Gemini 2.0 family models + - cannot be combined with `--gemini-file-cache` or `--gemini-explicit-cache` + - uses `S3_PREFIX=transcribeit/gemini` when no explicit `S3_PREFIX` is provided + - records `provider_metadata.data.upload_method = "signed_url"` and `provider_metadata.data.request.file_url_present = true`, but does not persist the pre-signed URL + - deletes the staged S3/R2 object after streamed generation when `--autoclean` is set - Endpoint flow: - `POST {upload-base-url}/files` to start a resumable file upload. - Upload bytes to the returned `x-goog-upload-url`. - Poll `GET {base-url}/files/{id}` until the file is `ACTIVE`. - `POST {base-url}/models/{model}:streamGenerateContent?alt=sse`. - - `DELETE {base-url}/files/{id}` after transcription unless `--gemini-file-cache` is enabled without `--gemini-autoclean`. + - `DELETE {base-url}/files/{id}` after transcription unless `--gemini-file-cache` is enabled without `--autoclean`. - Input audio/video is converted with FFmpeg to 16 kHz mono MP3 before upload. - `--gemini-file-cache` stores a local index of Gemini Files API uploads keyed by SHA-256 of the prepared 16 kHz mono MP3 bytes. The CLI verifies an indexed file with `files.get` before reuse and uploads again if the file is missing, expired, failed, or mismatched. - The default index path is `.cache/transcribeit/gemini-files.json`, or `--gemini-file-cache-index` / `GEMINI_FILE_CACHE_INDEX`. @@ -157,6 +169,7 @@ This project supports eight providers. They share the same input/output surface, - `provider_metadata.data.file.cache_enabled` - `provider_metadata.data.file.cache_reused` - `provider_metadata.data.file.cache_hash` + - `provider_metadata.data.staging.cleanup` when signed URL S3/R2 staging is used - `provider_metadata.data.cached_content` when explicit cached content is used - Gemini cache telemetry is normalized into `cache.transcription` from `usageMetadata.cachedContentTokenCount` and `usageMetadata.cacheTokensDetails` when returned. @@ -216,6 +229,7 @@ Gemini summary analysis includes: - `S3_PREFIX` (defaults to `transcribeit/deepgram` for this mode when unset) - `S3_PRESIGN_EXPIRES_SECS` (defaults to `3600`, minimum `300`) - `S3_FORCE_PATH_STYLE=true` for providers that require path-style URLs +- With `--autoclean`, the staged S3/R2 object is deleted after the Deepgram `/listen` response is received. - When `--diarize` is set, the request uses `diarize_model=latest`. `--speakers N` is treated as a request to enable diarization, but Deepgram does not accept a fixed speaker-count hint through this provider path. - `--deepgram-intelligence` enables `summarize=v2`, `topics=true`, `intents=true`, `detect_entities=true`, and `sentiment=true`. - Individual feature flags are also available: @@ -235,6 +249,7 @@ Gemini summary analysis includes: - `provider_metadata.provider = "deepgram"` - `provider_metadata.schema_version = "deepgram.metadata.v1"` - `provider_metadata.data.request` with `audio_source` (`direct_upload` or `presigned_url`) and `file_url_present`; the actual pre-signed URL is not persisted. + - `provider_metadata.data.staging.cleanup` when signed URL S3/R2 staging is used. - `provider_metadata.data.metadata` with Deepgram request id, duration, channel count, model ids, `model_info`, and intelligence token usage metadata. - `provider_metadata.data.response` with channel, utterance, and alternative counts, mean confidence, and timestamp clamping telemetry. - `provider_metadata.data.intelligence` with returned `summary`, `topics`, `intents`, `sentiments`, `entities`, `summaries`, `search`, and warnings. diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 0e2e1e6..a2028b4 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -204,6 +204,18 @@ Fix: - Use `--analysis summary` only with Gemini for now. - Always provide `-o` / `--output-dir`; analysis is written into `.manifest.json`. +### Gemini signed URL mode fails before transcription + +Symptoms: +- `--gemini-use-presigned-url` fails before or during the Gemini request +- error mentions S3 credentials, Gemini 2.0, file cache, explicit cache, or a 100 MB prepared input limit + +Fix: +- Confirm `S3_BUCKET`, `S3_REGION`, `S3_ACCESS_KEY_ID`, and `S3_SECRET_ACCESS_KEY` are set, plus `S3_ENDPOINT_URL` when using Cloudflare R2. +- Use a supported non-Gemini-2.0 model such as `gemini-3.5-flash` or `gemini-2.5-flash`. +- Do not combine `--gemini-use-presigned-url` with `--gemini-file-cache` or `--gemini-explicit-cache`; signed URL mode does not create reusable Gemini Files API handles. +- Keep the prepared 16 kHz mono MP3 under 100 MB, or use the default Gemini Files API path for larger or reusable files. + Example: ```bash @@ -237,7 +249,7 @@ Explanation: - A missing `usage_metadata.cachedContentTokenCount` means Gemini did not report a token-cache hit for that request. Fix: -- For upload reuse, keep `--gemini-file-cache` enabled and avoid `--gemini-autoclean`. +- For upload reuse, keep `--gemini-file-cache` enabled and avoid `--autoclean`. - For deterministic token-cache reuse, run with `--gemini-explicit-cache`. This creates or reuses a Gemini `cachedContent` object and should produce `cache.transcription.mode = "explicit"` plus `cachedContentTokenCount` when Gemini accepts the cache. - Explicit cached content has TTL and billing behavior. Use `--gemini-cache-ttl-secs` to control how long the cache is retained by Gemini. @@ -293,7 +305,7 @@ Common symptoms: Fix: - Use `--normalize` to reduce volume inconsistency from recorded content. - Ensure input is not corrupted and ffmpeg conversion succeeds. -- For OpenAI/Azure providers, MP3 conversion is used internally; local provider uses WAV input internally. Qwen file transcription stages a prepared MP3 in S3-compatible storage and passes a pre-signed URL to DashScope. Deepgram and NVIDIA Riva use WAV input internally; Deepgram can optionally stage that prepared WAV in S3/R2 and submit a pre-signed URL with `--deepgram-use-presigned-url`. +- For OpenAI/Azure providers, MP3 conversion is used internally; local provider uses WAV input internally. Qwen file transcription stages a prepared MP3 in S3-compatible storage and passes a pre-signed URL to DashScope. Gemini uses Gemini Files API by default, but can optionally stage prepared MP3 in S3/R2 and submit a signed URL with `--gemini-use-presigned-url`. Deepgram and NVIDIA Riva use WAV input internally; Deepgram can optionally stage that prepared WAV in S3/R2 and submit a pre-signed URL with `--deepgram-use-presigned-url`. ### Empty or tiny transcript outputs diff --git a/src/cli.rs b/src/cli.rs index bdce506..5b9c1f2 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -310,11 +310,15 @@ pub(crate) enum Command { #[arg(long, env = "GEMINI_FILE_CACHE")] gemini_file_cache: bool, + /// Stage Gemini input in S3-compatible storage and pass a pre-signed URL as file_uri + #[arg(long, env = "GEMINI_USE_PRESIGNED_URL")] + gemini_use_presigned_url: bool, + /// Local Gemini Files API cache index path #[arg(long, env = "GEMINI_FILE_CACHE_INDEX")] gemini_file_cache_index: Option, - /// Delete the Gemini Files API upload after transcription, even when file cache is enabled + /// Deprecated alias for --autoclean for Gemini Files API uploads #[arg(long, env = "GEMINI_AUTOCLEAN")] gemini_autoclean: bool, @@ -390,6 +394,10 @@ pub(crate) enum Command { #[arg(long)] normalize: bool, + /// Best-effort cleanup of temporary provider resources created during the run + #[arg(long, env = "TRANSCRIBEIT_AUTOCLEAN")] + autoclean: bool, + /// Enable speaker diarization #[arg(long)] diarize: bool, diff --git a/src/engines/deepgram.rs b/src/engines/deepgram.rs index 0a1fb64..c9f52f4 100644 --- a/src/engines/deepgram.rs +++ b/src/engines/deepgram.rs @@ -10,7 +10,7 @@ use serde_json::{Value, json}; use crate::audio::wav::encode_wav; use crate::engines::rate_limit::{self, send_with_retry}; -use crate::storage::s3::S3Uploader; +use crate::storage::s3::{S3CleanupResult, S3Uploader}; use crate::transcriber::{Segment, Transcriber, Transcript, Word}; pub struct DeepgramApi { @@ -22,6 +22,18 @@ pub struct DeepgramApi { client: Client, options: DeepgramOptions, presigned_url_uploader: Option, + autoclean: bool, +} + +pub struct DeepgramConfig { + pub base_url: String, + pub api_key: String, + pub model: String, + pub language: Option, + pub settings: rate_limit::ApiRequestSettings, + pub options: DeepgramOptions, + pub presigned_url_uploader: Option, + pub autoclean: bool, } #[derive(Debug, Clone, Default)] @@ -42,29 +54,22 @@ pub struct DeepgramOptions { } impl DeepgramApi { - pub fn new( - base_url: String, - api_key: String, - model: String, - language: Option, - settings: rate_limit::ApiRequestSettings, - options: DeepgramOptions, - presigned_url_uploader: Option, - ) -> Result { + pub fn new(config: DeepgramConfig) -> Result { let client = Client::builder() - .timeout(settings.request_timeout) + .timeout(config.settings.request_timeout) .build() .context("Failed to build HTTP client")?; Ok(Self { - base_url: base_url.trim_end_matches('/').to_string(), - api_key, - model, - language, - settings, + base_url: config.base_url.trim_end_matches('/').to_string(), + api_key: config.api_key, + model: config.model, + language: config.language, + settings: config.settings, client, - options, - presigned_url_uploader, + options: config.options, + presigned_url_uploader: config.presigned_url_uploader, + autoclean: config.autoclean, }) } @@ -194,8 +199,21 @@ impl Transcriber for DeepgramApi { async fn transcribe_path(&self, audio_path: &Path) -> Result { if let Some(uploader) = &self.presigned_url_uploader { - let url = uploader.upload_and_presign(audio_path).await?; - return self.transcribe_file_url(url).await; + let upload = uploader.upload_and_presign_object(audio_path).await?; + let mut transcript = self.transcribe_file_url(upload.url.clone()).await?; + let cleanup = if self.autoclean { + uploader.cleanup_uploaded_object(&upload).await + } else { + S3CleanupResult::skipped(&upload) + }; + if let Some(error) = cleanup.error.as_deref() { + eprintln!( + "Failed to delete staged Deepgram object s3://{}/{}: {error}", + cleanup.bucket, cleanup.key + ); + } + add_deepgram_staging_metadata(&mut transcript, cleanup); + return Ok(transcript); } let bytes = tokio::fs::read(audio_path) @@ -222,6 +240,25 @@ impl Transcriber for DeepgramApi { } } +fn add_deepgram_staging_metadata(transcript: &mut Transcript, cleanup: S3CleanupResult) { + let metadata = transcript.provider_metadata.get_or_insert_with(|| { + serde_json::json!({ + "provider": "deepgram", + "schema_version": "deepgram.metadata.v1", + "data": {} + }) + }); + if let Some(data) = metadata.get_mut("data").and_then(Value::as_object_mut) { + data.insert( + "staging".to_string(), + serde_json::json!({ + "provider": "s3", + "cleanup": cleanup.to_metadata(), + }), + ); + } +} + fn parse_response( body: &[u8], model: &str, diff --git a/src/engines/gemini.rs b/src/engines/gemini.rs index a558670..31e6970 100644 --- a/src/engines/gemini.rs +++ b/src/engines/gemini.rs @@ -7,6 +7,7 @@ use reqwest::Client; use crate::audio::segment::get_duration; use crate::audio::wav::encode_wav; use crate::engines::rate_limit; +use crate::storage::s3::{S3CleanupResult, S3Uploader}; use crate::transcriber::{Transcriber, Transcript}; mod analysis; @@ -21,7 +22,7 @@ use cached_content::add_cached_content_metadata; use file_cache::GeminiFileCache; pub use file_cache::GeminiFileCacheConfig; use files::{GeminiUploadRef, with_file_cleanup_metadata}; -use response::parse_stream_generate_response; +use response::{GeminiResponseContext, parse_stream_generate_response}; use schema::{ audio_mime, generate_payload, generate_payload_with_cached_content, prompt_text, upload_base_url, @@ -36,33 +37,53 @@ pub struct GeminiApi { settings: rate_limit::ApiRequestSettings, client: Client, file_cache: Option, + signed_url_uploader: Option, + autoclean: bool, +} + +pub struct GeminiConfig { + pub api_base_url: String, + pub api_key: String, + pub model: String, + pub language: Option, + pub settings: rate_limit::ApiRequestSettings, + pub file_cache: Option, + pub signed_url_uploader: Option, + pub autoclean: bool, +} + +const GEMINI_SIGNED_URL_MAX_BYTES: u64 = 100 * 1024 * 1024; + +struct GeminiGenerateInput<'a> { + file_uri: &'a str, + mime_type: &'a str, + input_bytes: u64, + duration_secs: Option, + upload: Option<&'a GeminiUploadRef>, + upload_method: &'a str, + file_url_present: bool, } impl GeminiApi { - pub fn new( - api_base_url: String, - api_key: String, - model: String, - language: Option, - settings: rate_limit::ApiRequestSettings, - file_cache: Option, - ) -> Result { + pub fn new(config: GeminiConfig) -> Result { let client = Client::builder() - .timeout(settings.request_timeout) + .timeout(config.settings.request_timeout) .build() .context("Failed to build HTTP client")?; - let api_base_url = api_base_url.trim_end_matches('/').to_string(); + let api_base_url = config.api_base_url.trim_end_matches('/').to_string(); let upload_base_url = upload_base_url(&api_base_url); Ok(Self { api_base_url, upload_base_url, - api_key, - model, - language, - settings, + api_key: config.api_key, + model: config.model, + language: config.language, + settings: config.settings, client, - file_cache: file_cache.map(GeminiFileCache::new), + file_cache: config.file_cache.map(GeminiFileCache::new), + signed_url_uploader: config.signed_url_uploader, + autoclean: config.autoclean, }) } @@ -72,15 +93,54 @@ impl GeminiApi { .with_context(|| format!("Failed to read audio file: {}", audio_path.display()))?; let mime_type = audio_mime(audio_path); let duration_secs = get_duration(audio_path).await.ok(); + if let Some(uploader) = &self.signed_url_uploader { + anyhow::ensure!( + bytes.len() as u64 <= GEMINI_SIGNED_URL_MAX_BYTES, + "Gemini signed URL input supports files up to 100 MB; prepared input is {:.2} MB", + bytes.len() as f64 / (1024.0 * 1024.0) + ); + anyhow::ensure!( + !is_gemini_2_0_model(&self.model), + "Gemini signed URL input is not supported for Gemini 2.0 family models; use Gemini Files API mode instead" + ); + let upload = uploader.upload_and_presign_object(audio_path).await?; + let mut transcript = self + .generate_transcript(GeminiGenerateInput { + file_uri: &upload.url, + mime_type, + input_bytes: bytes.len() as u64, + duration_secs, + upload: None, + upload_method: "signed_url", + file_url_present: true, + }) + .await?; + let cleanup = if self.autoclean { + uploader.cleanup_uploaded_object(&upload).await + } else { + S3CleanupResult::skipped(&upload) + }; + if let Some(error) = cleanup.error.as_deref() { + eprintln!( + "Failed to delete staged Gemini object s3://{}/{}: {error}", + cleanup.bucket, cleanup.key + ); + } + add_gemini_staging_metadata(&mut transcript, cleanup); + return Ok(transcript); + } + let upload = self.resolve_file(audio_path, &bytes, mime_type).await?; let response = self - .generate_transcript( - &upload.file.uri, + .generate_transcript(GeminiGenerateInput { + file_uri: &upload.file.uri, mime_type, - bytes.len() as u64, + input_bytes: bytes.len() as u64, duration_secs, - &upload, - ) + upload: Some(&upload), + upload_method: "files_api", + file_url_present: false, + }) .await; let cleanup = self.cleanup_file_after_run(&upload).await; @@ -95,37 +155,37 @@ impl GeminiApi { Ok(transcript) } - async fn generate_transcript( - &self, - file_uri: &str, - mime_type: &str, - input_bytes: u64, - duration_secs: Option, - upload: &GeminiUploadRef, - ) -> Result { + async fn generate_transcript(&self, input: GeminiGenerateInput<'_>) -> Result { let url = format!( "{}/models/{}:streamGenerateContent?alt=sse", self.api_base_url, urlencoding::encode(&self.model) ); - let prompt = prompt_text(self.language.as_deref(), duration_secs); - let cached_content = self - .resolve_cached_content(upload, file_uri, mime_type) - .await?; + let prompt = prompt_text(self.language.as_deref(), input.duration_secs); + let cached_content = if let Some(upload) = input.upload { + self.resolve_cached_content(upload, input.file_uri, input.mime_type) + .await? + } else { + None + }; let payload = if let Some(cached_content) = &cached_content { generate_payload_with_cached_content(&cached_content.name, &prompt) } else { - generate_payload(file_uri, mime_type, &prompt) + generate_payload(input.file_uri, input.mime_type, &prompt) }; let chunks = self.stream_generate_chunks(&url, &payload).await?; let mut transcript = parse_stream_generate_response( &chunks, - &self.model, - &self.api_base_url, - mime_type, - input_bytes, - duration_secs, + GeminiResponseContext { + model: &self.model, + api_base_url: &self.api_base_url, + mime_type: input.mime_type, + input_bytes: input.input_bytes, + duration_secs: input.duration_secs, + upload_method: input.upload_method, + file_url_present: input.file_url_present, + }, ); if let Some(cached_content) = cached_content { add_cached_content_metadata(&mut transcript, cached_content); @@ -134,6 +194,32 @@ impl GeminiApi { } } +fn add_gemini_staging_metadata(transcript: &mut Transcript, cleanup: S3CleanupResult) { + let metadata = transcript + .provider_metadata + .get_or_insert_with(|| serde_json::json!({ "gemini": {} })); + if let Some(gemini) = metadata + .get_mut("gemini") + .and_then(serde_json::Value::as_object_mut) + { + gemini.insert( + "staging".to_string(), + serde_json::json!({ + "provider": "s3", + "cleanup": cleanup.to_metadata(), + }), + ); + } +} + +fn is_gemini_2_0_model(model: &str) -> bool { + let model = model + .strip_prefix("models/") + .unwrap_or(model) + .to_ascii_lowercase(); + model.starts_with("gemini-2.0") +} + #[async_trait] impl Transcriber for GeminiApi { async fn transcribe(&self, audio_samples: Vec) -> Result { diff --git a/src/engines/gemini/response.rs b/src/engines/gemini/response.rs index 64dd82f..41bc027 100644 --- a/src/engines/gemini/response.rs +++ b/src/engines/gemini/response.rs @@ -2,15 +2,19 @@ use serde_json::{Map, Value, json}; use crate::transcriber::{Segment, Transcript}; +#[derive(Clone, Copy)] +pub(super) struct GeminiResponseContext<'a> { + pub(super) model: &'a str, + pub(super) api_base_url: &'a str, + pub(super) mime_type: &'a str, + pub(super) input_bytes: u64, + pub(super) duration_secs: Option, + pub(super) upload_method: &'a str, + pub(super) file_url_present: bool, +} + #[cfg(test)] -pub fn parse_generate_response( - body: &[u8], - model: &str, - api_base_url: &str, - mime_type: &str, - input_bytes: u64, - duration_secs: Option, -) -> Transcript { +pub fn parse_generate_response(body: &[u8], context: GeminiResponseContext<'_>) -> Transcript { let response_value = serde_json::from_slice::(body).unwrap_or_else(|_| { json!({ "raw_text": String::from_utf8_lossy(body).to_string() @@ -48,22 +52,14 @@ pub fn parse_generate_response( build_transcript_from_generated_text( &generated_text(&response_value).unwrap_or_default(), - model, - api_base_url, - mime_type, - input_bytes, - duration_secs, + context, response_metadata, ) } pub fn parse_stream_generate_response( chunks: &[Value], - model: &str, - api_base_url: &str, - mime_type: &str, - input_bytes: u64, - duration_secs: Option, + context: GeminiResponseContext<'_>, ) -> Transcript { let generated_text = chunks .iter() @@ -104,30 +100,18 @@ pub fn parse_stream_generate_response( prompt_feedback.unwrap_or(Value::Null), ); - build_transcript_from_generated_text( - &generated_text, - model, - api_base_url, - mime_type, - input_bytes, - duration_secs, - response_metadata, - ) + build_transcript_from_generated_text(&generated_text, context, response_metadata) } fn build_transcript_from_generated_text( generated_text: &str, - model: &str, - api_base_url: &str, - mime_type: &str, - input_bytes: u64, - duration_secs: Option, + context: GeminiResponseContext<'_>, mut response_metadata: Map, ) -> Transcript { let generated_json = parse_generated_json(generated_text); let segments = generated_json .as_ref() - .and_then(|value| parse_transcript_segments(value, duration_secs)) + .and_then(|value| parse_transcript_segments(value, context.duration_secs)) .filter(|segments| !segments.is_empty()) .or_else(|| { generated_json @@ -146,7 +130,7 @@ fn build_transcript_from_generated_text( Value::Bool( generated_json .as_ref() - .is_some_and(|value| timestamps_need_clamp(value, duration_secs)), + .is_some_and(|value| timestamps_need_clamp(value, context.duration_secs)), ), ); @@ -154,13 +138,16 @@ fn build_transcript_from_generated_text( segments, provider_metadata: Some(json!({ "gemini": { - "model": model, - "api_base_url": api_base_url, - "upload_method": "files_api", + "model": context.model, + "api_base_url": context.api_base_url, + "upload_method": context.upload_method, + "request": { + "file_url_present": context.file_url_present, + }, "input": { - "mime_type": mime_type, - "bytes": input_bytes, - "duration_secs": duration_secs, + "mime_type": context.mime_type, + "bytes": context.input_bytes, + "duration_secs": context.duration_secs, }, "response": Value::Object(response_metadata) } diff --git a/src/engines/gemini/response/tests.rs b/src/engines/gemini/response/tests.rs index 3f8e236..db031a4 100644 --- a/src/engines/gemini/response/tests.rs +++ b/src/engines/gemini/response/tests.rs @@ -1,6 +1,18 @@ -use super::{parse_generate_response, parse_stream_generate_response}; +use super::{GeminiResponseContext, parse_generate_response, parse_stream_generate_response}; use serde_json::json; +fn context(upload_method: &'static str, file_url_present: bool) -> GeminiResponseContext<'static> { + GeminiResponseContext { + model: "gemini-test", + api_base_url: "https://example.com", + mime_type: "audio/mp3", + input_bytes: 12, + duration_secs: None, + upload_method, + file_url_present, + } +} + #[test] fn parses_structured_transcript_segments() { let body = br#"{ @@ -15,14 +27,7 @@ fn parses_structured_transcript_segments() { "usageMetadata": {"totalTokenCount": 42} }"#; - let transcript = parse_generate_response( - body, - "gemini-test", - "https://example.com", - "audio/mp3", - 12, - None, - ); + let transcript = parse_generate_response(body, context("files_api", false)); assert_eq!(transcript.segments.len(), 1); assert_eq!(transcript.segments[0].text, "hello"); assert_eq!(transcript.segments[0].speaker.as_deref(), Some("A")); @@ -44,14 +49,7 @@ fn falls_back_to_top_level_text_when_segments_are_invalid() { }] }"#; - let transcript = parse_generate_response( - body, - "gemini-test", - "https://example.com", - "audio/mp3", - 12, - None, - ); + let transcript = parse_generate_response(body, context("files_api", false)); assert_eq!(transcript.segments.len(), 1); assert_eq!(transcript.segments[0].text, "fallback text"); assert_eq!(transcript.segments[0].start_ms, 0); @@ -67,14 +65,7 @@ fn falls_back_to_raw_generated_text_when_json_is_invalid() { }] }"#; - let transcript = parse_generate_response( - body, - "gemini-test", - "https://example.com", - "audio/mp3", - 12, - None, - ); + let transcript = parse_generate_response(body, context("files_api", false)); assert_eq!(transcript.segments.len(), 1); assert_eq!(transcript.segments[0].text, "plain transcript"); } @@ -93,11 +84,10 @@ fn clamps_timestamps_to_known_audio_duration() { let transcript = parse_generate_response( body, - "gemini-test", - "https://example.com", - "audio/mp3", - 12, - Some(300.0), + GeminiResponseContext { + duration_secs: Some(300.0), + ..context("files_api", false) + }, ); assert_eq!(transcript.segments[0].start_ms, 299000); assert_eq!(transcript.segments[0].end_ms, 300000); @@ -128,14 +118,7 @@ fn parses_streamed_response_chunks() { }), ]; - let transcript = parse_stream_generate_response( - &chunks, - "gemini-test", - "https://example.com", - "audio/mp3", - 12, - None, - ); + let transcript = parse_stream_generate_response(&chunks, context("files_api", false)); assert_eq!(transcript.segments.len(), 1); assert_eq!(transcript.segments[0].text, "hello world"); assert_eq!(transcript.segments[0].speaker.as_deref(), Some("A")); @@ -156,3 +139,34 @@ fn parses_streamed_response_chunks() { Some(2) ); } + +#[test] +fn records_signed_url_upload_method_without_persisting_url() { + let body = br#"{ + "candidates": [{ + "content": { + "parts": [{"text": "{\"text\":\"signed url transcript\",\"segments\":[{\"text\":\"signed url transcript\"}]}"}] + } + }] + }"#; + + let transcript = parse_generate_response(body, context("signed_url", true)); + let metadata = transcript + .provider_metadata + .as_ref() + .expect("metadata should exist"); + + assert_eq!( + metadata + .pointer("/gemini/upload_method") + .and_then(serde_json::Value::as_str), + Some("signed_url") + ); + assert_eq!( + metadata + .pointer("/gemini/request/file_url_present") + .and_then(serde_json::Value::as_bool), + Some(true) + ); + assert!(!metadata.to_string().contains("X-Amz-Signature")); +} diff --git a/src/engines/qwen_filetrans.rs b/src/engines/qwen_filetrans.rs index 7042ef5..64c031a 100644 --- a/src/engines/qwen_filetrans.rs +++ b/src/engines/qwen_filetrans.rs @@ -7,6 +7,7 @@ use std::time::Duration; use anyhow::{Context, Result}; use async_trait::async_trait; use reqwest::Client; +use serde_json::Value; use crate::audio::wav::encode_wav; use crate::engines::qwen_filetrans::limits::validate_model_for_path; @@ -15,7 +16,7 @@ use crate::engines::qwen_filetrans::types::{ TaskResult, normalize_api_base_url, }; use crate::engines::rate_limit::{self, send_with_retry}; -use crate::storage::s3::S3Uploader; +use crate::storage::s3::{S3CleanupResult, S3Uploader}; use crate::transcriber::{Transcriber, Transcript}; pub struct QwenFileTrans { @@ -26,6 +27,7 @@ pub struct QwenFileTrans { settings: rate_limit::ApiRequestSettings, client: Client, uploader: S3Uploader, + autoclean: bool, poll_interval: Duration, max_polls: u32, } @@ -38,6 +40,7 @@ impl QwenFileTrans { language: Option, settings: rate_limit::ApiRequestSettings, uploader: S3Uploader, + autoclean: bool, ) -> Result { let client = Client::builder() .timeout(settings.request_timeout) @@ -52,6 +55,7 @@ impl QwenFileTrans { settings, client, uploader, + autoclean, poll_interval: Duration::from_secs(2), max_polls: 900, }) @@ -189,8 +193,21 @@ impl Transcriber for QwenFileTrans { async fn transcribe_path(&self, audio_path: &Path) -> Result { validate_model_for_path(&self.model, audio_path).await?; - let url = self.uploader.upload_and_presign(audio_path).await?; - self.transcribe_file_url(url).await + let upload = self.uploader.upload_and_presign_object(audio_path).await?; + let mut transcript = self.transcribe_file_url(upload.url.clone()).await?; + let cleanup = if self.autoclean { + self.uploader.cleanup_uploaded_object(&upload).await + } else { + S3CleanupResult::skipped(&upload) + }; + if let Some(error) = cleanup.error.as_deref() { + eprintln!( + "Failed to delete staged Qwen object s3://{}/{}: {error}", + cleanup.bucket, cleanup.key + ); + } + add_qwen_staging_metadata(&mut transcript, cleanup); + Ok(transcript) } async fn transcribe_wav(&self, wav_bytes: Vec) -> Result { @@ -205,3 +222,18 @@ impl Transcriber for QwenFileTrans { self.transcribe_path(tmp.path()).await } } + +fn add_qwen_staging_metadata(transcript: &mut Transcript, cleanup: S3CleanupResult) { + let metadata = transcript + .provider_metadata + .get_or_insert_with(|| serde_json::json!({ "qwen": {} })); + if let Some(qwen) = metadata.get_mut("qwen").and_then(Value::as_object_mut) { + qwen.insert( + "staging".to_string(), + serde_json::json!({ + "provider": "s3", + "cleanup": cleanup.to_metadata(), + }), + ); + } +} diff --git a/src/main.rs b/src/main.rs index 6dcef9d..cc02303 100644 --- a/src/main.rs +++ b/src/main.rs @@ -25,8 +25,8 @@ use crate::cli::{ AnalysisKind, Cli, Command, ModelFormat, OutputFormatArg, Provider, SetupComponent, }; use crate::engines::azure_openai::AzureOpenAi; -use crate::engines::deepgram::DeepgramApi; -use crate::engines::gemini::GeminiApi; +use crate::engines::deepgram::{DeepgramApi, DeepgramConfig}; +use crate::engines::gemini::{GeminiApi, GeminiConfig}; use crate::engines::model_cache::ModelCache; use crate::engines::nvidia_riva::NvidiaRiva; use crate::engines::openai_api::OpenAiApi; @@ -169,6 +169,7 @@ async fn main() -> Result<()> { deepgram_numerals, deepgram_use_presigned_url, gemini_file_cache, + gemini_use_presigned_url, gemini_file_cache_index, gemini_autoclean, gemini_explicit_cache, @@ -185,6 +186,7 @@ async fn main() -> Result<()> { max_segment_secs, segment_concurrency, normalize, + autoclean, max_retries, request_timeout_secs, retry_wait_base_secs, @@ -366,6 +368,7 @@ async fn main() -> Result<()> { language.clone(), api_settings, uploader, + autoclean, )?), analyzer: None, provider_name: "qwen-filetrans".into(), @@ -379,6 +382,37 @@ async fn main() -> Result<()> { "--gemini-api-key, GEMINI_API_KEY, --api-key, or OPENAI_API_KEY is required for --provider gemini", )?; let model_name = remote_model.unwrap_or_else(|| "gemini-3.5-flash".into()); + let gemini_autoclean = autoclean || gemini_autoclean; + if gemini_use_presigned_url && (gemini_file_cache || gemini_explicit_cache) { + anyhow::bail!( + "--gemini-use-presigned-url cannot be combined with --gemini-file-cache or --gemini-explicit-cache because signed URLs do not create reusable Gemini Files API handles" + ); + } + if gemini_use_presigned_url && is_gemini_2_0_model(&model_name) { + anyhow::bail!( + "--gemini-use-presigned-url is not supported for Gemini 2.0 family models; use Gemini Files API mode instead" + ); + } + let signed_url_uploader = if gemini_use_presigned_url { + Some( + build_s3_uploader(S3UploaderArgs { + bucket: s3_bucket, + region: s3_region, + endpoint_url: s3_endpoint_url, + access_key_id: s3_access_key_id, + secret_access_key: s3_secret_access_key, + session_token: s3_session_token, + prefix: s3_prefix, + default_prefix: "transcribeit/gemini", + presign_expires_secs: s3_presign_expires_secs, + force_path_style: s3_force_path_style, + context_label: "--provider gemini --gemini-use-presigned-url", + }) + .await?, + ) + } else { + None + }; let gemini_file_cache = if gemini_file_cache || gemini_explicit_cache { Some(crate::engines::gemini::GeminiFileCacheConfig { index_path: gemini_file_cache_index, @@ -392,26 +426,30 @@ async fn main() -> Result<()> { let analyzer = analysis_config .is_enabled() .then(|| { - GeminiApi::new( - gemini_api_base_url.clone(), - key.clone(), - model_name.clone(), - language.clone(), - api_settings, - None, - ) + GeminiApi::new(GeminiConfig { + api_base_url: gemini_api_base_url.clone(), + api_key: key.clone(), + model: model_name.clone(), + language: language.clone(), + settings: api_settings, + file_cache: None, + signed_url_uploader: None, + autoclean: false, + }) .map(|api| Box::new(api) as Box) }) .transpose()?; ProviderRuntime { - engine: Box::new(GeminiApi::new( - gemini_api_base_url, - key, - model_name.clone(), - language.clone(), - api_settings, - gemini_file_cache, - )?), + engine: Box::new(GeminiApi::new(GeminiConfig { + api_base_url: gemini_api_base_url, + api_key: key, + model: model_name.clone(), + language: language.clone(), + settings: api_settings, + file_cache: gemini_file_cache, + signed_url_uploader, + autoclean: gemini_autoclean, + })?), analyzer, provider_name: "gemini".into(), model_name, @@ -465,13 +503,13 @@ async fn main() -> Result<()> { ); } ProviderRuntime { - engine: Box::new(DeepgramApi::new( - deepgram_api_base_url, - key, - model_name.clone(), - language.clone(), - api_settings, - crate::engines::deepgram::DeepgramOptions { + engine: Box::new(DeepgramApi::new(DeepgramConfig { + base_url: deepgram_api_base_url, + api_key: key, + model: model_name.clone(), + language: language.clone(), + settings: api_settings, + options: crate::engines::deepgram::DeepgramOptions { diarize: diarize || speakers.is_some(), intelligence: deepgram_intelligence, summarize: deepgram_summarize, @@ -486,7 +524,7 @@ async fn main() -> Result<()> { filler_words: deepgram_filler_words, numerals: deepgram_numerals, }, - if deepgram_use_presigned_url { + presigned_url_uploader: if deepgram_use_presigned_url { Some( build_s3_uploader( S3UploaderArgs { @@ -509,7 +547,8 @@ async fn main() -> Result<()> { } else { None }, - )?), + autoclean, + })?), analyzer: None, provider_name: "deepgram".into(), model_name, @@ -587,6 +626,14 @@ fn provider_handles_diarization(provider_name: &str, model_name: &str) -> bool { } } +fn is_gemini_2_0_model(model: &str) -> bool { + let model = model + .strip_prefix("models/") + .unwrap_or(model) + .to_ascii_lowercase(); + model.starts_with("gemini-2.0") +} + struct S3UploaderArgs { bucket: Option, region: Option, diff --git a/src/storage/s3.rs b/src/storage/s3.rs index d5d1e38..8a5ed82 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -9,6 +9,7 @@ use aws_sdk_s3::Client; use aws_sdk_s3::config::Builder; use aws_sdk_s3::presigning::PresigningConfig; use aws_sdk_s3::primitives::ByteStream; +use serde_json::{Value, json}; use uuid::Uuid; #[derive(Debug, Clone)] @@ -36,6 +37,7 @@ pub struct S3ConfigInput { pub force_path_style: bool, } +#[derive(Clone)] pub struct S3Uploader { client: Client, bucket: String, @@ -43,6 +45,44 @@ pub struct S3Uploader { presign_expires: Duration, } +#[derive(Debug, Clone)] +pub struct S3UploadedObject { + pub url: String, + pub bucket: String, + pub key: String, +} + +#[derive(Debug, Clone)] +pub struct S3CleanupResult { + pub attempted: bool, + pub deleted: bool, + pub bucket: String, + pub key: String, + pub error: Option, +} + +impl S3CleanupResult { + pub fn skipped(upload: &S3UploadedObject) -> Self { + Self { + attempted: false, + deleted: false, + bucket: upload.bucket.clone(), + key: upload.key.clone(), + error: None, + } + } + + pub fn to_metadata(&self) -> Value { + json!({ + "attempted": self.attempted, + "deleted": self.deleted, + "bucket": self.bucket, + "key": self.key, + "error": self.error, + }) + } +} + impl S3Uploader { pub async fn new(config: S3Config) -> Result { let credentials = Credentials::new( @@ -74,7 +114,7 @@ impl S3Uploader { }) } - pub async fn upload_and_presign(&self, path: &Path) -> Result { + pub async fn upload_and_presign_object(&self, path: &Path) -> Result { let key = self.object_key(path); let body = ByteStream::from_path(path) .await @@ -102,7 +142,30 @@ impl S3Uploader { format!("Failed to presign S3 object: s3://{}/{}", self.bucket, key) })?; - Ok(presigned.uri().to_string()) + Ok(S3UploadedObject { + url: presigned.uri().to_string(), + bucket: self.bucket.clone(), + key, + }) + } + + pub async fn cleanup_uploaded_object(&self, upload: &S3UploadedObject) -> S3CleanupResult { + let delete_result = self + .client + .delete_object() + .bucket(&upload.bucket) + .key(&upload.key) + .send() + .await; + + let deleted = delete_result.is_ok(); + S3CleanupResult { + attempted: true, + deleted, + bucket: upload.bucket.clone(), + key: upload.key.clone(), + error: delete_result.err().map(|err| err.to_string()), + } } fn object_key(&self, path: &Path) -> String { From 03460046f8ac2eeb0630b9d8a641ea02d994b8c3 Mon Sep 17 00:00:00 2001 From: skitsanos Date: Wed, 17 Jun 2026 15:53:23 +0300 Subject: [PATCH 3/4] Document clean remote provider benchmark --- docs/performance-benchmarks.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/performance-benchmarks.md b/docs/performance-benchmarks.md index de8565c..b66b039 100644 --- a/docs/performance-benchmarks.md +++ b/docs/performance-benchmarks.md @@ -219,6 +219,18 @@ Use this working ranking until broader benchmark data says otherwise: | 4 | Gemini | Useful whole-file multimodal transcription and summary path, but timestamps/speakers are model-generated rather than dedicated ASR metadata. | | 5 | NVIDIA Riva | Provider-native timestamps/diarization through hosted Riva, but less transcript intelligence returned through the current provider path. | +### Clean 5-minute remote provider comparison (2026-06-17) + +Measured on `samples/4289US19IPFSegA17Apr20256.45am_5m.wav` after the Deepgram, Gemini signed URL, and generic `--autoclean` provider updates. + +| Provider / model | Processing time | RTF | Segments | Timing | Speakers | Word timestamps | Assessment | +|---|---:|---:|---:|---|---|---|---| +| Deepgram `nova-3-medical` + keyterms | 23.33s | 0.078 | 68 | provider-native, clamped | provider-native | yes | Best overall Transcript Intelligence candidate; preserved key medical terms and returned rich intelligence metadata. Summary still had a role error. | +| Qwen `qwen3-asr-flash-filetrans` | 11.15s | 0.037 | 71 | provider-native, reliable | none | yes | Strong pure ASR baseline; preserved key terms including `Producta`; no speaker labels or intelligence metadata. | +| OpenAI `gpt-4o-transcribe-diarize` | 115.21s | 0.384 | 85 | provider-native, reliable | provider-native | no | Good timing and diarization, but slowest hosted run in this pass. | +| Gemini `gemini-3.5-flash` | 35.29s | 0.118 | 29 | model-generated, clamped | model-generated | no | Useful role labels and multimodal path, but timestamps remain unreliable for subtitle-grade output. | +| NVIDIA Riva hosted function | 5.83s | 0.019 | 38 | provider-native, reliable | provider-native | yes | Fastest run, but weaker domain term recognition and speaker separation on this sample. | + ## Reference benchmark results These results were measured on a 5-minute medical interview recording. From 8b5a2f8fd7730feab2e663b37a9a39ce3b9664bb Mon Sep 17 00:00:00 2001 From: skitsanos Date: Wed, 17 Jun 2026 16:12:16 +0300 Subject: [PATCH 4/4] Bump version to 1.6.0 --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f9dcf7b..abb015c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3411,7 +3411,7 @@ dependencies = [ [[package]] name = "transcribeit" -version = "1.5.0" +version = "1.6.0" dependencies = [ "anyhow", "async-trait", diff --git a/Cargo.toml b/Cargo.toml index 774c5f1..7eec2d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "transcribeit" -version = "1.5.0" +version = "1.6.0" edition = "2024" rust-version = "1.96" license-file = "LICENSE"