From 9d00fad02e7b053f5bfbe01e64cf11ccb6b11213 Mon Sep 17 00:00:00 2001 From: Hunter Hillman Date: Thu, 23 Apr 2026 11:48:36 -0700 Subject: [PATCH 01/19] Add product-tests: gated retry/close instrumentation + scenario/chaos suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a self-contained test system at product-tests/ that treats the three failure modes the current suite tolerates (silent retries, unexpected session closes, UI regressions) as hard fails, and runs on every PR as the "ship/no-ship" gate. - Gated RetryCounter at /api/v1/_debug/retry_stats (src/scope/server/ retry_counter.py) instrumenting livepeer connect, cloud_relay drops, and frontend reconnects. No-op unless SCOPE_TEST_INSTRUMENTATION=1. - Python pytest + playwright harness (product-tests/harness/) with ScopeHarness, PlaywrightDriver, RetryProbe, FailureWatcher, TestReport, ChaosDriver (seeded), reusable flows/gates/baselines helpers, and a cloud auth localStorage bypass for headless cloud tests. - Cross-cutting contracts (product-tests/contracts/) auto-applied at teardown: no banned retry counter > 0, no unexpected session close. - 12 tests across scenarios (onboarding local/cloud, parameter apply, stop-restart, release full-matrix) and chaos (rapid stop/start, parameter spam, reload mid-stream, workflow switching, session churn). - ~25 data-testid attrs on onboarding, graph toolbar, workflow cards, tour popover, video sink — no behavior changes. - GitHub Actions: PR gate (CPU, ubuntu-latest, 25min) + nightly (GPU self-hosted, 60min) + PR-comment summary via sticky-pull-request- comment. - Retires .agents/skills/onboarding-test/ (Claude-in-Chrome) and the unused e2e/ TypeScript scaffold; migration pointers in their READMEs. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Hunter Hillman --- .agents/skills/onboarding-test/SKILL.md | 90 +++--- .github/workflows/product-tests.yml | 197 +++++++++++++ e2e/README.md | 128 ++------- frontend/src/components/VideoOutput.tsx | 1 + .../src/components/graph/GraphToolbar.tsx | 2 + .../graph/GraphWorkflowImportDialog.tsx | 1 + .../onboarding/InferenceModeStep.tsx | 2 + .../onboarding/TelemetryDisclosure.tsx | 2 + .../src/components/onboarding/TourPopover.tsx | 9 +- .../onboarding/WorkflowPickerStep.tsx | 2 + frontend/src/hooks/useUnifiedWebRTC.ts | 24 ++ product-tests/.gitignore | 4 + product-tests/README.md | 56 ++++ product-tests/baselines/cloud.json | 11 + product-tests/baselines/local.json | 15 + product-tests/chaos/__init__.py | 0 product-tests/chaos/test_input_switching.py | 59 ++++ product-tests/chaos/test_parameter_spam.py | 85 ++++++ product-tests/chaos/test_rapid_stop_start.py | 73 +++++ product-tests/chaos/test_session_churn.py | 119 ++++++++ .../chaos/test_workflow_switching.py | 101 +++++++ product-tests/conftest.py | 269 ++++++++++++++++++ product-tests/contracts/__init__.py | 27 ++ product-tests/contracts/no_retries.py | 52 ++++ .../contracts/no_unexpected_session_close.py | 39 +++ product-tests/harness/__init__.py | 5 + product-tests/harness/baselines.py | 45 +++ product-tests/harness/chaos.py | 75 +++++ product-tests/harness/cloud_auth.py | 60 ++++ product-tests/harness/driver.py | 91 ++++++ product-tests/harness/failure_watcher.py | 126 ++++++++ product-tests/harness/flows.py | 107 +++++++ product-tests/harness/gates.py | 75 +++++ product-tests/harness/report.py | 107 +++++++ product-tests/harness/retry_probe.py | 67 +++++ product-tests/harness/scope_process.py | 155 ++++++++++ product-tests/pytest.ini | 17 ++ product-tests/release/README.md | 17 ++ product-tests/release/__init__.py | 0 .../release/test_cloud_full_matrix.py | 62 ++++ product-tests/scenarios/__init__.py | 0 .../scenarios/test_onboarding_cloud.py | 77 +++++ .../scenarios/test_onboarding_local.py | 47 +++ .../scenarios/test_parameter_apply.py | 75 +++++ product-tests/scenarios/test_stop_restart.py | 50 ++++ pyproject.toml | 8 + src/scope/server/app.py | 64 +++++ src/scope/server/cloud_relay.py | 6 + src/scope/server/livepeer.py | 10 + src/scope/server/retry_counter.py | 81 ++++++ uv.lock | 204 +++++++++++-- 51 files changed, 2814 insertions(+), 185 deletions(-) create mode 100644 .github/workflows/product-tests.yml create mode 100644 product-tests/.gitignore create mode 100644 product-tests/README.md create mode 100644 product-tests/baselines/cloud.json create mode 100644 product-tests/baselines/local.json create mode 100644 product-tests/chaos/__init__.py create mode 100644 product-tests/chaos/test_input_switching.py create mode 100644 product-tests/chaos/test_parameter_spam.py create mode 100644 product-tests/chaos/test_rapid_stop_start.py create mode 100644 product-tests/chaos/test_session_churn.py create mode 100644 product-tests/chaos/test_workflow_switching.py create mode 100644 product-tests/conftest.py create mode 100644 product-tests/contracts/__init__.py create mode 100644 product-tests/contracts/no_retries.py create mode 100644 product-tests/contracts/no_unexpected_session_close.py create mode 100644 product-tests/harness/__init__.py create mode 100644 product-tests/harness/baselines.py create mode 100644 product-tests/harness/chaos.py create mode 100644 product-tests/harness/cloud_auth.py create mode 100644 product-tests/harness/driver.py create mode 100644 product-tests/harness/failure_watcher.py create mode 100644 product-tests/harness/flows.py create mode 100644 product-tests/harness/gates.py create mode 100644 product-tests/harness/report.py create mode 100644 product-tests/harness/retry_probe.py create mode 100644 product-tests/harness/scope_process.py create mode 100644 product-tests/pytest.ini create mode 100644 product-tests/release/README.md create mode 100644 product-tests/release/__init__.py create mode 100644 product-tests/release/test_cloud_full_matrix.py create mode 100644 product-tests/scenarios/__init__.py create mode 100644 product-tests/scenarios/test_onboarding_cloud.py create mode 100644 product-tests/scenarios/test_onboarding_local.py create mode 100644 product-tests/scenarios/test_parameter_apply.py create mode 100644 product-tests/scenarios/test_stop_restart.py create mode 100644 src/scope/server/retry_counter.py diff --git a/.agents/skills/onboarding-test/SKILL.md b/.agents/skills/onboarding-test/SKILL.md index dc18828c1..f30417e1a 100644 --- a/.agents/skills/onboarding-test/SKILL.md +++ b/.agents/skills/onboarding-test/SKILL.md @@ -1,73 +1,51 @@ --- name: onboarding-test -description: Pre-release onboarding test via Chrome browser automation. Tests the full new-user flow — provider selection, workflow picker, and streaming all three starter workflows. Use when asked to test onboarding, first-run experience, or starter workflows. +description: RETIRED — superseded by product-tests/. Use those scenarios instead of Claude-in-Chrome automation. --- -# Onboarding Browser Test +# Onboarding Test — Retired -## Prerequisites +This Claude-in-Chrome-driven onboarding test has been replaced by the +self-contained Python/Playwright product-tests system. -- Chrome browser automation tools (claude-in-chrome MCP) -- Build frontend first: `cd frontend && npm run build` +## Where to go instead -## Server Setup +- **Run the local onboarding scenario locally:** -Use port **8080** (not 8000 — the OSC server binds to the same port as the HTTP server and port 8000 is commonly in use). + ```bash + uv sync --group product-tests + uv run playwright install chromium + cd product-tests && uv run pytest scenarios/test_onboarding_local.py + ``` -```bash -mkdir -p /tmp/scope-onboarding-test/data /tmp/scope-onboarding-test/models -lsof -ti:8080 | xargs kill -9 2>/dev/null -DAYDREAM_SCOPE_DIR=/tmp/scope-onboarding-test/data \ -DAYDREAM_SCOPE_MODELS_DIR=/tmp/scope-onboarding-test/models \ -SCOPE_CLOUD_APP_ID="daydream/scope-livepeer/ws" \ -uv run daydream-scope --port 8080 > /tmp/scope-onboarding.log 2>&1 & -for i in $(seq 1 30); do curl -s http://localhost:8080/health > /dev/null 2>&1 && break; sleep 1; done -``` +- **Run the cloud onboarding scenario:** -## Onboarding UI Flow (exact sequence) + ```bash + SCOPE_CLOUD_APP_ID=daydream/scope-livepeer/ws \ + uv run pytest product-tests/scenarios/test_onboarding_cloud.py + ``` -Navigate to `http://localhost:8080`. The onboarding screens appear in this order: +- **CI coverage:** `.github/workflows/product-tests.yml` runs the PR gate + on every push and a nightly with GPU + full models. -1. **Provider selection** — "Welcome to Daydream Scope" with "Use Daydream Cloud" and "Run Locally" cards. Select Cloud, click **Continue**. -2. **Usage Analytics dialog** — appears as a modal overlay. Click **No thanks** (privacy-preserving default). -3. **Onboarding style** — "Teaching Mode" vs "Simple". Pick either, click **Continue**. -4. **Workflow picker** — "Pick a workflow to get started" showing 3 starter workflows: - - **Mythical Creature** (Style LoRA) - - **Dissolving Sunflower** (Depth Map) - - **LTX 2.3** (Text to Video) - - Select one, click **Get Started**. +## Why it was retired -5. **Graph editor with onboarding tooltips** — Two tooltip popups appear sequentially over the Sink/Run area: - - Tooltip 1: "Click Play to start generation" (1 of 2) — click **Next** - - Tooltip 2: "Explore Workflows" (2 of 2) — click **Done** - - **IMPORTANT:** These tooltips intercept clicks on the Run button. You MUST dismiss both tooltips (using `read_page` to find the Next/Done button refs) BEFORE clicking Run. +The old skill drove a real Chrome browser through Claude's MCP tools and +had no way to: -6. **Click Run** — use `read_page(filter="interactive")` to find the Run button ref and click it. Do NOT click by coordinates near the tooltip area. +1. Count retries as hard failures (flaky/"eventually worked" runs passed). +2. Detect unexpected session closes that happen silently in logs. +3. Simulate chaotic user behavior with reproducible seeds. +4. Gate PRs — it ran only when Claude was asked to run it. -## Streaming Each Workflow +The new system (see `product-tests/README.md`) treats the onboarding +workflows on both local and cloud mode as the #1 gate and runs them on +every PR. -- After clicking Run, the status bar shows "Loading diffusion model..." / "Starting..." -- Cloud model loading takes **30-60 seconds** on first run. Wait in 10s increments, then screenshot. -- When ready, the Sink node shows video output with FPS/bitrate overlay. -- Click **Stop** to end the stream. +## Source of truth for the old flow -### Switching workflows - -Click **Workflows** in the top nav bar to reopen the workflow panel. The "Getting Started" section shows all three starter workflows. Click a different one to load it, then click Run. - -## Expected Results - -| Workflow | Nodes | Notes | -|----------|-------|-------| -| Mythical Creature | Source, VACE, LoRA, longlive, rife, Sink | Style LoRA, video input | -| Dissolving Sunflower | Source, video-depth-anything, VACE, LoRA, longlive, rife, Sink | Depth map, video input | -| LTX 2.3 | Primitive (String), ltx2, Sink | Text-to-video, no Source node | - -## Cleanup - -```bash -lsof -ti:8080 | xargs kill -9 2>/dev/null -rm -rf /tmp/scope-onboarding-test -``` +The old skill's step-by-step click map lives in git history; the +product-tests equivalent is in +[product-tests/harness/flows.py](../../../product-tests/harness/flows.py) +in the `complete_onboarding_local` and `complete_onboarding_cloud` +helpers. diff --git a/.github/workflows/product-tests.yml b/.github/workflows/product-tests.yml new file mode 100644 index 000000000..d97e520c5 --- /dev/null +++ b/.github/workflows/product-tests.yml @@ -0,0 +1,197 @@ +name: Product Tests + +on: + pull_request: + branches: [main, dev] + push: + branches: [main, dev] + schedule: + # Nightly GPU ring — 09:00 UTC every day + - cron: '0 9 * * *' + +concurrency: + group: product-tests-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + pull-requests: write + +jobs: + # --------------------------------------------------------------------------- + # PR gate: CPU-only, passthrough pipeline, <25 min budget + # --------------------------------------------------------------------------- + pr-gate: + if: github.event_name != 'schedule' + runs-on: ubuntu-latest + name: Product Tests (PR gate, CPU) + timeout-minutes: 25 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '22.19.0' + cache: 'npm' + cache-dependency-path: frontend/package-lock.json + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + version: "0.9.11" + + - name: Build frontend + working-directory: frontend + run: | + npm ci + npm run build + + - name: Install product-tests deps + run: uv sync --group product-tests + + - name: Install Playwright browser + run: uv run playwright install --with-deps chromium + + - name: Run PR-gate scenarios (local mode) + env: + SCOPE_TEST_INSTRUMENTATION: "1" + CUDA_VISIBLE_DEVICES: "" + SCOPE_CLOUD_RING: "pr" + run: | + uv run pytest product-tests/scenarios/ \ + -v --tb=short -m "not cloud" + + - name: Run PR-gate chaos (local mode, fast subset) + env: + SCOPE_TEST_INSTRUMENTATION: "1" + CUDA_VISIBLE_DEVICES: "" + run: | + uv run pytest product-tests/chaos/ \ + -v --tb=short -m "not slow" --chaos-seed="${{ github.sha }}" + + - name: Run PR-gate cloud smoke + # The test fixture pytest.skips when SCOPE_CLOUD_APP_ID is empty, + # so forks / PRs without cloud-app secret access pass by skipping. + env: + SCOPE_TEST_INSTRUMENTATION: "1" + CUDA_VISIBLE_DEVICES: "" + SCOPE_CLOUD_RING: "pr" + SCOPE_CLOUD_APP_ID: ${{ secrets.SCOPE_PR_FAL_APP_ID }} + run: | + uv run pytest product-tests/scenarios/test_onboarding_cloud.py \ + -v --tb=short -m cloud + + - name: Aggregate summary + if: always() + id: summary + run: | + summary=$(find product-tests/reports -name summary.md | head -1) + if [ -n "$summary" ]; then + echo "SUMMARY_PATH=$summary" >> "$GITHUB_ENV" + { + echo "summary<> "$GITHUB_OUTPUT" + cat "$summary" + else + echo "No summary.md emitted" | tee -a "$GITHUB_STEP_SUMMARY" + fi + + - name: Post summary as PR comment + if: always() && github.event_name == 'pull_request' && steps.summary.outputs.summary != '' + uses: marocchino/sticky-pull-request-comment@v2 + with: + header: product-tests-summary + message: | + ### Product Tests — ${{ job.status }} + + ${{ steps.summary.outputs.summary }} + + Run: [#${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) + + - name: Upload reports on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: product-tests-reports-${{ github.run_id }} + path: product-tests/reports/ + retention-days: 14 + + # --------------------------------------------------------------------------- + # Nightly ring: GPU, full model pipelines, <60 min budget + # --------------------------------------------------------------------------- + nightly: + if: github.event_name == 'schedule' + runs-on: [self-hosted, gpu] + name: Product Tests (Nightly, GPU) + timeout-minutes: 60 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + version: "0.9.11" + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '22.19.0' + + - name: Build frontend + working-directory: frontend + run: | + npm ci + npm run build + + - name: Install product-tests deps + run: uv sync --group product-tests + + - name: Install Playwright browser + run: uv run playwright install --with-deps chromium + + - name: Run scenarios + chaos (GPU) + env: + SCOPE_TEST_INSTRUMENTATION: "1" + SCOPE_CLOUD_APP_ID: ${{ secrets.SCOPE_NIGHTLY_FAL_APP_ID }} + SCOPE_CLOUD_RING: "nightly" + SCOPE_CHURN_DURATION_SEC: "180" + run: | + uv run pytest product-tests/scenarios/ product-tests/chaos/ \ + -v --tb=short --chaos-seed="${{ github.run_id }}" + + - name: Run release full-matrix (cloud, all starter workflows) + env: + SCOPE_TEST_INSTRUMENTATION: "1" + SCOPE_CLOUD_APP_ID: ${{ secrets.SCOPE_NIGHTLY_FAL_APP_ID }} + SCOPE_CLOUD_RING: "nightly" + run: | + uv run pytest product-tests/release/ -v --tb=short -m cloud + + - name: Run regression suite + env: + SCOPE_TEST_INSTRUMENTATION: "1" + SCOPE_CLOUD_APP_ID: ${{ secrets.SCOPE_NIGHTLY_FAL_APP_ID }} + SCOPE_CLOUD_RING: "nightly" + run: | + if [ -d product-tests/regression ] && ls product-tests/regression/test_*.py >/dev/null 2>&1; then + uv run pytest product-tests/regression/ -v --tb=short + else + echo "No regression tests yet — skipping." + fi + + - name: Upload reports + if: always() + uses: actions/upload-artifact@v4 + with: + name: product-tests-nightly-${{ github.run_id }} + path: product-tests/reports/ + retention-days: 30 diff --git a/e2e/README.md b/e2e/README.md index 40bfc88b3..eb4aa1dc6 100644 --- a/e2e/README.md +++ b/e2e/README.md @@ -1,113 +1,45 @@ -# Scope E2E Tests +# e2e/ — RETIRED -End-to-end tests for Scope onboarding and Livepeer cloud workflows. +This TypeScript Playwright scaffold has been superseded by the Python +product-tests system at [`../product-tests/`](../product-tests/README.md). -## Overview +## Where to go instead -These tests verify the full cloud flow: -1. Login to Daydream web app -2. Connect to Livepeer cloud mode -3. Start a stream with the passthrough model -4. Verify frames are being processed -5. Stop stream +- **PR-gate cloud smoke:** `product-tests/scenarios/test_onboarding_cloud.py` +- **Nightly full-matrix cloud:** `product-tests/release/test_cloud_full_matrix.py` +- **CI wiring:** `.github/workflows/product-tests.yml` -## Prerequisites +## Why it was retired -- Node.js 22+ -- A Daydream test account -- A deployed Livepeer runner to test against +The old scaffold had TypeScript + `@playwright/test` infrastructure but +no actual test bodies, no retry-counter gating, no chaos simulation, and +no PR-comment integration. The new system treats onboarding (local + +cloud) as the #1 gate, counts retries/unexpected closes as hard fails, +and scores runs across multiple product-quality dimensions. -## Setup +## Running the migrated tests ```bash -cd e2e -npm install -npx playwright install --with-deps chromium -``` - -## Running Tests +# Install the product-tests dep group: +uv sync --group product-tests +uv run playwright install chromium -### Environment Variables +# Local PR gate: +cd product-tests && uv run pytest scenarios/ chaos/ -| Variable | Required | Description | -|----------|----------|-------------| -| `SCOPE_CLOUD_APP_ID` | Yes | Livepeer fal app ID (e.g., `daydream/scope-livepeer-pr-123--preview/ws`) | -| `DAYDREAM_TEST_EMAIL` | Yes | Test user email for Daydream login | -| `DAYDREAM_TEST_PASSWORD` | Yes | Test user password | -| `DAYDREAM_BASE_URL` | No | Base URL for Daydream app (default: `https://app.daydream.live`) | +# Cloud (PR-deployed fal app): +SCOPE_CLOUD_APP_ID=daydream/scope-livepeer-pr-123--preview/ws \ + uv run pytest product-tests/scenarios/test_onboarding_cloud.py -### Run Tests - -```bash -# Headless mode (CI) +# Nightly full matrix: +SCOPE_CLOUD_RING=nightly \ SCOPE_CLOUD_APP_ID=daydream/scope-livepeer--prod/ws \ -DAYDREAM_TEST_EMAIL=test@example.com \ -DAYDREAM_TEST_PASSWORD=secret \ -npm test - -# With browser visible (debugging) -npm run test:headed - -# Interactive UI mode -npm run test:ui - -# Debug mode (step through) -npm run test:debug + uv run pytest product-tests/release/ ``` -### View Report - -After running tests: - -```bash -npm run report -``` - -## CI Integration - -These tests run automatically on every PR via GitHub Actions: - -1. **Docker Build** workflow builds the image -2. **Deploy PR to fal** workflow deploys a PR-specific Livepeer runner -3. **E2E Tests** workflow runs these tests against the deployment +## Leftover files -Results are posted as comments on the PR. - -## Test Structure - -``` -e2e/ -├── playwright.config.ts # Playwright configuration -├── package.json -└── README.md -``` - -## Debugging Failed Tests - -When tests fail in CI: -1. Check the workflow run for logs -2. Download the `test-artifacts` artifact for: - - Screenshots on failure - - Video recordings - - Playwright traces - -To view traces locally: -```bash -npx playwright show-trace path/to/trace.zip -``` - -## Writing New Tests - -```typescript -import { test, expect } from "@playwright/test"; - -test("my new cloud test", async ({ page }) => { - // Tests use saved auth state, so you're already logged in - await page.goto("/"); - - // Your test logic here - // Use data-testid attributes for reliable selectors - const element = page.locator('[data-testid="my-element"]'); - await expect(element).toBeVisible(); -}); -``` +`package.json`, `package-lock.json`, and `playwright.config.ts` remain +in place to avoid breaking any in-flight CI references. They can be +removed in a follow-up cleanup PR once the product-tests CI rings have +run green for a cycle. diff --git a/frontend/src/components/VideoOutput.tsx b/frontend/src/components/VideoOutput.tsx index 2fa704ad1..e5ee867a7 100644 --- a/frontend/src/components/VideoOutput.tsx +++ b/frontend/src/components/VideoOutput.tsx @@ -213,6 +213,7 @@ export function VideoOutput({ For audio-only streams it acts as an invisible audio sink. */}