diff --git a/.claude/launch.json b/.claude/launch.json index 9ccce1cb4..0d123c94e 100644 --- a/.claude/launch.json +++ b/.claude/launch.json @@ -6,6 +6,24 @@ "runtimeExecutable": "bash", "runtimeArgs": ["-c", "source ~/.nvm/nvm.sh && cd frontend && npm run dev"], "port": 5173 + }, + { + "name": "backend", + "runtimeExecutable": "bash", + "runtimeArgs": [ + "-c", + "CUDA_VISIBLE_DEVICES='' uv run daydream-scope --port 8033" + ], + "port": 8033 + }, + { + "name": "scope-cloud", + "runtimeExecutable": "bash", + "runtimeArgs": [ + "-c", + "CUDA_VISIBLE_DEVICES='' SCOPE_CLOUD_MODE=livepeer SCOPE_CLOUD_APP_ID='daydream/scope-livepeer-pr-971--preview/ws' uv run daydream-scope" + ], + "port": 8000 } ] } diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml new file mode 100644 index 000000000..012aa0094 --- /dev/null +++ b/.github/workflows/eval.yml @@ -0,0 +1,65 @@ +name: Agent Evals + +# Manual-dispatch only. These evals hit the live Anthropic API, so we do NOT +# run them on push/pull_request — they cost money and are inherently noisy. +on: + workflow_dispatch: + inputs: + case: + description: "Case name to run (blank = all cases)" + required: false + default: "" + runs: + description: "Samples per case" + required: false + default: "5" + model: + description: "Model id override (blank = default)" + required: false + default: "" + fail_threshold: + description: "Overall pass-rate threshold (0-100; blank = no gate)" + required: false + default: "" + +jobs: + evals: + runs-on: ubuntu-latest + name: Run Scope agent evals + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + version: "0.9.11" + + - name: Install dependencies + run: uv sync --group dev + + - name: Run evals + shell: bash + run: | + args=(--runs "${{ inputs.runs }}") + if [ -n "${{ inputs.case }}" ]; then + args+=(--case "${{ inputs.case }}") + fi + if [ -n "${{ inputs.model }}" ]; then + args+=(--model "${{ inputs.model }}") + fi + if [ -n "${{ inputs.fail_threshold }}" ]; then + args+=(--fail-threshold "${{ inputs.fail_threshold }}") + fi + uv run python -m evals "${args[@]}" + + - name: Upload artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-artifacts + path: evals/outputs/ + retention-days: 14 diff --git a/.gitignore b/.gitignore index e89385653..629a0789d 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,6 @@ notes/ .cursor/ .specstory/ *.local* + +# Eval harness artifacts +evals/outputs/ diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 000000000..173130a4e --- /dev/null +++ b/evals/README.md @@ -0,0 +1,138 @@ +# Scope Agent Eval Harness + +Measures how often the agentic workflow builder produces a workflow that +matches the user's intent from a single natural-language prompt. + +Each **case** = one prompt + structural checks. The runner drives the real +agent via in-process ASGI (no uvicorn, no port) N times and prints a +pass-rate table per case. + +## Quickstart + +```bash +# Install deps (one-time): +uv sync --group dev + +# Ensure an Anthropic key is set: +export ANTHROPIC_API_KEY=sk-ant-... + +# Run everything, 5 samples per case (default): +uv run python -m evals + +# Run just one case, 1 sample (fast smoke): +uv run python -m evals --case starter-ltx-text-to-video --runs 1 + +# Cheaper iteration: +uv run python -m evals --model claude-haiku-4-5 + +# Enforce a bar in CI-like mode: +uv run python -m evals --runs 10 --fail-threshold 90 +``` + +Artifacts land in `evals/outputs//r/`: + +- `proposal.json` — the full graph the agent proposed. +- `meta.json` — pass/fail, failures, rationale, wall time. +- `trace.jsonl` — every SSE event the agent emitted (one per line). + +## Authoring a case + +Drop a file in `evals/cases/my-case.yaml`: + +```yaml +name: my-case +description: one-line explanation of what good looks like +prompt: | + A natural-language prompt — as if a user typed it into the agent chat. +runs: 5 +expect: + # Each entry is a single-key mapping: {check_name: argument}. + - pipelines_include: [longlive] + - wire_present: { kind: vace_to_pipeline } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" +``` + +### Available checks + +Registered in [`grader.py`](grader.py): + +| Check | Argument | Passes when… | +| ----- | -------- | ------------ | +| `pipelines_equal` | `[ids]` | Pipeline nodes' `pipeline_id`s exactly equal the set. | +| `pipelines_include` | `[ids]` | Pipeline nodes include every id in the list (extras ok). | +| `pipelines_count_at_least` | `int` | At least N pipeline nodes exist (any ids). Good for vague prompts. | +| `lora_count_at_least` | `int` | Total LoRA entries across `lora` UI nodes ≥ N. | +| `wire_present` | `{kind, …}` | An edge of the named kind exists. See below. | +| `node_present` | `{type, count?, min_items?}` | ≥ `count` UI nodes of `type`. For `prompt_list`, `min_items` asserts list length. | +| `no_validator_errors` | _(any)_ | `_validate_proposal()` returns zero errors on the graph. | +| `bad_handle_prefix` | `"parameter:"` | (Forbid) No edge handle starts with the prefix. | +| `orphan_sinks` | _(any)_ | (Forbid) Every top-level `sink` node has at least one incoming top-level `stream` edge. Catches cases where the agent tacks on an extra sink that isn't wired to anything. | + +`wire_present` kinds: + +| Kind | Extra args | Matches | +| ---- | ---------- | ------- | +| `slider_to_pipeline_param` | `target_handle: "param:noise_scale"` | UI-value node → pipeline's `targetHandle`. | +| `vace_to_pipeline` | — | VACE UI node → pipeline's `param:__vace`. | +| `image_to_vace` | — | Image (or value) node → VACE node's `param:ref_image`/`first_frame`/`last_frame`. | +| `prompt_to_pipeline` | — | Any source → pipeline's `param:__prompt`. | +| `lora_to_pipeline` | — | LoRA node → pipeline's `param:__loras`. | +| `prompt_list_to_pipeline` | — | `prompt_list` UI node → pipeline's `param:__prompt`. | +| `trigger_to_prompt_list` | — | Value source → `prompt_list`'s `param:trigger`/`param:cycle`. | +| `pipeline_to_record` | — | A pipeline's stream output → a `record` UI node. | + +Adding a new check type = adding a function to `grader.py` and registering +it in `CHECKS`. The YAML format picks it up automatically. + +### Case tone: precise vs. vague + +Real users send prompts across a wide range of specificity. Cases should +cover that range: + +- **Precise** (`complex-krea-prompt-switch-record`) — the prompt names the + pipeline, exact counts, specific behaviors. Graders assert the precise + structure: `pipelines_include: [krea-realtime-video]`, + `node_present: { type: prompt_list, min_items: 5 }`, specific wires. +- **Vague** (`vague-capture-moments`) — the prompt says what the user + wants to *do*, not how. Graders assert only what the intent clearly + implies (`pipelines_count_at_least: 1`, `node_present: { type: record }`). + The agent gets latitude on everything else; the eval measures whether + it makes reasonable choices. + +Prefer more vague cases as pass-rate on precise ones improves — vague +ones surface filling-the-gaps failures that don't show up when every +detail is spelled out. + +## Pytest integration + +A single smoke test at `tests/test_evals_smoke.py` runs one case under +`@pytest.mark.eval`. Default `pytest` skips it (pytest-ini addopts +`-m 'not eval'`). To include it: + +```bash +uv run pytest -m eval +``` + +This only verifies the harness wires up end-to-end — it doesn't enforce +pass-rates. For pass-rate enforcement, use `python -m evals`. + +## CI + +There is a `.github/workflows/eval.yml` that runs on manual dispatch only +(`workflow_dispatch`). It is **not** hooked into `pull_request` or `push` +— LLM evals cost money and are inherently noisy at the edges. Gate launch +decisions on the number, not on PR green. + +## Design notes + +- The driver uses `httpx.ASGITransport` + `asgi-lifespan` so we hit the + real `/api/v1/agent/chat` endpoint without spawning a server. This is + the same endpoint the frontend uses, so behavior is identical to + production. +- Each case spins up an isolated `AgentSession`; no cross-case + contamination. Conversation history does not leak between runs. +- Grading is deterministic and structural. No LLM-as-judge in v1. +- Model/provider overrides flow through the on-disk agent config file so + runs respect the same resolution order the server uses. diff --git a/evals/__init__.py b/evals/__init__.py new file mode 100644 index 000000000..42cc95f24 --- /dev/null +++ b/evals/__init__.py @@ -0,0 +1,10 @@ +"""Eval harness for the Scope agentic workflow builder. + +Each "case" is a YAML file in ``evals/cases/`` describing a natural-language +prompt, how many times to sample the model, and structural checks to run on +the resulting workflow proposal. The runner drives the real agent via an +in-process ASGI transport and grades proposals deterministically. + +This package is NOT imported by the running server; it is only exercised by +``python -m evals`` (CLI) and the opt-in ``pytest -m eval`` smoke test. +""" diff --git a/evals/__main__.py b/evals/__main__.py new file mode 100644 index 000000000..dda39d6e0 --- /dev/null +++ b/evals/__main__.py @@ -0,0 +1,10 @@ +"""``python -m evals`` entry point.""" + +from __future__ import annotations + +import sys + +from .runner import main + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/evals/case.py b/evals/case.py new file mode 100644 index 000000000..8ccd4b5de --- /dev/null +++ b/evals/case.py @@ -0,0 +1,104 @@ +"""YAML → Case dataclass loader for the eval harness. + +A case file looks like:: + + name: starter-mythical-creature + description: | + Reproduces the Mythical Creature teaching starter. + prompt: | + I want a slime creature ... + runs: 5 + expect: + - pipelines_equal: [longlive] + - wire_present: { kind: vace_to_pipeline } + forbid: + - bad_handle_prefix: "parameter:" + +Each entry under ``expect`` / ``forbid`` is a single-key mapping whose key is +the name of a check in :mod:`evals.grader` and whose value is the check +argument. We deliberately keep the format flat and declarative so adding a +case is just dropping a new YAML file. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import yaml + + +@dataclass +class CheckSpec: + """One graded check: ``(name, arg)`` where ``name`` resolves to a function + in :mod:`evals.grader`.""" + + name: str + arg: Any + + +@dataclass +class Case: + name: str + prompt: str + description: str = "" + runs: int = 5 + expect: list[CheckSpec] = field(default_factory=list) + forbid: list[CheckSpec] = field(default_factory=list) + source_path: Path | None = None + # When true, the case passes iff the agent did NOT emit a + # ``workflow_proposal`` SSE event. Used for runtime-tweak cases where + # the right tool is ``update_parameters`` and re-proposing the graph + # is the regression we want to catch. + forbid_proposal: bool = False + + +def _parse_check_list(raw: list[Any], context: str) -> list[CheckSpec]: + """Convert a list of single-key mappings to ``CheckSpec``s.""" + out: list[CheckSpec] = [] + for idx, entry in enumerate(raw or []): + if not isinstance(entry, dict) or len(entry) != 1: + raise ValueError( + f"{context}[{idx}] must be a single-key mapping, got: {entry!r}" + ) + ((name, arg),) = entry.items() + if not isinstance(name, str): + raise ValueError(f"{context}[{idx}] check name must be a string") + out.append(CheckSpec(name=name, arg=arg)) + return out + + +def load_case(path: Path) -> Case: + """Load a single case YAML file into a :class:`Case`.""" + data = yaml.safe_load(path.read_text()) or {} + if not isinstance(data, dict): + raise ValueError(f"{path}: expected a mapping at top level") + + name = data.get("name") or path.stem + prompt = data.get("prompt") + if not isinstance(prompt, str) or not prompt.strip(): + raise ValueError(f"{path}: 'prompt' is required and must be a non-empty string") + + runs = data.get("runs", 5) + if not isinstance(runs, int) or runs < 1: + raise ValueError(f"{path}: 'runs' must be a positive integer") + + return Case( + name=str(name), + prompt=prompt, + description=str(data.get("description") or ""), + runs=runs, + expect=_parse_check_list(data.get("expect") or [], f"{path}:expect"), + forbid=_parse_check_list(data.get("forbid") or [], f"{path}:forbid"), + source_path=path, + forbid_proposal=bool(data.get("forbid_proposal", False)), + ) + + +def discover_cases(cases_dir: Path) -> list[Case]: + """Load every ``*.yaml`` / ``*.yml`` case in ``cases_dir``, alpha-sorted.""" + paths = sorted( + p for p in cases_dir.iterdir() if p.suffix in (".yaml", ".yml") and p.is_file() + ) + return [load_case(p) for p in paths] diff --git a/evals/cases/complex-krea-prompt-switch-record.yaml b/evals/cases/complex-krea-prompt-switch-record.yaml new file mode 100644 index 000000000..d088bc90c --- /dev/null +++ b/evals/cases/complex-krea-prompt-switch-record.yaml @@ -0,0 +1,29 @@ +name: complex-krea-prompt-switch-record +description: | + Multi-concept request that tests: picking a specific pipeline (krea), wiring + a reference image via VACE, using a prompt_list with ≥5 items driven by a + button/trigger, and wiring the output into a record node. Verbatim phrasing + of the kind a user would type. +prompt: | + Make a krea workflow that allows me to supply a reference image, switch + between 5 prompts with a button press, and record the output. +runs: 5 +expect: + - pipelines_include: [krea-realtime-video] + # Reference image path: krea supports VACE, so the reference image flows + # image -> vace -> pipeline's param:__vace aggregate. + - wire_present: { kind: image_to_vace } + - wire_present: { kind: vace_to_pipeline } + # Prompt switching: prompt_list node with at least 5 entries, its output + # feeding the pipeline's aggregate prompt input. + - node_present: { type: prompt_list, min_items: 5 } + - wire_present: { kind: prompt_list_to_pipeline } + - wire_present: { kind: trigger_to_prompt_list } + # Recording: at least one record node wired to the pipeline's stream output. + - node_present: { type: record } + - wire_present: { kind: pipeline_to_record } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" + - orphan_sinks: true + - overlapping_nodes: true diff --git a/evals/cases/complex-pipeline-name-respect.yaml b/evals/cases/complex-pipeline-name-respect.yaml new file mode 100644 index 000000000..9b9f06090 --- /dev/null +++ b/evals/cases/complex-pipeline-name-respect.yaml @@ -0,0 +1,16 @@ +name: complex-pipeline-name-respect +description: | + User explicitly names krea. Agent must pick krea-realtime-video, not + substitute a different pipeline. Pressure-tests the "honor the user's + pipeline name" rule added to CORE PRINCIPLES. +prompt: | + Give me a krea workflow for my webcam with a slider for noise_scale. +runs: 5 +expect: + - pipelines_include: [krea-realtime-video] + - wire_present: { kind: slider_to_pipeline_param, target_handle: "param:noise_scale" } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" + - orphan_sinks: true + - overlapping_nodes: true diff --git a/evals/cases/complex-reference-image-no-invented-handles.yaml b/evals/cases/complex-reference-image-no-invented-handles.yaml new file mode 100644 index 000000000..e6f3771c1 --- /dev/null +++ b/evals/cases/complex-reference-image-no-invented-handles.yaml @@ -0,0 +1,19 @@ +name: complex-reference-image-no-invented-handles +description: | + Asks for reference-image conditioning on a VACE-capable pipeline. The + only correct route is image → vace → pipeline.param:__vace — NOT an + invented param:i2v_image / param:ref handle. The backend validator + catches invented handles, but we assert the positive VACE path here + so the agent can't silently skip reference-image support either. +prompt: | + Set up longlive with my webcam and a reference image I can swap out. +runs: 5 +expect: + - pipelines_include: [longlive] + - wire_present: { kind: image_to_vace } + - wire_present: { kind: vace_to_pipeline } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" + - orphan_sinks: true + - overlapping_nodes: true diff --git a/evals/cases/layout-nodes-spaced.yaml b/evals/cases/layout-nodes-spaced.yaml new file mode 100644 index 000000000..07a6abdb2 --- /dev/null +++ b/evals/cases/layout-nodes-spaced.yaml @@ -0,0 +1,27 @@ +name: layout-nodes-spaced +description: | + Regression guard for node positioning. Observed failure mode: the agent + placed trigger/subgraph/image/slider UI nodes at x=0 and x=320, which + collides with the frontend's top-level auto-layout strip (sources at x=50, + pipelines at x=350). Result: nodes visually stacked on top of each other + on the canvas even though edges were correct. + + This case exercises the failure surface (many UI-state nodes alongside + multiple top-level nodes) and grades that NO two nodes overlap — relying + on either the agent placing them correctly OR the server-side + ``_reflow_ui_nodes`` safety net kicking in. +prompt: | + Build me a longlive workflow where I can control three things live: a + noise_scale slider, a num_steps slider, and a prompt_list with at least + 3 prompts that I can switch between with separate trigger buttons. Wire + them all up so I can tweak each one while it's running. +runs: 5 +expect: + - pipelines_include: [longlive] + - node_present: { type: slider } + - node_present: { type: prompt_list, min_items: 3 } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" + - orphan_sinks: true + - overlapping_nodes: true diff --git a/evals/cases/runtime-tweak-no-repropose.yaml b/evals/cases/runtime-tweak-no-repropose.yaml new file mode 100644 index 000000000..ae9092251 --- /dev/null +++ b/evals/cases/runtime-tweak-no-repropose.yaml @@ -0,0 +1,19 @@ +name: runtime-tweak-no-repropose +description: | + Regression guard for "I should never have to reload a workflow just to + change a parameter." Observed failure: after the user asks to tweak a + single runtime-settable param on an already-running pipeline, the agent + calls ``propose_workflow`` to rebuild the entire graph, forcing the user + to approve and reload. The right tool is ``update_parameters``, which is + a silent live-tweak. + + We frame the prompt so the only reasonable interpretation is "change this + one number on the running graph" — no new nodes, no rewiring. The grader + then forbids ``workflow_proposal`` entirely for this case via the + ``forbid_proposal`` Case field. +prompt: | + My longlive workflow is already loaded and streaming. I just want to + change noise_scale to 0.85. Don't rebuild or reload the workflow — + just update that parameter live. +runs: 5 +forbid_proposal: true diff --git a/evals/cases/starter-dissolving-sunflower.yaml b/evals/cases/starter-dissolving-sunflower.yaml new file mode 100644 index 000000000..249fe043f --- /dev/null +++ b/evals/cases/starter-dissolving-sunflower.yaml @@ -0,0 +1,23 @@ +name: starter-dissolving-sunflower +description: | + Reproduces the "Dissolving Sunflower" teaching starter: a camera feed runs + through video-depth-anything first, then into longlive with a dissolve + LoRA and VACE reference, then out through a passthrough pipeline. +prompt: | + Build me a depth-driven dissolve effect on my camera. Chain + video-depth-anything into longlive into passthrough. On the longlive node, + load the dissolve LoRA ("daydream-scope-dissolve.safetensors") at weight + around 1.5, and enable VACE with the depth output used as the reference + video. The main prompt should describe a "dissolving sunflower in + abstract particles". +runs: 5 +expect: + - pipelines_include: [video-depth-anything, longlive, passthrough] + - lora_count_at_least: 1 + - wire_present: { kind: vace_to_pipeline } + - wire_present: { kind: prompt_to_pipeline } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" + - orphan_sinks: true + - overlapping_nodes: true diff --git a/evals/cases/starter-ltx-text-to-video.yaml b/evals/cases/starter-ltx-text-to-video.yaml new file mode 100644 index 000000000..eb46ad524 --- /dev/null +++ b/evals/cases/starter-ltx-text-to-video.yaml @@ -0,0 +1,19 @@ +name: starter-ltx-text-to-video +description: | + Reproduces the "LTX 2.3" teaching starter: a single ltx2 pipeline running + in text-to-video mode driven by a prompt. +prompt: | + I want a simple text-to-video workflow using the ltx2 pipeline. Just one + pipeline node, wired straight to an output sink. Set up the prompt input + so I can type what I want to generate — for now, have it read + "a majestic lion striding across an open savannah". No LoRAs, no VACE, + no camera input needed; ltx2 is generating from text. +runs: 5 +expect: + - pipelines_equal: [ltx2] + - wire_present: { kind: prompt_to_pipeline } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" + - orphan_sinks: true + - overlapping_nodes: true diff --git a/evals/cases/starter-mythical-creature.yaml b/evals/cases/starter-mythical-creature.yaml new file mode 100644 index 000000000..61a89fd2d --- /dev/null +++ b/evals/cases/starter-mythical-creature.yaml @@ -0,0 +1,24 @@ +name: starter-mythical-creature +description: | + Reproduces the "Mythical Creature" teaching starter: longlive pipeline with + two style LoRAs (acid-lime + dissolve) and VACE enabled, plus a slider for + noise_scale that the user can tweak live. +prompt: | + I want to turn my webcam feed into a morphing slime creature. Use the + longlive pipeline. Load two LoRAs I already have by filename — + "diffslime_acidzlime-000016.safetensors" and + "daydream-scope-dissolve.safetensors" — both at weight ~1.5 in permanent + merge mode. Turn VACE on so I can feed the input video back in as a + reference. Give me a live slider wired to noise_scale so I can tweak it + while it's running. +runs: 5 +expect: + - pipelines_include: [longlive] + - lora_count_at_least: 2 + - wire_present: { kind: slider_to_pipeline_param, target_handle: "param:noise_scale" } + - wire_present: { kind: lora_to_pipeline } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" + - orphan_sinks: true + - overlapping_nodes: true diff --git a/evals/cases/vague-capture-moments.yaml b/evals/cases/vague-capture-moments.yaml new file mode 100644 index 000000000..0e7e5a353 --- /dev/null +++ b/evals/cases/vague-capture-moments.yaml @@ -0,0 +1,22 @@ +name: vague-capture-moments +description: | + Deliberately vague prompt — user says what they want to *do*, not which + pipeline or what kind of node graph. Graders only check the bits that the + intent clearly implies: camera/webcam source, *some* pipeline, and a way + to capture output. This pressure-tests the agent's ability to fill gaps. +prompt: | + I want to play around with my webcam and capture anything cool that happens. + Set up something simple I can tweak live. +runs: 5 +expect: + # Intent is clear: there must be at least one generative pipeline. We do + # NOT assert *which* pipeline — the agent gets to pick. + - pipelines_count_at_least: 1 + # Intent is clear: a way to record / capture output. + - node_present: { type: record } + - wire_present: { kind: pipeline_to_record } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" + - orphan_sinks: true + - overlapping_nodes: true diff --git a/evals/cases/vague-save-the-output.yaml b/evals/cases/vague-save-the-output.yaml new file mode 100644 index 000000000..e839a33e0 --- /dev/null +++ b/evals/cases/vague-save-the-output.yaml @@ -0,0 +1,20 @@ +name: vague-save-the-output +description: | + "Save" / "keep what I make" / "capture" all mean "add a record node". + This exercises the recording completeness-check item added to the + SYSTEM_PROMPT. Uses natural phrasing ("saves whatever I make") rather + than the literal word "record" to test intent-matching, not keyword- + matching. +prompt: | + Set me up with a passthrough pipeline on my camera and make sure it + saves whatever I make. +runs: 5 +expect: + - pipelines_include: [passthrough] + - node_present: { type: record } + - wire_present: { kind: pipeline_to_record } + - no_validator_errors: true +forbid: + - bad_handle_prefix: "parameter:" + - orphan_sinks: true + - overlapping_nodes: true diff --git a/evals/driver.py b/evals/driver.py new file mode 100644 index 000000000..cb48da2ac --- /dev/null +++ b/evals/driver.py @@ -0,0 +1,148 @@ +"""In-process driver for the Scope agent. + +Hits ``POST /api/v1/agent/chat`` via ``httpx.ASGITransport`` + ``asgi-lifespan`` +so no uvicorn server or port is needed. Parses the SSE stream, captures every +event as a structured trace, and pulls out the final ``workflow_proposal`` +payload if the agent produced one. + +Contract:: + + result = await run_case(app, prompt, model=..., provider=...) + result.proposal # dict | None — the `graph` from the workflow_proposal SSE + result.trace # list[{event, data}] — every SSE event, in order + result.error # str | None — provider/transport error if any +""" + +from __future__ import annotations + +import json +import logging +from collections.abc import AsyncIterator +from dataclasses import dataclass, field +from typing import Any + +import httpx +from asgi_lifespan import LifespanManager + +logger = logging.getLogger(__name__) + + +@dataclass +class DriveResult: + proposal: dict | None = None # the 'graph' from workflow_proposal SSE + proposal_id: str | None = None + rationale: str = "" + trace: list[dict] = field(default_factory=list) + error: str | None = None + session_id: str | None = None + + +async def _parse_sse_stream(resp: httpx.Response) -> AsyncIterator[dict]: + """Yield ``{event, data}`` dicts. Swallows malformed lines.""" + current_event: str | None = None + async for raw_line in resp.aiter_lines(): + if raw_line == "": + current_event = None + continue + line = raw_line.rstrip("\r") + if line.startswith("event:"): + current_event = line.split(":", 1)[1].strip() + elif line.startswith("data:"): + payload = line.split(":", 1)[1].strip() + try: + data = json.loads(payload) + except Exception: + data = {"_raw": payload} + yield {"event": current_event or "message", "data": data} + + +async def run_case( + app: Any, + prompt: str, + *, + model_override: str | None = None, + provider_override: str | None = None, +) -> DriveResult: + """Drive one agent turn with ``prompt`` and return the captured result. + + ``app`` is the FastAPI app instance (usually ``scope.server.app.app``). + We pass a fresh session_id=None so the store mints one for each case — + no cross-case contamination. + """ + # Apply provider/model overrides by writing to the config file on disk + # (that's what the app reads). We rely on the caller to have scoped this + # via EnvOverride if they want to reset it after. + if model_override or provider_override: + _patch_agent_config(model=model_override, provider=provider_override) + + result = DriveResult() + transport = httpx.ASGITransport(app=app) + try: + # Scope's startup runs plugin installs, pipeline registration, WebRTC + # setup, and OSC init — way past asgi-lifespan's 5s default. Give it + # plenty of headroom; a cold first run on CI can take >30s. + async with LifespanManager(app, startup_timeout=180, shutdown_timeout=30): + async with httpx.AsyncClient( + transport=transport, + base_url="http://scope-eval.local", + timeout=httpx.Timeout(300.0, connect=10.0), + ) as client: + async with client.stream( + "POST", + "/api/v1/agent/chat", + json={"message": prompt}, + ) as resp: + if resp.status_code != 200: + body = await resp.aread() + result.error = ( + f"HTTP {resp.status_code}: " + f"{body.decode('utf-8', errors='replace')[:500]}" + ) + return result + result.session_id = resp.headers.get("x-agent-session-id") + async for evt in _parse_sse_stream(resp): + result.trace.append(evt) + name = evt["event"] + data = evt["data"] + if name == "workflow_proposal": + # First proposal wins — agent should only emit one. + if result.proposal is None: + result.proposal = data.get("graph") + result.proposal_id = data.get("proposal_id") + result.rationale = data.get("rationale") or "" + elif name == "error": + # Don't short-circuit — the turn_end still arrives + # and the trace is useful for debugging. + msg = data.get("message") or str(data) + result.error = (result.error or "") + msg + "\n" + elif name == "turn_end": + # Agent finished. We don't need more events. + break + except Exception as e: + logger.exception("driver transport error") + result.error = f"{type(e).__name__}: {e}" + return result + + +def _patch_agent_config(*, model: str | None, provider: str | None) -> None: + """Best-effort update of the on-disk agent config. Safe to call repeatedly. + + We do this by loading, mutating, saving via the same helpers the server + uses, so we respect any fields we don't know about. + """ + from scope.server.agent_state import ( + AgentConfig, + load_agent_config, + save_agent_config, + ) + + cfg = load_agent_config() + if provider: + cfg = AgentConfig( + provider=provider, # type: ignore[arg-type] + model=model or cfg.model, + base_url=cfg.base_url, + ) + elif model: + cfg = AgentConfig(provider=cfg.provider, model=model, base_url=cfg.base_url) + save_agent_config(cfg) diff --git a/evals/grader.py b/evals/grader.py new file mode 100644 index 000000000..8a041b203 --- /dev/null +++ b/evals/grader.py @@ -0,0 +1,586 @@ +"""Structural graders for workflow proposals. + +Each check is a pure function ``(graph, arg) -> CheckResult`` registered in +:data:`CHECKS`. The YAML case format references checks by name (see +:mod:`evals.case`), so adding a new check is: write a function, register it, +reference it from a case file. + +We intentionally favor simple boolean-with-reason checks over complex +"structural equivalence" comparisons — the three canonical failure modes +we're trying to catch (missing VACE wire, unwired prompt, missing slider +for a called-out parameter) are all detectable with trivial traversals. +""" + +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass +from typing import Any + +# Re-use the backend validator so a regression there immediately shows up +# here too. +from scope.server.agent_tool_impls import ( + _derive_pipeline_handles, + _validate_proposal, +) + + +@dataclass +class CheckResult: + ok: bool + detail: str + + @classmethod + def ok_(cls, detail: str = "") -> CheckResult: + return cls(ok=True, detail=detail) + + @classmethod + def fail(cls, detail: str) -> CheckResult: + return cls(ok=False, detail=detail) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _pipeline_ids(graph: dict) -> list[str]: + return [ + n["pipeline_id"] + for n in graph.get("nodes", []) or [] + if n.get("type") == "pipeline" and n.get("pipeline_id") + ] + + +def _pipeline_node_ids(graph: dict) -> set[str]: + return { + n["id"] + for n in graph.get("nodes", []) or [] + if n.get("type") == "pipeline" and n.get("id") + } + + +def _ui_nodes(graph: dict) -> list[dict]: + return (graph.get("ui_state") or {}).get("nodes") or [] + + +def _ui_edges(graph: dict) -> list[dict]: + return (graph.get("ui_state") or {}).get("edges") or [] + + +def _ui_node_type(graph: dict, node_id: str) -> str | None: + for n in _ui_nodes(graph): + if n.get("id") == node_id: + return n.get("type") + return None + + +# Top-level (backend) graph helpers. The backend graph only accepts these +# four node types; anything else lives in ui_state (see SYSTEM_PROMPT's +# GRAPH SHAPE section). +_TOP_LEVEL_TYPES = {"source", "pipeline", "sink", "record"} + + +def _top_level_nodes(graph: dict) -> list[dict]: + return graph.get("nodes") or [] + + +def _top_level_edges(graph: dict) -> list[dict]: + return graph.get("edges") or [] + + +def _top_level_node_type(graph: dict, node_id: str) -> str | None: + for n in _top_level_nodes(graph): + if n.get("id") == node_id: + return n.get("type") + return None + + +def _nodes_of_type(graph: dict, want_type: str) -> list[dict]: + """Return all nodes of ``want_type`` from wherever they legally live. + + Top-level kinds (source/pipeline/sink/record) are searched in the + backend graph; everything else (slider, vace, prompt_list, ...) + lives in ui_state. This matches the producer-side split enforced by + the SYSTEM_PROMPT + backend validator. + """ + if want_type in _TOP_LEVEL_TYPES: + return [n for n in _top_level_nodes(graph) if n.get("type") == want_type] + return [n for n in _ui_nodes(graph) if n.get("type") == want_type] + + +# --------------------------------------------------------------------------- +# Checks — expect / forbid semantics are both `ok=True` means "assertion +# holds". The runner inverts for forbid. +# --------------------------------------------------------------------------- + + +def pipelines_equal(graph: dict, arg: Any) -> CheckResult: + want = set(arg or []) + got = set(_pipeline_ids(graph)) + if got == want: + return CheckResult.ok_(f"pipelines={sorted(got)}") + missing = sorted(want - got) + extra = sorted(got - want) + parts = [] + if missing: + parts.append(f"missing={missing}") + if extra: + parts.append(f"extra={extra}") + return CheckResult.fail(", ".join(parts) or f"got={sorted(got)}") + + +def pipelines_include(graph: dict, arg: Any) -> CheckResult: + want = set(arg or []) + got = set(_pipeline_ids(graph)) + missing = sorted(want - got) + if missing: + return CheckResult.fail(f"missing={missing}, got={sorted(got)}") + return CheckResult.ok_(f"got={sorted(got)}") + + +def pipelines_count_at_least(graph: dict, arg: Any) -> CheckResult: + """Assert at least N pipeline nodes exist, without pinning which ones. + + Useful for vague prompts where the agent gets to pick the pipeline. + """ + min_count = int(arg) + got = _pipeline_ids(graph) + if len(got) >= min_count: + return CheckResult.ok_(f"{len(got)} pipeline(s): {sorted(set(got))}") + return CheckResult.fail(f"need >= {min_count} pipeline(s), got {len(got)}") + + +def lora_count_at_least(graph: dict, arg: Any) -> CheckResult: + min_count = int(arg) + # Two reasonable places: a dedicated `lora` UI node, or a `lora` node + # with multiple entries in data.loras[]. Sum across both. + total = 0 + lora_node_count = 0 + for n in _ui_nodes(graph): + if n.get("type") == "lora": + lora_node_count += 1 + inner = (n.get("data") or {}).get("loras") or [] + total += max(1, len(inner)) + if total >= min_count: + return CheckResult.ok_( + f"{total} lora entr(ies) across {lora_node_count} node(s)" + ) + return CheckResult.fail( + f"need >= {min_count}, found {total} across {lora_node_count} lora node(s)" + ) + + +def no_validator_errors(graph: dict, _arg: Any) -> CheckResult: + """Re-run the backend validator. Getting a proposal at all implies this + passed once, but we re-assert so a silent regression in the validator is + still surfaced by the harness.""" + # Build a minimal handles lookup so the validator can check pipeline + # targets. For pipelines we don't know, fall back to an empty shape; + # validator will treat that as "unknown" and only report errors on + # clearly-malformed edges rather than on handle-existence. + handles: dict[str, dict] = {} + for pid in set(_pipeline_ids(graph)): + # We don't have the live registry here; synthesize a permissive shape + # by deriving from an empty schema. Unknown-handle checks still fire + # for bad prefixes but not for handle names we can't verify. + handles[pid] = _derive_pipeline_handles( + pid, + { + "supports_prompts": True, + "supports_vace": True, + "supports_lora": True, + "produces_video": True, + "config_schema": {"properties": {}}, + }, + ) + issues = _validate_proposal(graph, handles) + errs = [i for i in issues if i.get("severity") == "error"] + if errs: + first = errs[0].get("message", "") + return CheckResult.fail(f"{len(errs)} validator error(s); first: {first}") + return CheckResult.ok_("0 validator errors") + + +# --------------------------------------------------------------------------- +# wire_present — one check with a `kind` discriminator. +# --------------------------------------------------------------------------- + + +_VALUE_SOURCE_TYPES = { + "slider", + "knobs", + "primitive", + "trigger", + "control", + "subgraph", + "math", +} + + +def _edges_into( + graph: dict, target_id: str, target_handle: str | None = None +) -> list[dict]: + out = [] + for e in _ui_edges(graph): + if e.get("target") != target_id: + continue + if target_handle is not None and e.get("targetHandle") != target_handle: + continue + out.append(e) + return out + + +def _edges_into_any_pipeline(graph: dict, target_handle: str) -> list[dict]: + pipe_ids = _pipeline_node_ids(graph) + out = [] + for e in _ui_edges(graph): + if e.get("target") in pipe_ids and e.get("targetHandle") == target_handle: + out.append(e) + return out + + +def wire_present(graph: dict, arg: Any) -> CheckResult: + if not isinstance(arg, dict) or "kind" not in arg: + return CheckResult.fail(f"wire_present needs {{kind: ...}}, got {arg!r}") + kind = arg["kind"] + + if kind == "slider_to_pipeline_param": + target_handle = arg.get("target_handle") + if not target_handle: + return CheckResult.fail("slider_to_pipeline_param needs target_handle") + hits = _edges_into_any_pipeline(graph, target_handle) + if not hits: + return CheckResult.fail( + f"no ui_state edge targets a pipeline's {target_handle}" + ) + # Source must be a value-producing UI node type. + for e in hits: + src_t = _ui_node_type(graph, e.get("source")) + if src_t in _VALUE_SOURCE_TYPES: + return CheckResult.ok_(f"{src_t}({e.get('source')}) -> {target_handle}") + return CheckResult.fail( + f"edge(s) into {target_handle} exist but none originate from " + f"a value-producing node (types: {sorted(_VALUE_SOURCE_TYPES)})" + ) + + if kind == "vace_to_pipeline": + hits = _edges_into_any_pipeline(graph, "param:__vace") + if not hits: + return CheckResult.fail("no edge targets pipeline's param:__vace") + for e in hits: + if _ui_node_type(graph, e.get("source")) == "vace": + return CheckResult.ok_(f"vace({e.get('source')}) -> param:__vace") + return CheckResult.fail( + "param:__vace edge exists but source is not a vace node" + ) + + if kind == "image_to_vace": + vace_handles = {"param:ref_image", "param:first_frame", "param:last_frame"} + for e in _ui_edges(graph): + tgt_t = _ui_node_type(graph, e.get("target")) + if tgt_t == "vace" and e.get("targetHandle") in vace_handles: + src_t = _ui_node_type(graph, e.get("source")) + # Accept either a dedicated 'image' node or a generic value + # source (primitive holding a path). + if src_t in {"image"} | _VALUE_SOURCE_TYPES: + return CheckResult.ok_( + f"{src_t}({e.get('source')}) -> vace.{e.get('targetHandle')}" + ) + return CheckResult.fail( + "no edge into a vace node's ref_image/first_frame/last_frame" + ) + + if kind == "prompt_to_pipeline": + hits = _edges_into_any_pipeline(graph, "param:__prompt") + if hits: + return CheckResult.ok_(f"{len(hits)} edge(s) -> param:__prompt") + return CheckResult.fail("no edge targets pipeline's param:__prompt") + + if kind == "lora_to_pipeline": + hits = _edges_into_any_pipeline(graph, "param:__loras") + if hits: + return CheckResult.ok_(f"{len(hits)} edge(s) -> param:__loras") + return CheckResult.fail("no edge targets pipeline's param:__loras") + + if kind == "pipeline_to_record": + # A record node is a top-level node type; the canonical wiring is a + # top-level stream edge `pipeline -> record`. We also accept a + # ui_state-shaped edge from a pipeline to a record node, since + # either is permissible at the schema level. + pipe_ids = _pipeline_node_ids(graph) + + # Top-level form: {"from": , "to_node": , "kind": "stream"}. + for e in _top_level_edges(graph): + if e.get("from") not in pipe_ids: + continue + if _top_level_node_type(graph, e.get("to_node")) != "record": + continue + if e.get("kind") != "stream": + continue + return CheckResult.ok_( + f"pipeline({e.get('from')}) -> record({e.get('to_node')}) " + f"(top-level stream edge)" + ) + + # ui_state form (less common but legal for composed graphs). + for e in _ui_edges(graph): + if e.get("source") not in pipe_ids: + continue + if _ui_node_type(graph, e.get("target")) != "record": + continue + sh = e.get("sourceHandle") or "" + if isinstance(sh, str) and sh.startswith("stream:"): + return CheckResult.ok_( + f"pipeline({e.get('source')}) -> record({e.get('target')}) " + f"via ui_state {sh}" + ) + return CheckResult.fail( + "no stream edge (top-level or ui_state) wires a pipeline " + "output into a record node" + ) + + if kind == "prompt_list_to_pipeline": + # prompt_list node's param:prompt output → pipeline's param:__prompt. + hits = _edges_into_any_pipeline(graph, "param:__prompt") + if not hits: + return CheckResult.fail("no edge targets pipeline's param:__prompt") + for e in hits: + if _ui_node_type(graph, e.get("source")) == "prompt_list": + return CheckResult.ok_( + f"prompt_list({e.get('source')}) -> param:__prompt" + ) + return CheckResult.fail( + "param:__prompt edge exists but source is not a prompt_list node" + ) + + if kind == "trigger_to_prompt_list": + # Some value source → prompt_list's param:trigger (or param:cycle). + accepted = {"param:trigger", "param:cycle"} + for e in _ui_edges(graph): + if _ui_node_type(graph, e.get("target")) != "prompt_list": + continue + if e.get("targetHandle") not in accepted: + continue + src_t = _ui_node_type(graph, e.get("source")) + if src_t in _VALUE_SOURCE_TYPES: + return CheckResult.ok_( + f"{src_t}({e.get('source')}) -> prompt_list.{e.get('targetHandle')}" + ) + return CheckResult.fail( + "no edge from a value-producing source into a prompt_list's " + "param:trigger or param:cycle" + ) + + return CheckResult.fail(f"unknown wire_present kind: {kind!r}") + + +def node_present(graph: dict, arg: Any) -> CheckResult: + """Assert at least N nodes of a given type exist. + + arg: ``{type: "record", count: 1, min_items: 5}`` + - ``type`` (required) — node type. Top-level kinds + (source/pipeline/sink/record) are searched in the backend graph; + everything else (slider, vace, prompt_list, ...) in ui_state. + - ``count`` (default 1) — minimum number of nodes of that type. + - ``min_items`` (optional) — if set AND type=="prompt_list", at least one + such node must have ``data.promptListItems`` of length ≥ min_items. + """ + if not isinstance(arg, dict) or "type" not in arg: + return CheckResult.fail(f"node_present needs {{type: ...}}, got {arg!r}") + want_type = arg["type"] + want_count = int(arg.get("count", 1)) + min_items = arg.get("min_items") + + nodes = _nodes_of_type(graph, want_type) + if len(nodes) < want_count: + return CheckResult.fail( + f"need >= {want_count} node(s) of type {want_type!r}, got {len(nodes)}" + ) + + if min_items is not None: + # Look for at least one node whose item list is long enough. + threshold = int(min_items) + max_seen = 0 + for n in nodes: + items = (n.get("data") or {}).get("promptListItems") or [] + if isinstance(items, list): + max_seen = max(max_seen, len(items)) + if max_seen < threshold: + return CheckResult.fail( + f"{want_type} exists but longest promptListItems is {max_seen}, " + f"need >= {threshold}" + ) + return CheckResult.ok_( + f"{len(nodes)} {want_type} node(s); longest list has {max_seen} item(s)" + ) + + return CheckResult.ok_(f"{len(nodes)} {want_type} node(s)") + + +# --------------------------------------------------------------------------- +# forbid checks +# --------------------------------------------------------------------------- + + +def bad_handle_prefix(graph: dict, arg: Any) -> CheckResult: + """Forbid check: returns ok=True if NO edge uses the given prefix.""" + prefix = str(arg) + for e in _ui_edges(graph): + for side in ("sourceHandle", "targetHandle"): + h = e.get(side) + if isinstance(h, str) and h.startswith(prefix): + return CheckResult.fail( + f"edge {e.get('id', '?')} {side}={h!r} starts with forbidden {prefix!r}" + ) + return CheckResult.ok_(f"no edge handle starts with {prefix!r}") + + +def overlapping_nodes(graph: dict, _arg: Any) -> CheckResult: + """Forbid check: no two nodes on the canvas may overlap. + + Observed failure: the agent picks UI-node positions like (0,0), (0,80), + (320,40) that look "neat" in isolation but collide with the frontend's + top-level auto-layout strip (sources at x=50, pipelines at x=350, sinks + at x=650, records at x=950). The server-side ``_reflow_ui_nodes`` should + catch this and reassign, so this check is a regression detector: if it + ever fires in an eval, either the agent is producing new layout patterns + reflow doesn't cover OR reflow has a bug. + + We use the same bounding-box logic as ``_reflow_ui_nodes``: UI nodes are + 240×140 (280 tall for image/vace/subgraph), top-level nodes are the + 200×60 that ``graphConfigToFlow`` drops at x=50/350/650/950, row-spaced + by 160 starting at y=50. + """ + # Mirror the constants used by the server-side reflow (keeping them + # duplicated here is intentional — if either set drifts, the eval is + # exactly the place we want to catch it). + FE_START_X = 50 + FE_START_Y = 50 + FE_COLUMN_GAP = 300 + FE_ROW_GAP = 100 + FE_NODE_W = 200 + FE_NODE_H = 60 + + UI_NODE_W = 240 + UI_NODE_H_DEFAULT = 140 + UI_NODE_H_TALL = 280 + TALL_TYPES = {"image", "vace", "subgraph"} + + type_to_col = {"source": 0, "pipeline": 1, "sink": 2, "record": 3} + + def rects_overlap( + a: tuple[float, float, float, float], + b: tuple[float, float, float, float], + ) -> bool: + ax, ay, aw, ah = a + bx, by, bw, bh = b + return not (ax + aw <= bx or bx + bw <= ax or ay + ah <= by or by + bh <= ay) + + # Predict top-level rectangles the frontend will render. + top_by_col: dict[int, list[str]] = {} + for n in _top_level_nodes(graph): + col = type_to_col.get(n.get("type")) + if col is None or not n.get("id"): + continue + top_by_col.setdefault(col, []).append(n["id"]) + + rects: list[tuple[tuple[float, float, float, float], str]] = [] + for col, ids in top_by_col.items(): + for i, nid in enumerate(ids): + rects.append( + ( + ( + float(FE_START_X + col * FE_COLUMN_GAP), + float(FE_START_Y + i * (FE_NODE_H + FE_ROW_GAP)), + float(FE_NODE_W), + float(FE_NODE_H), + ), + f"top:{nid}", + ) + ) + + # UI-state rectangles use whatever position the agent (or reflow) set. + for n in _ui_nodes(graph): + pos = n.get("position") or {} + try: + x = float(pos.get("x", 0)) + y = float(pos.get("y", 0)) + except (TypeError, ValueError): + return CheckResult.fail( + f"ui node {n.get('id')!r} has invalid position {pos!r}" + ) + h = UI_NODE_H_TALL if n.get("type") in TALL_TYPES else UI_NODE_H_DEFAULT + rects.append(((x, y, float(UI_NODE_W), float(h)), f"ui:{n.get('id') or '?'}")) + + for i, (ra, ida) in enumerate(rects): + for j in range(i + 1, len(rects)): + rb, idb = rects[j] + if rects_overlap(ra, rb): + return CheckResult.fail(f"{ida} overlaps {idb}") + + return CheckResult.ok_(f"no overlaps among {len(rects)} node(s)") + + +def orphan_sinks(graph: dict, _arg: Any) -> CheckResult: + """Forbid check: every top-level sink must have an incoming stream edge. + + Observed failure: agent occasionally emits a second ``sink`` node not + wired to anything, producing a valid-but-dead canvas element. Passes + validation (disconnected sinks aren't illegal) but is obviously wrong. + + We scan top-level ``graph.edges`` for any ``stream`` edge whose + ``to_node`` is each top-level sink. A sink with zero such edges is an + orphan. + """ + sinks = [n for n in _top_level_nodes(graph) if n.get("type") == "sink"] + if not sinks: + # No sinks at all isn't what this check is about — other checks + # can assert presence if they need to. + return CheckResult.ok_("no sinks to inspect") + + orphans: list[str] = [] + for s in sinks: + sink_id = s.get("id") + has_incoming = any( + e.get("to_node") == sink_id and e.get("kind") == "stream" + for e in _top_level_edges(graph) + ) + if not has_incoming: + orphans.append(str(sink_id)) + + if orphans: + return CheckResult.fail( + f"{len(orphans)}/{len(sinks)} sink(s) have no incoming stream edge: " + f"{orphans}" + ) + return CheckResult.ok_(f"all {len(sinks)} sink(s) wired") + + +# --------------------------------------------------------------------------- +# Registry +# --------------------------------------------------------------------------- + + +CHECKS: dict[str, Callable[[dict, Any], CheckResult]] = { + "pipelines_equal": pipelines_equal, + "pipelines_include": pipelines_include, + "pipelines_count_at_least": pipelines_count_at_least, + "lora_count_at_least": lora_count_at_least, + "no_validator_errors": no_validator_errors, + "wire_present": wire_present, + "node_present": node_present, + "bad_handle_prefix": bad_handle_prefix, + "orphan_sinks": orphan_sinks, + "overlapping_nodes": overlapping_nodes, +} + + +def run_check(name: str, graph: dict, arg: Any) -> CheckResult: + fn = CHECKS.get(name) + if fn is None: + return CheckResult.fail(f"unknown check: {name!r}") + try: + return fn(graph, arg) + except Exception as e: # defensive — a buggy check must not kill the run + return CheckResult.fail(f"{type(e).__name__} in check {name}: {e}") diff --git a/evals/runner.py b/evals/runner.py new file mode 100644 index 000000000..863db3021 --- /dev/null +++ b/evals/runner.py @@ -0,0 +1,343 @@ +"""Runner: execute cases, grade proposals, print a summary, dump artifacts.""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import sys +from dataclasses import dataclass, field +from pathlib import Path +from time import perf_counter + +from .case import Case, discover_cases, load_case +from .driver import DriveResult, run_case +from .grader import run_check + +logger = logging.getLogger("evals") + + +EVALS_ROOT = Path(__file__).resolve().parent +DEFAULT_CASES_DIR = EVALS_ROOT / "cases" +DEFAULT_OUTPUT_DIR = EVALS_ROOT / "outputs" + + +@dataclass +class RunResult: + case_name: str + run_index: int + passed: bool + failures: list[str] = field(default_factory=list) + drive: DriveResult | None = None + wall_seconds: float = 0.0 + + +@dataclass +class CaseSummary: + case: Case + runs: list[RunResult] + + @property + def pass_count(self) -> int: + return sum(1 for r in self.runs if r.passed) + + @property + def rate_pct(self) -> float: + if not self.runs: + return 0.0 + return 100.0 * self.pass_count / len(self.runs) + + @property + def grouped_failures(self) -> list[str]: + """Human-readable failure labels grouped by run index.""" + return [ + f"r{r.run_index}: {'; '.join(r.failures)}" + for r in self.runs + if not r.passed and r.failures + ] + + +# --------------------------------------------------------------------------- +# Grading +# --------------------------------------------------------------------------- + + +def _grade(case: Case, drive: DriveResult) -> tuple[bool, list[str]]: + """Return ``(passed, failure_reasons)`` for a single run.""" + failures: list[str] = [] + + # Runtime-tweak cases: the agent must NOT emit workflow_proposal. If + # it does, that's a regression ("the user asked for a param change + # but we rebuilt the whole graph"). No graph-based checks are run — + # there shouldn't be a graph to check. + if case.forbid_proposal: + if drive.error and drive.proposal is None: + return False, [f"driver error: {drive.error.strip()[:200]}"] + if drive.proposal is not None: + return False, [ + "forbid_proposal: agent emitted workflow_proposal when it " + "should have used update_parameters instead" + ] + return True, [] + + if drive.error and drive.proposal is None: + return False, [f"driver error: {drive.error.strip()[:200]}"] + + if drive.proposal is None: + return False, [ + "no workflow_proposal SSE event seen — agent likely gave a " + "text-only response or failed before proposing" + ] + + graph = drive.proposal + + def _fail(name: str, arg: object, detail: str) -> None: + arg_repr = json.dumps(arg, default=str) if not isinstance(arg, str) else arg + failures.append(f"{name}({arg_repr}): {detail}") + + for spec in case.expect: + res = run_check(spec.name, graph, spec.arg) + if not res.ok: + _fail(spec.name, spec.arg, res.detail) + + # `forbid`: check returning ok=True means the forbidden pattern was + # NOT present, which is the success condition. Checks in forbid are + # the same named functions as in expect; we invert nothing — the + # `bad_handle_prefix` etc. are themselves phrased as "ok if absent". + for spec in case.forbid: + res = run_check(spec.name, graph, spec.arg) + if not res.ok: + _fail(f"forbid.{spec.name}", spec.arg, res.detail) + + return (not failures), failures + + +# --------------------------------------------------------------------------- +# Artifact writing +# --------------------------------------------------------------------------- + + +def _write_artifacts( + output_dir: Path, case_name: str, run_index: int, run: RunResult +) -> None: + out = output_dir / case_name / f"r{run_index:02d}" + out.mkdir(parents=True, exist_ok=True) + drive = run.drive or DriveResult() + (out / "proposal.json").write_text( + json.dumps(drive.proposal or {}, indent=2, default=str) + ) + (out / "meta.json").write_text( + json.dumps( + { + "case": case_name, + "run_index": run_index, + "passed": run.passed, + "failures": run.failures, + "rationale": drive.rationale, + "proposal_id": drive.proposal_id, + "session_id": drive.session_id, + "wall_seconds": round(run.wall_seconds, 3), + "error": drive.error, + }, + indent=2, + default=str, + ) + ) + # SSE trace as JSONL for easy grepping. + with (out / "trace.jsonl").open("w") as f: + for evt in drive.trace: + f.write(json.dumps(evt, default=str) + "\n") + + +# --------------------------------------------------------------------------- +# Orchestration +# --------------------------------------------------------------------------- + + +async def _run_single( + app, + case: Case, + run_index: int, + *, + model_override: str | None, + provider_override: str | None, +) -> RunResult: + t0 = perf_counter() + drive = await run_case( + app, + case.prompt, + model_override=model_override, + provider_override=provider_override, + ) + wall = perf_counter() - t0 + passed, failures = _grade(case, drive) + return RunResult( + case_name=case.name, + run_index=run_index, + passed=passed, + failures=failures, + drive=drive, + wall_seconds=wall, + ) + + +async def run_cases( + cases: list[Case], + *, + runs_override: int | None = None, + model_override: str | None = None, + provider_override: str | None = None, + output_dir: Path = DEFAULT_OUTPUT_DIR, +) -> list[CaseSummary]: + # Import here so a plain ``python -m evals --help`` doesn't pay the + # Scope import cost. + from scope.server.app import app # noqa: PLC0415 + + summaries: list[CaseSummary] = [] + for case in cases: + n = runs_override or case.runs + run_results: list[RunResult] = [] + for i in range(1, n + 1): + logger.info(f"[{case.name}] run {i}/{n}...") + rr = await _run_single( + app, + case, + i, + model_override=model_override, + provider_override=provider_override, + ) + _write_artifacts(output_dir, case.name, i, rr) + run_results.append(rr) + status = "PASS" if rr.passed else "FAIL" + detail = "" if rr.passed else f" — {'; '.join(rr.failures)[:160]}" + logger.info( + f"[{case.name}] run {i}/{n} {status} ({rr.wall_seconds:.1f}s){detail}" + ) + summaries.append(CaseSummary(case=case, runs=run_results)) + return summaries + + +# --------------------------------------------------------------------------- +# Printing +# --------------------------------------------------------------------------- + + +def print_summary(summaries: list[CaseSummary], output_dir: Path) -> tuple[int, int]: + """Return ``(total_pass, total_runs)``.""" + # Column widths + name_w = max((len(s.case.name) for s in summaries), default=4) + name_w = max(name_w, 4) + + header = f"{'case'.ljust(name_w)} runs pass rate failures" + print(header) + total_pass = total_runs = 0 + for s in summaries: + failures = "; ".join(s.grouped_failures)[:200] + total_pass += s.pass_count + total_runs += len(s.runs) + print( + f"{s.case.name.ljust(name_w)} " + f"{len(s.runs):>4} " + f"{s.pass_count:>4} " + f"{s.rate_pct:>4.0f}% " + f"{failures}" + ) + rule_w = max(len(header), 60) + print("─" * rule_w) + overall_rate = 100.0 * total_pass / total_runs if total_runs else 0.0 + print( + f"{'overall'.ljust(name_w)} {total_runs:>4} {total_pass:>4} " + f"{overall_rate:>4.0f}%" + ) + print(f"\nArtifacts: {output_dir}///{{proposal.json,trace.jsonl}}") + return total_pass, total_runs + + +# --------------------------------------------------------------------------- +# Entry points +# --------------------------------------------------------------------------- + + +def _resolve_cases(cases_dir: Path, selected: list[str] | None) -> list[Case]: + if not selected: + return discover_cases(cases_dir) + out: list[Case] = [] + for s in selected: + candidate = cases_dir / (s if s.endswith((".yaml", ".yml")) else f"{s}.yaml") + if not candidate.exists(): + raise FileNotFoundError(f"no such case: {candidate}") + out.append(load_case(candidate)) + return out + + +def main(argv: list[str] | None = None) -> int: + import argparse + + logging.basicConfig( + level=os.environ.get("EVALS_LOG_LEVEL", "INFO"), + format="%(asctime)s %(levelname)s %(message)s", + ) + p = argparse.ArgumentParser(prog="python -m evals") + p.add_argument( + "--case", + action="append", + default=[], + help="Case name (with or without .yaml). Repeatable. Omit for all cases.", + ) + p.add_argument("--runs", type=int, default=None, help="Override runs per case.") + p.add_argument("--model", default=None, help="Override model id.") + p.add_argument("--provider", default=None, help="Override provider.") + p.add_argument( + "--cases-dir", + default=str(DEFAULT_CASES_DIR), + help="Directory containing case YAMLs.", + ) + p.add_argument( + "--output-dir", + default=str(DEFAULT_OUTPUT_DIR), + help="Where to write per-run artifacts.", + ) + p.add_argument( + "--fail-threshold", + type=float, + default=None, + help="Exit non-zero if overall pass-rate < this percentage.", + ) + args = p.parse_args(argv) + + cases_dir = Path(args.cases_dir) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + try: + cases = _resolve_cases(cases_dir, args.case) + except FileNotFoundError as e: + print(f"error: {e}", file=sys.stderr) + return 2 + + if not cases: + print(f"no cases found in {cases_dir}", file=sys.stderr) + return 2 + + summaries = asyncio.run( + run_cases( + cases, + runs_override=args.runs, + model_override=args.model, + provider_override=args.provider, + output_dir=output_dir, + ) + ) + print() + total_pass, total_runs = print_summary(summaries, output_dir) + + if args.fail_threshold is not None and total_runs: + rate = 100.0 * total_pass / total_runs + if rate < args.fail_threshold: + print( + f"\nFAIL: overall {rate:.1f}% < threshold {args.fail_threshold:.1f}%", + file=sys.stderr, + ) + return 1 + return 0 diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index cf289cc1d..06e6cac4f 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -9,6 +9,8 @@ import { CloudProvider } from "./lib/cloudContext"; import { CloudStatusProvider } from "./hooks/useCloudStatus"; import { OnboardingProvider } from "./contexts/OnboardingContext"; import { BillingProvider } from "./contexts/BillingContext"; +import { AgentProvider } from "./contexts/AgentContext"; +import { AgentDrawer } from "./components/agent/AgentDrawer"; import { handleOAuthCallback, initElectronAuthListener, @@ -115,7 +117,18 @@ function App() { - + + {/* Row: main app (StreamPage) + optional agent + drawer. Drawer is a flex sibling, not a fixed + overlay, so the graph/perform panels resize to + fill remaining width and nothing is obscured. */} +
+
+ +
+ +
+
diff --git a/frontend/src/components/Header.tsx b/frontend/src/components/Header.tsx index c3927d901..9661c2f94 100644 --- a/frontend/src/components/Header.tsx +++ b/frontend/src/components/Header.tsx @@ -70,7 +70,7 @@ export function Header({ const [settingsOpen, setSettingsOpen] = useState(false); const [pluginsOpen, setPluginsOpen] = useState(false); const [initialTab, setInitialTab] = useState< - "general" | "account" | "api-keys" | "loras" | "osc" | "billing" + "general" | "account" | "api-keys" | "agent" | "loras" | "osc" | "billing" >("general"); const [initialPluginPath, setInitialPluginPath] = useState(""); const [pluginsInitialTab, setPluginsInitialTab] = useState< diff --git a/frontend/src/components/SettingsDialog.tsx b/frontend/src/components/SettingsDialog.tsx index 83cc6fcef..e0fed58be 100644 --- a/frontend/src/components/SettingsDialog.tsx +++ b/frontend/src/components/SettingsDialog.tsx @@ -2,6 +2,7 @@ import { useState, useEffect } from "react"; import { Dialog, DialogContent } from "./ui/dialog"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "./ui/tabs"; import { AccountTab } from "./settings/AccountTab"; +import { AgentProviderTab } from "./settings/AgentProviderTab"; import { ApiKeysTab } from "./settings/ApiKeysTab"; import { GeneralTab } from "./settings/GeneralTab"; import { ReportBugDialog } from "./ReportBugDialog"; @@ -24,6 +25,7 @@ interface SettingsDialogProps { | "account" | "billing" | "api-keys" + | "agent" | "loras" | "osc" | "dmx" @@ -160,6 +162,12 @@ export function SettingsDialog({ > API Keys + + Agent + + + + (() => { + const stored = localStorage.getItem(DRAWER_WIDTH_KEY); + const parsed = stored ? parseInt(stored, 10) : NaN; + return Number.isFinite(parsed) + ? Math.min(MAX_WIDTH, Math.max(MIN_WIDTH, parsed)) + : DEFAULT_WIDTH; + }); + const draggingRef = useRef(false); + + useEffect(() => { + localStorage.setItem(DRAWER_WIDTH_KEY, String(width)); + }, [width]); + + const onDragStart = useCallback( + (e: React.MouseEvent) => { + e.preventDefault(); + draggingRef.current = true; + const startX = e.clientX; + const startWidth = width; + const onMove = (me: MouseEvent) => { + if (!draggingRef.current) return; + const delta = startX - me.clientX; + const next = Math.min( + MAX_WIDTH, + Math.max(MIN_WIDTH, startWidth + delta) + ); + setWidth(next); + }; + const onUp = () => { + draggingRef.current = false; + window.removeEventListener("mousemove", onMove); + window.removeEventListener("mouseup", onUp); + }; + window.addEventListener("mousemove", onMove); + window.addEventListener("mouseup", onUp); + }, + [width] + ); + + if (!drawerOpen) return null; + + const needsKey = + !!config && + !configError && + config.key_sources[config.provider] == null && + config.provider !== "self_hosted"; + + return ( +
+ {/* Resize handle — dragging grows the drawer toward the graph (the + delta is inverted because the handle sits on the LEFT edge of the + drawer but we track the mouse moving LEFTWARD as "wider drawer"). */} +
+ + {/* Header */} +
+
+
Scope Agent
+ {config && ( + + {config.provider === "anthropic" + ? `Claude • ${config.model}` + : config.provider === "openai_compatible" + ? `OpenAI • ${config.model}` + : `Local • ${config.model}`} + + )} +
+
+ {isStreaming && ( + + )} + + +
+
+ + {/* Banner: missing key / config error */} + {configError && ( +
+ Failed to load agent config: {configError} +
+ )} + {needsKey && ( +
+ No API key configured for{" "} + {config?.provider === "anthropic" ? "Anthropic" : "OpenAI-compatible"} + . Open Settings → API Keys to add one. +
+ )} + + {/* Transcript */} + + + {/* Composer */} + +
+ ); +} diff --git a/frontend/src/components/agent/ChatTranscript.tsx b/frontend/src/components/agent/ChatTranscript.tsx new file mode 100644 index 000000000..11976ec21 --- /dev/null +++ b/frontend/src/components/agent/ChatTranscript.tsx @@ -0,0 +1,73 @@ +import { useEffect, useRef } from "react"; +import type { AgentMessage, AgentProposal } from "@/contexts/AgentContext"; +import { MessageBubble } from "./MessageBubble"; +import { WorkflowProposalCard } from "./WorkflowProposalCard"; + +interface ChatTranscriptProps { + messages: AgentMessage[]; + pendingProposal: AgentProposal | null; + onDecide: (approved: boolean, reason?: string) => Promise; +} + +export function ChatTranscript({ + messages, + pendingProposal, + onDecide, +}: ChatTranscriptProps) { + const scrollRef = useRef(null); + const stickyBottomRef = useRef(true); + + // Track whether user is at the bottom. If so, auto-scroll; otherwise leave + // their scroll position alone. + const onScroll = (e: React.UIEvent) => { + const el = e.currentTarget; + stickyBottomRef.current = + el.scrollHeight - el.scrollTop - el.clientHeight < 48; + }; + + useEffect(() => { + if (!stickyBottomRef.current) return; + const el = scrollRef.current; + if (el) el.scrollTop = el.scrollHeight; + }, [messages, pendingProposal]); + + if (messages.length === 0 && !pendingProposal) { + return ( +
+

+ Tell me what you want to build. +

+

+ I can pick pipelines, compose workflows, and tune parameters by + watching the output. +

+
    +
  • + • "Hyperrealistic scene with 3–5 switchable prompts" +
  • +
  • + • "It's not recognizing depth well" +
  • +
  • + • "Help me record what I'm seeing" +
  • +
+
+ ); + } + + return ( +
+ {messages.map(m => ( + + ))} + {pendingProposal && !pendingProposal.decision && ( + + )} +
+ ); +} diff --git a/frontend/src/components/agent/Composer.tsx b/frontend/src/components/agent/Composer.tsx new file mode 100644 index 000000000..294f8beb6 --- /dev/null +++ b/frontend/src/components/agent/Composer.tsx @@ -0,0 +1,56 @@ +import { useRef, useState, type KeyboardEvent } from "react"; +import { Send } from "lucide-react"; +import { Button } from "@/components/ui/button"; + +interface ComposerProps { + onSend: (text: string) => Promise; + disabled?: boolean; + placeholder?: string; +} + +export function Composer({ onSend, disabled, placeholder }: ComposerProps) { + const [value, setValue] = useState(""); + const textareaRef = useRef(null); + + const send = async () => { + const text = value.trim(); + if (!text || disabled) return; + setValue(""); + await onSend(text); + textareaRef.current?.focus(); + }; + + const onKeyDown = (e: KeyboardEvent) => { + // Cmd/Ctrl+Enter or bare Enter to send (Shift+Enter inserts newline). + if (e.key === "Enter" && !e.shiftKey) { + e.preventDefault(); + void send(); + } + }; + + return ( +
+