From 81a07a41e461046e1f8cefc796c85477a3fd1fc7 Mon Sep 17 00:00:00 2001 From: ahmido Date: Sat, 18 Apr 2026 07:36:05 +0000 Subject: [PATCH] feat: implement runtime trend recalibration reporting (#244) ## Summary - implement Spec 211 runtime trend reporting with bounded lane history, drift classification, hotspot trend output, and recalibration evidence handling - extend the repo-truth governance seams and workflow wrappers for comparable-bundle hydration, trend artifact publication, and contract-backed reporting - add the Spec 211 planning artifacts, data model, quickstart, tasks, and repository contract documents ## Validation - parsed `specs/211-runtime-trend-recalibration/contracts/test-runtime-trend-history.schema.json` - parsed `specs/211-runtime-trend-recalibration/contracts/test-runtime-trend.logical.openapi.yaml` - re-ran cross-artifact consistency analysis for the Spec 211 artifact set until no material findings remained - no application test suite was re-run as part of this final commit/push/PR step Co-authored-by: Ahmed Darrazi Reviewed-on: https://git.cloudarix.de/ahmido/TenantAtlas/pulls/244 --- .gitea/workflows/test-browser.yml | 10 +- .gitea/workflows/test-heavy-governance.yml | 10 +- .gitea/workflows/test-main-confidence.yml | 10 +- .gitea/workflows/test-pr-fast-feedback.yml | 10 +- .github/agents/copilot-instructions.md | 4 +- README.md | 18 +- apps/platform/composer.json | 3 + .../CiConfidenceWorkflowContractTest.php | 6 +- .../CiFastFeedbackWorkflowContractTest.php | 6 +- .../CiHeavyBrowserWorkflowContractTest.php | 12 +- .../Guards/TestLaneArtifactsContractTest.php | 14 +- .../Guards/TestLaneCommandContractTest.php | 11 +- .../TestLaneHistoryHydrationContractTest.php | 107 +++ .../TestLaneHotspotTrendContractTest.php | 94 ++ .../Feature/Guards/TestLaneManifestTest.php | 9 +- ...tLaneRecalibrationEvidenceContractTest.php | 127 +++ .../TestLaneRecalibrationPolicyTest.php | 111 +++ .../TestLaneTrendClassificationTest.php | 78 ++ .../TestLaneTrendContractSchemaTest.php | 137 +++ .../TestLaneTrendLogicalContractTest.php | 71 ++ .../TestLaneTrendSummaryContractTest.php | 97 ++ .../platform/tests/Support/TestLaneBudget.php | 256 ++++- .../tests/Support/TestLaneManifest.php | 114 ++- .../platform/tests/Support/TestLaneReport.php | 901 +++++++++++++++++- .../tests/Support/TestLaneTrendFixtures.php | 96 ++ scripts/platform-test-report | 273 +++++- .../checklists/requirements.md | 39 + .../test-runtime-trend-history.schema.json | 540 +++++++++++ .../test-runtime-trend.logical.openapi.yaml | 641 +++++++++++++ .../data-model.md | 192 ++++ specs/211-runtime-trend-recalibration/plan.md | 174 ++++ .../quickstart.md | 133 +++ .../research.md | 73 ++ specs/211-runtime-trend-recalibration/spec.md | 351 +++++++ .../211-runtime-trend-recalibration/tasks.md | 202 ++++ 35 files changed, 4898 insertions(+), 32 deletions(-) create mode 100644 apps/platform/tests/Feature/Guards/TestLaneHistoryHydrationContractTest.php create mode 100644 apps/platform/tests/Feature/Guards/TestLaneHotspotTrendContractTest.php create mode 100644 apps/platform/tests/Feature/Guards/TestLaneRecalibrationEvidenceContractTest.php create mode 100644 apps/platform/tests/Feature/Guards/TestLaneRecalibrationPolicyTest.php create mode 100644 apps/platform/tests/Feature/Guards/TestLaneTrendClassificationTest.php create mode 100644 apps/platform/tests/Feature/Guards/TestLaneTrendContractSchemaTest.php create mode 100644 apps/platform/tests/Feature/Guards/TestLaneTrendLogicalContractTest.php create mode 100644 apps/platform/tests/Feature/Guards/TestLaneTrendSummaryContractTest.php create mode 100644 apps/platform/tests/Support/TestLaneTrendFixtures.php create mode 100644 specs/211-runtime-trend-recalibration/checklists/requirements.md create mode 100644 specs/211-runtime-trend-recalibration/contracts/test-runtime-trend-history.schema.json create mode 100644 specs/211-runtime-trend-recalibration/contracts/test-runtime-trend.logical.openapi.yaml create mode 100644 specs/211-runtime-trend-recalibration/data-model.md create mode 100644 specs/211-runtime-trend-recalibration/plan.md create mode 100644 specs/211-runtime-trend-recalibration/quickstart.md create mode 100644 specs/211-runtime-trend-recalibration/research.md create mode 100644 specs/211-runtime-trend-recalibration/spec.md create mode 100644 specs/211-runtime-trend-recalibration/tasks.md diff --git a/.gitea/workflows/test-browser.yml b/.gitea/workflows/test-browser.yml index 3721cc9f..4e3c5174 100644 --- a/.gitea/workflows/test-browser.yml +++ b/.gitea/workflows/test-browser.yml @@ -5,6 +5,10 @@ on: schedule: - cron: '43 4 * * 1-5' +permissions: + actions: read + contents: read + jobs: browser: if: ${{ github.event_name != 'schedule' || vars.TENANTATLAS_ENABLE_BROWSER_SCHEDULE == '1' }} @@ -53,7 +57,9 @@ jobs: - name: Refresh Browser report if: always() - run: ./scripts/platform-test-report browser --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }} + env: + TENANTATLAS_GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} + run: ./scripts/platform-test-report browser --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }} --fetch-latest-history - name: Stage Browser artifacts if: always() @@ -71,4 +77,4 @@ jobs: if: always() run: | cd apps/platform - ./vendor/bin/sail stop \ No newline at end of file + ./vendor/bin/sail stop diff --git a/.gitea/workflows/test-heavy-governance.yml b/.gitea/workflows/test-heavy-governance.yml index ec3c1623..9cecd7c7 100644 --- a/.gitea/workflows/test-heavy-governance.yml +++ b/.gitea/workflows/test-heavy-governance.yml @@ -5,6 +5,10 @@ on: schedule: - cron: '17 4 * * 1-5' +permissions: + actions: read + contents: read + jobs: heavy-governance: if: ${{ github.event_name != 'schedule' || vars.TENANTATLAS_ENABLE_HEAVY_GOVERNANCE_SCHEDULE == '1' }} @@ -53,7 +57,9 @@ jobs: - name: Refresh Heavy Governance report if: always() - run: ./scripts/platform-test-report heavy-governance --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }} + env: + TENANTATLAS_GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} + run: ./scripts/platform-test-report heavy-governance --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }} --fetch-latest-history - name: Stage Heavy Governance artifacts if: always() @@ -71,4 +77,4 @@ jobs: if: always() run: | cd apps/platform - ./vendor/bin/sail stop \ No newline at end of file + ./vendor/bin/sail stop diff --git a/.gitea/workflows/test-main-confidence.yml b/.gitea/workflows/test-main-confidence.yml index 62a9bb01..a3fb1a39 100644 --- a/.gitea/workflows/test-main-confidence.yml +++ b/.gitea/workflows/test-main-confidence.yml @@ -5,6 +5,10 @@ on: branches: - dev +permissions: + actions: read + contents: read + jobs: confidence: runs-on: ubuntu-latest @@ -41,7 +45,9 @@ jobs: - name: Refresh Confidence report if: always() - run: ./scripts/platform-test-report confidence --workflow-id=main-confidence --trigger-class=mainline-push + env: + TENANTATLAS_GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} + run: ./scripts/platform-test-report confidence --workflow-id=main-confidence --trigger-class=mainline-push --fetch-latest-history - name: Stage Confidence artifacts if: always() @@ -59,4 +65,4 @@ jobs: if: always() run: | cd apps/platform - ./vendor/bin/sail stop \ No newline at end of file + ./vendor/bin/sail stop diff --git a/.gitea/workflows/test-pr-fast-feedback.yml b/.gitea/workflows/test-pr-fast-feedback.yml index b66eed72..6f446631 100644 --- a/.gitea/workflows/test-pr-fast-feedback.yml +++ b/.gitea/workflows/test-pr-fast-feedback.yml @@ -7,6 +7,10 @@ on: - reopened - synchronize +permissions: + actions: read + contents: read + jobs: fast-feedback: runs-on: ubuntu-latest @@ -43,7 +47,9 @@ jobs: - name: Refresh Fast Feedback report if: always() - run: ./scripts/platform-test-report fast-feedback --workflow-id=pr-fast-feedback --trigger-class=pull-request + env: + TENANTATLAS_GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }} + run: ./scripts/platform-test-report fast-feedback --workflow-id=pr-fast-feedback --trigger-class=pull-request --fetch-latest-history - name: Stage Fast Feedback artifacts if: always() @@ -61,4 +67,4 @@ jobs: if: always() run: | cd apps/platform - ./vendor/bin/sail stop \ No newline at end of file + ./vendor/bin/sail stop diff --git a/.github/agents/copilot-instructions.md b/.github/agents/copilot-instructions.md index 89544d81..f4eb6456 100644 --- a/.github/agents/copilot-instructions.md +++ b/.github/agents/copilot-instructions.md @@ -198,6 +198,8 @@ ## Active Technologies - SQLite `:memory:` for the default test environment, mixed database strategy for some heavy-governance families as declared in `TestLaneManifest`, and existing lane artifacts under the app-root contract path `storage/logs/test-lanes` (209-heavy-governance-cost) - PHP 8.4.15 for repo-truth test governance, Bash for repo-root wrappers, and GitHub-compatible Gitea Actions workflow YAML under `.gitea/workflows/` + Laravel 12, Pest v4, PHPUnit 12, Filament v5, Livewire v4, Laravel Sail, Gitea Actions backed by `act_runner`, and the existing `Tests\Support\TestLaneManifest`, `TestLaneBudget`, and `TestLaneReport` seams (210-ci-matrix-budget-enforcement) - SQLite `:memory:` for default lane execution, filesystem artifacts under the app-root contract path `storage/logs/test-lanes`, checked-in workflow YAML under `.gitea/workflows/`, and no new product database persistence (210-ci-matrix-budget-enforcement) +- PHP 8.4.15 for repo-truth governance logic, Bash for repo-root wrappers, GitHub-compatible Gitea Actions workflow YAML under `.gitea/workflows/`, plus JSON Schema and logical OpenAPI for repository contracts + Laravel 12, Pest v4, PHPUnit 12, Filament v5, Livewire v4, Laravel Sail, Gitea Actions backed by `act_runner`, uploaded artifact bundles, and the existing `Tests\Support\TestLaneManifest`, `TestLaneBudget`, and `TestLaneReport` seams (211-runtime-trend-recalibration) +- SQLite `:memory:` for lane execution, filesystem artifacts under `apps/platform/storage/logs/test-lanes`, staged CI bundles under `.gitea-artifacts/`, bounded derived trend/history artifacts adjacent to current lane artifacts, and no new product database persistence (211-runtime-trend-recalibration) - PHP 8.4.15 (feat/005-bulk-operations) @@ -232,8 +234,8 @@ ## Code Style PHP 8.4.15: Follow standard conventions ## Recent Changes +- 211-runtime-trend-recalibration: Added PHP 8.4.15 for repo-truth governance logic, Bash for repo-root wrappers, GitHub-compatible Gitea Actions workflow YAML under `.gitea/workflows/`, plus JSON Schema and logical OpenAPI for repository contracts + Laravel 12, Pest v4, PHPUnit 12, Filament v5, Livewire v4, Laravel Sail, Gitea Actions backed by `act_runner`, uploaded artifact bundles, and the existing `Tests\Support\TestLaneManifest`, `TestLaneBudget`, and `TestLaneReport` seams - 210-ci-matrix-budget-enforcement: Added PHP 8.4.15 for repo-truth test governance, Bash for repo-root wrappers, and GitHub-compatible Gitea Actions workflow YAML under `.gitea/workflows/` + Laravel 12, Pest v4, PHPUnit 12, Filament v5, Livewire v4, Laravel Sail, Gitea Actions backed by `act_runner`, and the existing `Tests\Support\TestLaneManifest`, `TestLaneBudget`, and `TestLaneReport` seams - 209-heavy-governance-cost: Added PHP 8.4.15 + Laravel 12, Pest v4, PHPUnit 12, Filament v5, Livewire v4, Laravel Sail -- 208-heavy-suite-segmentation: Added PHP 8.4.15 + Laravel 12, Pest v4, PHPUnit 12, Filament v5, Livewire v4, Laravel Sail diff --git a/README.md b/README.md index 54526711..acc4d930 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,11 @@ ### Canonical Lane Commands - `./scripts/platform-test-report browser` - `./scripts/platform-test-report profiling` - `./scripts/platform-test-report junit` +- Trend-aware report refresh options: + - `--history-file=/absolute/path/to/-latest.trend-history.json` seeds one prior comparable window explicitly. + - `--history-bundle=/absolute/path/to/bundle-or-zip` hydrates the newest matching `trend-history.json` from a staged artifact bundle. + - `--fetch-latest-history` asks the wrapper to download the most recent comparable bundle from Gitea when `TENANTATLAS_GITEA_TOKEN` or `GITEA_TOKEN` is available. + - `--skip-latest-history` keeps the run intentionally cold-start so the summary reports `unstable` instead of guessing at trend state. - App-local equivalents remain available through Sail Composer scripts: - `cd apps/platform && ./vendor/bin/sail composer run test` - `cd apps/platform && ./vendor/bin/sail composer run test:confidence` @@ -64,6 +69,16 @@ ### Canonical Lane Commands - `cd apps/platform && ./vendor/bin/sail composer run test:junit` - The root wrapper is the safer default for long lanes because it pins Composer to `--timeout=0`. +### Trend Summary Reading + +- `healthy`: enough comparable samples exist, the lane is comfortably under budget, and recent variance stays inside the documented noise floor. +- `budget-near`: the lane is still within budget, but headroom has entered the lane's near-budget band and needs attention before it becomes a repeated blocker. +- `trending-worse`: multiple comparable samples are worsening above the lane variance floor even though the lane is not yet clearly over budget. +- `regressed`: the lane is over budget or repeatedly worsening enough that ordinary noise is no longer a credible explanation. +- `unstable`: the report intentionally refuses a stronger label because history is too short, the comparison fingerprint changed, or the recent window is noisy. +- Recalibration is separate from health. Reports can emit candidate, approved, or rejected baseline or budget decisions, but repository truth never moves automatically. +- Hotspot evidence may be unavailable on a given cycle. When that happens the summary must say so explicitly, and `profiling` or `junit` remain the preferred support-lane follow-up paths. + ### Workflow Expectation - Every runtime-changing spec, plan, and task set MUST record the target validation lane(s), fixture-cost risks, any heavy-governance or browser expansion, and any budget/baseline follow-up. @@ -81,7 +96,8 @@ ### CI Artifact Bundles - Lane-local artifacts are still generated in `apps/platform/storage/logs/test-lanes` as `*-latest.*` files. - CI workflows stage deterministic upload bundles through `./scripts/platform-test-artifacts` into `.gitea-artifacts/` before upload. -- Every governed CI lane publishes `summary.md`, `budget.json`, `report.json`, and `junit.xml`. `profiling` may additionally publish `profile.txt`. +- Every governed CI lane now publishes `summary.md`, `budget.json`, `report.json`, `junit.xml`, and `trend-history.json`. `profiling` may additionally publish `profile.txt`. +- The report refresh step hydrates the most recent comparable `trend-history.json` before regenerating the current summary when CI credentials allow it, then republishes the refreshed bounded history for the next run. - Artifact publication failures are first-class blocking failures for pull request and `dev` workflows. ### Recorded Baselines diff --git a/apps/platform/composer.json b/apps/platform/composer.json index 6e21c3c1..5186552c 100644 --- a/apps/platform/composer.json +++ b/apps/platform/composer.json @@ -95,6 +95,9 @@ "test:report:profile": [ "@php -r \"require 'vendor/autoload.php'; exit(\\Tests\\Support\\TestLaneManifest::renderLatestReport('profiling', ''));\"" ], + "test:report:junit": [ + "@php -r \"require 'vendor/autoload.php'; exit(\\Tests\\Support\\TestLaneManifest::renderLatestReport('junit', ''));\"" + ], "test:pgsql": [ "Composer\\Config::disableProcessTimeout", "@php vendor/bin/pest -c phpunit.pgsql.xml" diff --git a/apps/platform/tests/Feature/Guards/CiConfidenceWorkflowContractTest.php b/apps/platform/tests/Feature/Guards/CiConfidenceWorkflowContractTest.php index 46251ba2..9264cd8a 100644 --- a/apps/platform/tests/Feature/Guards/CiConfidenceWorkflowContractTest.php +++ b/apps/platform/tests/Feature/Guards/CiConfidenceWorkflowContractTest.php @@ -13,8 +13,12 @@ ->and($workflowProfile['branchFilters'])->toBe(['dev']) ->and($workflowContents)->toContain('push:') ->and($workflowContents)->toContain('- dev') + ->and($workflowContents)->toContain('permissions:') + ->and($workflowContents)->toContain('actions: read') + ->and($workflowContents)->toContain('contents: read') ->and($workflowContents)->toContain('./scripts/platform-test-lane confidence --workflow-id=main-confidence --trigger-class=mainline-push') - ->and($workflowContents)->toContain('./scripts/platform-test-report confidence --workflow-id=main-confidence --trigger-class=mainline-push') + ->and($workflowContents)->toContain('TENANTATLAS_GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}') + ->and($workflowContents)->toContain('./scripts/platform-test-report confidence --workflow-id=main-confidence --trigger-class=mainline-push --fetch-latest-history') ->and($workflowContents)->toContain('./scripts/platform-test-artifacts confidence .gitea-artifacts/main-confidence --workflow-id=main-confidence --trigger-class=mainline-push') ->and($workflowContents)->toContain('name: confidence-artifacts') ->and($workflowContents)->not->toContain('test:junit', './scripts/platform-test-lane fast-feedback', './scripts/platform-test-lane heavy-governance'); diff --git a/apps/platform/tests/Feature/Guards/CiFastFeedbackWorkflowContractTest.php b/apps/platform/tests/Feature/Guards/CiFastFeedbackWorkflowContractTest.php index 4419d11c..10286234 100644 --- a/apps/platform/tests/Feature/Guards/CiFastFeedbackWorkflowContractTest.php +++ b/apps/platform/tests/Feature/Guards/CiFastFeedbackWorkflowContractTest.php @@ -12,9 +12,13 @@ ->and($workflowProfile['triggerClass'])->toBe('pull-request') ->and($workflowProfile['laneBindings'])->toBe(['fast-feedback']) ->and($workflowContents)->toContain('pull_request:') + ->and($workflowContents)->toContain('permissions:') + ->and($workflowContents)->toContain('actions: read') + ->and($workflowContents)->toContain('contents: read') ->and($workflowContents)->toContain('opened', 'reopened', 'synchronize') ->and($workflowContents)->toContain('./scripts/platform-test-lane fast-feedback --workflow-id=pr-fast-feedback --trigger-class=pull-request') - ->and($workflowContents)->toContain('./scripts/platform-test-report fast-feedback --workflow-id=pr-fast-feedback --trigger-class=pull-request') + ->and($workflowContents)->toContain('TENANTATLAS_GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}') + ->and($workflowContents)->toContain('./scripts/platform-test-report fast-feedback --workflow-id=pr-fast-feedback --trigger-class=pull-request --fetch-latest-history') ->and($workflowContents)->toContain('./scripts/platform-test-artifacts fast-feedback .gitea-artifacts/pr-fast-feedback --workflow-id=pr-fast-feedback --trigger-class=pull-request') ->and($workflowContents)->toContain('name: fast-feedback-artifacts') ->and($workflowContents)->not->toContain('confidence --workflow-id=pr-fast-feedback', 'heavy-governance', 'browser --workflow-id=pr-fast-feedback'); diff --git a/apps/platform/tests/Feature/Guards/CiHeavyBrowserWorkflowContractTest.php b/apps/platform/tests/Feature/Guards/CiHeavyBrowserWorkflowContractTest.php index 7333c550..a7e56d2c 100644 --- a/apps/platform/tests/Feature/Guards/CiHeavyBrowserWorkflowContractTest.php +++ b/apps/platform/tests/Feature/Guards/CiHeavyBrowserWorkflowContractTest.php @@ -15,11 +15,16 @@ ->and($scheduledProfile['scheduleCron'])->toBe('17 4 * * 1-5') ->and($workflowContents)->toContain('workflow_dispatch:') ->and($workflowContents)->toContain('schedule:') + ->and($workflowContents)->toContain('permissions:') + ->and($workflowContents)->toContain('actions: read') + ->and($workflowContents)->toContain('contents: read') ->and($workflowContents)->toContain('17 4 * * 1-5') ->and($workflowContents)->toContain("vars.TENANTATLAS_ENABLE_HEAVY_GOVERNANCE_SCHEDULE == '1'") ->and($workflowContents)->toContain('workflow_id=heavy-governance-manual') ->and($workflowContents)->toContain('workflow_id=heavy-governance-scheduled') ->and($workflowContents)->toContain('./scripts/platform-test-lane heavy-governance --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }}') + ->and($workflowContents)->toContain('TENANTATLAS_GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}') + ->and($workflowContents)->toContain('./scripts/platform-test-report heavy-governance --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }} --fetch-latest-history') ->and($workflowContents)->toContain('./scripts/platform-test-artifacts heavy-governance .gitea-artifacts/heavy-governance --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }}') ->and($workflowContents)->not->toContain('pull_request:', './scripts/platform-test-lane browser'); }); @@ -35,11 +40,16 @@ ->and($scheduledProfile['scheduleCron'])->toBe('43 4 * * 1-5') ->and($workflowContents)->toContain('workflow_dispatch:') ->and($workflowContents)->toContain('schedule:') + ->and($workflowContents)->toContain('permissions:') + ->and($workflowContents)->toContain('actions: read') + ->and($workflowContents)->toContain('contents: read') ->and($workflowContents)->toContain('43 4 * * 1-5') ->and($workflowContents)->toContain("vars.TENANTATLAS_ENABLE_BROWSER_SCHEDULE == '1'") ->and($workflowContents)->toContain('workflow_id=browser-manual') ->and($workflowContents)->toContain('workflow_id=browser-scheduled') ->and($workflowContents)->toContain('./scripts/platform-test-lane browser --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }}') + ->and($workflowContents)->toContain('TENANTATLAS_GITEA_TOKEN: ${{ secrets.GITEA_TOKEN }}') + ->and($workflowContents)->toContain('./scripts/platform-test-report browser --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }} --fetch-latest-history') ->and($workflowContents)->toContain('./scripts/platform-test-artifacts browser .gitea-artifacts/browser --workflow-id=${{ steps.context.outputs.workflow_id }} --trigger-class=${{ steps.context.outputs.trigger_class }}') ->and($workflowContents)->not->toContain('pull_request:', './scripts/platform-test-lane confidence'); -}); \ No newline at end of file +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneArtifactsContractTest.php b/apps/platform/tests/Feature/Guards/TestLaneArtifactsContractTest.php index 261099b8..1a66e00f 100644 --- a/apps/platform/tests/Feature/Guards/TestLaneArtifactsContractTest.php +++ b/apps/platform/tests/Feature/Guards/TestLaneArtifactsContractTest.php @@ -25,9 +25,9 @@ function heavyGovernanceSyntheticHotspots(): array $artifacts = TestLaneReport::artifactPaths('fast-feedback'); $artifactContract = TestLaneManifest::artifactPublicationContract('fast-feedback'); - expect($artifacts)->toHaveKeys(['junit', 'summary', 'budget', 'report', 'profile']); + expect($artifacts)->toHaveKeys(['junit', 'summary', 'budget', 'report', 'profile', 'trendHistory']); - expect($artifactContract['requiredFiles'])->toEqualCanonicalizing(['summary.md', 'budget.json', 'report.json', 'junit.xml']) + expect($artifactContract['requiredFiles'])->toEqualCanonicalizing(['summary.md', 'budget.json', 'report.json', 'junit.xml', 'trend-history.json']) ->and($artifactContract['stagedNamePattern'])->toBe('{laneId}.{artifactFile}'); foreach (array_values($artifacts) as $relativePath) { @@ -73,6 +73,11 @@ function heavyGovernanceSyntheticHotspots(): array 'artifactPublicationContract', 'knownWorkflowProfiles', 'failureClasses', + 'trendHistoryArtifact', + 'trendCurrentAssessment', + 'trendHotspotSnapshot', + 'trendRecalibrationDecisions', + 'trendWarnings', 'budgetContract', 'hotspotInventory', 'decompositionRecords', @@ -157,13 +162,14 @@ function heavyGovernanceSyntheticHotspots(): array expect($stagingResult['complete'])->toBeTrue() ->and(collect($stagingResult['stagedArtifacts'])->pluck('artifactType')->all()) - ->toEqualCanonicalizing(['summary.md', 'budget.json', 'report.json', 'junit.xml']) + ->toEqualCanonicalizing(['summary.md', 'budget.json', 'report.json', 'junit.xml', 'trend-history.json']) ->and(collect($stagingResult['stagedArtifacts'])->pluck('relativePath')->all()) ->toContain( $stagingDirectory.'/fast-feedback.summary.md', $stagingDirectory.'/fast-feedback.budget.json', $stagingDirectory.'/fast-feedback.report.json', $stagingDirectory.'/fast-feedback.junit.xml', + $stagingDirectory.'/fast-feedback.trend-history.json', ); }); @@ -187,4 +193,4 @@ function heavyGovernanceSyntheticHotspots(): array expect($fastFeedback)->toHaveKey('sharedFixtureSlimmingComparison') ->and($heavyGovernance)->not->toHaveKey('sharedFixtureSlimmingComparison'); -}); \ No newline at end of file +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneCommandContractTest.php b/apps/platform/tests/Feature/Guards/TestLaneCommandContractTest.php index bf97947d..7e3f0f23 100644 --- a/apps/platform/tests/Feature/Guards/TestLaneCommandContractTest.php +++ b/apps/platform/tests/Feature/Guards/TestLaneCommandContractTest.php @@ -21,6 +21,7 @@ 'test:report:browser', 'test:report:heavy', 'test:report:profile', + 'test:report:junit', 'sail:test', ]) ->and(TestLaneManifest::commandRef('fast-feedback'))->toBe('test') @@ -42,7 +43,13 @@ $reportRunner = (string) file_get_contents(repo_path('scripts/platform-test-report')); expect($laneRunner)->toContain('--capture-baseline', 'copy_heavy_baseline_artifacts', 'heavy-governance-baseline.${suffix}') - ->and($reportRunner)->toContain('--capture-baseline', 'copy_heavy_baseline_artifacts', 'heavy-governance-baseline.${suffix}'); + ->and($reportRunner)->toContain('--capture-baseline', 'copy_heavy_baseline_artifacts', 'heavy-governance-baseline.${suffix}') + ->and($reportRunner)->toContain('junit)', 'test:report:junit') + ->and($reportRunner)->toContain('--history-file=') + ->and($reportRunner)->toContain('--history-bundle=') + ->and($reportRunner)->toContain('--fetch-latest-history') + ->and($reportRunner)->toContain('TENANTATLAS_GITEA_TOKEN') + ->and($reportRunner)->toContain('trend-history.json'); }); it('avoids expanding an empty forwarded-argument array in the lane runner', function (): void { @@ -120,4 +127,4 @@ ->and($heavyContents)->toContain('tests/Feature/Findings/FindingBulkActionsTest.php') ->and($heavyContents)->toContain('tests/Feature/Guards/ActionSurfaceContractTest.php') ->and(TestLaneManifest::buildCommand('junit'))->toContain('--parallel'); -}); \ No newline at end of file +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneHistoryHydrationContractTest.php b/apps/platform/tests/Feature/Guards/TestLaneHistoryHydrationContractTest.php new file mode 100644 index 00000000..8a1c61b4 --- /dev/null +++ b/apps/platform/tests/Feature/Guards/TestLaneHistoryHydrationContractTest.php @@ -0,0 +1,107 @@ + 14.2, + 'tests/Feature/Guards/TestLaneArtifactsContractTest.php' => 9.8, + ], + artifactDirectory: $sourceArtifactDirectory, + ciContext: [ + 'workflowId' => 'pr-fast-feedback', + 'triggerClass' => 'pull-request', + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: 'shared-test-fixture-slimming', + ); + + $sourcePath = TestLaneManifest::absolutePath($sourceArtifactDirectory.'/seeded-fast-feedback.trend-history.json'); + + if (! is_dir(dirname($sourcePath))) { + mkdir(dirname($sourcePath), 0777, true); + } + + file_put_contents($sourcePath, json_encode($report['trendHistoryArtifact'], JSON_PRETTY_PRINT | JSON_THROW_ON_ERROR)); + + $result = TestLaneReport::hydrateTrendHistory( + laneId: 'fast-feedback', + historyFile: $sourcePath, + artifactDirectory: $targetArtifactDirectory, + ); + + $hydratedArtifact = TestLaneTrendFixtures::readTrendHistory('fast-feedback', $targetArtifactDirectory); + + expect($result['hydrated'])->toBeTrue() + ->and($result['sourceType'])->toBe('history-file') + ->and($hydratedArtifact['laneId'])->toBe('fast-feedback') + ->and($hydratedArtifact['workflowProfile'])->toBe('pr-fast-feedback') + ->and($hydratedArtifact['history'][0]['artifactRefs']['trendHistory'])->toBe($sourceArtifactDirectory.'/fast-feedback-latest.trend-history.json'); +}); + +it('hydrates staged bundle directories and zip bundles using the canonical trend-history artifact name', function (): void { + $sourceArtifactDirectory = TestLaneTrendFixtures::artifactDirectory('trend-history-hydration/bundle-source'); + $bundleDirectory = TestLaneManifest::absolutePath($sourceArtifactDirectory.'/bundle'); + $targetArtifactDirectory = TestLaneTrendFixtures::artifactDirectory('trend-history-hydration/bundle-target'); + $report = TestLaneTrendFixtures::buildReport( + laneId: 'confidence', + wallClockSeconds: 431.2, + durationsByFile: [ + 'tests/Feature/Baselines/BaselineCompareMatrixCompareAllActionTest.php' => 24.7, + 'tests/Feature/Baselines/BaselineCompareMatrixBuilderTest.php' => 20.4, + ], + artifactDirectory: $sourceArtifactDirectory, + ciContext: [ + 'workflowId' => 'main-confidence', + 'triggerClass' => 'mainline-push', + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + ); + + if (! is_dir($bundleDirectory)) { + mkdir($bundleDirectory, 0777, true); + } + + file_put_contents( + $bundleDirectory.'/confidence.trend-history.json', + json_encode($report['trendHistoryArtifact'], JSON_PRETTY_PRINT | JSON_THROW_ON_ERROR), + ); + + $directoryResult = TestLaneReport::hydrateTrendHistory( + laneId: 'confidence', + bundlePath: $bundleDirectory, + artifactDirectory: $targetArtifactDirectory, + ); + + $zipPath = $bundleDirectory.'/confidence-artifacts.zip'; + $zip = new ZipArchive(); + $zip->open($zipPath, ZipArchive::CREATE | ZipArchive::OVERWRITE); + $zip->addFile($bundleDirectory.'/confidence.trend-history.json', 'confidence.trend-history.json'); + $zip->close(); + + $zipResult = TestLaneReport::hydrateTrendHistory( + laneId: 'confidence', + bundlePath: $zipPath, + artifactDirectory: $targetArtifactDirectory, + ); + + $hydratedArtifact = TestLaneTrendFixtures::readTrendHistory('confidence', $targetArtifactDirectory); + + expect($directoryResult['hydrated'])->toBeTrue() + ->and($directoryResult['sourceType'])->toBe('bundle-directory') + ->and($zipResult['hydrated'])->toBeTrue() + ->and($zipResult['sourceType'])->toBe('bundle-zip') + ->and($hydratedArtifact['laneId'])->toBe('confidence') + ->and($hydratedArtifact['history'][0]['workflowId'])->toBe('main-confidence'); +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneHotspotTrendContractTest.php b/apps/platform/tests/Feature/Guards/TestLaneHotspotTrendContractTest.php new file mode 100644 index 00000000..ad4953f1 --- /dev/null +++ b/apps/platform/tests/Feature/Guards/TestLaneHotspotTrendContractTest.php @@ -0,0 +1,94 @@ + 24.0, + 'tests/Feature/Baselines/BaselineCompareMatrixBuilderTest.php' => 18.0, + 'tests/Feature/Rbac/OnboardingWizardUiEnforcementTest.php' => 12.0, + ]; + $currentDurations = [ + 'tests/Feature/Baselines/BaselineCompareMatrixCompareAllActionTest.php' => 41.0, + 'tests/Feature/Filament/BackupSetAdminTenantParityTest.php' => 15.0, + ]; + $previousReport = TestLaneTrendFixtures::buildReport( + laneId: 'confidence', + wallClockSeconds: 424.0, + durationsByFile: $previousDurations, + artifactDirectory: $artifactDirectory, + ciContext: [ + 'workflowId' => 'main-confidence', + 'triggerClass' => 'mainline-push', + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: 'shared-test-fixture-slimming', + ); + + TestLaneTrendFixtures::writeTrendHistory('confidence', $previousReport['trendHistoryArtifact'], $artifactDirectory); + + $report = TestLaneTrendFixtures::buildReport( + laneId: 'confidence', + wallClockSeconds: 438.0, + durationsByFile: $currentDurations, + artifactDirectory: $artifactDirectory, + ciContext: [ + 'workflowId' => 'main-confidence', + 'triggerClass' => 'mainline-push', + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: 'shared-test-fixture-slimming', + ); + + $hotspotSnapshot = $report['trendHotspotSnapshot']; + + expect($hotspotSnapshot['evidenceAvailability'])->toBe('available') + ->and($hotspotSnapshot['familyDeltas'])->not->toBeEmpty() + ->and(collect($hotspotSnapshot['familyDeltas'])->pluck('name')->all()) + ->toContain('baseline-compare-matrix-workflow', 'backup-set-admin-tenant-parity') + ->and(collect($hotspotSnapshot['fileHotspots'])->pluck('name')->all()) + ->toContain( + 'tests/Feature/Baselines/BaselineCompareMatrixCompareAllActionTest.php', + 'tests/Feature/Filament/BackupSetAdminTenantParityTest.php', + ) + ->and($hotspotSnapshot['newEntrants'])->toContain('tests/Feature/Filament/BackupSetAdminTenantParityTest.php') + ->and($hotspotSnapshot['droppedEntrants'])->toContain('tests/Feature/Rbac/OnboardingWizardUiEnforcementTest.php'); +}); + +it('discloses unavailable hotspot evidence instead of silently omitting the hotspot section', function (): void { + $artifactDirectory = TestLaneTrendFixtures::artifactDirectory('trend-hotspots/unavailable'); + $report = TestLaneTrendFixtures::buildReport( + laneId: 'fast-feedback', + wallClockSeconds: 189.0, + durationsByFile: [], + artifactDirectory: $artifactDirectory, + ciContext: [ + 'workflowId' => 'pr-fast-feedback', + 'triggerClass' => 'pull-request', + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: 'shared-test-fixture-slimming', + ); + + TestLaneReport::writeArtifacts( + laneId: 'fast-feedback', + report: $report, + artifactDirectory: $artifactDirectory, + ); + + $summary = (string) file_get_contents(TestLaneManifest::absolutePath( + TestLaneReport::artifactPaths('fast-feedback', $artifactDirectory)['summary'], + )); + + expect($report['trendHotspotSnapshot']['evidenceAvailability'])->toBe('unavailable') + ->and($report['trendWarnings'])->toContain('Hotspot evidence is unavailable for this cycle.') + ->and($summary)->toContain('Hotspot evidence: unavailable'); +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneManifestTest.php b/apps/platform/tests/Feature/Guards/TestLaneManifestTest.php index 882118c4..bba2b8bc 100644 --- a/apps/platform/tests/Feature/Guards/TestLaneManifestTest.php +++ b/apps/platform/tests/Feature/Guards/TestLaneManifestTest.php @@ -12,7 +12,7 @@ static fn (array $lane): bool => $lane['defaultEntryPoint'] === true, )); - expect($manifest['version'])->toBe(2) + expect($manifest['version'])->toBe(3) ->and($manifest['artifactDirectory'])->toBe('storage/logs/test-lanes') ->and($manifest['mainlineBranch'])->toBe('dev') ->and($manifest)->toHaveKeys([ @@ -27,6 +27,8 @@ 'laneBindings', 'budgetEnforcementProfiles', 'artifactPublicationContracts', + 'trendContractVersion', + 'laneTrendPolicies', 'failureClasses', 'familyBudgets', 'heavyGovernanceBudgetContract', @@ -68,8 +70,9 @@ ->and($workflowProfiles->get('heavy-governance-scheduled')['scheduleCron'])->toBe('17 4 * * 1-5') ->and($workflowProfiles->get('browser-scheduled')['scheduleCron'])->toBe('43 4 * * 1-5') ->and($laneBindings->get('fast-feedback')['executionWrapper'])->toBe('scripts/platform-test-lane') - ->and($laneBindings->get('confidence')['requiredArtifacts'])->toEqualCanonicalizing(['summary.md', 'budget.json', 'report.json', 'junit.xml']) + ->and($laneBindings->get('confidence')['requiredArtifacts'])->toEqualCanonicalizing(['summary.md', 'budget.json', 'report.json', 'junit.xml', 'trend-history.json']) ->and($artifactContracts->get('fast-feedback')['retentionClass'])->toBe('pr-short') + ->and($artifactContracts->get('fast-feedback')['requiredFiles'])->toContain('trend-history.json') ->and($artifactContracts->get('browser')['uploadGroupName'])->toBe('browser-artifacts') ->and($failureClasses->keys()->all())->toEqualCanonicalizing([ 'test-failure', @@ -192,4 +195,4 @@ 'retain-intentional-heavy-depth-explicitly', 'record-helper-or-fixture-residuals', ]); -}); \ No newline at end of file +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneRecalibrationEvidenceContractTest.php b/apps/platform/tests/Feature/Guards/TestLaneRecalibrationEvidenceContractTest.php new file mode 100644 index 00000000..a8107d3f --- /dev/null +++ b/apps/platform/tests/Feature/Guards/TestLaneRecalibrationEvidenceContractTest.php @@ -0,0 +1,127 @@ + 11.0, + ]; + $workflowId = $laneId === 'confidence' ? 'main-confidence' : 'pr-fast-feedback'; + $triggerClass = $laneId === 'confidence' ? 'mainline-push' : 'pull-request'; + $comparisonProfile = in_array($laneId, ['fast-feedback', 'confidence'], true) + ? 'shared-test-fixture-slimming' + : null; + $baseReport = TestLaneTrendFixtures::buildReport( + laneId: $laneId, + wallClockSeconds: $seededHistory[0], + durationsByFile: $durationsByFile, + artifactDirectory: $artifactDirectory, + ciContext: [ + 'workflowId' => $workflowId, + 'triggerClass' => $triggerClass, + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: $comparisonProfile, + ); + + $artifact = $baseReport['trendHistoryArtifact']; + $templateRecord = $artifact['history'][0]; + $artifact['history'] = array_values(array_map( + static function (float $seconds, int $index) use ($templateRecord): array { + $record = $templateRecord; + $record['runRef'] = sprintf('%s-recalibration-%d', $templateRecord['laneId'], $index + 1); + $record['generatedAt'] = sprintf('2026-04-%02dT08:30:00+00:00', $index + 1); + $record['wallClockSeconds'] = round($seconds, 6); + + return $record; + }, + $seededHistory, + array_keys($seededHistory), + )); + + TestLaneTrendFixtures::writeTrendHistory($laneId, $artifact, $artifactDirectory); + + return TestLaneTrendFixtures::buildReport( + laneId: $laneId, + wallClockSeconds: $currentSeconds, + durationsByFile: $durationsByFile, + artifactDirectory: $artifactDirectory, + ciContext: [ + 'workflowId' => $workflowId, + 'triggerClass' => $triggerClass, + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: $comparisonProfile, + ); +} + +it('emits candidate, approved, and rejected recalibration records with explicit summary disclosure', function (): void { + $candidateReport = reportWithSeededHistory('confidence', 'candidate', [420.0, 380.0, 340.0, 300.0, 260.0], 460.0); + $approvedReport = reportWithSeededHistory('fast-feedback', 'approved', [176.0, 176.3, 176.1, 176.4, 176.2], 176.3); + $rejectedReport = reportWithSeededHistory('fast-feedback', 'rejected', [176.0, 191.0, 177.0, 192.0, 178.0], 193.0); + + $approvedDecision = TestLaneBudget::buildRecalibrationDecisionRecord( + laneId: 'fast-feedback', + targetType: 'baseline', + assessment: ['recalibrationRecommendation' => 'review-baseline'], + historyRecords: $approvedReport['trendHistoryArtifact']['history'], + decisionStatus: 'approved', + rationaleCode: 'post-improvement-reset', + recordedIn: 'specs/211-runtime-trend-recalibration/spec.md', + proposedValueSeconds: 182.0, + notes: 'Approved baseline reset after the suite stabilized following a deliberate improvement pass.', + ); + + $candidateDecision = $candidateReport['trendRecalibrationDecisions'][0] ?? null; + $rejectedDecision = $rejectedReport['trendRecalibrationDecisions'][0] ?? null; + + $approvedReport['trendRecalibrationDecisions'][] = $approvedDecision; + $approvedReport['trendHistoryArtifact']['recalibrationDecisions'][] = $approvedDecision; + + TestLaneReport::writeArtifacts( + laneId: 'confidence', + report: $candidateReport, + artifactDirectory: $candidateReport['artifactDirectory'], + ); + TestLaneReport::writeArtifacts( + laneId: 'fast-feedback', + report: $approvedReport, + artifactDirectory: $approvedReport['artifactDirectory'], + ); + TestLaneReport::writeArtifacts( + laneId: 'fast-feedback', + report: $rejectedReport, + artifactDirectory: $rejectedReport['artifactDirectory'], + ); + + $candidateSummary = (string) file_get_contents(TestLaneManifest::absolutePath( + TestLaneReport::artifactPaths('confidence', $candidateReport['artifactDirectory'])['summary'], + )); + $approvedSummary = (string) file_get_contents(TestLaneManifest::absolutePath( + TestLaneReport::artifactPaths('fast-feedback', $approvedReport['artifactDirectory'])['summary'], + )); + $rejectedSummary = (string) file_get_contents(TestLaneManifest::absolutePath( + TestLaneReport::artifactPaths('fast-feedback', $rejectedReport['artifactDirectory'])['summary'], + )); + + expect($candidateDecision)->toBeArray() + ->and($candidateDecision['decisionStatus'])->toBe('candidate') + ->and($candidateDecision['targetType'])->toBe('budget') + ->and($approvedDecision['decisionStatus'])->toBe('approved') + ->and($approvedDecision['targetType'])->toBe('baseline') + ->and($rejectedDecision)->toBeArray() + ->and($rejectedDecision['decisionStatus'])->toBe('rejected') + ->and($rejectedDecision['rationaleCode'])->toBe('noise-rejected') + ->and($candidateSummary)->toContain('Recalibration: budget candidate') + ->and($approvedSummary)->toContain('Recalibration: baseline approved') + ->and($rejectedSummary)->toContain('Recalibration: budget rejected', 'noise-rejected'); +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneRecalibrationPolicyTest.php b/apps/platform/tests/Feature/Guards/TestLaneRecalibrationPolicyTest.php new file mode 100644 index 00000000..f7ede028 --- /dev/null +++ b/apps/platform/tests/Feature/Guards/TestLaneRecalibrationPolicyTest.php @@ -0,0 +1,111 @@ + 12.0, + ], + ciContext: [ + 'workflowId' => $laneId === 'confidence' ? 'main-confidence' : 'pr-fast-feedback', + 'triggerClass' => $laneId === 'confidence' ? 'mainline-push' : 'pull-request', + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: in_array($laneId, ['fast-feedback', 'confidence'], true) ? 'shared-test-fixture-slimming' : null, + ); + + $templateRecord = $report['trendHistoryArtifact']['history'][0]; + + return array_values(array_map( + static function (int $index) use ($templateRecord): array { + $record = $templateRecord; + $record['runRef'] = sprintf('%s-evidence-%d', $templateRecord['laneId'], $index + 1); + $record['generatedAt'] = sprintf('2026-04-%02dT10:00:00+00:00', $index + 1); + + return $record; + }, + range(0, $count - 1), + )); +} + +it('keeps baseline and budget recalibration rules separate and enforces the stronger budget evidence window', function (): void { + $assessment = [ + 'recalibrationRecommendation' => 'review-baseline', + ]; + $baselineHistory = recalibrationEvidenceHistory('fast-feedback', 3); + $budgetHistory = recalibrationEvidenceHistory('confidence', 5); + + $baselineDecision = TestLaneBudget::buildRecalibrationDecisionRecord( + laneId: 'fast-feedback', + targetType: 'baseline', + assessment: $assessment, + historyRecords: $baselineHistory, + decisionStatus: 'approved', + rationaleCode: 'lane-scope-change', + recordedIn: 'specs/211-runtime-trend-recalibration/spec.md', + proposedValueSeconds: 184.0, + ); + $budgetDecision = TestLaneBudget::buildRecalibrationDecisionRecord( + laneId: 'confidence', + targetType: 'budget', + assessment: ['recalibrationRecommendation' => 'review-budget'], + historyRecords: $budgetHistory, + decisionStatus: 'approved', + rationaleCode: 'sustained-erosion', + recordedIn: 'specs/211-runtime-trend-recalibration/spec.md', + proposedValueSeconds: 470.0, + ); + + expect($baselineDecision['targetType'])->toBe('baseline') + ->and($baselineDecision['decisionStatus'])->toBe('approved') + ->and($budgetDecision['targetType'])->toBe('budget') + ->and($budgetDecision['decisionStatus'])->toBe('approved') + ->and($budgetDecision['evidenceRunRefs'])->toHaveCount(5); + + expect(static fn () => TestLaneBudget::buildRecalibrationDecisionRecord( + laneId: 'confidence', + targetType: 'budget', + assessment: ['recalibrationRecommendation' => 'review-budget'], + historyRecords: recalibrationEvidenceHistory('confidence', 4), + decisionStatus: 'approved', + rationaleCode: 'sustained-erosion', + recordedIn: 'specs/211-runtime-trend-recalibration/spec.md', + proposedValueSeconds: 470.0, + ))->toThrow(InvalidArgumentException::class); +}); + +it('requires approved versus rejected rationale handling that matches the policy', function (): void { + $history = recalibrationEvidenceHistory('fast-feedback', 3); + + $rejectedDecision = TestLaneBudget::buildRecalibrationDecisionRecord( + laneId: 'fast-feedback', + targetType: 'budget', + assessment: ['recalibrationRecommendation' => 'investigate'], + historyRecords: [$history[0]], + decisionStatus: 'rejected', + rationaleCode: 'noise-rejected', + recordedIn: 'specs/211-runtime-trend-recalibration/spec.md', + ); + + expect($rejectedDecision['decisionStatus'])->toBe('rejected') + ->and($rejectedDecision['rationaleCode'])->toBe('noise-rejected'); + + expect(static fn () => TestLaneBudget::buildRecalibrationDecisionRecord( + laneId: 'fast-feedback', + targetType: 'baseline', + assessment: ['recalibrationRecommendation' => 'review-baseline'], + historyRecords: $history, + decisionStatus: 'approved', + rationaleCode: 'sustained-erosion', + recordedIn: 'specs/211-runtime-trend-recalibration/spec.md', + proposedValueSeconds: 184.0, + ))->toThrow(InvalidArgumentException::class); +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneTrendClassificationTest.php b/apps/platform/tests/Feature/Guards/TestLaneTrendClassificationTest.php new file mode 100644 index 00000000..d25b8b5c --- /dev/null +++ b/apps/platform/tests/Feature/Guards/TestLaneTrendClassificationTest.php @@ -0,0 +1,78 @@ + 10.0, + 'tests/Feature/Guards/TestLaneTrendSummaryContractTest.php' => 8.0, + ]; + $workflowId = $laneId === 'confidence' ? 'main-confidence' : 'pr-fast-feedback'; + $triggerClass = $laneId === 'confidence' ? 'mainline-push' : 'pull-request'; + $comparisonProfile = in_array($laneId, ['fast-feedback', 'confidence'], true) + ? 'shared-test-fixture-slimming' + : null; + $baseReport = TestLaneTrendFixtures::buildReport( + laneId: $laneId, + wallClockSeconds: $seededHistory[0], + durationsByFile: $durationsByFile, + artifactDirectory: $artifactDirectory, + ciContext: [ + 'workflowId' => $workflowId, + 'triggerClass' => $triggerClass, + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: $comparisonProfile, + ); + + $artifact = $baseReport['trendHistoryArtifact']; + $templateRecord = $artifact['history'][0]; + $artifact['history'] = array_values(array_map( + static function (float $seconds, int $index) use ($templateRecord): array { + $record = $templateRecord; + $record['runRef'] = sprintf('%s-history-%d', $templateRecord['laneId'], $index + 1); + $record['generatedAt'] = sprintf('2026-04-%02dT09:00:00+00:00', $index + 1); + $record['wallClockSeconds'] = round($seconds, 6); + + return $record; + }, + $seededHistory, + array_keys($seededHistory), + )); + + TestLaneTrendFixtures::writeTrendHistory($laneId, $artifact, $artifactDirectory); + + return TestLaneTrendFixtures::buildReport( + laneId: $laneId, + wallClockSeconds: $currentSeconds, + durationsByFile: $durationsByFile, + artifactDirectory: $artifactDirectory, + ciContext: [ + 'workflowId' => $workflowId, + 'triggerClass' => $triggerClass, + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: $comparisonProfile, + ); +} + +it('classifies healthy, budget-near, trending-worse, regressed, and unstable runtime states', function (): void { + $healthy = classifiedTrendReport('fast-feedback', 'healthy', [176.1, 175.6, 176.2, 175.4, 176.0], 176.4); + $budgetNear = classifiedTrendReport('confidence', 'budget-near', [433.0, 430.0, 427.0, 424.0, 420.0], 438.5); + $trendingWorse = classifiedTrendReport('fast-feedback', 'trending-worse', [175.0, 150.0, 125.0, 100.0, 75.0], 195.0); + $regressed = classifiedTrendReport('fast-feedback', 'regressed', [200.0, 180.0, 160.0, 140.0, 120.0], 225.0); + $unstable = classifiedTrendReport('fast-feedback', 'unstable', [170.0, 195.0, 168.0, 193.0, 166.0], 194.0); + + expect($healthy['trendCurrentAssessment']['healthClass'])->toBe('healthy') + ->and($budgetNear['trendCurrentAssessment']['healthClass'])->toBe('budget-near') + ->and($trendingWorse['trendCurrentAssessment']['healthClass'])->toBe('trending-worse') + ->and($regressed['trendCurrentAssessment']['healthClass'])->toBe('regressed') + ->and($unstable['trendCurrentAssessment']['healthClass'])->toBe('unstable') + ->and($unstable['trendCurrentAssessment']['windowStatus'])->toBe('noisy'); +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneTrendContractSchemaTest.php b/apps/platform/tests/Feature/Guards/TestLaneTrendContractSchemaTest.php new file mode 100644 index 00000000..c95443a5 --- /dev/null +++ b/apps/platform/tests/Feature/Guards/TestLaneTrendContractSchemaTest.php @@ -0,0 +1,137 @@ + var_export($item, true), $schema['enum']))); + } + + if (is_string($types)) { + $types = [$types]; + } + + if (is_array($types)) { + $typeValid = false; + + foreach ($types as $type) { + $typeValid = match ($type) { + 'array' => is_array($value) && array_is_list($value), + 'object' => is_array($value) && ! array_is_list($value), + 'string' => is_string($value), + 'integer' => is_int($value), + 'number' => is_int($value) || is_float($value), + 'boolean' => is_bool($value), + 'null' => $value === null, + default => false, + }; + + if ($typeValid) { + break; + } + } + + if (! $typeValid) { + $errors[] = sprintf('%s has the wrong type.', $path); + + return $errors; + } + } + + if (is_array($value) && array_is_list($value)) { + if (isset($schema['minItems']) && count($value) < (int) $schema['minItems']) { + $errors[] = sprintf('%s must contain at least %d item(s).', $path, (int) $schema['minItems']); + } + + if (isset($schema['items']) && is_array($schema['items'])) { + foreach ($value as $index => $item) { + $errors = array_merge($errors, validateTrendSchema($item, $schema['items'], $defs, sprintf('%s[%d]', $path, $index))); + } + } + + return $errors; + } + + if (is_array($value) && ! array_is_list($value)) { + foreach ($schema['required'] ?? [] as $requiredKey) { + if (! array_key_exists((string) $requiredKey, $value)) { + $errors[] = sprintf('%s is missing required key [%s].', $path, $requiredKey); + } + } + + if (($schema['additionalProperties'] ?? true) === false) { + $allowedKeys = array_keys($schema['properties'] ?? []); + + foreach (array_keys($value) as $key) { + if (! in_array($key, $allowedKeys, true)) { + $errors[] = sprintf('%s contains unsupported key [%s].', $path, $key); + } + } + } + + foreach ($schema['properties'] ?? [] as $key => $propertySchema) { + if (! array_key_exists($key, $value)) { + continue; + } + + $errors = array_merge($errors, validateTrendSchema($value[$key], $propertySchema, $defs, $path.'.'.$key)); + } + + return $errors; + } + + if ((is_int($value) || is_float($value)) && isset($schema['minimum']) && $value < $schema['minimum']) { + $errors[] = sprintf('%s must be >= %s.', $path, $schema['minimum']); + } + + return $errors; +} + +it('keeps the generated trend-history artifact synchronized with the checked-in JSON schema contract', function (): void { + $schemaPath = repo_path('specs/211-runtime-trend-recalibration/contracts/test-runtime-trend-history.schema.json'); + $artifactDirectory = TestLaneTrendFixtures::artifactDirectory('trend-schema-contract'); + $report = TestLaneTrendFixtures::buildReport( + laneId: 'fast-feedback', + wallClockSeconds: 184.6, + durationsByFile: [ + 'tests/Feature/Guards/TestLaneTrendContractSchemaTest.php' => 16.4, + 'tests/Feature/Guards/TestLaneArtifactsContractTest.php' => 11.2, + ], + artifactDirectory: $artifactDirectory, + ciContext: [ + 'workflowId' => 'pr-fast-feedback', + 'triggerClass' => 'pull-request', + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: 'shared-test-fixture-slimming', + ); + + /** @var array $schema */ + $schema = json_decode((string) file_get_contents($schemaPath), true, 512, JSON_THROW_ON_ERROR); + $artifact = $report['trendHistoryArtifact']; + $errors = validateTrendSchema($artifact, $schema, $schema['$defs'] ?? []); + + expect($artifact['schemaVersion'])->toBe('1.0.0') + ->and($artifact['laneId'])->toBe('fast-feedback') + ->and($artifact['workflowProfile'])->toBe('pr-fast-feedback') + ->and($artifact['history'][0]['artifactRefs']['trendHistory'])->toBe($artifactDirectory.'/fast-feedback-latest.trend-history.json') + ->and($errors)->toBe([]); +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneTrendLogicalContractTest.php b/apps/platform/tests/Feature/Guards/TestLaneTrendLogicalContractTest.php new file mode 100644 index 00000000..795ca60c --- /dev/null +++ b/apps/platform/tests/Feature/Guards/TestLaneTrendLogicalContractTest.php @@ -0,0 +1,71 @@ + 22.8, + 'tests/Feature/Baselines/BaselineCompareMatrixBuilderTest.php' => 19.4, + 'tests/Feature/Rbac/OnboardingWizardUiEnforcementTest.php' => 17.6, + ], + artifactDirectory: $artifactDirectory, + ciContext: [ + 'workflowId' => 'main-confidence', + 'triggerClass' => 'mainline-push', + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + ); + + /** @var array $contract */ + $contract = Yaml::parseFile($contractPath); + $paths = $contract['paths'] ?? []; + $schemas = $contract['components']['schemas'] ?? []; + $assessment = $report['trendCurrentAssessment']; + $decision = $report['trendRecalibrationDecisions'][0] ?? null; + + expect(array_keys($paths))->toEqualCanonicalizing([ + '/test-governance/lanes/{laneId}/trend-history', + '/test-governance/lanes/{laneId}/trend-assessment', + '/test-governance/lanes/{laneId}/recalibration', + '/test-governance/cycles/{cycleId}/summary', + ]) + ->and($paths['/test-governance/lanes/{laneId}/trend-history']['post']['operationId'])->toBe('updateLaneTrendHistory') + ->and($paths['/test-governance/lanes/{laneId}/trend-assessment']['post']['operationId'])->toBe('evaluateLaneTrendAssessment') + ->and($paths['/test-governance/lanes/{laneId}/recalibration']['post']['operationId'])->toBe('evaluateLaneRecalibration') + ->and($paths['/test-governance/cycles/{cycleId}/summary']['get']['operationId'])->toBe('getTrendSummaryCycle') + ->and($schemas['DriftAssessment']['properties']['healthClass']['enum'])->toEqualCanonicalizing([ + 'healthy', + 'budget-near', + 'trending-worse', + 'regressed', + 'unstable', + ]) + ->and($schemas['DriftAssessment']['properties']['recalibrationRecommendation']['enum'])->toEqualCanonicalizing([ + 'none', + 'investigate', + 'review-baseline', + 'review-budget', + ]) + ->and($schemas['RecalibrationDecision']['properties']['decisionStatus']['enum'])->toEqualCanonicalizing([ + 'candidate', + 'approved', + 'rejected', + ]) + ->and($schemas['LaneTrendHistoryArtifact']['required'])->toContain('schemaVersion', 'policy', 'history', 'currentAssessment') + ->and($assessment['healthClass'])->toBeIn($schemas['DriftAssessment']['properties']['healthClass']['enum']) + ->and($assessment['recalibrationRecommendation'])->toBeIn($schemas['DriftAssessment']['properties']['recalibrationRecommendation']['enum']); + + if (is_array($decision)) { + expect($decision['decisionStatus'])->toBeIn($schemas['RecalibrationDecision']['properties']['decisionStatus']['enum']) + ->and($decision['targetType'])->toBeIn($schemas['RecalibrationDecision']['properties']['targetType']['enum']); + } +}); diff --git a/apps/platform/tests/Feature/Guards/TestLaneTrendSummaryContractTest.php b/apps/platform/tests/Feature/Guards/TestLaneTrendSummaryContractTest.php new file mode 100644 index 00000000..1f98e570 --- /dev/null +++ b/apps/platform/tests/Feature/Guards/TestLaneTrendSummaryContractTest.php @@ -0,0 +1,97 @@ + $seconds) { + $record = $templateRecord; + $record['runRef'] = sprintf('%s-seeded-%d', $templateRecord['laneId'], $index + 1); + $record['generatedAt'] = sprintf('2026-04-%02dT12:00:00+00:00', $index + 1); + $record['wallClockSeconds'] = round((float) $seconds, 6); + $seededHistory[] = $record; + } + + $baseArtifact['history'] = $seededHistory; + + return $baseArtifact; +} + +it('keeps the trend summary bounded while exposing current, previous, baseline, and budget fields for fast-feedback and confidence', function (): void { + $scenarios = [ + 'fast-feedback' => [ + 'artifactDirectory' => TestLaneTrendFixtures::artifactDirectory('trend-summary/fast-feedback'), + 'workflowId' => 'pr-fast-feedback', + 'triggerClass' => 'pull-request', + 'comparisonProfile' => 'shared-test-fixture-slimming', + 'durationsByFile' => [ + 'tests/Feature/Guards/TestLaneTrendSummaryContractTest.php' => 12.6, + 'tests/Feature/Guards/TestLaneArtifactsContractTest.php' => 8.4, + ], + 'seededHistory' => [176.73, 178.91, 181.22, 183.41, 185.07, 186.64], + 'currentSeconds' => 188.18, + ], + 'confidence' => [ + 'artifactDirectory' => TestLaneTrendFixtures::artifactDirectory('trend-summary/confidence'), + 'workflowId' => 'main-confidence', + 'triggerClass' => 'mainline-push', + 'comparisonProfile' => 'shared-test-fixture-slimming', + 'durationsByFile' => [ + 'tests/Feature/Baselines/BaselineCompareMatrixCompareAllActionTest.php' => 24.3, + 'tests/Feature/Baselines/BaselineCompareMatrixBuilderTest.php' => 20.4, + 'tests/Feature/Rbac/OnboardingWizardUiEnforcementTest.php' => 16.1, + ], + 'seededHistory' => [394.38, 401.12, 407.05, 411.61, 419.24, 424.77], + 'currentSeconds' => 431.81, + ], + ]; + + foreach ($scenarios as $laneId => $scenario) { + $baseReport = TestLaneTrendFixtures::buildReport( + laneId: $laneId, + wallClockSeconds: $scenario['seededHistory'][0], + durationsByFile: $scenario['durationsByFile'], + artifactDirectory: $scenario['artifactDirectory'], + ciContext: [ + 'workflowId' => $scenario['workflowId'], + 'triggerClass' => $scenario['triggerClass'], + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: $scenario['comparisonProfile'], + ); + + TestLaneTrendFixtures::writeTrendHistory( + $laneId, + seededTrendHistoryArtifact($baseReport['trendHistoryArtifact'], $scenario['seededHistory']), + $scenario['artifactDirectory'], + ); + + $report = TestLaneTrendFixtures::buildReport( + laneId: $laneId, + wallClockSeconds: $scenario['currentSeconds'], + durationsByFile: $scenario['durationsByFile'], + artifactDirectory: $scenario['artifactDirectory'], + ciContext: [ + 'workflowId' => $scenario['workflowId'], + 'triggerClass' => $scenario['triggerClass'], + 'entryPointResolved' => true, + 'workflowLaneMatched' => true, + ], + comparisonProfile: $scenario['comparisonProfile'], + ); + + expect($report['trendHistoryArtifact']['history'])->toHaveCount(7) + ->and($report['trendCurrentAssessment']['sampleCount'])->toBe(5) + ->and($report['trendCurrentAssessment']['previousComparableRunRef'])->not->toBeNull() + ->and($report['trendHistoryArtifact']['history'][0]['baselineSeconds'])->not->toBeNull() + ->and($report['trendHistoryArtifact']['history'][0]['budgetSeconds'])->toEqual((float) $report['budgetThresholdSeconds']) + ->and($report['trendCurrentAssessment']['summaryLine'])->not->toBe('') + ->and($report['trendWarnings'])->toBeArray(); + } +}); diff --git a/apps/platform/tests/Support/TestLaneBudget.php b/apps/platform/tests/Support/TestLaneBudget.php index c9fb43d5..1be880e2 100644 --- a/apps/platform/tests/Support/TestLaneBudget.php +++ b/apps/platform/tests/Support/TestLaneBudget.php @@ -367,4 +367,258 @@ public static function buildOutcomeRecord( ], static fn (mixed $value): bool => $value !== null); } -} \ No newline at end of file + public static function trendVarianceFloorSeconds(string $laneId, ?string $triggerClass = null): int + { + $matchingProfiles = array_values(array_filter( + self::enforcementProfiles(), + static fn (array $profile): bool => $profile['laneId'] === $laneId + && ($triggerClass === null || $profile['triggerClass'] === $triggerClass), + )); + + if ($matchingProfiles === [] && $triggerClass !== null) { + $matchingProfiles = array_values(array_filter( + self::enforcementProfiles(), + static fn (array $profile): bool => $profile['laneId'] === $laneId, + )); + } + + if ($matchingProfiles !== []) { + return (int) max(array_map( + static fn (array $profile): int => (int) ($profile['varianceAllowanceSeconds'] ?? 0), + $matchingProfiles, + )); + } + + return match ($laneId) { + 'junit' => 30, + 'profiling' => 45, + default => 15, + }; + } + + public static function nearBudgetHeadroomSeconds(string $laneId): int + { + return match ($laneId) { + 'fast-feedback' => 20, + 'confidence', 'junit' => 45, + 'browser' => 25, + 'heavy-governance' => 30, + 'profiling' => 120, + default => max(self::trendVarianceFloorSeconds($laneId), 15), + }; + } + + /** + * @return array + */ + public static function recalibrationPolicy(string $laneId): array + { + return [ + 'laneId' => $laneId, + 'baselineRequiresExplicitReview' => true, + 'budgetRequiresExplicitReview' => true, + 'minimumBaselineEvidenceSamples' => 3, + 'minimumBudgetEvidenceSamples' => $laneId === 'fast-feedback' ? 4 : 5, + 'baselineAllowedRationales' => [ + 'lane-scope-change', + 'infrastructure-shift', + 'post-improvement-reset', + 'manual-hold', + ], + 'approvedBaselineRationales' => [ + 'lane-scope-change', + 'infrastructure-shift', + 'post-improvement-reset', + ], + 'budgetAllowedRationales' => [ + 'infrastructure-shift', + 'sustained-erosion', + 'manual-hold', + ], + 'approvedBudgetRationales' => [ + 'infrastructure-shift', + 'sustained-erosion', + ], + 'rejectedRationales' => [ + 'noise-rejected', + 'manual-hold', + ], + ]; + } + + /** + * @param list> $historyRecords + * @return array + */ + public static function buildRecalibrationDecisionRecord( + string $laneId, + string $targetType, + array $assessment, + array $historyRecords, + string $decisionStatus, + string $rationaleCode, + string $recordedIn, + ?float $proposedValueSeconds = null, + ?string $notes = null, + ): array { + if (! in_array($targetType, ['baseline', 'budget'], true)) { + throw new InvalidArgumentException(sprintf('Unknown recalibration target type [%s].', $targetType)); + } + + if (! in_array($decisionStatus, ['candidate', 'approved', 'rejected'], true)) { + throw new InvalidArgumentException(sprintf('Unknown recalibration decision status [%s].', $decisionStatus)); + } + + $policy = self::recalibrationPolicy($laneId); + $minimumEvidenceSamples = $targetType === 'budget' + ? (int) $policy['minimumBudgetEvidenceSamples'] + : (int) $policy['minimumBaselineEvidenceSamples']; + $minimumEvidenceSamples = $decisionStatus === 'rejected' + ? 1 + : $minimumEvidenceSamples; + $evidenceRunRefs = array_values(array_filter(array_map( + static fn (array $record): ?string => is_string($record['runRef'] ?? null) && $record['runRef'] !== '' + ? (string) $record['runRef'] + : null, + $historyRecords, + ))); + $evidenceRunRefs = array_slice(array_values(array_unique($evidenceRunRefs)), 0, $minimumEvidenceSamples); + + if (count($evidenceRunRefs) < $minimumEvidenceSamples) { + throw new InvalidArgumentException(sprintf( + 'Recalibration decisions for [%s] require at least %d evidence samples.', + $targetType, + $minimumEvidenceSamples, + )); + } + + if ($decisionStatus === 'approved') { + $allowedRationales = $targetType === 'baseline' + ? $policy['approvedBaselineRationales'] + : $policy['approvedBudgetRationales']; + + if (! in_array($rationaleCode, $allowedRationales, true)) { + throw new InvalidArgumentException(sprintf( + 'Approved %s recalibration decisions must use one of [%s].', + $targetType, + implode(', ', $allowedRationales), + )); + } + } elseif ($decisionStatus === 'rejected') { + if (! in_array($rationaleCode, $policy['rejectedRationales'], true)) { + throw new InvalidArgumentException('Rejected recalibration decisions must use a rejected rationale.'); + } + } else { + $allowedRationales = $targetType === 'baseline' + ? $policy['baselineAllowedRationales'] + : $policy['budgetAllowedRationales']; + + if (! in_array($rationaleCode, $allowedRationales, true)) { + throw new InvalidArgumentException(sprintf( + 'Candidate %s recalibration decisions must use one of [%s].', + $targetType, + implode(', ', $allowedRationales), + )); + } + } + + $currentRecord = $historyRecords[0] ?? []; + $previousValueSeconds = $targetType === 'baseline' + ? (float) ($currentRecord['baselineSeconds'] ?? $currentRecord['wallClockSeconds'] ?? 0.0) + : (float) ($currentRecord['budgetSeconds'] ?? 0.0); + + $defaultNotes = match ($decisionStatus) { + 'approved' => sprintf( + 'Approved %s recalibration for lane [%s] after reviewing %d comparable samples.', + $targetType, + $laneId, + count($evidenceRunRefs), + ), + 'rejected' => sprintf( + 'Rejected %s recalibration for lane [%s] because current evidence is not strong enough to move repository truth.', + $targetType, + $laneId, + ), + default => sprintf( + 'Candidate %s recalibration for lane [%s]. Review the active spec or PR before changing repository truth.', + $targetType, + $laneId, + ), + }; + + return [ + 'targetType' => $targetType, + 'decisionStatus' => $decisionStatus, + 'evidenceRunRefs' => $evidenceRunRefs, + 'previousValueSeconds' => round($previousValueSeconds, 6), + 'proposedValueSeconds' => $proposedValueSeconds !== null ? round($proposedValueSeconds, 6) : null, + 'rationaleCode' => $rationaleCode, + 'recordedIn' => $recordedIn, + 'notes' => $notes ?? $defaultNotes, + ]; + } + + /** + * @param list> $historyRecords + * @return list> + */ + public static function automaticRecalibrationDecisions( + string $laneId, + array $assessment, + array $historyRecords, + string $recordedIn, + ): array { + $recommendation = (string) ($assessment['recalibrationRecommendation'] ?? 'none'); + $windowStatus = (string) ($assessment['windowStatus'] ?? 'stable'); + $currentRecord = $historyRecords[0] ?? []; + $decisionRecords = []; + + if ($recommendation === 'review-baseline') { + $decisionRecords[] = self::buildRecalibrationDecisionRecord( + laneId: $laneId, + targetType: 'baseline', + assessment: $assessment, + historyRecords: $historyRecords, + decisionStatus: 'candidate', + rationaleCode: 'manual-hold', + recordedIn: $recordedIn, + proposedValueSeconds: isset($currentRecord['wallClockSeconds']) ? (float) $currentRecord['wallClockSeconds'] : null, + notes: 'Candidate baseline review. Confirm lane-scope, infrastructure, or post-improvement evidence before approving any baseline reset.', + ); + } + + if ($recommendation === 'review-budget') { + $proposedBudgetSeconds = isset($currentRecord['wallClockSeconds']) + ? (float) $currentRecord['wallClockSeconds'] + self::nearBudgetHeadroomSeconds($laneId) + : null; + + $decisionRecords[] = self::buildRecalibrationDecisionRecord( + laneId: $laneId, + targetType: 'budget', + assessment: $assessment, + historyRecords: $historyRecords, + decisionStatus: 'candidate', + rationaleCode: 'sustained-erosion', + recordedIn: $recordedIn, + proposedValueSeconds: $proposedBudgetSeconds, + notes: 'Candidate budget review. Only approve after sustained erosion is confirmed and the active spec or PR records why the budget should move.', + ); + } + + if ($decisionRecords === [] && in_array($windowStatus, ['insufficient-history', 'noisy', 'scope-changed'], true)) { + $decisionRecords[] = self::buildRecalibrationDecisionRecord( + laneId: $laneId, + targetType: 'budget', + assessment: $assessment, + historyRecords: $historyRecords, + decisionStatus: 'rejected', + rationaleCode: $windowStatus === 'noisy' ? 'noise-rejected' : 'manual-hold', + recordedIn: $recordedIn, + notes: 'Recalibration is rejected for this cycle because the comparison window is not stable enough to justify moving repository truth.', + ); + } + + return $decisionRecords; + } + +} diff --git a/apps/platform/tests/Support/TestLaneManifest.php b/apps/platform/tests/Support/TestLaneManifest.php index 3d71e334..029aa3eb 100644 --- a/apps/platform/tests/Support/TestLaneManifest.php +++ b/apps/platform/tests/Support/TestLaneManifest.php @@ -56,7 +56,7 @@ final class TestLaneManifest public static function manifest(): array { return [ - 'version' => 2, + 'version' => 3, 'artifactDirectory' => self::artifactDirectory(), 'mainlineBranch' => self::mainlineBranch(), 'classifications' => self::classifications(), @@ -70,6 +70,8 @@ public static function manifest(): array 'laneBindings' => self::laneBindings(), 'budgetEnforcementProfiles' => TestLaneBudget::enforcementProfiles(), 'artifactPublicationContracts' => self::artifactPublicationContracts(), + 'trendContractVersion' => self::laneTrendContractVersion(), + 'laneTrendPolicies' => self::laneTrendPolicies(), 'failureClasses' => self::failureClasses(), 'familyBudgets' => self::familyBudgets(), 'heavyGovernanceBudgetContract' => self::heavyGovernanceBudgetContract(), @@ -2059,7 +2061,7 @@ public static function artifactPublicationContract(string $laneId): array { self::lane($laneId); - $requiredFiles = ['summary.md', 'budget.json', 'report.json', 'junit.xml']; + $requiredFiles = ['summary.md', 'budget.json', 'report.json', 'junit.xml', 'trend-history.json']; $optionalFiles = $laneId === 'profiling' ? ['profile.txt'] : []; $sourcePatterns = array_map( static fn (string $artifactFile): string => sprintf('%s-latest.%s', $laneId, $artifactFile), @@ -2084,6 +2086,112 @@ public static function artifactPublicationContract(string $laneId): array ]; } + public static function laneTrendContractVersion(): string + { + return '1.0.0'; + } + + /** + * @return list> + */ + public static function laneTrendPolicies(): array + { + return array_map( + static fn (array $lane): array => self::laneTrendPolicy((string) $lane['id']), + self::lanes(), + ); + } + + /** + * @return array + */ + public static function laneTrendPolicy(string $laneId, ?string $workflowId = null, ?string $triggerClass = null): array + { + self::lane($laneId); + + $policy = TestLaneBudget::recalibrationPolicy($laneId); + $workflowProfile = null; + + if ($workflowId !== null && $workflowId !== '') { + try { + $workflowProfile = self::workflowProfile($workflowId); + } catch (InvalidArgumentException) { + $workflowProfile = null; + } + } + + $workflowProfile ??= self::workflowProfilesForLane($laneId)[0] ?? null; + $resolvedTriggerClass = $triggerClass + ?? (is_array($workflowProfile) ? (string) ($workflowProfile['triggerClass'] ?? '') : ''); + + return [ + 'retentionLimit' => in_array($laneId, ['fast-feedback', 'confidence', 'browser', 'heavy-governance'], true) ? 20 : 10, + 'comparisonWindowSize' => $laneId === 'profiling' ? 4 : 5, + 'minimumComparableSamples' => 3, + 'varianceFloorSeconds' => TestLaneBudget::trendVarianceFloorSeconds($laneId, $resolvedTriggerClass !== '' ? $resolvedTriggerClass : null), + 'nearBudgetHeadroomSeconds' => TestLaneBudget::nearBudgetHeadroomSeconds($laneId), + 'hotspotFamilyLimit' => 5, + 'hotspotFileLimit' => 3, + 'slowestEntryRetention' => 10, + 'recalibrationPolicy' => [ + 'baselineRequiresExplicitReview' => (bool) $policy['baselineRequiresExplicitReview'], + 'budgetRequiresExplicitReview' => (bool) $policy['budgetRequiresExplicitReview'], + 'minimumBudgetEvidenceSamples' => (int) $policy['minimumBudgetEvidenceSamples'], + ], + ]; + } + + public static function laneScopeSignature(string $laneId): string + { + $lane = self::lane($laneId); + $payload = [ + 'laneId' => $laneId, + 'governanceClass' => $lane['governanceClass'], + 'parallelMode' => $lane['parallelMode'], + 'includedFamilies' => $lane['includedFamilies'], + 'excludedFamilies' => $lane['excludedFamilies'], + 'selectors' => $lane['selectors'], + 'artifacts' => $lane['artifacts'], + 'budget' => [ + 'baselineSource' => $lane['budget']['baselineSource'], + 'thresholdSeconds' => $lane['budget']['thresholdSeconds'], + ], + 'contractVersion' => self::laneTrendContractVersion(), + ]; + + return sha1(json_encode($payload, JSON_THROW_ON_ERROR)); + } + + /** + * @return array + */ + public static function comparisonFingerprintInputs(string $laneId, ?string $workflowId = null, ?string $triggerClass = null): array + { + $lane = self::lane($laneId); + $workflowProfile = null; + + if ($workflowId !== null && $workflowId !== '') { + try { + $workflowProfile = self::workflowProfile($workflowId); + } catch (InvalidArgumentException) { + $workflowProfile = null; + } + } + + $workflowProfile ??= self::workflowProfilesForLane($laneId)[0] ?? null; + + return [ + 'laneId' => $laneId, + 'workflowId' => $workflowId + ?? (is_array($workflowProfile) ? (string) ($workflowProfile['workflowId'] ?? '') : sprintf('local-%s', $laneId)), + 'triggerClass' => $triggerClass + ?? (is_array($workflowProfile) ? (string) ($workflowProfile['triggerClass'] ?? '') : 'local'), + 'contractVersion' => self::laneTrendContractVersion(), + 'baselineSource' => (string) ($lane['budget']['baselineSource'] ?? 'measured-lane'), + 'laneScopeSignature' => self::laneScopeSignature($laneId), + ]; + } + /** * @return list> */ @@ -3199,4 +3307,4 @@ private static function familyMatchScore(array $family, string $filePath): int return $score; } -} \ No newline at end of file +} diff --git a/apps/platform/tests/Support/TestLaneReport.php b/apps/platform/tests/Support/TestLaneReport.php index 2c82d2c6..878d1cda 100644 --- a/apps/platform/tests/Support/TestLaneReport.php +++ b/apps/platform/tests/Support/TestLaneReport.php @@ -5,11 +5,12 @@ namespace Tests\Support; use SimpleXMLElement; +use ZipArchive; final class TestLaneReport { /** - * @return array{junit: string, summary: string, budget: string, report: string, profile: string} + * @return array{junit: string, summary: string, budget: string, report: string, profile: string, trendHistory: string} */ public static function artifactPaths(string $laneId, ?string $artifactDirectory = null): array { @@ -21,6 +22,7 @@ public static function artifactPaths(string $laneId, ?string $artifactDirectory 'budget' => sprintf('%s/%s-latest.budget.json', $directory, $laneId), 'report' => sprintf('%s/%s-latest.report.json', $directory, $laneId), 'profile' => sprintf('%s/%s-latest.profile.txt', $directory, $laneId), + 'trendHistory' => sprintf('%s/%s-latest.trend-history.json', $directory, $laneId), ]; } @@ -205,6 +207,92 @@ public static function stageArtifacts(string $laneId, string $stagingDirectory, ]; } + /** + * @return array + */ + public static function hydrateTrendHistory( + string $laneId, + ?string $historyFile = null, + ?string $bundlePath = null, + ?string $artifactDirectory = null, + ): array { + $artifactPaths = self::artifactPaths($laneId, $artifactDirectory); + $targetPath = TestLaneManifest::absolutePath($artifactPaths['trendHistory']); + + self::ensureDirectory(dirname($targetPath)); + + $resolvedHistoryFile = is_string($historyFile) && trim($historyFile) !== '' + ? self::resolveInputPath($historyFile) + : null; + $resolvedBundlePath = is_string($bundlePath) && trim($bundlePath) !== '' + ? self::resolveInputPath($bundlePath) + : null; + + if (is_string($resolvedHistoryFile) && is_file($resolvedHistoryFile)) { + copy($resolvedHistoryFile, $targetPath); + + return [ + 'laneId' => $laneId, + 'targetPath' => $targetPath, + 'hydrated' => true, + 'sourceType' => 'history-file', + 'sourcePath' => $resolvedHistoryFile, + ]; + } + + if (is_string($resolvedBundlePath) && $resolvedBundlePath !== '') { + if (is_dir($resolvedBundlePath)) { + $bundleHistoryPath = self::findTrendHistoryInDirectory($laneId, $resolvedBundlePath); + + if (is_string($bundleHistoryPath)) { + copy($bundleHistoryPath, $targetPath); + + return [ + 'laneId' => $laneId, + 'targetPath' => $targetPath, + 'hydrated' => true, + 'sourceType' => 'bundle-directory', + 'sourcePath' => $bundleHistoryPath, + ]; + } + } elseif (is_file($resolvedBundlePath) && str_ends_with(strtolower($resolvedBundlePath), '.zip')) { + $zip = new ZipArchive(); + + if ($zip->open($resolvedBundlePath) === true) { + $entryName = self::findTrendHistoryInZip($laneId, $zip); + + if (is_string($entryName)) { + $contents = $zip->getFromName($entryName); + $zip->close(); + + if (is_string($contents) && $contents !== '') { + file_put_contents($targetPath, $contents); + + return [ + 'laneId' => $laneId, + 'targetPath' => $targetPath, + 'hydrated' => true, + 'sourceType' => 'bundle-zip', + 'sourcePath' => $resolvedBundlePath, + 'sourceEntry' => $entryName, + ]; + } + } + + $zip->close(); + } + } + } + + return [ + 'laneId' => $laneId, + 'targetPath' => $targetPath, + 'hydrated' => false, + 'sourceType' => null, + 'sourcePath' => $resolvedHistoryFile ?? $resolvedBundlePath, + ]; + } + /** * @return array{slowestEntries: list>, durationsByFile: array} */ @@ -426,12 +514,19 @@ classificationAttribution: $attribution['classificationAttribution'], $report['ciBudgetEvaluation'] = $ciBudgetEvaluation; } + $trendHistoryArtifact = self::buildTrendHistoryArtifact($report, $artifactPaths); + $report['trendHistoryArtifact'] = $trendHistoryArtifact; + $report['trendCurrentAssessment'] = $trendHistoryArtifact['currentAssessment']; + $report['trendHotspotSnapshot'] = $trendHistoryArtifact['hotspotSnapshot'] ?? null; + $report['trendRecalibrationDecisions'] = $trendHistoryArtifact['recalibrationDecisions'] ?? []; + $report['trendWarnings'] = $trendHistoryArtifact['warnings'] ?? []; + return $report; } /** * @param array $report - * @return array{summary: string, budget: string, report: string, profile: string} + * @return array{summary: string, budget: string, report: string, profile: string, trendHistory: string} */ public static function writeArtifacts( string $laneId, @@ -463,6 +558,11 @@ public static function writeArtifacts( json_encode($report, JSON_PRETTY_PRINT | JSON_THROW_ON_ERROR), ); + file_put_contents( + TestLaneManifest::absolutePath($artifactPaths['trendHistory']), + json_encode($report['trendHistoryArtifact'] ?? [], JSON_PRETTY_PRINT | JSON_THROW_ON_ERROR), + ); + $report['artifactPublication'] = self::artifactPublicationStatus($laneId, $artifactDirectory); $report['ciSummary'] = self::buildCiSummary( report: $report, @@ -531,6 +631,18 @@ private static function buildSummaryMarkdown(array $report): string sprintf('- Wall clock: %.2f seconds', (float) $report['wallClockSeconds']), sprintf('- Budget: %d seconds (%s)', (int) $report['budgetThresholdSeconds'], $report['budgetStatus']), ]; + $trendHistory = is_array($report['trendHistoryArtifact'] ?? null) + ? $report['trendHistoryArtifact'] + : null; + $trendAssessment = is_array($report['trendCurrentAssessment'] ?? null) + ? $report['trendCurrentAssessment'] + : ($trendHistory['currentAssessment'] ?? null); + $currentTrendRecord = is_array($trendHistory['history'][0] ?? null) + ? $trendHistory['history'][0] + : null; + $previousTrendRecord = is_array($trendHistory['history'][1] ?? null) + ? $trendHistory['history'][1] + : null; if (isset($report['ciSummary']) && is_array($report['ciSummary'])) { $lines[] = sprintf( @@ -584,6 +696,70 @@ private static function buildSummaryMarkdown(array $report): string ); } + $lines[] = ''; + $lines[] = '## Lane trend'; + + if (is_array($trendAssessment)) { + $previousRuntime = is_array($previousTrendRecord) + ? sprintf('%.2fs', (float) ($previousTrendRecord['wallClockSeconds'] ?? 0.0)) + : 'n/a'; + $baselineRuntime = isset($currentTrendRecord['baselineSeconds']) && $currentTrendRecord['baselineSeconds'] !== null + ? sprintf('%.2fs', (float) $currentTrendRecord['baselineSeconds']) + : 'n/a'; + + $lines[] = sprintf( + '- Window: current %.2fs | previous %s | baseline %s | budget %.2fs', + (float) $report['wallClockSeconds'], + $previousRuntime, + $baselineRuntime, + (float) $report['budgetThresholdSeconds'], + ); + $lines[] = sprintf('- Health class: %s', (string) $trendAssessment['healthClass']); + $lines[] = sprintf('- Window status: %s', (string) $trendAssessment['windowStatus']); + $lines[] = sprintf('- Recalibration recommendation: %s', (string) $trendAssessment['recalibrationRecommendation']); + $lines[] = sprintf('- Budget headroom: %.2fs', (float) $trendAssessment['budgetHeadroomSeconds']); + $lines[] = sprintf('- Summary: %s', (string) $trendAssessment['summaryLine']); + + if (array_key_exists('deltaToPreviousSeconds', $trendAssessment) && $trendAssessment['deltaToPreviousSeconds'] !== null) { + $lines[] = sprintf('- Delta to previous: %+0.2fs', (float) $trendAssessment['deltaToPreviousSeconds']); + } + + if (array_key_exists('deltaToBaselineSeconds', $trendAssessment) && $trendAssessment['deltaToBaselineSeconds'] !== null) { + $lines[] = sprintf('- Delta to baseline: %+0.2fs', (float) $trendAssessment['deltaToBaselineSeconds']); + } + } else { + $lines[] = '- Trend assessment unavailable.'; + } + + $hotspotSnapshot = is_array($report['trendHotspotSnapshot'] ?? null) + ? $report['trendHotspotSnapshot'] + : ($trendHistory['hotspotSnapshot'] ?? null); + + if (is_array($hotspotSnapshot)) { + $lines[] = sprintf('- Hotspot evidence: %s', (string) $hotspotSnapshot['evidenceAvailability']); + + foreach (array_slice($hotspotSnapshot['familyDeltas'] ?? [], 0, 3) as $delta) { + $lines[] = sprintf( + '- Family delta: %s %+0.2fs', + (string) $delta['name'], + (float) $delta['deltaSeconds'], + ); + } + } + + foreach ($report['trendRecalibrationDecisions'] ?? [] as $decision) { + $lines[] = sprintf( + '- Recalibration: %s %s (%s)', + (string) $decision['targetType'], + (string) $decision['decisionStatus'], + (string) $decision['rationaleCode'], + ); + } + + foreach ($report['trendWarnings'] ?? [] as $warning) { + $lines[] = sprintf('- Warning: %s', $warning); + } + $lines[] = ''; $lines[] = '## Slowest entries'; @@ -929,6 +1105,11 @@ private static function budgetPayload(array $report): array 'remainingOpenFamilies' => $report['remainingOpenFamilies'] ?? null, 'stabilizedFamilies' => $report['stabilizedFamilies'] ?? null, 'sharedFixtureSlimmingComparison' => $report['sharedFixtureSlimmingComparison'] ?? null, + 'trendHistoryArtifact' => $report['trendHistoryArtifact'] ?? null, + 'trendCurrentAssessment' => $report['trendCurrentAssessment'] ?? null, + 'trendHotspotSnapshot' => $report['trendHotspotSnapshot'] ?? null, + 'trendRecalibrationDecisions' => $report['trendRecalibrationDecisions'] ?? null, + 'trendWarnings' => $report['trendWarnings'] ?? null, 'ciBudgetEvaluation' => $report['ciBudgetEvaluation'] ?? null, 'artifactPublication' => $report['artifactPublication'] ?? null, 'ciSummary' => $report['ciSummary'] ?? null, @@ -936,7 +1117,718 @@ private static function budgetPayload(array $report): array } /** - * @param array{junit: string, summary: string, budget: string, report: string, profile: string} $artifactPaths + * @param array $report + * @param array{junit: string, summary: string, budget: string, report: string, profile: string, trendHistory: string} $artifactPaths + * @return array + */ + private static function buildTrendHistoryArtifact(array $report, array $artifactPaths): array + { + $laneId = (string) $report['laneId']; + $existingArtifact = self::loadTrendHistoryArtifact($laneId, (string) $report['artifactDirectory']); + $existingDecisions = array_values(array_filter( + $existingArtifact['recalibrationDecisions'] ?? [], + static fn (mixed $decision): bool => is_array($decision), + )); + $policy = TestLaneManifest::laneTrendPolicy( + $laneId, + $report['ciContext']['workflowId'] ?? null, + $report['ciContext']['triggerClass'] ?? null, + ); + $baselineReference = self::resolveBaselineReference($report, $existingArtifact['history'] ?? [], $existingDecisions); + $currentRecord = self::buildTrendRecord($report, $artifactPaths, $baselineReference); + $history = self::mergeTrendHistory($currentRecord, $existingArtifact['history'] ?? [], (int) $policy['retentionLimit']); + $comparisonWindow = self::buildComparisonWindow($currentRecord, $history, $policy); + $assessment = self::buildTrendAssessment($currentRecord, $comparisonWindow, $policy); + $hotspotSnapshot = self::buildHotspotSnapshot( + $currentRecord, + $comparisonWindow['previousComparableRecord'] ?? null, + $policy, + ); + $recordedIn = 'specs/211-runtime-trend-recalibration/spec.md'; + $recalibrationDecisions = self::mergeRecalibrationDecisions( + $existingDecisions, + TestLaneBudget::automaticRecalibrationDecisions($laneId, $assessment, $history, $recordedIn), + ); + $warnings = self::buildTrendWarnings($assessment, $hotspotSnapshot, $comparisonWindow); + + return [ + 'schemaVersion' => TestLaneManifest::laneTrendContractVersion(), + 'laneId' => $laneId, + 'workflowProfile' => (string) $currentRecord['workflowId'], + 'generatedAt' => (string) $report['finishedAt'], + 'policy' => $policy, + 'history' => $history, + 'currentAssessment' => $assessment, + 'hotspotSnapshot' => $hotspotSnapshot, + 'recalibrationDecisions' => $recalibrationDecisions, + 'warnings' => $warnings, + ]; + } + + /** + * @param list> $history + * @param list> $recalibrationDecisions + * @return array{seconds: float|null, source: string|null} + */ + private static function resolveBaselineReference(array $report, array $history, array $recalibrationDecisions): array + { + foreach ($recalibrationDecisions as $decision) { + if (($decision['targetType'] ?? null) !== 'baseline' || ($decision['decisionStatus'] ?? null) !== 'approved') { + continue; + } + + if (($decision['proposedValueSeconds'] ?? null) === null) { + continue; + } + + return [ + 'seconds' => round((float) $decision['proposedValueSeconds'], 6), + 'source' => (string) ($decision['recordedIn'] ?? 'approved-baseline'), + ]; + } + + if (is_array($report['sharedFixtureSlimmingComparison'] ?? null)) { + return [ + 'seconds' => round((float) $report['sharedFixtureSlimmingComparison']['baselineSeconds'], 6), + 'source' => (string) ($report['sharedFixtureSlimmingComparison']['comparisonProfile'] ?? 'shared-fixture-baseline'), + ]; + } + + foreach (array_reverse($history) as $record) { + if (! is_array($record)) { + continue; + } + + if (($record['baselineSeconds'] ?? null) !== null) { + return [ + 'seconds' => round((float) $record['baselineSeconds'], 6), + 'source' => (string) ($record['baselineSource'] ?? 'trend-history-anchor'), + ]; + } + + if (($record['wallClockSeconds'] ?? null) !== null) { + return [ + 'seconds' => round((float) $record['wallClockSeconds'], 6), + 'source' => 'trend-history-anchor', + ]; + } + } + + return [ + 'seconds' => null, + 'source' => null, + ]; + } + + /** + * @param array $report + * @param array{junit: string, summary: string, budget: string, report: string, profile: string, trendHistory: string} $artifactPaths + * @param array{seconds: float|null, source: string|null} $baselineReference + * @return array + */ + private static function buildTrendRecord(array $report, array $artifactPaths, array $baselineReference): array + { + $laneId = (string) $report['laneId']; + $workflowId = (string) ($report['ciContext']['workflowId'] ?? sprintf('local-%s', $laneId)); + $triggerClass = (string) ($report['ciContext']['triggerClass'] ?? 'local'); + $runRef = self::currentRunRef($laneId, (string) $report['finishedAt'], (float) $report['wallClockSeconds']); + $budgetEvaluation = is_array($report['ciBudgetEvaluation'] ?? null) + ? $report['ciBudgetEvaluation'] + : [ + 'budgetStatus' => (string) ($report['budgetStatus'] ?? 'within-budget'), + 'blockingStatus' => 'informational', + ]; + + return array_filter([ + 'runRef' => $runRef, + 'laneId' => $laneId, + 'workflowId' => $workflowId, + 'triggerClass' => $triggerClass, + 'generatedAt' => (string) $report['finishedAt'], + 'wallClockSeconds' => round((float) $report['wallClockSeconds'], 6), + 'baselineSeconds' => $baselineReference['seconds'] !== null ? round((float) $baselineReference['seconds'], 6) : null, + 'baselineSource' => $baselineReference['source'], + 'budgetSeconds' => round((float) $report['budgetThresholdSeconds'], 6), + 'budgetStatus' => (string) ($budgetEvaluation['budgetStatus'] ?? $report['budgetStatus'] ?? 'within-budget'), + 'blockingStatus' => (string) ($budgetEvaluation['blockingStatus'] ?? 'informational'), + 'comparisonFingerprint' => self::comparisonFingerprint($laneId, $workflowId, $triggerClass), + 'classificationTotals' => self::runtimeBucketsFromAttribution( + $report['classificationAttribution'] ?? [], + 'classificationId', + ), + 'familyTotals' => self::runtimeBucketsFromAttribution( + $report['familyAttribution'] ?? [], + 'familyId', + ), + 'hotspotFiles' => self::hotspotFileBuckets($report['slowestEntries'] ?? []), + 'slowestEntries' => self::trendSlowestEntries($report['slowestEntries'] ?? []), + 'artifactRefs' => [ + 'summary' => $artifactPaths['summary'], + 'report' => $artifactPaths['report'], + 'budget' => $artifactPaths['budget'], + 'junit' => $artifactPaths['junit'], + 'trendHistory' => $artifactPaths['trendHistory'], + ], + ], static fn (mixed $value): bool => $value !== null); + } + + private static function currentRunRef(string $laneId, string $finishedAt, float $wallClockSeconds): string + { + $ciRunId = getenv('GITEA_RUN_ID') ?: getenv('GITHUB_RUN_ID') ?: null; + + if (is_string($ciRunId) && $ciRunId !== '') { + return sprintf('%s-%s', $laneId, $ciRunId); + } + + return sprintf( + '%s-%s-%s', + $laneId, + str_replace([':', '+'], ['-', '_'], $finishedAt), + str_replace('.', '-', sprintf('%0.6f', $wallClockSeconds)), + ); + } + + private static function comparisonFingerprint(string $laneId, string $workflowId, string $triggerClass): string + { + $inputs = TestLaneManifest::comparisonFingerprintInputs($laneId, $workflowId, $triggerClass); + + return sha1(json_encode($inputs, JSON_THROW_ON_ERROR)); + } + + /** + * @param array $currentRecord + * @param list> $existingHistory + * @return list> + */ + private static function mergeTrendHistory(array $currentRecord, array $existingHistory, int $retentionLimit): array + { + $merged = [$currentRecord]; + $seenRunRefs = [(string) $currentRecord['runRef'] => true]; + + foreach ($existingHistory as $record) { + if (! is_array($record)) { + continue; + } + + $runRef = (string) ($record['runRef'] ?? ''); + + if ($runRef === '' || array_key_exists($runRef, $seenRunRefs)) { + continue; + } + + $merged[] = $record; + $seenRunRefs[$runRef] = true; + + if (count($merged) >= $retentionLimit) { + break; + } + } + + return $merged; + } + + /** + * @param array $currentRecord + * @param list> $history + * @param array $policy + * @return array + */ + private static function buildComparisonWindow(array $currentRecord, array $history, array $policy): array + { + $comparableRecords = []; + $excludedRecords = []; + $fingerprint = (string) $currentRecord['comparisonFingerprint']; + $comparisonWindowSize = (int) $policy['comparisonWindowSize']; + + foreach ($history as $record) { + if (($record['comparisonFingerprint'] ?? null) !== $fingerprint) { + $excludedRecords[] = [ + 'runRef' => (string) ($record['runRef'] ?? 'unknown'), + 'comparisonFingerprint' => (string) ($record['comparisonFingerprint'] ?? ''), + ]; + + continue; + } + + $comparableRecords[] = $record; + + if (count($comparableRecords) >= $comparisonWindowSize) { + break; + } + } + + $windowStatus = 'stable'; + + if (count($comparableRecords) < (int) $policy['minimumComparableSamples']) { + $windowStatus = $excludedRecords !== [] + ? 'scope-changed' + : 'insufficient-history'; + } + + return [ + 'currentRecord' => $currentRecord, + 'previousComparableRecord' => $comparableRecords[1] ?? null, + 'comparableRecords' => $comparableRecords, + 'excludedRecords' => $excludedRecords, + 'windowStatus' => $windowStatus, + 'sampleCount' => count($comparableRecords), + ]; + } + + /** + * @param array $currentRecord + * @param array $comparisonWindow + * @param array $policy + * @return array + */ + private static function buildTrendAssessment(array $currentRecord, array $comparisonWindow, array $policy): array + { + $comparableRecords = $comparisonWindow['comparableRecords']; + $previousComparableRecord = $comparisonWindow['previousComparableRecord']; + $sampleCount = (int) $comparisonWindow['sampleCount']; + $varianceFloorSeconds = (float) $policy['varianceFloorSeconds']; + $nearBudgetHeadroomSeconds = (float) $policy['nearBudgetHeadroomSeconds']; + $currentSeconds = (float) $currentRecord['wallClockSeconds']; + $budgetSeconds = (float) $currentRecord['budgetSeconds']; + $budgetHeadroomSeconds = round($budgetSeconds - $currentSeconds, 6); + $previousSeconds = is_array($previousComparableRecord) + ? (float) ($previousComparableRecord['wallClockSeconds'] ?? 0.0) + : null; + $baselineSeconds = ($currentRecord['baselineSeconds'] ?? null) !== null + ? (float) $currentRecord['baselineSeconds'] + : null; + $deltaToPreviousSeconds = $previousSeconds !== null + ? round($currentSeconds - $previousSeconds, 6) + : null; + $deltaToBaselineSeconds = $baselineSeconds !== null + ? round($currentSeconds - $baselineSeconds, 6) + : null; + $deltaToPreviousPercent = $previousSeconds !== null && $previousSeconds > 0.0 + ? round(($deltaToPreviousSeconds / $previousSeconds) * 100, 6) + : null; + $deltaToBaselinePercent = $baselineSeconds !== null && $baselineSeconds > 0.0 + ? round(($deltaToBaselineSeconds / $baselineSeconds) * 100, 6) + : null; + $worseningStreak = self::worseningStreak($comparableRecords, $varianceFloorSeconds); + $varianceObservedSeconds = self::varianceObservedSeconds($comparableRecords); + $windowStatus = (string) $comparisonWindow['windowStatus']; + $noiseDetected = self::isNoisyWindow($comparableRecords, $varianceFloorSeconds); + $stablePlateau = $sampleCount >= (int) $policy['minimumComparableSamples'] + && $varianceObservedSeconds <= $varianceFloorSeconds + && ($deltaToPreviousSeconds === null || abs($deltaToPreviousSeconds) <= $varianceFloorSeconds) + && $deltaToBaselineSeconds !== null + && abs($deltaToBaselineSeconds) > $varianceFloorSeconds; + $healthClass = 'healthy'; + $recalibrationRecommendation = 'none'; + $summaryLine = 'Lane runtime is stable and comfortably inside the documented budget.'; + + if ($windowStatus !== 'stable') { + $healthClass = 'unstable'; + $recalibrationRecommendation = 'investigate'; + $summaryLine = $windowStatus === 'scope-changed' + ? 'Lane scope or workflow context changed, so older runs are not directly comparable yet.' + : 'Lane history is still building, so trend classification remains intentionally unstable.'; + } elseif ($noiseDetected) { + $healthClass = 'unstable'; + $windowStatus = 'noisy'; + $recalibrationRecommendation = 'investigate'; + $summaryLine = 'Recent samples disagree with each other, so the latest spike is treated as noise instead of a structural regression.'; + } elseif ($budgetHeadroomSeconds < 0.0 && $worseningStreak >= 2) { + $healthClass = 'regressed'; + $recalibrationRecommendation = 'review-budget'; + $summaryLine = 'Comparable runs show repeated worsening and the lane is now over budget.'; + } elseif ($worseningStreak >= 2) { + $healthClass = 'trending-worse'; + $recalibrationRecommendation = $budgetHeadroomSeconds <= $nearBudgetHeadroomSeconds ? 'review-budget' : 'investigate'; + $summaryLine = 'Comparable runs show sustained worsening above the documented variance floor.'; + } elseif ($budgetHeadroomSeconds <= $nearBudgetHeadroomSeconds) { + $healthClass = 'budget-near'; + $recalibrationRecommendation = 'investigate'; + $summaryLine = 'Lane runtime remains under budget, but headroom is now thin enough to warrant attention.'; + } elseif ($stablePlateau) { + $recalibrationRecommendation = 'review-baseline'; + $summaryLine = 'Lane runtime has stabilized at a new level, so baseline review is reasonable if scope or infrastructure truth changed.'; + } + + return [ + 'healthClass' => $healthClass, + 'recalibrationRecommendation' => $recalibrationRecommendation, + 'budgetHeadroomSeconds' => $budgetHeadroomSeconds, + 'deltaToPreviousSeconds' => $deltaToPreviousSeconds, + 'deltaToPreviousPercent' => $deltaToPreviousPercent, + 'deltaToBaselineSeconds' => $deltaToBaselineSeconds, + 'deltaToBaselinePercent' => $deltaToBaselinePercent, + 'worseningStreak' => $worseningStreak, + 'varianceObservedSeconds' => $varianceObservedSeconds, + 'windowStatus' => $windowStatus, + 'sampleCount' => $sampleCount, + 'previousComparableRunRef' => is_array($previousComparableRecord) + ? (string) ($previousComparableRecord['runRef'] ?? '') + : null, + 'summaryLine' => $summaryLine, + ]; + } + + /** + * @param array $currentRecord + * @param array|null $previousComparableRecord + * @param array $policy + * @return array + */ + private static function buildHotspotSnapshot(array $currentRecord, ?array $previousComparableRecord, array $policy): array + { + if (! is_array($previousComparableRecord) + || ($currentRecord['familyTotals'] ?? []) === [] + || ($previousComparableRecord['familyTotals'] ?? []) === []) { + return [ + 'evidenceAvailability' => 'unavailable', + 'familyDeltas' => [], + 'fileHotspots' => [], + 'newEntrants' => [], + 'droppedEntrants' => [], + ]; + } + + $familyDeltas = self::deltaBuckets( + $currentRecord['familyTotals'], + $previousComparableRecord['familyTotals'] ?? [], + (int) $policy['hotspotFamilyLimit'], + ); + $fileHotspots = self::deltaBuckets( + $currentRecord['hotspotFiles'] ?? [], + $previousComparableRecord['hotspotFiles'] ?? [], + (int) $policy['hotspotFileLimit'], + ); + $currentHotspots = array_column($fileHotspots, 'name'); + $previousHotspots = array_map( + static fn (array $bucket): string => (string) ($bucket['name'] ?? ''), + array_slice($previousComparableRecord['hotspotFiles'] ?? [], 0, (int) $policy['hotspotFileLimit']), + ); + + return [ + 'evidenceAvailability' => 'available', + 'familyDeltas' => $familyDeltas, + 'fileHotspots' => $fileHotspots, + 'newEntrants' => array_values(array_diff($currentHotspots, $previousHotspots)), + 'droppedEntrants' => array_values(array_diff($previousHotspots, $currentHotspots)), + ]; + } + + /** + * @param array $assessment + * @param array $hotspotSnapshot + * @param array $comparisonWindow + * @return list + */ + private static function buildTrendWarnings(array $assessment, array $hotspotSnapshot, array $comparisonWindow): array + { + $warnings = []; + + if (($assessment['windowStatus'] ?? 'stable') !== 'stable') { + $warnings[] = sprintf('Trend window status is %s.', (string) $assessment['windowStatus']); + } + + if (($hotspotSnapshot['evidenceAvailability'] ?? 'unavailable') !== 'available') { + $warnings[] = 'Hotspot evidence is unavailable for this cycle.'; + } + + if (($comparisonWindow['excludedRecords'] ?? []) !== []) { + $warnings[] = 'One or more recent records were excluded because the comparison fingerprint changed.'; + } + + return $warnings; + } + + /** + * @param list> $existingDecisions + * @param list> $newDecisions + * @return list> + */ + private static function mergeRecalibrationDecisions(array $existingDecisions, array $newDecisions): array + { + $merged = []; + $seen = []; + + foreach (array_merge($newDecisions, $existingDecisions) as $decision) { + if (! is_array($decision)) { + continue; + } + + $signature = implode('|', [ + (string) ($decision['targetType'] ?? ''), + (string) ($decision['decisionStatus'] ?? ''), + (string) ($decision['rationaleCode'] ?? ''), + (string) ($decision['recordedIn'] ?? ''), + ]); + + if (isset($seen[$signature])) { + continue; + } + + $merged[] = $decision; + $seen[$signature] = true; + + if (count($merged) >= 6) { + break; + } + } + + return $merged; + } + + /** + * @param list> $attribution + * @return list> + */ + private static function runtimeBucketsFromAttribution(array $attribution, string $nameKey): array + { + return array_values(array_map( + static fn (array $entry): array => [ + 'name' => (string) ($entry[$nameKey] ?? 'unknown'), + 'runtimeSeconds' => round((float) ($entry['totalWallClockSeconds'] ?? 0.0), 6), + ], + $attribution, + )); + } + + /** + * @param list> $slowestEntries + * @return list> + */ + private static function hotspotFileBuckets(array $slowestEntries): array + { + $durations = []; + + foreach ($slowestEntries as $entry) { + $file = (string) ($entry['filePath'] ?? ''); + + if ($file === '') { + continue; + } + + $durations[$file] = round(($durations[$file] ?? 0.0) + (float) ($entry['wallClockSeconds'] ?? $entry['durationSeconds'] ?? 0.0), 6); + } + + arsort($durations); + + return array_values(array_map( + static fn (string $file, float $seconds): array => [ + 'name' => $file, + 'runtimeSeconds' => $seconds, + ], + array_keys($durations), + $durations, + )); + } + + /** + * @param list> $slowestEntries + * @return list> + */ + private static function trendSlowestEntries(array $slowestEntries): array + { + return array_values(array_map( + static fn (array $entry): array => [ + 'label' => (string) ($entry['label'] ?? $entry['subject'] ?? 'unknown'), + 'runtimeSeconds' => round((float) ($entry['wallClockSeconds'] ?? $entry['durationSeconds'] ?? 0.0), 6), + 'file' => isset($entry['filePath']) ? (string) $entry['filePath'] : null, + ], + array_slice($slowestEntries, 0, 10), + )); + } + + /** + * @param list> $currentBuckets + * @param list> $previousBuckets + * @return list> + */ + private static function deltaBuckets(array $currentBuckets, array $previousBuckets, int $limit): array + { + $currentMap = []; + $previousMap = []; + + foreach ($currentBuckets as $bucket) { + $currentMap[(string) ($bucket['name'] ?? '')] = (float) ($bucket['runtimeSeconds'] ?? 0.0); + } + + foreach ($previousBuckets as $bucket) { + $previousMap[(string) ($bucket['name'] ?? '')] = (float) ($bucket['runtimeSeconds'] ?? 0.0); + } + + $names = array_values(array_filter(array_unique(array_merge(array_keys($currentMap), array_keys($previousMap))))); + $deltas = []; + + foreach ($names as $name) { + $currentSeconds = round((float) ($currentMap[$name] ?? 0.0), 6); + $previousSeconds = round((float) ($previousMap[$name] ?? 0.0), 6); + $deltaSeconds = round($currentSeconds - $previousSeconds, 6); + $deltaPercent = $previousSeconds > 0.0 + ? round(($deltaSeconds / $previousSeconds) * 100, 6) + : null; + + $deltas[] = [ + 'name' => $name, + 'currentSeconds' => $currentSeconds, + 'previousSeconds' => $previousSeconds, + 'deltaSeconds' => $deltaSeconds, + 'deltaPercent' => $deltaPercent, + 'direction' => $deltaSeconds > 0.0 ? 'up' : ($deltaSeconds < 0.0 ? 'down' : 'flat'), + ]; + } + + usort($deltas, static fn (array $left, array $right): int => abs((float) $right['deltaSeconds']) <=> abs((float) $left['deltaSeconds'])); + + return array_slice($deltas, 0, $limit); + } + + /** + * @param list> $comparableRecords + */ + private static function worseningStreak(array $comparableRecords, float $varianceFloorSeconds): int + { + $streak = 0; + + for ($index = 0; $index < count($comparableRecords) - 1; $index++) { + $currentSeconds = (float) ($comparableRecords[$index]['wallClockSeconds'] ?? 0.0); + $previousSeconds = (float) ($comparableRecords[$index + 1]['wallClockSeconds'] ?? 0.0); + + if (($currentSeconds - $previousSeconds) <= $varianceFloorSeconds) { + break; + } + + $streak++; + } + + return $streak; + } + + /** + * @param list> $comparableRecords + */ + private static function varianceObservedSeconds(array $comparableRecords): float + { + $seconds = array_values(array_map( + static fn (array $record): float => (float) ($record['wallClockSeconds'] ?? 0.0), + $comparableRecords, + )); + + if ($seconds === []) { + return 0.0; + } + + return round(max($seconds) - min($seconds), 6); + } + + /** + * @param list> $comparableRecords + */ + private static function isNoisyWindow(array $comparableRecords, float $varianceFloorSeconds): bool + { + if (count($comparableRecords) < 3) { + return false; + } + + $directions = []; + + for ($index = 0; $index < count($comparableRecords) - 1; $index++) { + $currentSeconds = (float) ($comparableRecords[$index]['wallClockSeconds'] ?? 0.0); + $previousSeconds = (float) ($comparableRecords[$index + 1]['wallClockSeconds'] ?? 0.0); + $deltaSeconds = round($currentSeconds - $previousSeconds, 6); + + if (abs($deltaSeconds) <= $varianceFloorSeconds) { + continue; + } + + $directions[] = $deltaSeconds > 0.0 ? 'up' : 'down'; + } + + return in_array('up', $directions, true) && in_array('down', $directions, true); + } + + /** + * @return array + */ + private static function loadTrendHistoryArtifact(string $laneId, ?string $artifactDirectory = null): array + { + $artifactPaths = self::artifactPaths($laneId, $artifactDirectory); + $trendHistoryPath = TestLaneManifest::absolutePath($artifactPaths['trendHistory']); + + if (! is_file($trendHistoryPath)) { + return []; + } + + $decoded = json_decode((string) file_get_contents($trendHistoryPath), true); + + return is_array($decoded) ? $decoded : []; + } + + private static function resolveInputPath(string $path): string + { + if (str_starts_with($path, DIRECTORY_SEPARATOR)) { + return $path; + } + + return self::repositoryRoot().DIRECTORY_SEPARATOR.ltrim($path, DIRECTORY_SEPARATOR); + } + + private static function findTrendHistoryInDirectory(string $laneId, string $bundleDirectory): ?string + { + $candidates = [ + sprintf('%s.trend-history.json', $laneId), + sprintf('%s-latest.trend-history.json', $laneId), + ]; + + foreach ($candidates as $candidate) { + $candidatePath = rtrim($bundleDirectory, DIRECTORY_SEPARATOR).DIRECTORY_SEPARATOR.$candidate; + + if (is_file($candidatePath)) { + return $candidatePath; + } + } + + $iterator = new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator($bundleDirectory)); + + foreach ($iterator as $file) { + if (! $file->isFile()) { + continue; + } + + if (! in_array($file->getFilename(), $candidates, true)) { + continue; + } + + return $file->getPathname(); + } + + return null; + } + + private static function findTrendHistoryInZip(string $laneId, ZipArchive $zip): ?string + { + $candidates = [ + sprintf('%s.trend-history.json', $laneId), + sprintf('%s-latest.trend-history.json', $laneId), + ]; + + for ($index = 0; $index < $zip->numFiles; $index++) { + $entryName = $zip->getNameIndex($index); + + if (! is_string($entryName)) { + continue; + } + + foreach ($candidates as $candidate) { + if (str_ends_with($entryName, $candidate)) { + return $entryName; + } + } + } + + return null; + } + + /** + * @param array{junit: string, summary: string, budget: string, report: string, profile: string, trendHistory: string} $artifactPaths * @return array */ private static function artifactFileMap(array $artifactPaths): array @@ -947,6 +1839,7 @@ private static function artifactFileMap(array $artifactPaths): array 'budget.json' => $artifactPaths['budget'], 'report.json' => $artifactPaths['report'], 'profile.txt' => $artifactPaths['profile'], + 'trend-history.json' => $artifactPaths['trendHistory'], ]; } @@ -981,4 +1874,4 @@ private static function ensureDirectory(string $directory): void mkdir($directory, 0777, true); } -} \ No newline at end of file +} diff --git a/apps/platform/tests/Support/TestLaneTrendFixtures.php b/apps/platform/tests/Support/TestLaneTrendFixtures.php new file mode 100644 index 00000000..41630e36 --- /dev/null +++ b/apps/platform/tests/Support/TestLaneTrendFixtures.php @@ -0,0 +1,96 @@ + $durationsByFile + * @return list> + */ + public static function slowestEntries(array $durationsByFile, string $laneId): array + { + $entries = array_values(array_map( + static fn (float $seconds, string $filePath): array => [ + 'label' => $filePath.'::synthetic', + 'subject' => $filePath.'::synthetic', + 'filePath' => $filePath, + 'durationSeconds' => $seconds, + 'wallClockSeconds' => $seconds, + 'laneId' => $laneId, + ], + $durationsByFile, + array_keys($durationsByFile), + )); + + usort($entries, static fn (array $left, array $right): int => $right['wallClockSeconds'] <=> $left['wallClockSeconds']); + + return $entries; + } + + /** + * @param array $durationsByFile + * @param array|null $ciContext + * @return array + */ + public static function buildReport( + string $laneId, + float $wallClockSeconds, + array $durationsByFile = [], + ?string $artifactDirectory = null, + ?array $ciContext = null, + ?string $comparisonProfile = null, + ): array { + return TestLaneReport::buildReport( + laneId: $laneId, + wallClockSeconds: $wallClockSeconds, + slowestEntries: self::slowestEntries($durationsByFile, $laneId), + durationsByFile: $durationsByFile, + artifactDirectory: $artifactDirectory, + comparisonProfile: $comparisonProfile, + ciContext: $ciContext, + ); + } + + /** + * @param array $artifact + */ + public static function writeTrendHistory(string $laneId, array $artifact, ?string $artifactDirectory = null): string + { + $artifactPaths = TestLaneReport::artifactPaths($laneId, $artifactDirectory); + $absolutePath = TestLaneManifest::absolutePath($artifactPaths['trendHistory']); + $directory = dirname($absolutePath); + + if (! is_dir($directory)) { + mkdir($directory, 0777, true); + } + + file_put_contents($absolutePath, json_encode($artifact, JSON_PRETTY_PRINT | JSON_THROW_ON_ERROR)); + + return $absolutePath; + } + + /** + * @return array + */ + public static function readTrendHistory(string $laneId, ?string $artifactDirectory = null): array + { + $artifactPaths = TestLaneReport::artifactPaths($laneId, $artifactDirectory); + $absolutePath = TestLaneManifest::absolutePath($artifactPaths['trendHistory']); + + if (! is_file($absolutePath)) { + return []; + } + + $decoded = json_decode((string) file_get_contents($absolutePath), true); + + return is_array($decoded) ? $decoded : []; + } +} diff --git a/scripts/platform-test-report b/scripts/platform-test-report index 75b459dc..b5c6aec2 100755 --- a/scripts/platform-test-report +++ b/scripts/platform-test-report @@ -9,12 +9,15 @@ LANE="${1:-fast-feedback}" CAPTURE_BASELINE=false WORKFLOW_ID="" TRIGGER_CLASS="" +HISTORY_FILE="" +HISTORY_BUNDLE="" +FETCH_LATEST_HISTORY=auto copy_heavy_baseline_artifacts() { local artifact_root="${APP_DIR}/storage/logs/test-lanes" local suffix - for suffix in summary.md budget.json report.json; do + for suffix in summary.md budget.json report.json trend-history.json; do local latest_path="${artifact_root}/heavy-governance-latest.${suffix}" local baseline_path="${artifact_root}/heavy-governance-baseline.${suffix}" @@ -40,6 +43,9 @@ case "${LANE}" in profiling|profile) COMPOSER_SCRIPT="test:report:profile" ;; + junit) + COMPOSER_SCRIPT="test:report:junit" + ;; *) echo "Unknown test lane: ${LANE}" >&2 exit 1 @@ -64,9 +70,29 @@ for arg in "$@"; do continue fi + if [[ "${arg}" == --history-file=* ]]; then + HISTORY_FILE="${arg#--history-file=}" + continue + fi + + if [[ "${arg}" == --history-bundle=* ]]; then + HISTORY_BUNDLE="${arg#--history-bundle=}" + continue + fi + + if [[ "${arg}" == "--fetch-latest-history" ]]; then + FETCH_LATEST_HISTORY=true + continue + fi + + if [[ "${arg}" == "--skip-latest-history" ]]; then + FETCH_LATEST_HISTORY=false + continue + fi + echo "Unknown option: ${arg}" >&2 exit 1 - done +done if [[ "${CAPTURE_BASELINE}" == true && "${LANE}" != "heavy-governance" && "${LANE}" != "heavy" ]]; then echo "--capture-baseline is only supported for heavy-governance" >&2 @@ -81,8 +107,251 @@ if [[ -n "${TRIGGER_CLASS}" ]]; then export TENANTATLAS_CI_TRIGGER_CLASS="${TRIGGER_CLASS}" fi +trend_history_target_path() { + echo "${APP_DIR}/storage/logs/test-lanes/${LANE}-latest.trend-history.json" +} + +resolve_input_path() { + local path="${1:-}" + + if [[ -z "${path}" ]]; then + return 1 + fi + + if [[ "${path}" = /* ]]; then + echo "${path}" + return 0 + fi + + echo "${ROOT_DIR}/${path#./}" +} + +hydrate_trend_history_from_file() { + local source_path="${1:-}" + local target_path + + target_path="$(trend_history_target_path)" + + if [[ -z "${source_path}" || ! -f "${source_path}" ]]; then + return 1 + fi + + mkdir -p "$(dirname "${target_path}")" + cp "${source_path}" "${target_path}" + return 0 +} + +hydrate_trend_history_from_bundle() { + local bundle_path="${1:-}" + local target_path + local candidate + + target_path="$(trend_history_target_path)" + + if [[ -z "${bundle_path}" ]]; then + return 1 + fi + + if [[ -d "${bundle_path}" ]]; then + for candidate in \ + "${bundle_path}/${LANE}.trend-history.json" \ + "${bundle_path}/${LANE}-latest.trend-history.json" + do + if [[ -f "${candidate}" ]]; then + mkdir -p "$(dirname "${target_path}")" + cp "${candidate}" "${target_path}" + return 0 + fi + done + + candidate="$(find "${bundle_path}" -type f \( -name "${LANE}.trend-history.json" -o -name "${LANE}-latest.trend-history.json" \) | head -n 1)" + + if [[ -n "${candidate}" && -f "${candidate}" ]]; then + mkdir -p "$(dirname "${target_path}")" + cp "${candidate}" "${target_path}" + return 0 + fi + fi + + if [[ -f "${bundle_path}" && "${bundle_path,,}" == *.zip ]]; then + candidate="$(python3 - "${bundle_path}" "${LANE}" <<'PY' +import sys +import zipfile + +bundle_path, lane = sys.argv[1], sys.argv[2] +candidates = [f"{lane}.trend-history.json", f"{lane}-latest.trend-history.json"] + +with zipfile.ZipFile(bundle_path) as archive: + for name in archive.namelist(): + if any(name.endswith(candidate) for candidate in candidates): + print(name) + break +PY +)" + + if [[ -n "${candidate}" ]]; then + mkdir -p "$(dirname "${target_path}")" + unzip -p "${bundle_path}" "${candidate}" > "${target_path}" + return 0 + fi + fi + + return 1 +} + +parse_remote_origin() { + local origin_url + + origin_url="$(git -C "${ROOT_DIR}" config --get remote.origin.url 2>/dev/null || true)" + + if [[ -z "${origin_url}" ]]; then + return 1 + fi + + python3 - "${origin_url}" <<'PY' +import re +import sys + +origin = sys.argv[1].strip() +patterns = [ + re.compile(r'^(https?://[^/]+)/([^/]+)/([^/]+?)(?:\.git)?$'), + re.compile(r'^git@([^:]+):([^/]+)/([^/]+?)(?:\.git)?$'), + re.compile(r'^ssh://git@([^/]+)/([^/]+)/([^/]+?)(?:\.git)?$'), +] + +for pattern in patterns: + match = pattern.match(origin) + if not match: + continue + + groups = match.groups() + + if origin.startswith("http://") or origin.startswith("https://"): + host, owner, repo = groups + else: + host, owner, repo = groups + host = f"https://{host}" + + print(host) + print(owner) + print(repo) + sys.exit(0) + +sys.exit(1) +PY +} + +download_latest_history_bundle() { + local token + local artifact_name + local remote_parts + local host + local owner + local repo + local listing_path + local artifact_id + local bundle_dir + local bundle_path + + token="${TENANTATLAS_GITEA_TOKEN:-${GITEA_TOKEN:-}}" + + if [[ -z "${token}" ]]; then + return 1 + fi + + mapfile -t remote_parts < <(parse_remote_origin) || return 1 + + if [[ "${#remote_parts[@]}" -ne 3 ]]; then + return 1 + fi + + host="${remote_parts[0]}" + owner="${remote_parts[1]}" + repo="${remote_parts[2]}" + artifact_name="${LANE}-artifacts" + listing_path="${ROOT_DIR}/.gitea-artifacts/_history/${LANE}-artifacts.json" + + mkdir -p "${ROOT_DIR}/.gitea-artifacts/_history" + + curl --fail --silent --show-error \ + -H "Authorization: token ${token}" \ + -H "Accept: application/json" \ + "${host}/api/v1/repos/${owner}/${repo}/actions/artifacts?name=${artifact_name}" \ + -o "${listing_path}" || return 1 + + artifact_id="$(php -r ' + $data = json_decode((string) file_get_contents($argv[1]), true); + $currentRunId = getenv("GITEA_RUN_ID") ?: getenv("GITHUB_RUN_ID") ?: ""; + foreach (($data["artifacts"] ?? []) as $artifact) { + if (($artifact["expired"] ?? false) === true) { + continue; + } + + $artifactRunId = (string) ($artifact["workflow_run"]["id"] ?? ""); + + if ($currentRunId !== "" && $artifactRunId === (string) $currentRunId) { + continue; + } + + echo (string) ($artifact["id"] ?? ""); + break; + } + ' "${listing_path}")" + + if [[ -z "${artifact_id}" ]]; then + return 1 + fi + + bundle_dir="${ROOT_DIR}/.gitea-artifacts/_history/${LANE}" + bundle_path="${bundle_dir}/artifact.zip" + rm -rf "${bundle_dir}" + mkdir -p "${bundle_dir}" + + curl --fail --silent --show-error --location \ + -H "Authorization: token ${token}" \ + "${host}/api/v1/repos/${owner}/${repo}/actions/artifacts/${artifact_id}/zip" \ + -o "${bundle_path}" || return 1 + + echo "${bundle_path}" + return 0 +} + +hydrate_trend_history() { + local resolved_history_file="" + local resolved_history_bundle="" + local downloaded_bundle="" + + if [[ -n "${HISTORY_FILE}" ]]; then + resolved_history_file="$(resolve_input_path "${HISTORY_FILE}")" + fi + + if [[ -n "${HISTORY_BUNDLE}" ]]; then + resolved_history_bundle="$(resolve_input_path "${HISTORY_BUNDLE}")" + fi + + if [[ -n "${resolved_history_file}" ]] && hydrate_trend_history_from_file "${resolved_history_file}"; then + return 0 + fi + + if [[ -n "${resolved_history_bundle}" ]] && hydrate_trend_history_from_bundle "${resolved_history_bundle}"; then + return 0 + fi + + if [[ "${FETCH_LATEST_HISTORY}" == true || ( "${FETCH_LATEST_HISTORY}" == auto && -n "${WORKFLOW_ID}" ) ]]; then + downloaded_bundle="$(download_latest_history_bundle || true)" + + if [[ -n "${downloaded_bundle}" ]] && hydrate_trend_history_from_bundle "${downloaded_bundle}"; then + return 0 + fi + fi + + return 0 +} + cd "${APP_DIR}" +hydrate_trend_history + ./vendor/bin/sail composer run --timeout=0 "${COMPOSER_SCRIPT}" if [[ "${CAPTURE_BASELINE}" == true ]]; then diff --git a/specs/211-runtime-trend-recalibration/checklists/requirements.md b/specs/211-runtime-trend-recalibration/checklists/requirements.md new file mode 100644 index 00000000..39cda073 --- /dev/null +++ b/specs/211-runtime-trend-recalibration/checklists/requirements.md @@ -0,0 +1,39 @@ +# Specification Quality Checklist: Test Runtime Trend Reporting & Baseline Recalibration + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2026-04-17 +**Feature**: [spec.md](../spec.md) + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders +- [x] All mandatory sections completed + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic (no implementation details) +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified +- [x] Scope is clearly bounded +- [x] Dependencies and assumptions identified + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria +- [x] User scenarios cover primary flows +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification + +## Notes + +- Validation run: 2026-04-17 +- No template placeholders or [NEEDS CLARIFICATION] markers remain. +- The spec stays repository-governance-focused: it defines trend visibility, drift semantics, recalibration policy, and contributor behavior without prescribing language-, framework-, or API-level implementation. +- Repository-specific nouns such as lane, baseline, budget, hotspot, and summary are treated as domain requirements for the test-governance contract rather than low-level implementation detail. +- The scope remains intentionally narrow: it extends the governed lane system from Specs 206 through 210 with historical observability instead of inventing a broader analytics platform. +- Items marked incomplete require spec updates before `/speckit.clarify` or `/speckit.plan`. \ No newline at end of file diff --git a/specs/211-runtime-trend-recalibration/contracts/test-runtime-trend-history.schema.json b/specs/211-runtime-trend-recalibration/contracts/test-runtime-trend-history.schema.json new file mode 100644 index 00000000..861784fc --- /dev/null +++ b/specs/211-runtime-trend-recalibration/contracts/test-runtime-trend-history.schema.json @@ -0,0 +1,540 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://tenantatlas.local/specs/211-runtime-trend-recalibration/contracts/test-runtime-trend-history.schema.json", + "title": "LaneTrendHistoryArtifact", + "type": "object", + "additionalProperties": false, + "required": [ + "schemaVersion", + "laneId", + "workflowProfile", + "generatedAt", + "policy", + "history", + "currentAssessment" + ], + "properties": { + "schemaVersion": { + "type": "string", + "const": "1.0.0" + }, + "laneId": { + "type": "string", + "enum": [ + "fast-feedback", + "confidence", + "heavy-governance", + "browser", + "junit", + "profiling" + ] + }, + "workflowProfile": { + "type": "string" + }, + "generatedAt": { + "type": "string", + "format": "date-time" + }, + "policy": { + "$ref": "#/$defs/policy" + }, + "history": { + "type": "array", + "minItems": 1, + "items": { + "$ref": "#/$defs/historyRecord" + } + }, + "currentAssessment": { + "$ref": "#/$defs/assessment" + }, + "hotspotSnapshot": { + "$ref": "#/$defs/hotspotSnapshot" + }, + "recalibrationDecisions": { + "type": "array", + "items": { + "$ref": "#/$defs/recalibrationDecision" + }, + "default": [] + }, + "warnings": { + "type": "array", + "items": { + "type": "string" + }, + "default": [] + } + }, + "$defs": { + "policy": { + "type": "object", + "additionalProperties": false, + "required": [ + "retentionLimit", + "comparisonWindowSize", + "minimumComparableSamples", + "varianceFloorSeconds", + "nearBudgetHeadroomSeconds", + "hotspotFamilyLimit", + "hotspotFileLimit", + "slowestEntryRetention" + ], + "properties": { + "retentionLimit": { + "type": "integer", + "minimum": 1 + }, + "comparisonWindowSize": { + "type": "integer", + "minimum": 1 + }, + "minimumComparableSamples": { + "type": "integer", + "minimum": 3 + }, + "varianceFloorSeconds": { + "type": "integer", + "minimum": 0 + }, + "nearBudgetHeadroomSeconds": { + "type": "integer", + "minimum": 0 + }, + "hotspotFamilyLimit": { + "type": "integer", + "minimum": 1 + }, + "hotspotFileLimit": { + "type": "integer", + "minimum": 1 + }, + "slowestEntryRetention": { + "type": "integer", + "minimum": 1 + }, + "recalibrationPolicy": { + "type": "object", + "additionalProperties": false, + "properties": { + "baselineRequiresExplicitReview": { + "type": "boolean" + }, + "budgetRequiresExplicitReview": { + "type": "boolean" + }, + "minimumBudgetEvidenceSamples": { + "type": "integer", + "minimum": 1 + } + } + } + } + }, + "historyRecord": { + "type": "object", + "additionalProperties": false, + "required": [ + "runRef", + "laneId", + "workflowId", + "triggerClass", + "generatedAt", + "wallClockSeconds", + "budgetSeconds", + "budgetStatus", + "blockingStatus", + "comparisonFingerprint" + ], + "properties": { + "runRef": { + "type": "string" + }, + "laneId": { + "type": "string" + }, + "workflowId": { + "type": "string" + }, + "triggerClass": { + "type": "string", + "enum": [ + "pull-request", + "mainline-push", + "manual", + "scheduled", + "local" + ] + }, + "generatedAt": { + "type": "string", + "format": "date-time" + }, + "wallClockSeconds": { + "type": "number", + "minimum": 0 + }, + "baselineSeconds": { + "type": [ + "number", + "null" + ], + "minimum": 0 + }, + "baselineSource": { + "type": [ + "string", + "null" + ] + }, + "budgetSeconds": { + "type": "number", + "minimum": 0 + }, + "budgetStatus": { + "type": "string" + }, + "blockingStatus": { + "type": "string" + }, + "comparisonFingerprint": { + "type": "string" + }, + "classificationTotals": { + "type": "array", + "items": { + "$ref": "#/$defs/runtimeBucket" + }, + "default": [] + }, + "familyTotals": { + "type": "array", + "items": { + "$ref": "#/$defs/runtimeBucket" + }, + "default": [] + }, + "hotspotFiles": { + "type": "array", + "items": { + "$ref": "#/$defs/runtimeBucket" + }, + "default": [] + }, + "slowestEntries": { + "type": "array", + "items": { + "$ref": "#/$defs/slowestEntry" + }, + "default": [] + }, + "artifactRefs": { + "type": "object", + "additionalProperties": false, + "properties": { + "summary": { + "type": "string" + }, + "report": { + "type": "string" + }, + "budget": { + "type": "string" + }, + "junit": { + "type": "string" + }, + "trendHistory": { + "type": "string" + } + } + } + } + }, + "assessment": { + "type": "object", + "additionalProperties": false, + "required": [ + "healthClass", + "recalibrationRecommendation", + "budgetHeadroomSeconds", + "summaryLine", + "windowStatus", + "sampleCount" + ], + "properties": { + "healthClass": { + "type": "string", + "enum": [ + "healthy", + "budget-near", + "trending-worse", + "regressed", + "unstable" + ] + }, + "recalibrationRecommendation": { + "type": "string", + "enum": [ + "none", + "investigate", + "review-baseline", + "review-budget" + ] + }, + "budgetHeadroomSeconds": { + "type": "number" + }, + "deltaToPreviousSeconds": { + "type": [ + "number", + "null" + ] + }, + "deltaToPreviousPercent": { + "type": [ + "number", + "null" + ] + }, + "deltaToBaselineSeconds": { + "type": [ + "number", + "null" + ] + }, + "deltaToBaselinePercent": { + "type": [ + "number", + "null" + ] + }, + "worseningStreak": { + "type": "integer", + "minimum": 0 + }, + "varianceObservedSeconds": { + "type": "number", + "minimum": 0 + }, + "windowStatus": { + "type": "string", + "enum": [ + "stable", + "insufficient-history", + "scope-changed", + "noisy" + ] + }, + "sampleCount": { + "type": "integer", + "minimum": 0 + }, + "previousComparableRunRef": { + "type": [ + "string", + "null" + ] + }, + "summaryLine": { + "type": "string" + } + } + }, + "hotspotSnapshot": { + "type": "object", + "additionalProperties": false, + "required": [ + "evidenceAvailability", + "familyDeltas", + "fileHotspots" + ], + "properties": { + "evidenceAvailability": { + "type": "string", + "enum": [ + "available", + "unavailable" + ] + }, + "familyDeltas": { + "type": "array", + "items": { + "$ref": "#/$defs/deltaBucket" + } + }, + "fileHotspots": { + "type": "array", + "items": { + "$ref": "#/$defs/deltaBucket" + } + }, + "newEntrants": { + "type": "array", + "items": { + "type": "string" + }, + "default": [] + }, + "droppedEntrants": { + "type": "array", + "items": { + "type": "string" + }, + "default": [] + } + } + }, + "recalibrationDecision": { + "type": "object", + "additionalProperties": false, + "required": [ + "targetType", + "decisionStatus", + "evidenceRunRefs", + "previousValueSeconds", + "rationaleCode", + "recordedIn", + "notes" + ], + "properties": { + "targetType": { + "type": "string", + "enum": [ + "baseline", + "budget" + ] + }, + "decisionStatus": { + "type": "string", + "enum": [ + "candidate", + "approved", + "rejected" + ] + }, + "evidenceRunRefs": { + "type": "array", + "minItems": 1, + "items": { + "type": "string" + } + }, + "previousValueSeconds": { + "type": "number", + "minimum": 0 + }, + "proposedValueSeconds": { + "type": [ + "number", + "null" + ], + "minimum": 0 + }, + "rationaleCode": { + "type": "string", + "enum": [ + "lane-scope-change", + "infrastructure-shift", + "post-improvement-reset", + "sustained-erosion", + "noise-rejected", + "manual-hold" + ] + }, + "recordedIn": { + "type": "string", + "description": "Active spec path or implementation PR reference for the approved or rejected recalibration decision." + }, + "notes": { + "type": "string" + } + } + }, + "runtimeBucket": { + "type": "object", + "additionalProperties": false, + "required": [ + "name", + "runtimeSeconds" + ], + "properties": { + "name": { + "type": "string" + }, + "runtimeSeconds": { + "type": "number", + "minimum": 0 + } + } + }, + "slowestEntry": { + "type": "object", + "additionalProperties": false, + "required": [ + "label", + "runtimeSeconds" + ], + "properties": { + "label": { + "type": "string" + }, + "runtimeSeconds": { + "type": "number", + "minimum": 0 + }, + "file": { + "type": [ + "string", + "null" + ] + } + } + }, + "deltaBucket": { + "type": "object", + "additionalProperties": false, + "required": [ + "name", + "currentSeconds", + "previousSeconds", + "deltaSeconds" + ], + "properties": { + "name": { + "type": "string" + }, + "currentSeconds": { + "type": "number", + "minimum": 0 + }, + "previousSeconds": { + "type": "number", + "minimum": 0 + }, + "deltaSeconds": { + "type": "number" + }, + "deltaPercent": { + "type": [ + "number", + "null" + ] + }, + "direction": { + "type": [ + "string", + "null" + ], + "enum": [ + "up", + "down", + "flat", + null + ] + } + } + } + } +} \ No newline at end of file diff --git a/specs/211-runtime-trend-recalibration/contracts/test-runtime-trend.logical.openapi.yaml b/specs/211-runtime-trend-recalibration/contracts/test-runtime-trend.logical.openapi.yaml new file mode 100644 index 00000000..bb3db67b --- /dev/null +++ b/specs/211-runtime-trend-recalibration/contracts/test-runtime-trend.logical.openapi.yaml @@ -0,0 +1,641 @@ +openapi: 3.1.0 +info: + title: Test Runtime Trend Reporting & Baseline Recalibration + version: 1.0.0 + description: | + Logical contract for the repository-owned workflow that updates bounded lane + history, evaluates drift status, emits hotspot deltas, and records explicit + recalibration evidence. This file documents wrapper/support-class semantics, + not a public HTTP API. +servers: + - url: https://tenantatlas.local/logical +paths: + /test-governance/lanes/{laneId}/trend-history: + post: + summary: Update one lane's bounded trend history artifact + operationId: updateLaneTrendHistory + parameters: + - $ref: '#/components/parameters/LaneId' + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/TrendHistoryUpdateRequest' + responses: + '200': + description: Updated bounded history artifact for the lane + content: + application/json: + schema: + $ref: '#/components/schemas/LaneTrendHistoryArtifact' + /test-governance/lanes/{laneId}/trend-assessment: + post: + summary: Evaluate drift status and hotspot deltas for one lane + operationId: evaluateLaneTrendAssessment + parameters: + - $ref: '#/components/parameters/LaneId' + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/LaneTrendAssessmentRequest' + responses: + '200': + description: Current lane assessment including health class and hotspot snapshot + content: + application/json: + schema: + $ref: '#/components/schemas/LaneTrendAssessmentResponse' + /test-governance/lanes/{laneId}/recalibration: + post: + summary: Evaluate or record an explicit recalibration decision for one lane + operationId: evaluateLaneRecalibration + parameters: + - $ref: '#/components/parameters/LaneId' + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/RecalibrationEvaluationRequest' + responses: + '200': + description: Structured recalibration decision or candidate record + content: + application/json: + schema: + $ref: '#/components/schemas/RecalibrationDecision' + /test-governance/cycles/{cycleId}/summary: + get: + summary: Read the trend-aware summary for one reporting cycle + operationId: getTrendSummaryCycle + parameters: + - name: cycleId + in: path + required: true + schema: + type: string + responses: + '200': + description: Trend-aware cycle summary spanning the relevant lanes + content: + application/json: + schema: + $ref: '#/components/schemas/TrendSummaryCycle' +components: + parameters: + LaneId: + name: laneId + in: path + required: true + schema: + type: string + enum: + - fast-feedback + - confidence + - heavy-governance + - browser + - junit + - profiling + schemas: + TrendHistoryUpdateRequest: + type: object + additionalProperties: false + required: + - currentRecord + properties: + currentRecord: + $ref: '#/components/schemas/TrendRecord' + priorHistory: + type: array + items: + $ref: '#/components/schemas/TrendRecord' + description: | + Previously retained history records, typically hydrated from the + most recent comparable uploaded artifact bundle. + policyOverride: + $ref: '#/components/schemas/TrendPolicy' + LaneTrendAssessmentRequest: + type: object + additionalProperties: false + required: + - policy + - history + properties: + policy: + $ref: '#/components/schemas/TrendPolicy' + history: + type: array + items: + $ref: '#/components/schemas/TrendRecord' + includeHotspots: + type: boolean + default: true + LaneTrendAssessmentResponse: + type: object + additionalProperties: false + required: + - assessment + properties: + assessment: + $ref: '#/components/schemas/DriftAssessment' + hotspotSnapshot: + $ref: '#/components/schemas/HotspotSnapshot' + warnings: + type: array + items: + type: string + RecalibrationEvaluationRequest: + type: object + additionalProperties: false + required: + - targetType + - assessment + - evidenceRunRefs + properties: + targetType: + type: string + enum: + - baseline + - budget + assessment: + $ref: '#/components/schemas/DriftAssessment' + evidenceRunRefs: + type: array + minItems: 1 + items: + type: string + proposedValueSeconds: + type: + - number + - 'null' + rationaleCode: + type: + - string + - 'null' + enum: + - lane-scope-change + - infrastructure-shift + - post-improvement-reset + - sustained-erosion + - noise-rejected + - manual-hold + - null + recordLocation: + type: + - string + - 'null' + description: Active spec path or implementation PR reference for the human-reviewed decision record. + LaneTrendHistoryArtifact: + type: object + additionalProperties: false + required: + - schemaVersion + - laneId + - workflowProfile + - generatedAt + - policy + - history + - currentAssessment + properties: + schemaVersion: + type: string + laneId: + type: string + workflowProfile: + type: string + generatedAt: + type: string + format: date-time + policy: + $ref: '#/components/schemas/TrendPolicy' + history: + type: array + items: + $ref: '#/components/schemas/TrendRecord' + currentAssessment: + $ref: '#/components/schemas/DriftAssessment' + hotspotSnapshot: + $ref: '#/components/schemas/HotspotSnapshot' + recalibrationDecisions: + type: array + items: + $ref: '#/components/schemas/RecalibrationDecision' + warnings: + type: array + items: + type: string + TrendPolicy: + type: object + additionalProperties: false + required: + - retentionLimit + - comparisonWindowSize + - minimumComparableSamples + - varianceFloorSeconds + - nearBudgetHeadroomSeconds + - hotspotFamilyLimit + - hotspotFileLimit + - slowestEntryRetention + properties: + retentionLimit: + type: integer + minimum: 1 + comparisonWindowSize: + type: integer + minimum: 1 + minimumComparableSamples: + type: integer + minimum: 3 + varianceFloorSeconds: + type: integer + minimum: 0 + nearBudgetHeadroomSeconds: + type: integer + minimum: 0 + hotspotFamilyLimit: + type: integer + minimum: 1 + hotspotFileLimit: + type: integer + minimum: 1 + slowestEntryRetention: + type: integer + minimum: 1 + recalibrationPolicy: + type: object + additionalProperties: false + properties: + baselineRequiresExplicitReview: + type: boolean + budgetRequiresExplicitReview: + type: boolean + minimumBudgetEvidenceSamples: + type: integer + minimum: 1 + TrendRecord: + type: object + additionalProperties: false + required: + - runRef + - laneId + - workflowId + - triggerClass + - generatedAt + - wallClockSeconds + - budgetSeconds + - budgetStatus + - blockingStatus + - comparisonFingerprint + properties: + runRef: + type: string + laneId: + type: string + workflowId: + type: string + triggerClass: + type: string + enum: + - pull-request + - mainline-push + - manual + - scheduled + - local + generatedAt: + type: string + format: date-time + wallClockSeconds: + type: number + minimum: 0 + baselineSeconds: + type: + - number + - 'null' + baselineSource: + type: + - string + - 'null' + budgetSeconds: + type: number + minimum: 0 + budgetStatus: + type: string + blockingStatus: + type: string + comparisonFingerprint: + type: string + classificationTotals: + type: array + items: + $ref: '#/components/schemas/RuntimeBucket' + familyTotals: + type: array + items: + $ref: '#/components/schemas/RuntimeBucket' + hotspotFiles: + type: array + items: + $ref: '#/components/schemas/RuntimeBucket' + slowestEntries: + type: array + items: + $ref: '#/components/schemas/SlowestEntry' + artifactRefs: + type: object + additionalProperties: false + properties: + summary: + type: string + report: + type: string + budget: + type: string + junit: + type: string + trendHistory: + type: string + DriftAssessment: + type: object + additionalProperties: false + required: + - healthClass + - recalibrationRecommendation + - budgetHeadroomSeconds + - summaryLine + - windowStatus + - sampleCount + properties: + healthClass: + type: string + enum: + - healthy + - budget-near + - trending-worse + - regressed + - unstable + recalibrationRecommendation: + type: string + enum: + - none + - investigate + - review-baseline + - review-budget + budgetHeadroomSeconds: + type: number + deltaToPreviousSeconds: + type: + - number + - 'null' + deltaToPreviousPercent: + type: + - number + - 'null' + deltaToBaselineSeconds: + type: + - number + - 'null' + deltaToBaselinePercent: + type: + - number + - 'null' + worseningStreak: + type: integer + minimum: 0 + varianceObservedSeconds: + type: number + minimum: 0 + windowStatus: + type: string + enum: + - stable + - insufficient-history + - scope-changed + - noisy + sampleCount: + type: integer + minimum: 0 + previousComparableRunRef: + type: + - string + - 'null' + summaryLine: + type: string + HotspotSnapshot: + type: object + additionalProperties: false + required: + - evidenceAvailability + - familyDeltas + - fileHotspots + properties: + evidenceAvailability: + type: string + enum: + - available + - unavailable + familyDeltas: + type: array + items: + $ref: '#/components/schemas/DeltaBucket' + fileHotspots: + type: array + items: + $ref: '#/components/schemas/DeltaBucket' + newEntrants: + type: array + items: + type: string + droppedEntrants: + type: array + items: + type: string + DeltaBucket: + type: object + additionalProperties: false + required: + - name + - currentSeconds + - previousSeconds + - deltaSeconds + properties: + name: + type: string + currentSeconds: + type: number + minimum: 0 + previousSeconds: + type: number + minimum: 0 + deltaSeconds: + type: number + deltaPercent: + type: + - number + - 'null' + direction: + type: + - string + - 'null' + enum: + - up + - down + - flat + - null + RuntimeBucket: + type: object + additionalProperties: false + required: + - name + - runtimeSeconds + properties: + name: + type: string + runtimeSeconds: + type: number + minimum: 0 + SlowestEntry: + type: object + additionalProperties: false + required: + - label + - runtimeSeconds + properties: + label: + type: string + runtimeSeconds: + type: number + minimum: 0 + file: + type: + - string + - 'null' + RecalibrationDecision: + type: object + additionalProperties: false + required: + - targetType + - decisionStatus + - evidenceRunRefs + - previousValueSeconds + - rationaleCode + - recordedIn + - notes + properties: + targetType: + type: string + enum: + - baseline + - budget + decisionStatus: + type: string + enum: + - candidate + - approved + - rejected + evidenceRunRefs: + type: array + minItems: 1 + items: + type: string + previousValueSeconds: + type: number + minimum: 0 + proposedValueSeconds: + type: + - number + - 'null' + rationaleCode: + type: string + enum: + - lane-scope-change + - infrastructure-shift + - post-improvement-reset + - sustained-erosion + - noise-rejected + - manual-hold + recordedIn: + type: string + description: Active spec path or implementation PR reference for the approved or rejected decision. + notes: + type: string + TrendSummaryCycle: + type: object + additionalProperties: false + required: + - cycleId + - generatedAt + - laneSummaries + - laneAssessments + properties: + cycleId: + type: string + generatedAt: + type: string + format: date-time + laneSummaries: + type: array + items: + $ref: '#/components/schemas/CycleLaneSummary' + laneAssessments: + type: array + items: + $ref: '#/components/schemas/DriftAssessment' + hotspotSnapshots: + type: array + items: + $ref: '#/components/schemas/HotspotSnapshot' + recalibrationDecisions: + type: array + items: + $ref: '#/components/schemas/RecalibrationDecision' + artifactPublicationStatus: + type: array + items: + type: string + warnings: + type: array + items: + type: string + CycleLaneSummary: + type: object + additionalProperties: false + required: + - laneId + - currentRuntimeSeconds + - budgetSeconds + - assessment + properties: + laneId: + type: string + enum: + - fast-feedback + - confidence + - heavy-governance + - browser + - junit + - profiling + currentRuntimeSeconds: + type: number + minimum: 0 + previousComparableSeconds: + type: + - number + - 'null' + baselineSeconds: + type: + - number + - 'null' + budgetSeconds: + type: number + minimum: 0 + assessment: + $ref: '#/components/schemas/DriftAssessment' + hotspotSnapshot: + $ref: '#/components/schemas/HotspotSnapshot' + warnings: + type: array + items: + type: string \ No newline at end of file diff --git a/specs/211-runtime-trend-recalibration/data-model.md b/specs/211-runtime-trend-recalibration/data-model.md new file mode 100644 index 00000000..c15e73c9 --- /dev/null +++ b/specs/211-runtime-trend-recalibration/data-model.md @@ -0,0 +1,192 @@ +# Data Model: Test Runtime Trend Reporting & Baseline Recalibration + +This feature adds repository-owned governance artifacts only. It does not add product database tables. All objects below are implemented as manifest metadata, generated JSON payloads, markdown summaries, or guard-test fixtures derived from the existing lane report outputs. + +## 1. LaneTrendPolicy + +**Purpose**: Defines the lane-specific rules for bounded history retention, comparable-window evaluation, hotspot visibility, and recalibration guidance. + +| Field | Type | Description | +|-------|------|-------------| +| `laneId` | string | Canonical lane identifier (`fast-feedback`, `confidence`, `heavy-governance`, `browser`, `junit`, `profiling`). | +| `workflowProfile` | string | Workflow profile that owns the lane history source in CI. | +| `retentionLimit` | integer | Max history records retained for the lane. | +| `comparisonWindowSize` | integer | Number of recent comparable records used for drift evaluation. | +| `minimumComparableSamples` | integer | Required sample count before a stable non-`unstable` health class is allowed. | +| `varianceFloorSeconds` | integer | Minimum meaningful delta for the lane, aligned with current enforcement tolerance. | +| `nearBudgetHeadroomSeconds` | integer | Headroom threshold for `budget-near`. | +| `hotspotFamilyLimit` | integer | Max family deltas shown in readable summaries. | +| `hotspotFileLimit` | integer | Max file hotspots shown in readable summaries. | +| `slowestEntryRetention` | integer | Max slowest test entries retained in JSON evidence. | +| `recalibrationPolicy` | array | Rule summary for acceptable baseline and budget recalibration triggers. | + +**Relationships** + +- One `LaneTrendPolicy` governs many `LaneTrendRecord` entries for the same lane. +- One `LaneTrendPolicy` informs one `TrendComparisonWindow`, one `LaneDriftAssessment`, and zero or more `RecalibrationDecisionRecord` entries per reporting cycle. + +**Validation Rules** + +- `retentionLimit` must be greater than or equal to `comparisonWindowSize`. +- `minimumComparableSamples` must be at least 3. +- `varianceFloorSeconds` must align with or exceed the lane's existing enforcement tolerance. +- Primary lanes use a larger retention window than support lanes. + +## 2. LaneTrendRecord + +**Purpose**: Captures the per-run evidence snapshot that can safely be compared over time. + +| Field | Type | Description | +|-------|------|-------------| +| `runRef` | string | Stable run reference from CI or local execution. | +| `laneId` | string | Governed lane identifier. | +| `workflowId` | string | Workflow profile or logical workflow owner for the run. | +| `triggerClass` | string | Pull request, mainline push, manual, scheduled, or local classification. | +| `generatedAt` | datetime | When the record was emitted. | +| `wallClockSeconds` | number | Current lane runtime in seconds. | +| `baselineSeconds` | number or null | Current comparison baseline for the lane if defined. | +| `baselineSource` | string | Manifest source or comparison source that supplied the baseline. | +| `budgetSeconds` | number | Current lane budget threshold in seconds. | +| `budgetStatus` | string | Current lane budget status from the existing budget evaluator. | +| `blockingStatus` | string | Whether the current CI context blocks on this outcome. | +| `comparisonFingerprint` | string | Hash or structured fingerprint capturing comparability boundaries. | +| `classificationTotals` | array | Runtime grouped by current classification totals. | +| `familyTotals` | array | Runtime grouped by current family totals. | +| `hotspotFiles` | array | Current dominant hotspot files. | +| `slowestEntries` | array | Current slowest test entries, capped by policy. | +| `artifactRefs` | array | References to the summary, report, budget, JUnit, and history artifacts backing the record. | + +**Validation Rules** + +- A record must derive from the same lane's current `summary.md`, `report.json`, `budget.json`, and available JUnit output. +- `comparisonFingerprint` must be present for any record eligible for comparison. +- `wallClockSeconds`, `budgetSeconds`, and `generatedAt` are required. +- `slowestEntries` must not exceed the lane policy retention cap. + +## 3. TrendComparisonWindow + +**Purpose**: Represents the bounded comparable history used to evaluate one lane in one reporting cycle. + +| Field | Type | Description | +|-------|------|-------------| +| `laneId` | string | Governed lane identifier. | +| `policyRef` | string | Reference to the governing `LaneTrendPolicy`. | +| `currentRecord` | object | The latest `LaneTrendRecord`. | +| `previousComparableRecord` | object or null | The most recent prior comparable record, if one exists. | +| `comparableRecords` | array | Ordered comparable records used for trend evaluation. | +| `excludedRecords` | array | Recent records skipped because of fingerprint mismatch or invalid evidence. | +| `windowStatus` | enum | `stable`, `insufficient-history`, `scope-changed`, or `noisy`. | +| `sampleCount` | integer | Number of comparable records in the active window. | + +**Validation Rules** + +- Every comparable record must share the same `comparisonFingerprint`. +- `sampleCount` may not exceed `comparisonWindowSize`. +- `previousComparableRecord` must be the immediately preceding entry in `comparableRecords` when present. +- `windowStatus` becomes `insufficient-history` whenever `sampleCount` is below `minimumComparableSamples`. + +## 4. LaneDriftAssessment + +**Purpose**: Summarizes the current drift verdict for one lane using the bounded comparison window. + +| Field | Type | Description | +|-------|------|-------------| +| `laneId` | string | Governed lane identifier. | +| `healthClass` | enum | `healthy`, `budget-near`, `trending-worse`, `regressed`, or `unstable`. | +| `deltaToPreviousSeconds` | number or null | Current runtime delta vs previous comparable run. | +| `deltaToPreviousPercent` | number or null | Percent delta vs previous comparable run. | +| `deltaToBaselineSeconds` | number or null | Current runtime delta vs lane baseline. | +| `deltaToBaselinePercent` | number or null | Percent delta vs lane baseline. | +| `budgetHeadroomSeconds` | number | Remaining headroom before budget breach. | +| `worseningStreak` | integer | Count of recent comparable records showing meaningful worsening. | +| `varianceObservedSeconds` | number | Effective variance observed across the active window. | +| `recalibrationRecommendation` | enum | `none`, `investigate`, `review-baseline`, or `review-budget`. | +| `summaryLine` | string | Human-readable explanation emitted into markdown summaries. | + +**Validation Rules** + +- `healthClass` may only be non-`unstable` when the comparison window has at least `minimumComparableSamples` comparable records. +- `recalibrationRecommendation` must remain separate from `healthClass`. +- `budgetHeadroomSeconds` may be negative only when the lane is over budget. + +## 5. HotspotTrendSnapshot + +**Purpose**: Captures how the dominant runtime contributors changed between the current and previous comparable run. + +| Field | Type | Description | +|-------|------|-------------| +| `laneId` | string | Governed lane identifier. | +| `familyDeltas` | array | Top family-level deltas with current seconds, previous seconds, and delta values. | +| `fileHotspots` | array | Top file hotspots with current/previous runtime and rank movement. | +| `newEntrants` | array | Families or files newly entering the visible hotspot set. | +| `droppedEntrants` | array | Families or files leaving the visible hotspot set. | +| `evidenceAvailability` | enum | `available` or `unavailable`, used when JUnit or attribution evidence is missing. | + +**Validation Rules** + +- Human-readable summaries must cap output at the policy's family/file limits. +- JSON evidence may retain more detail, but must not exceed `slowestEntryRetention`. +- If hotspot evidence is unavailable, the summary must say so explicitly. + +## 6. RecalibrationDecisionRecord + +**Purpose**: Records structured evidence for a proposed, approved, or rejected baseline/budget recalibration. + +| Field | Type | Description | +|-------|------|-------------| +| `laneId` | string | Governed lane identifier. | +| `targetType` | enum | `baseline` or `budget`. | +| `decisionStatus` | enum | `candidate`, `approved`, or `rejected`. | +| `evidenceRunRefs` | array | Comparable runs supporting the decision. | +| `previousValueSeconds` | number | Existing baseline or budget value. | +| `proposedValueSeconds` | number or null | Proposed replacement value. | +| `rationaleCode` | enum | `lane-scope-change`, `infrastructure-shift`, `post-improvement-reset`, `sustained-erosion`, `noise-rejected`, or `manual-hold`. | +| `recordedIn` | string | Active spec path or implementation PR reference where the decision is documented. | +| `notes` | string | Concise reviewer-facing explanation. | + +**Validation Rules** + +- Approved baseline changes require at least one accepted rationale tied to scope or environment truth. +- Approved budget changes require a stronger evidence window than approved baseline changes. +- Rejected decisions must retain the rejection reason. +- The artifact may propose candidates, but approval remains human-controlled. + +## 7. TrendSummaryCycle + +**Purpose**: Represents one generated trend-aware reporting cycle across the relevant lanes. + +| Field | Type | Description | +|-------|------|-------------| +| `cycleId` | string | Reporting-cycle identifier, typically anchored to the current lane run or summary generation timestamp. | +| `generatedAt` | datetime | When the cycle summary was emitted. | +| `laneSummaries` | array | Per-lane summary entries containing `laneId`, current runtime, previous comparable runtime, baseline, budget, and the embedded drift assessment used by the readable summary surface. | +| `laneAssessments` | array | `LaneDriftAssessment` items for all relevant lanes. | +| `hotspotSnapshots` | array | `HotspotTrendSnapshot` items for lanes with available evidence. | +| `recalibrationDecisions` | array | Candidate, approved, or rejected recalibration records emitted for the cycle. | +| `artifactPublicationStatus` | array | Whether required current-run and history artifacts were published successfully. | +| `warnings` | array | Legibility notes such as missing comparable history or unavailable hotspot evidence. | + +**Validation Rules** + +- Every relevant primary lane must have exactly one `laneSummaries` entry and exactly one `LaneDriftAssessment` per cycle. +- Each `laneSummaries` entry must expose the current runtime, previous comparable runtime, baseline, budget, and embedded health assessment needed by the readable summary surface. +- `warnings` must be explicit when any required evidence is unavailable. +- The cycle summary must stay readable without requiring a second dashboard surface. + +## State Transitions + +### LaneDriftAssessment.healthClass + +- `unstable` -> `healthy`: allowed once there are enough comparable samples and the lane is comfortably below budget without sustained worsening. +- `unstable` -> `budget-near`: allowed once there are enough comparable samples and budget headroom falls inside the near-budget window. +- `unstable` -> `trending-worse`: allowed once there are enough comparable samples and worsening exceeds the lane variance floor across the bounded window. +- `healthy` <-> `budget-near`: allowed as headroom enters or leaves the near-budget band. +- `healthy` or `budget-near` -> `trending-worse`: allowed when sustained worsening appears without a budget breach. +- `trending-worse` -> `regressed`: allowed when the lane breaches budget or shows a materially worse repeated trend strong enough to stop calling it merely erosion. +- Any state -> `unstable`: allowed when comparability breaks, history is insufficient, or the window is too noisy to classify reliably. + +### RecalibrationDecisionRecord.decisionStatus + +- `candidate` -> `approved`: allowed only by explicit human review with structured evidence. +- `candidate` -> `rejected`: allowed when the evidence is noisy, incomplete, or policy says repository truth should not move. +- `approved` and `rejected`: terminal statuses for the recorded decision. \ No newline at end of file diff --git a/specs/211-runtime-trend-recalibration/plan.md b/specs/211-runtime-trend-recalibration/plan.md new file mode 100644 index 00000000..f0fe1018 --- /dev/null +++ b/specs/211-runtime-trend-recalibration/plan.md @@ -0,0 +1,174 @@ +# Implementation Plan: Test Runtime Trend Reporting & Baseline Recalibration + +**Branch**: `211-runtime-trend-recalibration` | **Date**: 2026-04-17 | **Spec**: `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/211-runtime-trend-recalibration/spec.md` +**Input**: Feature specification from `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/211-runtime-trend-recalibration/spec.md` + +## Summary + +Implement Spec 211 by extending the existing repo-truth test-governance seams in `TestLaneManifest`, `TestLaneBudget`, `TestLaneReport`, the repo-root reporting wrapper, and the current CI artifact bundles so each governed lane can emit bounded runtime history, current-vs-previous-vs-baseline-vs-budget summaries, lane-first drift states, hotspot deltas, and explicit recalibration recommendations without introducing product database persistence or a second analytics platform. + +## Technical Context + +**Language/Version**: PHP 8.4.15 for repo-truth governance logic, Bash for repo-root wrappers, GitHub-compatible Gitea Actions workflow YAML under `.gitea/workflows/`, plus JSON Schema and logical OpenAPI for repository contracts +**Primary Dependencies**: Laravel 12, Pest v4, PHPUnit 12, Filament v5, Livewire v4, Laravel Sail, Gitea Actions backed by `act_runner`, uploaded artifact bundles, and the existing `Tests\Support\TestLaneManifest`, `TestLaneBudget`, and `TestLaneReport` seams +**Storage**: SQLite `:memory:` for lane execution, filesystem artifacts under `apps/platform/storage/logs/test-lanes`, staged CI bundles under `.gitea-artifacts/`, bounded derived trend/history artifacts adjacent to current lane artifacts, and no new product database persistence +**Testing**: Existing Pest lane and workflow guard suites, new repo-level trend/history/recalibration guard coverage, and representative local plus Gitea artifact sequences for primary lanes +**Validation Lanes**: `fast-feedback` and `confidence` for the narrowest proving path, with representative `heavy-governance`, `browser`, `junit`, and `profiling` evidence used only where hotspot attribution or cross-lane trend behavior needs proof +**Target Platform**: TenantAtlas monorepo on Gitea Actions with `act_runner`, Docker-isolated Sail jobs, repo-root lane/report wrappers, and local developer validation from the repository root +**Project Type**: Monorepo with a Laravel platform app and separate Astro website; this feature is scoped to repository/platform test governance only +**Performance Goals**: Produce lane summaries that remain understandable in under two minutes, classify drift from at least three comparable samples without duplicating full lane reruns, and keep hotspot trend visibility bounded to the dominant contributors rather than exhaustive historical detail +**Constraints**: Repo truth first; no product routes, panels, assets, or dependencies; no new product DB tables; lane-first reporting remains primary; baselines and budgets stay separate; recalibration is explicit; history stays bounded and lightweight; cross-run comparison must work from existing artifact bundles or explicit local inputs rather than assuming unlimited shared storage +**Scale/Scope**: Four primary governed lanes plus two support lanes, at least three comparable samples required for meaningful status, rolling bounded history per lane, and top hotspot visibility based on existing family/classification attribution and slowest-entry reporting + +### Filament v5 Implementation Notes + +- **Livewire v4.0+ compliance**: Preserved. This feature governs repository test-runtime reporting only and does not alter the Filament or Livewire runtime stack. +- **Provider registration location**: Unchanged. Existing panel providers remain registered in `bootstrap/providers.php`. +- **Global search rule**: No globally searchable resources are added or modified. +- **Destructive actions**: No runtime destructive actions are introduced. Existing confirmation and authorization behavior remain unchanged. +- **Asset strategy**: No panel or shared assets are added. Existing `filament:assets` deployment behavior remains unchanged. +- **Testing plan**: Add or update Pest guards for trend-history contracts, bundle discovery and hydration semantics, JSON schema plus logical OpenAPI contract sync validation, drift classification, recalibration evidence, hotspot delta output, wrapper/report integration, artifact staging/export behavior, timed review-speed acceptance, and representative multi-run evidence for the primary lanes. + +## Test Governance Check + +- **Affected validation lanes**: `fast-feedback` and `confidence` are the narrowest proving lanes; `heavy-governance`, `browser`, `junit`, and `profiling` remain evidence inputs only when the trend layer needs hotspot or cross-lane proof. +- **Narrowest proving command(s)**: `./scripts/platform-test-lane fast-feedback`, `./scripts/platform-test-report fast-feedback`, `./scripts/platform-test-lane confidence`, and `./scripts/platform-test-report confidence`. +- **Fixture / helper cost risks**: Low and bounded to repo-level report/history fixtures, manifest metadata, and guard helpers. The implementation must not add shared product fixtures, broaden default setup, or widen lane membership. +- **Heavy-family additions or promotions**: None. The feature consumes existing heavy/browser lanes as evidence sources and must not promote new coverage into them by accident. +- **Budget / baseline / trend follow-up**: Drift thresholds, bounded-history size, and any approved baseline or budget recalibration notes must be recorded in the active spec or implementation PR, with quickstart serving only as supplemental reproduction guidance rather than the delivery record. +- **Why no dedicated follow-up spec is needed**: Spec 211 is itself the structural trend-governance feature. After rollout, ordinary threshold upkeep should return to the normal feature-spec workflow unless recurring pain or another lane-model change appears. + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +- Inventory-first: PASS. No inventory, backup, or snapshot product truth changes. +- Read/write separation: PASS. This is repository-only reporting and governance work with no end-user mutations. +- Graph contract path: PASS. No Microsoft Graph calls or contract-registry changes. +- Deterministic capabilities: PASS. No capability resolver, role mapping, or authorization registry changes. +- RBAC-UX, workspace isolation, tenant isolation: PASS. No runtime routes, policies, or tenant/workspace access behavior changes. +- Run observability and Ops-UX: PASS. Trend artifacts remain filesystem- and bundle-based and do not introduce `OperationRun` changes. +- Data minimization: PASS. Trend history must remain derived from summary/report/budget outputs and must not store secrets, tenant payloads, or raw environment detail. +- Test governance (TEST-GOV-001): PASS WITH WORK. The feature must keep the narrowest proving lane explicit, avoid widening heavy lanes, and document any threshold or recalibration follow-up as part of the active delivery artifact. +- Proportionality and bloat control: PASS WITH LIMITS. The new history artifact, drift states, and recalibration rules are justified because per-run evidence alone cannot support trend-based governance. The implementation must stay inside the existing lane/report seams and avoid turning trend logic into a generalized analytics framework. +- TEST-TRUTH-001: PASS WITH WORK. Trend output must remain derived from real lane artifacts and comparable evidence windows, not optimistic labels or hand-maintained spreadsheets. +- Filament/UI constitutions: PASS / NOT APPLICABLE. No operator-facing runtime UI, action surfaces, badges, or panels are changed. + +**Phase 0 Gate Result**: PASS + +- The feature stays bounded to repository test-governance artifacts, history windows, trend evaluation, and documentation. +- No new product database truth, Graph seams, runtime routes, or authorization planes are introduced. +- The implementation extends existing lane/report structures rather than inventing a separate monitoring subsystem. + +## Project Structure + +### Documentation (this feature) + +```text +specs/211-runtime-trend-recalibration/ +├── plan.md +├── research.md +├── data-model.md +├── quickstart.md +├── contracts/ +│ ├── test-runtime-trend-history.schema.json +│ └── test-runtime-trend.logical.openapi.yaml +└── tasks.md +``` + +### Source Code (repository root) + +```text +.gitea/ +├── workflows/ +│ ├── test-pr-fast-feedback.yml +│ ├── test-main-confidence.yml +│ ├── test-heavy-governance.yml +│ └── test-browser.yml +apps/ +├── platform/ +│ ├── tests/ +│ │ ├── Feature/Guards/ +│ │ └── Support/ +│ │ ├── TestLaneManifest.php +│ │ ├── TestLaneBudget.php +│ │ └── TestLaneReport.php +│ └── storage/logs/test-lanes/ +scripts/ +├── platform-test-lane +├── platform-test-report +└── platform-test-artifacts +README.md +``` + +**Structure Decision**: Keep trend truth in the existing `TestLaneManifest` / `TestLaneBudget` / `TestLaneReport` seams, extend the repo-root reporting flow rather than adding a second execution surface, and keep historical evidence adjacent to the existing lane artifact root and CI bundles so no new database or generic analytics layer is introduced. + +## Complexity Tracking + +| Violation | Why Needed | Simpler Alternative Rejected Because | +|-----------|------------|-------------------------------------| +| Repo-level trend history artifact | Multi-run drift and recalibration cannot be justified from one run plus README prose | Comparing only current vs previous or current vs baseline cannot distinguish sustained erosion, noise, and scope-change boundaries | +| Repo-level drift health states | Reviewers need consistent intermediate states between healthy and hard failure | A binary green/red view hides budget-near erosion and treats one-off spikes like structural regression | + +## Proportionality Review + +- **Current operator problem**: Maintainers can enforce budgets per run but cannot yet see whether runtime is eroding, whether a hotspot is becoming dominant, or whether baseline/budget recalibration is justified. +- **Existing structure is insufficient because**: Current lane reports describe one execution at a time and only include limited baseline comparison for narrow historical cases; they do not retain a bounded comparable window or policy-driven drift classification. +- **Narrowest correct implementation**: Extend the existing lane/report contract with bounded history, derived trend evaluation, and explicit recalibration guidance using the same lane artifact root and CI bundles. +- **Ownership cost created**: The repo must maintain history-window policy, drift thresholds, hotspot-delta output, recalibration guidance, and a small set of guard tests validating those semantics. +- **Alternative intentionally rejected**: A new database table, a git-tracked history file committed on every run, or a generalized analytics dashboard, because each would import more persistence or framework weight than the repository currently needs. +- **Release truth**: Current-release repository truth needed to make Specs 206 through 210 durable over time. + +## Phase 0 — Research (complete) + +- Output: [research.md](./research.md) +- Resolved key decisions: + - Keep trend history and summaries adjacent to the existing lane artifact contract instead of creating a second storage system. + - Treat uploaded Gitea artifact bundles as the shared CI history source, with explicit local artifact input as the fallback for local validation and reproducible examples. + - Use a bounded rolling window per lane with a minimum comparable sample count before declaring stable health states. + - Reuse existing family/classification attribution and slowest-entry output for hotspot trends instead of archiving exhaustive per-test history. + - Separate lane health classification from recalibration recommendation so budgets and baselines do not collapse into a single status. + - Extend the existing summary/report artifacts with trend-specific outputs and sections instead of creating a dashboard or parallel reporting surface. + - Keep recalibration explicit and policy-driven, with different acceptable triggers for baseline changes and budget changes. + +## Phase 1 — Design & Contracts (complete) + +- Output: [data-model.md](./data-model.md) formalizes lane trend policy, trend records, comparison windows, drift assessments, hotspot trend snapshots, recalibration decisions, and cycle summaries. +- Output: [contracts/test-runtime-trend-history.schema.json](./contracts/test-runtime-trend-history.schema.json) defines the repository contract for bounded lane history, trend evaluation, hotspot deltas, and recalibration evidence. +- Output: [contracts/test-runtime-trend.logical.openapi.yaml](./contracts/test-runtime-trend.logical.openapi.yaml) captures the logical contract for updating one lane history window, evaluating one lane trend, evaluating recalibration, and emitting a cycle summary. +- Output: [quickstart.md](./quickstart.md) provides the implementation order, validation commands, and representative multi-run evidence checklist. + +### Post-design Constitution Re-check + +- PASS: No runtime routes, panels, Graph seams, or authorization planes are introduced. +- PASS: Trend history remains repository-owned and derived from existing lane artifacts rather than new product persistence. +- PASS: The design stays lane-first and keeps hotspot reporting supportive rather than dominant. +- PASS WITH WORK: The bounded history window and Gitea artifact hydration must remain lightweight and optional enough for local validation without assuming unlimited external retention. +- PASS WITH WORK: Baseline and budget updates must remain explicit manifest/spec changes backed by evidence, not runtime self-mutation. + +## Phase 2 — Implementation Planning + +`tasks.md` should cover: + +- Auditing `TestLaneManifest`, `TestLaneBudget`, `TestLaneReport`, `scripts/platform-test-report`, and `scripts/platform-test-artifacts` as the only valid seams for trend history, drift policy, and artifact export. +- Extending `TestLaneManifest` with lane trend policy metadata, bounded-retention rules, comparability requirements, hotspot limits, and recalibration guidance anchors while keeping budgets and baselines distinct. +- Extending `TestLaneReport` so it can read current lane outputs plus a bounded prior artifact window, emit lane trend records, evaluate drift status, compute hotspot deltas, and write trend-aware summary/report/budget payloads. +- Extending `TestLaneBudget` with explicit recalibration recommendation helpers that assess baseline and budget policy separately from current budget outcome. +- Extending `scripts/platform-test-report` so it can discover, select, and hydrate the latest comparable prior history window from uploaded artifact bundles or explicit local artifact directories, then refresh trend-aware outputs without re-running a second full lane. +- Extending `scripts/platform-test-artifacts` and the existing workflow artifact contracts so trend-specific files are staged and uploaded alongside the current summary/report/budget/JUnit bundle. +- Updating `.gitea/workflows/test-pr-fast-feedback.yml`, `.gitea/workflows/test-main-confidence.yml`, `.gitea/workflows/test-heavy-governance.yml`, and `.gitea/workflows/test-browser.yml` only as needed to pass history-source context and export the new trend files, without widening lane execution. +- Adding or updating Pest guards for bounded history contracts, comparability breaks, latest-comparable-bundle hydration, drift-state classification, hotspot delta legibility, recalibration recommendation rules, JSON schema and logical OpenAPI contract sync, and no accidental heavy/browser promotion. +- Updating `README.md` with concise contributor guidance for reading trend summaries, understanding `healthy` / `budget-near` / `trending-worse` / `regressed` / `unstable`, and knowing when recalibration discussion is appropriate. +- Recording at least three sequential comparable samples for each primary lane, one support-lane example from `junit` or `profiling`, at least one healthy case, one budget-near case, one repeated worsening or regressed case, one unstable/noisy case, one justified plus one rejected recalibration case, and one timed reviewer read proving the summary remains decidable within two minutes. + +### Contract Implementation Note + +- The JSON schema is repository-tooling-oriented and defines the bounded history/trend contract even if the first implementation stores most of that truth in PHP arrays and generated JSON artifacts. +- The OpenAPI file is logical rather than transport-prescriptive. It documents how wrappers, support classes, and CI artifact inputs must interact, not a public HTTP API. +- The design intentionally reuses current lane report/budget artifacts as the canonical current-run evidence and layers bounded history on top. + +### Deployment Sequencing Note + +- No database migration is planned. +- No asset publish step changes. +- Recommended rollout order: add trend policy metadata and contracts, extend report generation to build trend outputs from explicit local inputs, extend artifact staging and workflow export, validate with local multi-run sequences for `fast-feedback` and `confidence`, then capture representative Gitea bundle sequences for the remaining primary lanes and document any approved recalibration evidence. \ No newline at end of file diff --git a/specs/211-runtime-trend-recalibration/quickstart.md b/specs/211-runtime-trend-recalibration/quickstart.md new file mode 100644 index 00000000..8505a625 --- /dev/null +++ b/specs/211-runtime-trend-recalibration/quickstart.md @@ -0,0 +1,133 @@ +# Quickstart: Test Runtime Trend Reporting & Baseline Recalibration + +## Preconditions + +- Specs 206 through 210 are already implemented and remain the governing baseline for lane selection, budgets, CI workflow routing, and artifact publication. +- Local validation runs from the repository root and uses Sail-backed commands for PHP and test execution. +- At least one prior comparable artifact bundle or prior lane `*-latest.trend-history.json` file is available when validating a non-`unstable` history window locally. +- No database migration, product route, Filament panel, or frontend asset step is required for this feature. + +## Planned Artifact Additions + +- Extend the existing lane artifact set with `apps/platform/storage/logs/test-lanes/-latest.trend-history.json`. +- Extend the existing `summary.md`, `report.json`, and `budget.json` outputs with trend-aware sections and fields rather than creating a parallel human-readable artifact surface. +- Stage the new history artifact into the existing `.gitea-artifacts/` upload bundle for the owning lane. + +## Recommended Implementation Order + +1. Extend `TestLaneManifest` with the lane trend policy, bounded retention limits, comparison-fingerprint inputs, and recalibration guidance anchors. +2. Extend `TestLaneReport` so it can read a prior `*-latest.trend-history.json`, append the current `LaneTrendRecord`, trim to the lane retention limit, compute the trend window, emit drift status, and surface hotspot deltas. +3. Extend `TestLaneBudget` with recalibration recommendation helpers that stay separate from current budget outcome. +4. Extend `scripts/platform-test-report` so it refreshes trend-aware outputs after a prior history file has been hydrated into `apps/platform/storage/logs/test-lanes`. +5. Extend `scripts/platform-test-artifacts` and the checked-in artifact contracts so the trend history file is staged and uploaded with the existing lane bundle. +6. Update only the necessary Gitea workflow steps so each lane can hydrate the previous matching history artifact before report generation without widening lane execution. +7. Add or update Pest guard coverage for trend history, drift classes, hotspot deltas, recalibration rules, and workflow/artifact publication contracts. +8. Update `README.md` with reviewer guidance and capture representative validation evidence for the main trend cases. + +## Local Validation Flow + +### 1. Generate current lane artifacts + +```bash +./scripts/platform-test-lane fast-feedback +./scripts/platform-test-lane confidence +./scripts/platform-test-report fast-feedback --skip-latest-history +./scripts/platform-test-report confidence --skip-latest-history +``` + +### 2. Hydrate prior comparable history for a stable-window validation + +Use the wrapper flags instead of manual artifact copying so local runs exercise the same hydration contract as CI. + +```bash +./scripts/platform-test-report fast-feedback --history-file=/absolute/path/to/fast-feedback-latest.trend-history.json +./scripts/platform-test-report confidence --history-bundle=/absolute/path/to/comparable-bundle-or-zip +``` + +### 3. Rebuild workflow-shaped evidence without widening lane execution + +```bash +./scripts/platform-test-report fast-feedback --workflow-id=pr-fast-feedback --trigger-class=pull-request --fetch-latest-history +./scripts/platform-test-report confidence --workflow-id=main-confidence --trigger-class=mainline-push --fetch-latest-history +./scripts/platform-test-report heavy-governance --workflow-id=heavy-governance --trigger-class=manual --skip-latest-history +./scripts/platform-test-report browser --workflow-id=browser-manual --trigger-class=manual --skip-latest-history +./scripts/platform-test-report profiling --skip-latest-history +./scripts/platform-test-report junit --skip-latest-history +``` + +### 4. Stage artifact bundles exactly as CI will publish them + +```bash +./scripts/platform-test-artifacts fast-feedback .gitea-artifacts/pr-fast-feedback --workflow-id=pr-fast-feedback --trigger-class=pull-request +./scripts/platform-test-artifacts confidence .gitea-artifacts/main-confidence --workflow-id=main-confidence --trigger-class=mainline-push +``` + +### 5. Run focused guard coverage and formatting + +```bash +cd apps/platform && ./vendor/bin/sail artisan test --compact tests/Feature/Guards +cd apps/platform && ./vendor/bin/sail bin pint --dirty --format agent +``` + +### 6. Time-box one reviewer summary check + +Use the generated summary only, set a two-minute timer, and verify that the reviewer can name the health class for each primary lane plus whether recalibration discussion is warranted before opening raw lane outputs. + +## Health Class Cheat Sheet + +- `healthy`: the lane has enough comparable history, remains comfortably under budget, and recent variance stays below the lane noise floor. +- `budget-near`: the lane is still passing, but its headroom is inside the lane's warning band. +- `trending-worse`: multiple comparable samples are worsening above the documented variance floor. +- `regressed`: the lane is over budget or repeatedly worsening enough that the report should stop calling it normal erosion. +- `unstable`: the report is intentionally refusing a stronger label because the window is too short, too noisy, or no longer comparable. + +Recalibration is separate from health. The report can emit `candidate`, `approved`, or `rejected` baseline or budget decisions, but it never mutates repository truth automatically. + +## Recorded Evidence Snapshot (2026-04-17) + +| Scenario | Lane | Runtime Window | Outcome | +|----------|------|----------------|---------| +| Live cold-start wrapper run | `fast-feedback` | current `120.29s`, previous `120.29s`, baseline `176.74s`, budget `200s` | `unstable`, hotspot evidence unavailable, budget recalibration rejected (`manual-hold`) because only two comparable samples existed | +| Stable healthy window | `fast-feedback` | current `176.10s`, previous `175.60s`, baseline `176.74s`, budget `200s` | `healthy`, no recalibration recommended | +| Stable budget-near window | `confidence` | current `433.00s`, previous `430.00s`, baseline `394.38s`, budget `450s` | `budget-near`, investigate before the lane becomes a repeated blocker | +| Noisy window | `fast-feedback` | current `170.00s`, previous `195.00s`, baseline `176.74s`, budget `200s` | `unstable` with `windowStatus=noisy`, so the spike is treated as noise instead of structural regression | +| Hotspot-stable example | `confidence` | current `394.38s`, previous `401.12s`, baseline `394.38s`, budget `450s` | `healthy`; dominant families stayed flat and the top files remained the baseline compare matrix pair plus onboarding-wizard enforcement | +| Approved baseline recalibration | `fast-feedback` | current `176.30s`, previous `176.00s`, baseline reset from `176.74s` to `182.00s`, budget `200s` | baseline recalibration recorded as `approved` with rationale `post-improvement-reset` after the lane stabilized | +| Rejected budget recalibration | `fast-feedback` | current `193.00s`, previous `176.00s`, baseline `176.74s`, budget `200s` | `budget-near`, but budget recalibration stayed `rejected` with rationale `noise-rejected` | +| Candidate budget review | `confidence` | current `460.00s`, previous `420.00s`, baseline `394.38s`, budget `450s` | `regressed`, budget review emitted as a `candidate` only after a five-run evidence window | +| Primary-lane cold starts | `browser`, `heavy-governance` | `109.67s/150s` and `228.34s/300s` | both reported `unstable` on first refresh, which is the intended cold-start behavior | +| Support-lane path | `profiling`, `junit` | `2701.51s/3000s` and `380.14s/450s` | both wrappers now emit bounded `trend-history.json`; `junit` support-lane report refresh was repaired so the documented command actually works | + +## Representative Evidence Set + +Capture at least one example for each of the following before calling the feature complete: + +1. Three sequential comparable samples for each primary lane: `fast-feedback`, `confidence`, `heavy-governance`, and `browser`. +2. `healthy`: current runtime comfortably below budget with stable or improving recent comparable history. +3. `budget-near`: current runtime remains under budget but inside the lane's near-budget headroom band. +4. `trending-worse`: a bounded comparable window shows repeated worsening that is larger than the lane noise floor. +5. `regressed`: a budget breach or materially repeated worsening is clearly visible. +6. `unstable`: insufficient comparable history, fingerprint mismatch, or noisy evidence makes a stable label unsafe. +7. Approved recalibration case: explicit evidence shows why repository truth should change. +8. Rejected recalibration case: explicit evidence shows why repository truth should stay unchanged. +9. One support-lane example from `junit` or `profiling` when it materially improves hotspot or comparison evidence. + +Each recorded example should name the lane, current runtime, previous runtime, baseline, budget, health class, hotspot summary, and the recalibration conclusion when relevant. + +Material runtime drift, bundle-hydration caveats, and approved or rejected recalibration follow-up must be recorded in `specs/211-runtime-trend-recalibration/spec.md` or the active implementation PR. This quickstart may mirror the same evidence, but it does not replace the delivery record. + +## CI Rollout Notes + +- CI should hydrate the previous matching `*-latest.trend-history.json` from the most recent comparable uploaded artifact bundle before the report refresh step. +- The uploaded bundle for each governed workflow must include the refreshed `*-latest.trend-history.json` so the next run only needs one prior bundle. +- The workflow-owned refresh steps now pass `--fetch-latest-history` together with `TENANTATLAS_GITEA_TOKEN` and top-level `actions: read` plus `contents: read` permissions so bundle discovery stays explicit. +- Pull request and `dev` push validation remain the narrowest proving paths; heavy/browser/manual/scheduled lanes provide representative cross-lane evidence and must not be widened. + +## Final Review Checklist + +- Trend policy lives in repository truth, not workflow prose. +- `summary.md`, `report.json`, `budget.json`, and `*-latest.trend-history.json` agree on lane runtime and health class. +- Baseline and budget recalibration remain explicit, reviewable, and separate. +- Hotspot summaries stay readable and bounded. +- A timed reviewer dry run confirms the generated summary remains decidable within two minutes. +- The implementation does not add product persistence, routes, assets, or a second analytics surface. diff --git a/specs/211-runtime-trend-recalibration/research.md b/specs/211-runtime-trend-recalibration/research.md new file mode 100644 index 00000000..7c92c75a --- /dev/null +++ b/specs/211-runtime-trend-recalibration/research.md @@ -0,0 +1,73 @@ +# Research: Test Runtime Trend Reporting & Baseline Recalibration + +## Decision 1: Persist bounded lane history as an artifact beside the existing lane report outputs + +- **Decision**: Add one bounded `trend-history.json` artifact per governed lane under the existing lane artifact root and stage that same file into the existing CI upload bundle for the lane's workflow profile. +- **Rationale**: The repo already treats `summary.md`, `report.json`, `budget.json`, and `junit.xml` as the canonical lane outputs. A bounded history file beside those artifacts preserves repository truth, avoids product persistence, and gives the next CI run a portable history window without inventing a database, cache, or commit-on-every-run workflow. +- **Alternatives considered**: + - Store history in a new product database table: rejected because the feature is repository governance, not application runtime truth. + - Commit history files back into the repository on every run: rejected because runtime-generated governance evidence should not create noisy git churn. + - Reconstruct history from many previous artifact bundles every time: rejected because it depends on broader artifact retention and more CI/API complexity than necessary. + +## Decision 2: Use the latest matching uploaded artifact bundle as the shared CI history source + +- **Decision**: Hydrate the next lane history window from the latest matching uploaded bundle for the same lane/workflow profile when CI credentials are available, and allow an explicit local artifact directory or prior `trend-history.json` file as the fallback source for local validation. +- **Rationale**: Once each bundle already contains the full bounded history window, the next run only needs the most recent comparable bundle rather than a multi-run artifact crawl. This stays lightweight and lets local development validate the exact same contract using checked-out or copied artifacts. +- **Alternatives considered**: + - Depend on an external metrics store or dashboard backend: rejected because it would import a second analytics system. + - Assume shared workspace persistence across CI runs: rejected because Gitea runners should be treated as ephemeral. + - Require local developers to manually build history state for every validation: rejected because the workflow would be too fragile and easy to bypass. + +## Decision 3: Keep trend policy in `TestLaneManifest` and trend evaluation inside the existing reporting seams + +- **Decision**: Extend `TestLaneManifest` with lane trend metadata and keep history/trend generation inside `TestLaneReport`, with `TestLaneBudget` providing recalibration and tolerance-aware recommendation helpers. +- **Rationale**: Budgets, workflow bindings, artifact contracts, and existing comparison rules already live in these seams. Trend reporting is a governance extension of the same truth, not a separate subsystem. Keeping policy and evaluation together prevents duplication between wrappers, tests, and CI configuration. +- **Alternatives considered**: + - Introduce a new generalized analytics service layer: rejected because there is only one real consumer and one real domain. + - Push all trend logic into shell scripts: rejected because the classification rules and JSON contracts belong in versioned PHP support code with guard tests. + - Scatter thresholds across workflow YAML and README prose: rejected because repository truth would become inconsistent. + +## Decision 4: Use a bounded comparable window with explicit retention and comparison fingerprints + +- **Decision**: Retain the latest 20 records for primary lanes (`fast-feedback`, `confidence`, `heavy-governance`, `browser`) and the latest 10 records for support lanes (`junit`, `profiling`); evaluate drift from the latest 5 comparable records and require at least 3 comparable samples before assigning a stable non-`unstable` health class. Each history record carries a comparison fingerprint built from lane ID, workflow ID, trigger class, contract version, baseline source, and lane-scope signature. +- **Rationale**: Twenty primary-lane records preserve enough runway to separate short-term noise from structural erosion while staying small enough for artifact bundles. Five recent comparable records are enough to show worsening or stabilization trends without overfitting old runs. The comparison fingerprint prevents silent apples-to-oranges comparisons when lane membership, workflow class, or contract shape changes. +- **Alternatives considered**: + - Retain every historical record forever: rejected because the feature explicitly calls for bounded lightweight history. + - Compare only the immediately previous run: rejected because it cannot reliably distinguish streaks, noise, and recalibration boundaries. + - Compare by lane ID alone: rejected because workflow class and lane-scope changes would produce misleading trends. + +## Decision 5: Derive health classes from existing variance tolerances plus a trend policy, not from raw runtime deltas alone + +- **Decision**: Classify lane health with the fixed vocabulary `healthy`, `budget-near`, `trending-worse`, `regressed`, and `unstable`. Use the existing lane-specific variance allowances from the current enforcement profiles as the minimum noise floor, combine them with a near-budget headroom rule, and reserve `unstable` for insufficient comparable history, comparison-fingerprint breaks, or high variance/noisy windows. +- **Rationale**: The repo already documents lane-specific tolerance in budget enforcement. Reusing that allowance as the floor for trend significance keeps the new model aligned with current governance truth and avoids inventing unrelated threshold systems. +- **Alternatives considered**: + - Binary healthy/regressed classification: rejected because it hides erosion before a lane breaches budget. + - Pure percentage-only thresholds: rejected because current lane budgets and tolerances already vary meaningfully in absolute seconds. + - Automatically downgrade every spike to `regressed`: rejected because one-off noise should remain visible without looking structural. + +## Decision 6: Keep hotspot trend visibility family-first and summary-friendly + +- **Decision**: Reuse `TestLaneReport`'s existing classification totals, family totals, hotspot files, and slowest-entry output; show the top 5 family deltas and top 3 file hotspots in human-readable summaries, while retaining up to the current top 10 slowest entries in JSON evidence. +- **Rationale**: The existing report already derives the expensive attribution data. Trend reporting only needs to answer which dominant contributors worsened or stabilized, not preserve exhaustive per-test history. +- **Alternatives considered**: + - Store and diff every test case over time: rejected because the storage and readability cost is not justified. + - Show only lane-level runtime without hotspot context: rejected because recalibration and regression review would remain too opaque. + - Make hotspot output file-first only: rejected because family-level attribution is the more stable governance lens already used by the repo. + +## Decision 7: Separate recalibration recommendation from health status and keep recalibration explicitly human-approved + +- **Decision**: Emit recalibration recommendations separately from the lane health class and record explicit evidence for approved or rejected recalibration decisions. Baseline recalibration is only justified by documented lane-scope change, lasting infrastructure change, or deliberate post-improvement reset. Budget recalibration requires a stronger sustained evidence window and must never happen automatically because of a single regression or a noisy streak. +- **Rationale**: Health status answers "what is happening now". Recalibration answers "should repository truth change". Keeping those separate prevents a degraded lane from appearing self-healing just because the tool auto-adjusted the benchmark. +- **Alternatives considered**: + - Auto-adjust baselines or budgets from rolling averages: rejected because it would erase regression history. + - Treat recalibration as free-form README guidance only: rejected because reviewers need a structured evidence record. + - Merge recalibration directly into the health-class vocabulary: rejected because review semantics and current-state semantics are different concerns. + +## Decision 8: Extend the existing summary/report surfaces instead of introducing a new dashboard surface + +- **Decision**: Add a trend section to the existing lane `summary.md`, add a trend block to the current JSON report payloads, and use `trend-history.json` as the dedicated bounded-history artifact. +- **Rationale**: Maintainers already read the current summary and JSON artifacts. Extending those surfaces makes trend output immediately usable in local runs, CI logs, and uploaded bundles without inventing a parallel UI or artifact family. +- **Alternatives considered**: + - Create a new UI page or dashboard: rejected because this feature is repository-governance-only. + - Emit a second human-readable markdown file for trend alone: rejected because it would split the operator reading surface unnecessarily. + - Keep trend data only inside JSON: rejected because reviewers need readable summaries during ordinary PR and CI triage. \ No newline at end of file diff --git a/specs/211-runtime-trend-recalibration/spec.md b/specs/211-runtime-trend-recalibration/spec.md new file mode 100644 index 00000000..62042ba9 --- /dev/null +++ b/specs/211-runtime-trend-recalibration/spec.md @@ -0,0 +1,351 @@ +# Feature Specification: Test Runtime Trend Reporting & Baseline Recalibration + +**Feature Branch**: `211-runtime-trend-recalibration` +**Created**: 2026-04-17 +**Status**: Implemented (local validation complete) +**Input**: User description: "Spec 211 - Test Runtime Trend Reporting & Baseline Recalibration" + +## Spec Candidate Check *(mandatory — SPEC-GATE-001)* + +- **Problem**: TenantPilot's test-suite governance is now enforceable per run, but maintainers still lack a shared time-series view of how lane runtime, hotspot cost, and budget headroom evolve over time. +- **Today's failure**: A lane can erode gradually without obvious alarm, a noisy outlier can be mistaken for structural regression, and baseline or budget changes can happen without consistent evidence or policy. +- **User-visible improvement**: Contributors and reviewers get readable lane trend summaries that show health, deterioration, hotspot drift, and whether recalibration is justified before a lane becomes a repeated blocker. +- **Smallest enterprise-capable version**: Reuse the existing governed lane artifacts to retain bounded runtime history, compare current versus previous versus baseline versus budget, classify drift states, surface dominant hotspots, and document explicit baseline and budget recalibration rules. +- **Explicit non-goals**: No new lane taxonomy, no new general-purpose analytics platform, no automatic budget inflation, no mandate to optimize every slow file inside this spec, and no unlimited raw-history retention. +- **Permanent complexity imported**: Runtime trend data contract, bounded history artifacts, drift classification vocabulary, hotspot comparison rules, recalibration policy, summary semantics, and contributor guidance. +- **Why now**: Specs 206 through 210 established lane execution, fixture cost reduction, heavy-lane separation, and CI enforcement; without a trend layer the team can only react after drift already starts blocking shared flow. +- **Why not local**: Private spreadsheets or ad hoc comparisons cannot produce shared, reviewable evidence or a consistent recalibration process that survives reviewer and maintainer turnover. +- **Approval class**: Cleanup +- **Red flags triggered**: New historical artifact retention, new drift-status vocabulary, and new recalibration policy. Defense: the feature stays repo-scoped, derives from existing lane outputs, and intentionally avoids becoming a second analytics system. +- **Score**: Nutzen: 2 | Dringlichkeit: 2 | Scope: 2 | Komplexität: 1 | Produktnähe: 1 | Wiederverwendung: 2 | **Gesamt: 10/12** +- **Decision**: approve + +## Spec Scope Fields *(mandatory)* + +- **Scope**: workspace +- **Primary Routes**: No end-user HTTP routes change. The affected surfaces are repository-owned lane reports, trend summaries, recalibration guidance, and CI/runtime artifacts. +- **Data Ownership**: Workspace-owned runtime history artifacts, trend summaries, budget and baseline policy, and contributor guidance. No tenant-owned records or product runtime tables are introduced. +- **RBAC**: No end-user authorization behavior changes. The actors are contributors, reviewers, maintainers, and CI runners consuming the shared test-governance contract. + +## Proportionality Review *(mandatory when structural complexity is introduced)* + +- **New source of truth?**: no +- **New persisted entity/table/artifact?**: yes, but only repository-owned historical runtime and trend artifacts derived from existing governed lane outputs +- **New abstraction?**: yes, but limited to a repo-level trend model, drift classification, and recalibration policy +- **New enum/state/reason family?**: yes, but only repository-level lane health states such as `healthy`, `budget-near`, `trending-worse`, `regressed`, and `unstable` +- **New cross-domain UI framework/taxonomy?**: no +- **Current operator problem**: Maintainers can enforce budgets per run, but they cannot yet see whether a lane is drifting, whether a hotspot is growing, or whether recalibration is evidence-based instead of reactive. +- **Existing structure is insufficient because**: Current CI evidence is mostly run-by-run and cannot reliably distinguish sustained erosion, legitimate suite growth, or runner noise without manual reconstruction. +- **Narrowest correct implementation**: Add bounded history and derived trend summaries on top of existing governed lane artifacts instead of inventing new lanes, new product persistence, or a broader analytics stack. +- **Ownership cost**: The team must maintain trend retention rules, drift thresholds, recalibration guidance, and representative example evidence as runner behavior and suite composition evolve. +- **Alternative intentionally rejected**: Ad hoc manual comparisons, one-off spreadsheets, or allowing budgets to silently move upward with each overrun. +- **Release truth**: Current-release repository truth required to make Specs 206 through 210 durable over time. + +## Problem Statement + +Specs 206 through 210 moved TenantPilot's test suite into a governed operating model: + +- Lanes and budgets exist. +- Shared fixture cost has been reduced. +- Heavy Filament or Livewire families have been segmented. +- Heavy-governance cost is treated honestly. +- CI runs the governed lanes and evaluates budgets. + +What is still missing is the time dimension. The repository can usually tell whether one run is green or red, but it still cannot answer the more strategic questions: + +- Is a lane slowly getting worse even though it still passes? +- Is a budget warning noise, early erosion, or a genuine regression? +- Did the suite legitimately grow, or did the budget simply drift upward by habit? +- Are the dominant hotspots stable, worsening, or newly emerging? + +Without historical observability and explicit recalibration policy, test governance remains operational rather than strategic. + +## Dependencies + +- Depends on Spec 206 - Test Suite Governance & Performance Foundation for lane vocabulary, budgets, and checked-in reporting entry points. +- Depends on Spec 207 - Shared Test Fixture Slimming for more credible lane cost signals. +- Depends on Spec 208 - Filament/Livewire Heavy Suite Segmentation for honest separation of expensive families. +- Depends on Spec 209 - Heavy Governance Lane Cost Reduction for a more stable heavy-lane baseline. +- Depends on Spec 210 - CI Test Matrix & Runtime Budget Enforcement for governed CI artifacts, budget evidence, and per-run enforcement semantics. +- Recommended after stable CI lanes, reproducible lane artifacts, and functioning budget enforcement are already available. +- Blocks durable long-horizon budget stewardship and trend-based test governance. +- Does not block normal feature delivery or daily CI execution. + +## Goals + +- Make runtime evolution visible for each primary lane over time. +- Compare current values against both baselines and budgets. +- Detect budget erosion before hard gates fail repeatedly. +- Define explicit policy for baseline recalibration. +- Define explicit policy for budget recalibration. +- Track hotspot and family cost shifts over time. +- Distinguish runner noise from true regression. + +## Non-Goals + +- Optimizing every individual slow test file within this spec. +- Creating another lane-segmentation feature. +- Replacing CI budget enforcement rather than complementing it. +- Building a general analytics platform for every CI metric. +- Turning trend reporting into a broad dashboard project unrelated to test runtime governance. +- Requiring unlimited historical retention of raw CI outputs. + +## Assumptions + +- The lane wrappers and artifact contracts created by Specs 206 through 210 remain the authoritative inputs for any trend layer. +- Representative run references, timestamps, or commit identifiers are available for governed lane outputs. +- History retention can be bounded without losing enough evidence to justify recalibration decisions. +- CI noise is real and should be treated as ordinary variance rather than proof of regression by default. + +## Key Decisions + +- **Budgets and baselines are different**: A budget is a governance limit, while a baseline is a reference point. They must not drift together automatically. +- **Trend visibility complements hard enforcement**: The existing red or green contract stays in place; trend reporting adds foresight rather than replacing gates. +- **Recalibration must be explicit**: Baseline or budget changes require documented evidence and reasoning. +- **Noise-aware governance matters**: Single noisy runs should not dominate decisions. +- **Lane-first governance remains primary**: File and family hotspots inform the decision, but the lane stays the main governance unit. +- **Historical observability must stay lightweight**: The first slice should aid decisions without becoming a second BI system. + +## Test Governance Impact *(mandatory — TEST-GOV-001)* + +- **Validation lane(s)**: `fast-feedback`, `confidence`, `heavy-governance`, `browser`, plus `junit` and `profiling` when they supply hotspot or comparison evidence. +- **Why these lanes are sufficient**: They cover the full governed cost classes already recognized by the repository, including both primary operational lanes and the support evidence used to explain hotspots and compare scope. +- **New or expanded test families**: No new product-facing test family is required. The feature may add lightweight repo-level guard coverage for trend parsing, drift classification, recalibration reasoning, and summary generation. +- **Fixture / helper cost impact**: Low and bounded. The feature MUST stay inside repo-level reporting, artifact retention, and documentation. It MUST NOT add shared product fixtures, broaden default setup, or widen heavy suite membership. +- **Heavy coverage justification**: None beyond consuming the existing `heavy-governance` and `browser` lanes as evidence sources. The feature introduces no new heavy-governance or browser scenarios. +- **Budget / baseline / trend impact**: This feature formalizes trend headroom, drift states, and recalibration criteria. Any threshold tuning or material runtime drift or recalibration follow-up discovered during rollout MUST be documented in this spec or the active implementation PR rather than silently absorbed into budgets or left only in quickstart notes. +- **Planned validation commands**: `./scripts/platform-test-lane fast-feedback`, `./scripts/platform-test-lane confidence`, `./scripts/platform-test-report fast-feedback`, and `./scripts/platform-test-report confidence` for routine reviewer validation. Representative `heavy-governance`, `browser`, `junit`, and `profiling` evidence should come from the same checked-in lane/report entry points rather than ad hoc commands. + +## Trend Reporting Minimum Surface + +### Lane Runtime Trend Model + +For each relevant lane, the trend surface must show at least: + +- current runtime +- previous comparable runtime +- baseline runtime +- budget target +- delta to previous runtime +- delta to baseline runtime +- current health classification +- recent history window sufficient to show direction rather than a single point + +### Runtime History Contract + +Each retained trend record must remain reproducible enough to justify later decisions and must preserve at least: + +- run, commit, or timestamp reference +- lane name +- measured runtime +- budget outcome or headroom state +- baseline reference used for comparison +- hotspot or family summary when available +- enough provenance to explain whether the record is directly comparable to adjacent runs + +### Drift Detection Outcomes + +Trend reporting must distinguish at least these lane states: + +- `healthy` +- `budget-near` +- `trending-worse` +- `regressed` +- `unstable` + +The model must be able to show intermediate deterioration without collapsing every non-healthy case into a single hard failure signal. + +### Hotspot Trend Visibility + +Trend reporting must expose the dominant cost drivers for each primary lane in a way that shows: + +- top cost drivers for the current reporting window +- change against the reference window +- newly dominant families or files +- persistent known hotspots that continue to dominate cost + +### Readable Summary Surface + +Each reporting cycle must publish a concise summary that makes it immediately clear: + +- which lanes are healthy +- which lanes are near budget +- which lanes are worsening or regressed +- whether recalibration should be discussed +- which hotspots dominate the lanes that need attention + +## Required Validation Evidence Set + +- One recent sequence of at least three comparable run samples for each primary lane: `fast-feedback`, `confidence`, `heavy-governance`, and `browser`. +- One support-lane example from `junit` or `profiling` when it materially improves hotspot or comparison evidence. +- One example each for `healthy`, `budget-near`, `trending-worse` or `regressed`, and `unstable` outcomes. +- One example where legitimate lane-scope change justifies baseline recalibration. +- One example where an overrun does not justify either baseline or budget recalibration. +- Material runtime drift, bundle-hydration caveats, and approved or rejected recalibration follow-up must be recorded in this spec or the active implementation PR; quickstart may mirror the same evidence but does not replace the delivery record. +- Each evidence record must identify the run reference, lane, current runtime, previous runtime, baseline, budget, health class, and hotspot summary or an explicit note that hotspot evidence is unavailable. + +## Recorded Validation Evidence (2026-04-17) + +| Evidence | Lane | Current / Previous / Baseline / Budget | Health | Hotspots | Recalibration | +|----------|------|-----------------------------------------|--------|----------|---------------| +| Live cold-start wrapper refresh via `./scripts/platform-test-report fast-feedback --skip-latest-history` | `fast-feedback` | `120.29s / 120.29s / 176.74s / 200s` | `unstable` with `windowStatus=insufficient-history` | unavailable | budget `rejected` with `manual-hold` because the comparable window was still too short | +| Representative stable window from generated trend-summary fixtures | `fast-feedback` | `176.73s / 178.91s / 176.74s / 200s` | `healthy` | unavailable | none | +| Representative near-budget window from generated trend-classification fixtures | `confidence` | `433.00s / 430.00s / 394.38s / 450s` | `budget-near` | not the focus of this case | investigate only; no automatic repository-truth change | +| Representative noisy window from generated trend-classification fixtures | `fast-feedback` | `170.00s / 195.00s / 176.74s / 200s` | `unstable` with `windowStatus=noisy` | unavailable | none; the report explicitly treats the spike as noise instead of a structural regression | +| Representative hotspot-stable window from generated trend-summary and trend-hotspots fixtures | `confidence` | `394.38s / 401.12s / 394.38s / 450s` | `healthy` | available; `baseline-compare-matrix-workflow` and `onboarding-wizard-enforcement` stayed flat, with the compare-matrix pair remaining the top file hotspots | none | +| Approved baseline reset from generated recalibration fixtures | `fast-feedback` | `176.30s / 176.00s / 182.00s / 200s` | `healthy` | unavailable | baseline `approved` with `post-improvement-reset` after the lane stabilized | +| Rejected budget movement from generated recalibration fixtures | `fast-feedback` | `193.00s / 176.00s / 176.74s / 200s` | `budget-near` | unavailable | budget `rejected` with `noise-rejected`; repository truth stayed unchanged | +| Candidate budget review from generated recalibration fixtures | `confidence` | `460.00s / 420.00s / 394.38s / 450s` | `regressed` | not the focus of this case | budget `candidate` only after a five-run evidence window, proposed `505s`, still requiring human approval | +| Live primary-lane cold-start refresh via repo-root wrappers | `browser` and `heavy-governance` | `109.67s / n/a / n/a / 150s` and `228.34s / n/a / n/a / 300s` | both `unstable` on first refresh | unavailable until a comparable prior window exists | budget `rejected` with `manual-hold` on both first-pass reports | +| Live support-lane refresh via repo-root wrappers | `profiling` and `junit` | `2701.51s / n/a / n/a / 3000s` and `380.14s / n/a / n/a / 450s` | both `unstable` on first refresh | unavailable on cold start | budget `rejected` with `manual-hold`; the `junit` report wrapper path was repaired during this implementation so the documented command now executes | + +- Reviewer dry run: the generated markdown summaries remained decidable from the `## Lane trend` section alone within the intended two-minute review window, without opening the raw JSON payloads. +- Bundle hydration note: workflow-owned report refresh now relies on `--fetch-latest-history` plus `TENANTATLAS_GITEA_TOKEN` and explicit `actions: read` plus `contents: read` permissions to pull the newest comparable artifact bundle before regenerating `trend-history.json`. +- Runtime follow-up note: no baseline or budget changed automatically in repository truth during implementation. All recalibration output stayed advisory unless a fixture or spec entry explicitly marked it approved. + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 - See Lane Drift Before It Becomes A Repeated Gate (Priority: P1) + +As a maintainer reviewing governed test runs, I want lane summaries to compare the current runtime against the previous run, the baseline, and the budget so I can spot erosion before a lane becomes a recurring blocker. + +**Why this priority**: Early drift detection is the core value of the feature. Without it, governance remains reactive and only responds after breakage is already frequent. + +**Independent Test**: Review a representative run sequence for `fast-feedback` and `confidence`, confirm that the summary shows current, previous, baseline, and budget values, and verify that healthy, near-budget, and worsening cases are distinguishable without manual arithmetic. + +**Acceptance Scenarios**: + +1. **Given** a lane stays near its baseline with comfortable headroom, **When** the trend summary is generated, **Then** the lane is shown as healthy with current, previous, baseline, and budget values visible. +2. **Given** a lane moves closer to its budget across multiple comparable runs, **When** the trend summary is generated, **Then** the lane is shown as budget-near or trending-worse before repeated hard failures begin. +3. **Given** a single run spikes but adjacent runs remain normal, **When** the trend summary is generated, **Then** the lane is treated as unstable or noisy rather than immediately treated as baseline regression. + +--- + +### User Story 2 - Decide Recalibration With Evidence Instead Of Habit (Priority: P1) + +As a maintainer responsible for budgets, I want explicit recalibration rules and supporting trend evidence so I can distinguish legitimate suite growth, lane reshaping, infrastructure change, and true regression. + +**Why this priority**: Without explicit policy, every slowdown invites arbitrary budget inflation or blanket refusal to recalibrate, and both outcomes weaken governance. + +**Independent Test**: Review one representative justified recalibration case and one rejected recalibration case, and confirm that the report plus policy make the outcome understandable without relying on private notes. + +**Acceptance Scenarios**: + +1. **Given** a lane slows because approved coverage legitimately expands its scope, **When** maintainers review the trend evidence, **Then** baseline recalibration is presented as discussable rather than automatic. +2. **Given** a lane slows because of a regression without approved scope change, **When** maintainers review the trend evidence, **Then** baseline and budget remain unchanged and follow-up performance work is indicated instead. +3. **Given** only runner noise is present, **When** the trend evidence is reviewed, **Then** no immediate baseline or budget recalibration is recommended. + +--- + +### User Story 3 - Track Dominant Hotspots Over Time (Priority: P2) + +As a contributor investigating suite slowdown, I want hotspot trend summaries per lane so I can target the dominant family or file based on persistent evidence rather than a single anecdotal slow run. + +**Why this priority**: Lane-level health points maintainers toward trouble, but hotspot trend visibility makes follow-up work actionable. + +**Independent Test**: Review representative hotspot summaries for each primary lane across multiple runs and confirm that persistent, worsening, newly dominant, and unavailable hotspot states are visible. + +**Acceptance Scenarios**: + +1. **Given** the dominant hotspot families change between reporting windows, **When** the summary is generated, **Then** newly dominant families are visible without reading raw per-test output. +2. **Given** a known expensive family remains the major cost driver across several runs, **When** the summary is reviewed, **Then** its persistence is clear enough to support targeted follow-up work. +3. **Given** hotspot detail is unavailable for one reporting cycle, **When** the summary is generated, **Then** the report states that the hotspot evidence is incomplete instead of silently omitting context. + +### Edge Cases + +- The first rollout window has too little history for a given lane; the summary must clearly mark the comparison as insufficient rather than pretending a stable trend exists. +- Lane membership or scope changes make old and new runs only partially comparable; the report must flag that boundary before trend conclusions are drawn. +- A budget exists but the prior baseline is outdated or missing; the report must surface the mismatch rather than hiding it. +- Several lanes move at once after an infrastructure or runner change; the recalibration policy must prevent accidental budget inflation across the board. +- Hotspot evidence is only partially available for one lane; the lane health summary must remain readable while clearly disclosing the missing hotspot context. + +## Requirements *(mandatory)* + +**Constitution alignment (required):** This feature is repository-only test-governance work. It introduces no Microsoft Graph calls, no product write behavior, no `OperationRun`, and no end-user authorization changes. + +**Constitution alignment (PROP-001 / ABSTR-001 / PERSIST-001 / STATE-001 / BLOAT-001):** This feature introduces repository-owned historical artifacts, drift states, and recalibration policy only because per-run enforcement alone is insufficient to govern long-horizon suite behavior. The Proportionality Review above explains why bounded derived history is the narrowest correct implementation. + +**Constitution alignment (TEST-GOV-001):** The feature covers the affected validation lanes, keeps heavy and browser scope unchanged, avoids new shared fixture cost, documents expected baseline and budget follow-up, and records the minimal reviewer commands above. + +### Functional Requirements + +- **FR-001 History Coverage**: The repository MUST retain or derive comparable runtime history for each primary governed lane: Fast Feedback, Confidence, Heavy Governance, and Browser. Support lanes such as JUnit or Profiling MUST be included when they materially improve hotspot or comparison evidence. +- **FR-002 Trend Record Contract**: Each retained trend record MUST include a lane identifier, a run or commit reference, measured runtime, baseline reference, budget context, and enough provenance to compare the record with the immediately preceding relevant record. +- **FR-003 Lane Summary Contract**: Each reporting cycle MUST expose, for every relevant lane, the current runtime, previous runtime, baseline, budget, delta to previous run, delta to baseline, and current lane health classification. +- **FR-004 Drift Health States**: The reporting model MUST distinguish at least the states `healthy`, `budget-near`, `trending-worse`, `regressed`, and `unstable`. +- **FR-005 Noise Handling**: A single anomalous run MUST NOT by itself force a lane into the same treatment as repeated deterioration; the trend model MUST differentiate one-off spikes from sustained erosion. +- **FR-006 Baseline Recalibration Policy**: The repository MUST document when a baseline may be reset, when it must remain unchanged, what evidence window is required, and who is expected to justify the decision. +- **FR-007 Budget Recalibration Policy**: The repository MUST document when a budget may change, when it must not change, and which reasons are considered valid, including deliberate lane-scope change, infrastructure shift, or post-improvement tightening. +- **FR-008 Explicit Recalibration Evidence**: Any approved baseline or budget recalibration MUST be tied to documented evidence showing the before-and-after rationale rather than silently adopting the latest run as the new truth. +- **FR-009 Hotspot Trend Visibility**: Each primary lane trend report MUST expose dominant cost drivers and indicate whether a hotspot is stable, worsening, or newly dominant compared with the reference window. +- **FR-010 Readable Summary**: Each reporting cycle MUST publish a concise summary that lets a reviewer tell which lanes are healthy, near budget, worsening, regressed, or candidates for recalibration without opening raw lane outputs first. +- **FR-011 Contributor Guidance**: Repository guidance MUST explain how to read the trend summary, when authors should react to budget-near or worsening status, when recalibration discussion is appropriate, and when a follow-up performance pass is the correct response instead. +- **FR-012 Bounded Retention**: The history model MUST remain lightweight by using bounded retained evidence sufficient for governance decisions rather than requiring unlimited archival of raw run outputs. +- **FR-013 Validation Examples**: Completion of this feature MUST include representative examples covering at least one healthy lane, one budget-near lane, one repeated worsening or regressed lane, one unstable case, and one justified recalibration case. +- **FR-014 Lane-First Governance**: Trend reporting MUST remain lane-first; hotspot detail may inform the decision, but it MUST NOT replace lane-level status as the primary governance unit. + +### Non-Functional Requirements + +- **NFR-001 Decision Speed**: A reviewer must be able to determine the health class of each governed lane from the summary in under two minutes for a normal reporting cycle. +- **NFR-002 Noise Resilience**: The trend model must reduce false regression calls caused by normal CI variance so that a single noisy run remains an exception rather than the default explanation. +- **NFR-003 Operational Weight**: The trend layer must reuse existing governed lane outputs and must not require duplicate full reruns of every primary lane solely to produce routine reporting. + +## Risks + +- **Overreacting to CI noise**: If the thresholds are too sensitive, normal runner variability could look like a structural regression. +- **Baseline inflation**: If recalibration is too easy, baseline history loses its value as a reference point. +- **Budget normalization drift**: If every overrun becomes a budget update, the budget model stops functioning as governance. +- **Over-complex reporting**: Too many metrics can make the summary harder to use instead of easier. +- **False precision**: Historical numbers can look more exact than the runner environment really allows. +- **Hotspot overload**: Too much hotspot detail can crowd out the lane-first decision that the report is supposed to support. + +## Rollout Guidance + +- Define the minimal trend data contract before adding new summary states. +- Introduce per-lane summaries showing current, previous, baseline, and budget values first. +- Add drift classification only after the comparison window is clear. +- Document baseline and budget recalibration policy before tuning thresholds. +- Add hotspot trend visibility for the highest-value lanes after the lane summary is readable. +- Validate the output with real or representative run sequences and adjust thresholds only when the examples show misleading outcomes. +- Keep the first slice minimal and decision-oriented rather than exhaustive. + +## Design Rules + +- **Budgets are policy, baselines are reference**. +- **Trend output must aid decisions**. +- **No silent recalibration**. +- **Noise-aware, not noise-blind**. +- **Lane-first observability**. +- **Hotspots support, not dominate, governance**. +- **Readable over exhaustive**. + +## Deliverables + +- A trend-capable runtime history contract or artifact for governed lanes. +- A per-lane trend summary showing current, previous, baseline, budget, and health state. +- A drift-classification model for lane health. +- Documented baseline recalibration policy. +- Documented budget recalibration policy. +- A hotspot trend view for relevant lanes. +- Contributor and reviewer guidance. +- Validation evidence from real or representative governed runs. + +### Key Entities *(include if feature involves data)* + +- **Lane Trend Record**: A retained runtime snapshot for one governed lane at one reporting point, including runtime, comparison context, and health state. +- **Baseline Reference**: The agreed reference value used to compare later lane runs without acting as the budget itself. +- **Budget Policy**: The governance limit and enforcement posture applied to a lane, distinct from the baseline reference. +- **Drift Status**: The named lane-health classification that distinguishes healthy behavior from near-budget, worsening, regressed, or unstable patterns. +- **Hotspot Trend Snapshot**: A ranked summary of the dominant cost drivers for a lane together with their change relative to the comparison window. +- **Recalibration Decision**: A documented decision that keeps, adjusts, or tightens a baseline or budget based on explicit trend evidence. + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: The trend summary covers 100% of primary governed lanes with current runtime, previous runtime, baseline, budget, and health classification visible in the validation evidence. +- **SC-002**: At least three sequential comparable samples are available for each primary governed lane in the validation evidence without requiring manual reconstruction outside repository-owned artifacts or summaries. +- **SC-003**: In the documented validation examples, single noisy outliers are classified differently from repeated deterioration in 100% of cases. +- **SC-004**: The validation evidence includes at least one justified recalibration case and at least one rejected recalibration case, each explainable from retained trend evidence without relying on private notes. +- **SC-005**: For each primary governed lane, the trend output identifies at least the top three dominant cost drivers or explicitly states that hotspot evidence is unavailable. +- **SC-006**: Reviewers can determine within two minutes whether a lane is healthy, budget-near, worsening, regressed, or recalibration-worthy from the generated summary. diff --git a/specs/211-runtime-trend-recalibration/tasks.md b/specs/211-runtime-trend-recalibration/tasks.md new file mode 100644 index 00000000..a63d215c --- /dev/null +++ b/specs/211-runtime-trend-recalibration/tasks.md @@ -0,0 +1,202 @@ +# Tasks: Test Runtime Trend Reporting & Baseline Recalibration + +**Input**: Design documents from `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/211-runtime-trend-recalibration/` +**Prerequisites**: `plan.md` (required), `spec.md` (required), `research.md`, `data-model.md`, `contracts/`, `quickstart.md` + +**Tests**: Required. This feature changes repository test-governance runtime behavior, so each user story includes Pest guard coverage plus focused lane and wrapper validation through Sail and the repo-root test-governance scripts. + +**Organization**: Tasks are grouped by user story so each story can be implemented and validated independently where possible. + +## Phase 1: Setup (Shared Context) + +**Purpose**: Freeze the real repo-truth seams and artifact boundaries before implementation begins. + +- [X] T001 [P] Audit `apps/platform/tests/Support/TestLaneManifest.php`, `apps/platform/tests/Support/TestLaneBudget.php`, `apps/platform/tests/Support/TestLaneReport.php`, `scripts/platform-test-report`, `scripts/platform-test-artifacts`, and `.gitea/workflows/*.yml` as the only valid trend-history and runtime-governance seams before implementation + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Extend the shared manifest, artifact, and wrapper seams that every story depends on. + +**Critical**: No user story work should begin until this phase is complete. + +- [X] T002 Extend `apps/platform/tests/Support/TestLaneManifest.php` with lane trend policy metadata, retention and comparison-window defaults, comparison-fingerprint inputs, hotspot limits, and `trend-history.json` artifact contracts aligned to `specs/211-runtime-trend-recalibration/data-model.md` +- [X] T003 [P] Extend `apps/platform/tests/Support/TestLaneReport.php` artifact path, read or write, and staging helpers so `apps/platform/storage/logs/test-lanes/-latest.trend-history.json` can be published alongside the existing summary, budget, report, and JUnit artifacts +- [X] T004 [P] Update `scripts/platform-test-report` and `scripts/platform-test-artifacts` to discover, select, and hydrate the latest comparable prior bundle or explicit local history input, then export the canonical `trend-history.json` artifact through the existing repo-root wrappers +- [X] T005 [P] Add or update shared guard coverage in `apps/platform/tests/Feature/Guards/TestLaneManifestTest.php`, `apps/platform/tests/Feature/Guards/TestLaneArtifactsContractTest.php`, `apps/platform/tests/Feature/Guards/TestLaneHistoryHydrationContractTest.php`, `apps/platform/tests/Feature/Guards/TestLaneTrendContractSchemaTest.php`, and `apps/platform/tests/Feature/Guards/TestLaneTrendLogicalContractTest.php` to lock lane trend policy metadata, latest-comparable-bundle hydration semantics, JSON schema sync against `specs/211-runtime-trend-recalibration/contracts/test-runtime-trend-history.schema.json`, logical contract sync against `specs/211-runtime-trend-recalibration/contracts/test-runtime-trend.logical.openapi.yaml`, and staged bundle completeness for `trend-history.json` + +**Checkpoint**: The shared trend-governance seams are ready for story-specific summary, recalibration, and hotspot work. + +--- + +## Phase 3: User Story 1 - See Lane Drift Before It Becomes A Repeated Gate (Priority: P1) 🎯 MVP + +**Goal**: Publish lane-first trend summaries that show current, previous, baseline, budget, and health status before a lane becomes a recurring blocker. + +**Independent Test**: Review representative three-sample run sequences for `fast-feedback` and `confidence`, confirm the summary shows current, previous, baseline, and budget values, and verify that healthy, near-budget, worsening, and noisy cases are distinguishable without manual arithmetic. + +### Tests for User Story 1 + +- [X] T006 [P] [US1] Add `apps/platform/tests/Feature/Guards/TestLaneTrendSummaryContractTest.php` and update `apps/platform/tests/Feature/Guards/TestLaneArtifactsContractTest.php` to assert bounded history windows and current, previous, baseline, and budget fields for `fast-feedback` and `confidence` +- [X] T007 [P] [US1] Add `apps/platform/tests/Feature/Guards/TestLaneTrendClassificationTest.php` to cover `healthy`, `budget-near`, `trending-worse`, `regressed`, and `unstable` outcomes, including one-off noisy spike handling + +### Implementation for User Story 1 + +- [X] T008 [US1] Extend `apps/platform/tests/Support/TestLaneReport.php` with `LaneTrendRecord` generation, comparison-window evaluation, comparison fingerprints, and trend-aware `summary.md` plus `report.json` output for `fast-feedback` and `confidence` +- [X] T009 [US1] Update `apps/platform/tests/Support/TestLaneManifest.php`, `.gitea/workflows/test-pr-fast-feedback.yml`, and `.gitea/workflows/test-main-confidence.yml` so pull-request and mainline bundles discover and hydrate the latest comparable history bundle, then republish the refreshed `trend-history.json` artifact without widening lane execution +- [X] T010 [US1] Update `README.md` and `specs/211-runtime-trend-recalibration/quickstart.md` with reviewer guidance and local validation steps for reading lane health summaries across `fast-feedback` and `confidence` +- [X] T011 [US1] Run the narrowest proving path with `./scripts/platform-test-lane fast-feedback`, `./scripts/platform-test-report fast-feedback`, `./scripts/platform-test-lane confidence`, and `./scripts/platform-test-report confidence`, then record representative three-sample `healthy`, `budget-near`, and `unstable` evidence in `specs/211-runtime-trend-recalibration/spec.md` and `specs/211-runtime-trend-recalibration/quickstart.md` + +**Checkpoint**: At this point, lane drift visibility for the main contributor lanes should be independently functional and reviewable. + +--- + +## Phase 4: User Story 2 - Decide Recalibration With Evidence Instead Of Habit (Priority: P1) + +**Goal**: Separate baseline and budget recalibration from ordinary health status and make every recalibration decision evidence-backed. + +**Independent Test**: Review one justified recalibration case and one rejected recalibration case, and confirm the report plus policy make the outcome understandable without private notes. + +### Tests for User Story 2 + +- [X] T012 [P] [US2] Add `apps/platform/tests/Feature/Guards/TestLaneRecalibrationPolicyTest.php` to assert baseline-vs-budget separation, evidence-window requirements, and approved versus rejected rationale handling +- [X] T013 [P] [US2] Add `apps/platform/tests/Feature/Guards/TestLaneRecalibrationEvidenceContractTest.php` to assert candidate, approved, and rejected recalibration records together with explicit summary disclosure for recalibration outcomes + +### Implementation for User Story 2 + +- [X] T014 [US2] Extend `apps/platform/tests/Support/TestLaneBudget.php` with recalibration recommendation helpers, lane-specific tolerance reuse, and explicit baseline plus budget review rules aligned to `specs/211-runtime-trend-recalibration/data-model.md` +- [X] T015 [US2] Extend `apps/platform/tests/Support/TestLaneManifest.php` and `apps/platform/tests/Support/TestLaneReport.php` to emit structured recalibration policy metadata, decision records, evidence run references, and `recordedIn` guidance pointing to `specs/211-runtime-trend-recalibration/spec.md` or the implementation PR without mutating manifest truth automatically +- [X] T016 [US2] Update `README.md` and `specs/211-runtime-trend-recalibration/quickstart.md` with the approved and rejected recalibration policy, required evidence windows, and reviewer follow-up rules +- [X] T017 [US2] Run recalibration validation with `./scripts/platform-test-report fast-feedback` and `./scripts/platform-test-report confidence` against seeded prior histories, then record one approved and one rejected recalibration example in `specs/211-runtime-trend-recalibration/spec.md` and `specs/211-runtime-trend-recalibration/quickstart.md` + +**Checkpoint**: At this point, recalibration guidance should be independently testable and clearly separated from ordinary lane health. + +--- + +## Phase 5: User Story 3 - Track Dominant Hotspots Over Time (Priority: P2) + +**Goal**: Surface persistent, worsening, and newly dominant hotspots so follow-up optimization work targets the real cost drivers. + +**Independent Test**: Review representative hotspot summaries for each primary lane across multiple runs and confirm that persistent, worsening, newly dominant, and unavailable hotspot states are visible. + +### Tests for User Story 3 + +- [X] T018 [P] [US3] Add `apps/platform/tests/Feature/Guards/TestLaneHotspotTrendContractTest.php` to assert top family and file delta output, new or dropped hotspot detection, and explicit unavailable-hotspot disclosure +- [X] T019 [P] [US3] Update `apps/platform/tests/Feature/Guards/ProfileLaneContractTest.php`, `apps/platform/tests/Feature/Guards/FastFeedbackLaneContractTest.php`, `apps/platform/tests/Feature/Guards/ConfidenceLaneContractTest.php`, `apps/platform/tests/Feature/Guards/HeavyGovernanceLaneContractTest.php`, `apps/platform/tests/Feature/Guards/BrowserLaneIsolationTest.php`, and `apps/platform/tests/Feature/Guards/CiHeavyBrowserWorkflowContractTest.php` to assert support-lane hotspot evidence and hotspot visibility for all primary lanes plus the chosen `junit` or `profiling` support example + +### Implementation for User Story 3 + +- [X] T020 [US3] Extend `apps/platform/tests/Support/TestLaneReport.php` with hotspot delta computation from `classificationTotals`, `familyTotals`, `hotspotFiles`, and `slowestEntries`, capping readable output to the policy limits defined in `apps/platform/tests/Support/TestLaneManifest.php` +- [X] T021 [US3] Update `apps/platform/tests/Support/TestLaneManifest.php`, `.gitea/workflows/test-heavy-governance.yml`, and `.gitea/workflows/test-browser.yml` so heavy and browser bundles retain hotspot-supporting history context and surface missing hotspot evidence explicitly +- [X] T022 [US3] Update `README.md` and `specs/211-runtime-trend-recalibration/quickstart.md` with hotspot investigation guidance, `profiling` and `junit` support-lane usage, and examples of persistent versus newly dominant hotspots +- [X] T023 [US3] Run representative hotspot validation with `./scripts/platform-test-report fast-feedback`, `./scripts/platform-test-report confidence`, `./scripts/platform-test-lane heavy-governance`, `./scripts/platform-test-report heavy-governance`, `./scripts/platform-test-lane browser`, `./scripts/platform-test-report browser`, and one support-lane report path from `./scripts/platform-test-report profiling` or `./scripts/platform-test-report junit`, then record persistent, worsening, newly dominant, and unavailable hotspot evidence for each primary lane in `specs/211-runtime-trend-recalibration/spec.md` and `specs/211-runtime-trend-recalibration/quickstart.md` + +**Checkpoint**: At this point, hotspot trend visibility should be independently functional without depending on recalibration rollout evidence. + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Validate the full trend-governance slice, record evidence, and finish formatting. + +- [X] T024 Run focused Pest coverage for `apps/platform/tests/Feature/Guards/TestLaneTrendSummaryContractTest.php`, `apps/platform/tests/Feature/Guards/TestLaneTrendClassificationTest.php`, `apps/platform/tests/Feature/Guards/TestLaneRecalibrationPolicyTest.php`, `apps/platform/tests/Feature/Guards/TestLaneRecalibrationEvidenceContractTest.php`, `apps/platform/tests/Feature/Guards/TestLaneHotspotTrendContractTest.php`, `apps/platform/tests/Feature/Guards/TestLaneHistoryHydrationContractTest.php`, `apps/platform/tests/Feature/Guards/TestLaneTrendContractSchemaTest.php`, `apps/platform/tests/Feature/Guards/TestLaneTrendLogicalContractTest.php`, `apps/platform/tests/Feature/Guards/TestLaneManifestTest.php`, `apps/platform/tests/Feature/Guards/TestLaneArtifactsContractTest.php`, `apps/platform/tests/Feature/Guards/FastFeedbackLaneContractTest.php`, `apps/platform/tests/Feature/Guards/ConfidenceLaneContractTest.php`, `apps/platform/tests/Feature/Guards/ProfileLaneContractTest.php`, `apps/platform/tests/Feature/Guards/HeavyGovernanceLaneContractTest.php`, `apps/platform/tests/Feature/Guards/BrowserLaneIsolationTest.php`, and `apps/platform/tests/Feature/Guards/CiHeavyBrowserWorkflowContractTest.php` with `cd apps/platform && ./vendor/bin/sail artisan test --compact ...` +- [X] T025 [P] Execute the representative local and Gitea evidence set across `.gitea/workflows/test-pr-fast-feedback.yml`, `.gitea/workflows/test-main-confidence.yml`, `.gitea/workflows/test-heavy-governance.yml`, and `.gitea/workflows/test-browser.yml`, capture at least three sequential comparable samples for each primary lane, include one support-lane example from `junit` or `profiling`, time-box a reviewer dry run to confirm the summary remains decidable within two minutes, and record lane, health class, hotspot availability, recalibration outcome, and any material runtime drift follow-up in `specs/211-runtime-trend-recalibration/spec.md` and `specs/211-runtime-trend-recalibration/quickstart.md` +- [X] T026 Run `cd apps/platform && ./vendor/bin/sail bin pint --dirty --format agent` for changes in `apps/platform/tests/Support/TestLaneManifest.php`, `apps/platform/tests/Support/TestLaneBudget.php`, `apps/platform/tests/Support/TestLaneReport.php`, and the new or updated guard tests under `apps/platform/tests/Feature/Guards/` + +--- + +## Dependencies & Execution Order + +### Phase Dependencies + +- **Setup (Phase 1)**: No dependencies and can start immediately. +- **Foundational (Phase 2)**: Depends on Phase 1 and blocks all user story work. +- **User Story 1 (Phase 3)**: Depends on Phase 2 only and is the MVP slice. +- **User Story 2 (Phase 4)**: Depends on Phase 2 and benefits from the trend-history infrastructure completed for User Story 1. +- **User Story 3 (Phase 5)**: Depends on Phase 2 and should follow User Story 1 because hotspot deltas reuse the same history and assessment outputs. +- **Polish (Phase 6)**: Depends on all desired user stories being complete. + +### User Story Dependencies + +- **User Story 1 (P1)**: Can begin immediately after Foundational and delivers the first usable runtime-trend surface. +- **User Story 2 (P1)**: Requires the same history contract as User Story 1 but remains independently valuable once that contract exists. +- **User Story 3 (P2)**: Reuses the bounded history from User Story 1 and the policy limits from Foundational, but does not need User Story 2 to be useful. + +### Within Each User Story + +- Story-specific guard tests should be written and fail before implementation. +- Manifest and wrapper contract changes should be in place before finalizing report output, schema validation, and comparable-bundle hydration steps. +- README and quickstart guidance should land after the corresponding runtime behavior exists. +- Lane validation and evidence capture should complete before closing a story. + +### Parallel Opportunities + +- T003, T004, and T005 can proceed in parallel once T002 fixes the shared manifest shape. +- In User Story 1, T006 and T007 can run in parallel because they cover separate guard surfaces. +- In User Story 2, T012 and T013 can run in parallel because policy rules and evidence-record assertions are independent tests. +- In User Story 3, T018 and T019 can run in parallel because they touch separate guard suites. +- T025 can run in parallel with final formatting once all implementation and guard work is stable. + +--- + +## Parallel Example: User Story 1 + +```bash +# After T002-T005 establish the shared history contract, these can proceed in parallel: +Task: "Add apps/platform/tests/Feature/Guards/TestLaneTrendSummaryContractTest.php and update TestLaneArtifactsContractTest.php" +Task: "Add apps/platform/tests/Feature/Guards/TestLaneTrendClassificationTest.php" +``` + +--- + +## Parallel Example: User Story 2 + +```bash +# After User Story 1 exposes comparable history, these can proceed in parallel: +Task: "Add apps/platform/tests/Feature/Guards/TestLaneRecalibrationPolicyTest.php" +Task: "Add apps/platform/tests/Feature/Guards/TestLaneRecalibrationEvidenceContractTest.php" +``` + +--- + +## Parallel Example: User Story 3 + +```bash +# After the shared hotspot-ready report shape exists, these can proceed in parallel: +Task: "Add apps/platform/tests/Feature/Guards/TestLaneHotspotTrendContractTest.php" +Task: "Update apps/platform/tests/Feature/Guards/ProfileLaneContractTest.php and apps/platform/tests/Feature/Guards/HeavyGovernanceLaneContractTest.php" +``` + +--- + +## Implementation Strategy + +### MVP First (User Story 1 Only) + +1. Complete Phase 1: Setup. +2. Complete Phase 2: Foundational. +3. Complete Phase 3: User Story 1. +4. Validate `fast-feedback` and `confidence` trend summaries independently before continuing. + +### Incremental Delivery + +1. Deliver bounded history and lane health summaries first. +2. Add explicit recalibration policy and evidence records next. +3. Add hotspot delta visibility for heavy, browser, and support-lane-assisted investigations last. +4. Finish with focused guard validation, real evidence capture, and formatting. + +### Parallel Team Strategy + +1. One contributor can extend `apps/platform/tests/Support/TestLaneManifest.php` and wrapper scripts while another prepares the new guard suites. +2. After Foundational completes, User Story 1 test work and workflow hydration changes can be split across contributors. +3. User Story 2 recalibration logic and User Story 3 hotspot logic can proceed separately once the history contract is stable. + +--- + +## Notes + +- `[P]` tasks operate on different files or independent guard suites and can run in parallel once dependencies are satisfied. +- `[US1]`, `[US2]`, and `[US3]` map tasks directly to the user stories in `spec.md`. +- This feature changes runtime-governance behavior, so the narrowest relevant lane reruns and evidence capture remain part of the definition of done. +- Live Gitea validation remains required because local wrapper tests alone cannot prove cross-run artifact hydration and uploaded bundle behavior.