spec: add spec 113 artifacts

2026-02-26 02:18:19 +01:00 · 2026-02-26 02:18:19 +01:00 · a069085814
commit a069085814
parent 32c3a64147
8 changed files with 891 additions and 0 deletions
--- a/specs/113-platform-ops-runbooks/checklists/requirements.md
+++ b/specs/113-platform-ops-runbooks/checklists/requirements.md
@ -0,0 +1,35 @@
+# Specification Quality Checklist: Platform Ops Runbooks (Spec 113)
+
+**Purpose**: Validate specification completeness and quality before proceeding to planning  
+**Created**: 2026-02-26  
+**Feature**: specs/113-platform-ops-runbooks/spec.md
+
+## Content Quality
+
+- [x] No implementation details (languages, frameworks, APIs)
+- [x] Focused on user value and business needs
+- [x] Written for non-technical stakeholders
+- [x] All mandatory sections completed
+
+## Requirement Completeness
+
+- [x] No [NEEDS CLARIFICATION] markers remain
+- [x] Requirements are testable and unambiguous
+- [x] Success criteria are measurable
+- [x] Success criteria are technology-agnostic (no implementation details)
+- [x] All acceptance scenarios are defined
+- [x] Edge cases are identified
+- [x] Scope is clearly bounded
+- [x] Dependencies and assumptions identified
+
+## Feature Readiness
+
+- [x] All functional requirements have clear acceptance criteria
+- [x] User scenarios cover primary flows
+- [x] Feature meets measurable outcomes defined in Success Criteria
+- [x] No implementation details leak into specification
+
+## Notes
+
+- Spec intentionally uses concrete routes (`/system/*`, `/admin/*`) and capability identifiers to keep RBAC and plane separation testable.
+- Run tracking/audit/lock semantics are expressed as outcomes and constraints, not as specific classes or framework APIs.
--- a/specs/113-platform-ops-runbooks/contracts/system-ops-runbooks.openapi.yaml
+++ b/specs/113-platform-ops-runbooks/contracts/system-ops-runbooks.openapi.yaml
@ -0,0 +1,168 @@
+openapi: 3.0.3
+info:
+  title: System Ops Runbooks (Spec 113)
+  version: 0.1.0
+  description: |
+    Conceptual contract for the operator control plane under /system.
+
+    Note: The implementation is a Filament (Livewire) UI. These endpoints
+    represent the stable user-facing routes + the logical actions (preflight/run)
+    and their request/response shapes.
+
+servers:
+  - url: /
+
+paths:
+  /system/ops/runbooks:
+    get:
+      summary: Runbook catalog page
+      responses:
+        '200':
+          description: HTML page
+          content:
+            text/html:
+              schema:
+                type: string
+
+  /system/ops/runbooks/findings-lifecycle-backfill/preflight:
+    post:
+      summary: Preflight findings lifecycle backfill
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RunbookPreflightRequest'
+      responses:
+        '200':
+          description: Preflight result
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/RunbookPreflightResponse'
+        '403':
+          description: Platform user lacks capability
+        '404':
+          description: Wrong plane / not platform-authenticated
+
+  /system/ops/runbooks/findings-lifecycle-backfill/runs:
+    post:
+      summary: Start findings lifecycle backfill
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RunbookStartRequest'
+      responses:
+        '201':
+          description: Run accepted/queued
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/RunbookStartResponse'
+        '409':
+          description: Already queued / lock busy
+        '422':
+          description: Validation error (missing reason, missing typed confirmation, etc.)
+
+  /system/ops/runs:
+    get:
+      summary: Operation runs list page
+      responses:
+        '200':
+          description: HTML page
+          content:
+            text/html:
+              schema:
+                type: string
+
+  /system/ops/runs/{runId}:
+    get:
+      summary: Operation run detail page
+      parameters:
+        - in: path
+          name: runId
+          required: true
+          schema:
+            type: integer
+      responses:
+        '200':
+          description: HTML page
+          content:
+            text/html:
+              schema:
+                type: string
+        '404':
+          description: Not found
+
+components:
+  schemas:
+    RunbookScope:
+      type: object
+      required: [mode]
+      properties:
+        mode:
+          type: string
+          enum: [all_tenants, single_tenant]
+        tenant_id:
+          type: integer
+          nullable: true
+
+    RunbookPreflightRequest:
+      type: object
+      required: [scope]
+      properties:
+        scope:
+          $ref: '#/components/schemas/RunbookScope'
+
+    RunbookPreflightResponse:
+      type: object
+      required: [affected_count, total_count]
+      properties:
+        affected_count:
+          type: integer
+          minimum: 0
+        total_count:
+          type: integer
+          minimum: 0
+
+    RunbookReason:
+      type: object
+      required: [reason_code, reason_text]
+      properties:
+        reason_code:
+          type: string
+          enum: [DATA_REPAIR, INCIDENT, SUPPORT, SECURITY]
+        reason_text:
+          type: string
+          maxLength: 500
+
+    RunbookStartRequest:
+      type: object
+      required: [scope, preflight]
+      properties:
+        scope:
+          $ref: '#/components/schemas/RunbookScope'
+        preflight:
+          type: object
+          required: [affected_count]
+          properties:
+            affected_count:
+              type: integer
+              minimum: 0
+        typed_confirmation:
+          type: string
+          nullable: true
+          description: Required for all_tenants (must equal BACKFILL)
+        reason:
+          $ref: '#/components/schemas/RunbookReason'
+
+    RunbookStartResponse:
+      type: object
+      required: [operation_run_id, view_run_url]
+      properties:
+        operation_run_id:
+          type: integer
+        view_run_url:
+          type: string
--- a/specs/113-platform-ops-runbooks/data-model.md
+++ b/specs/113-platform-ops-runbooks/data-model.md
@ -0,0 +1,99 @@
+# Data Model — Spec 113: Platform Ops Runbooks
+
+This design describes the data we will read/write to implement the `/system` operator runbooks, grounded in the existing schema.
+
+## Core persisted entities
+
+### OperationRun (existing)
+- Table: `operation_runs`
+- Ownership:
+  - Workspace-owned (always has `workspace_id`)
+  - Tenant association is optional (`tenant_id` nullable) to support workspace/canonical runs
+- Fields (existing):
+  - `id`
+  - `workspace_id` (FK, NOT NULL)
+  - `tenant_id` (FK, nullable)
+  - `user_id` (FK to `users`, nullable)
+  - `initiator_name` (string)
+  - `type` (string; for this feature: `findings.lifecycle.backfill`)
+  - `status` (`queued|running|completed`)
+  - `outcome` (`pending|succeeded|failed|blocked|...`)
+  - `run_identity_hash` (string; active-run idempotency)
+  - `summary_counts` (json)
+  - `failure_summary` (json)
+  - `context` (json)
+  - `started_at`, `completed_at`
+
+#### Summary counts contract
+- Must only use keys from `App\Support\OpsUx\OperationSummaryKeys::all()`.
+- v1 keys for this runbook:
+  - `total` (findings scanned)
+  - `processed` (findings processed)
+  - `updated` (findings updated + duplicate consolidations)
+  - `skipped` (findings unchanged)
+  - `failed` (per-tenant job failures)
+  - `tenants` (for all-tenants orchestrator: tenants targeted)
+
+#### Context shape (for this feature)
+Store these values in `operation_runs.context`:
+
+- `runbook`:
+  - `key`: `findings.lifecycle.backfill`
+  - `scope`: `all_tenants` | `single_tenant`
+  - `target_tenant_id`: int|null
+  - `source`: `system_ui` | `cli` | `deploy_hook`
+- `preflight`:
+  - `affected_count`: int (findings that would change)
+  - `total_count`: int (findings scanned)
+  - `estimated_tenants`: int|null (for all tenants)
+- `reason` (required for all-tenants and break-glass):
+  - `reason_code`: `DATA_REPAIR|INCIDENT|SUPPORT|SECURITY`
+  - `reason_text`: string
+- `platform_initiator` (when started from `/system`):
+  - `platform_user_id`: int
+  - `email`: string
+  - `name`: string
+  - `is_break_glass`: bool
+
+Notes:
+- We intentionally do not store secrets/PII beyond operator email/name already used in auditing.
+- `failure_summary` should store sanitized messages + stable reason codes, as already done by `RunFailureSanitizer`.
+
+#### All-tenants run modeling (v1)
+- All-tenants executes as a single **workspace-scoped** run (`tenant_id = null`).
+- Implementation fans out to multiple tenant jobs, but they all update the same workspace run via:
+  - `OperationRunService::incrementSummaryCounts()`
+  - `OperationRunService::appendFailures()`
+  - `OperationRunService::maybeCompleteBulkRun()`
+- Per-tenant `OperationRun` rows are not required for v1 (avoids parent/child coordination).
+
+### Audit log (existing infrastructure)
+- Existing: `App\Services\Intune\AuditLogger` is already used for System login auditing.
+- New audit actions (stable action IDs):
+  - `platform.ops.runbooks.preflight`
+  - `platform.ops.runbooks.start`
+  - `platform.ops.runbooks.completed`
+  - `platform.ops.runbooks.failed`
+- Audit context should include:
+  - runbook key, scope, affected_count, operation_run_id, platform_user_id/email, ip/user_agent.
+
+### Alerts (existing infrastructure)
+- Use `AlertDispatchService` to create `alert_deliveries` for operators.
+- New alert event:
+  - `event_type`: `operations.run.failed`
+  - `tenant_id`: platform tenant id (to route via workspace rules)
+  - `metadata`: run id, run type, scope, view-run URL
+
+## Derived / non-persisted
+
+### Runbook catalog
+- Implementation as a PHP catalog (no DB table) with:
+  - key, label, description, capability required, estimated duration (can reuse `OperationCatalog`).
+
+## State transitions
+- `OperationRun.status/outcome` transitions are owned by `OperationRunService`.
+- Expected transitions (per run):
+  - `queued` → `running` → `completed(succeeded|failed|blocked)`
+- Locks:
+  - Tenant runs: already implemented via `Cache::lock('tenantpilot:findings:lifecycle_backfill:tenant:{id}', 900)`
+  - All-tenants orchestration: add a scope-level lock to prevent duplicate fan-out.
--- a/specs/113-platform-ops-runbooks/plan.md
+++ b/specs/113-platform-ops-runbooks/plan.md
@ -0,0 +1,128 @@
+# Implementation Plan: Platform Ops Runbooks (Spec 113)
+
+**Branch**: `[113-platform-ops-runbooks]` | **Date**: 2026-02-26  
+**Spec**: `specs/113-platform-ops-runbooks/spec.md`  
+**Input**: Feature specification + design artifacts in `specs/113-platform-ops-runbooks/`
+
+**Note**: This file is generated/maintained via Spec Kit (`/speckit.plan`). Keep it concise and free of placeholders/duplicates.
+
+## Summary
+
+Introduce a `/system` operator control plane for safe backfills/data repair.
+
+v1 delivers one runbook: **Rebuild Findings Lifecycle**. It must:
+- preflight (read-only)
+- require explicit confirmation (typed confirmation for all-tenants) + reason capture
+- execute as a tracked `OperationRun` with audit events + locking + idempotency
+- be **never exposed** in the customer `/admin` plane
+- reuse one shared code path across System UI + CLI + deploy hook
+
+## Technical Context
+
+- **Language/Runtime**: PHP 8.4, Laravel 12
+- **Admin UI**: Filament v5 (Livewire v4)
+- **Storage**: PostgreSQL
+- **Testing**: Pest v4 (required for runtime behavior changes)
+- **Ops primitives**: `OperationRun` + `OperationRunService` (service owns status/outcome transitions)
+
+## Non-negotiables (Constitution / Spec constraints)
+
+- Cross-plane access (`/admin` → `/system`) must be deny-as-not-found (**404**).
+- Platform user missing a required capability must be **403**.
+- `/system` session cookie must be isolated (distinct cookie name) and applied **before** `StartSession`.
+- `/system/login` throttling: **10/min** per **IP + username** key; failed login attempts are audited.
+- Any destructive-like action uses Filament `->action(...)` and `->requiresConfirmation()`.
+- Ops-UX contract: toast intent-only; progress in run detail; terminal DB notification is `OperationRunCompleted` (initiator-only); no queued/running DB notifications.
+- Audit writes are fail-safe (audit failure must not crash the runbook).
+
+## Scope decisions (v1)
+
+- **Canonical run viewing** for this spec is the **System panel**:
+  - Runbooks: `/system/ops/runbooks`
+  - Runs: `/system/ops/runs`
+- **Allowed tenant universe (v1)**: all non-platform tenants present in the database (`tenants.external_id != 'platform'`). The System UI must not allow selecting or targeting the platform tenant.
+
+## Project Structure
+
+### Documentation
+
+```text
+specs/113-platform-ops-runbooks/
+├── spec.md
+├── plan.md
+├── research.md
+├── data-model.md
+├── quickstart.md
+├── tasks.md
+└── contracts/
+    └── system-ops-runbooks.openapi.yaml
+```
+
+### Source code (planned touch points)
+
+```text
+app/
+├── Console/Commands/
+│   ├── TenantpilotBackfillFindingLifecycle.php
+│   └── TenantpilotRunDeployRunbooks.php
+├── Filament/System/Pages/
+│   └── Ops/
+│       ├── Runbooks.php
+│       ├── Runs.php
+│       └── ViewRun.php
+├── Http/Middleware/
+│   ├── EnsureCorrectGuard.php
+│   ├── EnsurePlatformCapability.php
+│   └── UseSystemSessionCookie.php
+├── Jobs/
+│   ├── BackfillFindingLifecycleJob.php
+│   ├── BackfillFindingLifecycleWorkspaceJob.php
+│   └── BackfillFindingLifecycleTenantIntoWorkspaceRunJob.php
+├── Providers/Filament/
+│   └── SystemPanelProvider.php
+├── Services/
+│   ├── Alerts/AlertDispatchService.php
+│   ├── OperationRunService.php
+│   └── Runbooks/FindingsLifecycleBackfillRunbookService.php
+└── Support/Auth/
+    └── PlatformCapabilities.php
+
+resources/views/filament/system/pages/ops/
+├── runbooks.blade.php
+├── runs.blade.php
+└── view-run.blade.php
+
+tests/Feature/System/
+├── Spec113/
+└── OpsRunbooks/
+```
+
+## Implementation Phases
+
+1) **Foundational security hardening**
+   - Capability registry additions.
+   - 404 vs 403 semantics correctness.
+   - System session cookie isolation.
+   - System login throttling.
+
+2) **Runbook core service (single source of truth)**
+   - `preflight(scope)` + `start(scope, initiator, reason, source)`.
+   - Audit events (fail-safe).
+   - Locking + idempotency.
+
+3) **Execution pipeline**
+   - All-tenants orchestration as a workspace-scoped bulk run.
+   - Fan-out tenant jobs update shared run counts and completion.
+
+4) **System UI surfaces**
+   - `/system/ops/runbooks` (preflight + confirm + start).
+   - `/system/ops/runs` list + `/system/ops/runs/{run}` detail.
+
+5) **Remove customer-plane exposure**
+   - Remove/disable `/admin` maintenance trigger (feature flag default-off) + regression test.
+
+6) **Shared entry points**
+   - Refactor existing CLI command to call the shared service.
+   - Add deploy hook command that calls the same service.
+
+   - Run focused tests + formatting (`vendor/bin/sail artisan test --compact` + `vendor/bin/sail bin pint --dirty`).
--- a/specs/113-platform-ops-runbooks/quickstart.md
+++ b/specs/113-platform-ops-runbooks/quickstart.md
@ -0,0 +1,35 @@
+# Quickstart — Spec 113 (Operator Runbooks)
+
+## Prereqs
+- Docker + Laravel Sail
+
+## Boot the app
+- `vendor/bin/sail up -d`
+- `vendor/bin/sail composer install`
+- `vendor/bin/sail artisan migrate`
+
+## Seed a platform operator
+- `vendor/bin/sail artisan db:seed --class=PlatformUserSeeder`
+
+This creates:
+- Workspace: `default`
+- Tenant: `platform` (used for platform-plane audit context)
+- PlatformUser: `operator@tenantpilot.io` / password `password`
+
+## Open the System panel
+- Visit `/system` and login as the platform operator.
+
+## Run the findings lifecycle backfill
+1. Go to `/system/ops/runbooks`
+2. Select scope (All tenants or Single tenant)
+3. Run preflight
+4. Confirm and start
+5. Use “View run” to monitor progress
+
+## CLI (existing)
+- Tenant-scoped backfill (existing behavior):
+  - `vendor/bin/sail artisan tenantpilot:findings:backfill-lifecycle --tenant={tenant_id|external_id}`
+
+## Notes
+- In production-like environments, `/admin` must not expose maintenance/backfill actions.
+- If UI changes don’t show up, run `vendor/bin/sail npm run dev`.
--- a/specs/113-platform-ops-runbooks/research.md
+++ b/specs/113-platform-ops-runbooks/research.md
@ -0,0 +1,82 @@
+# Research — Spec 113: Platform Ops Runbooks
+
+This file resolves the design unknowns required to produce an implementation plan that fits the existing TenantAtlas codebase.
+
+## Decisions
+
+### 1) Reuse existing backfill pipeline (Command + Job) via a single service
+- **Decision**: Extract a single “runbook service” that is called from:
+  - `/system` runbook UI (preflight + start)
+  - CLI command (`tenantpilot:findings:backfill-lifecycle`)
+  - deploy-time hook
+- **Rationale**: The repo already contains a correct tenant-scoped implementation:
+  - Command: `app/Console/Commands/TenantpilotBackfillFindingLifecycle.php`
+  - Job: `app/Jobs/BackfillFindingLifecycleJob.php`
+  - It uses `OperationRunService` for lifecycle transitions and idempotency, and a cache lock per tenant.
+- **Alternatives considered**:
+  - Build a new pipeline from scratch → rejected as it duplicates proven behavior and increases drift risk.
+
+### 2) “All tenants” scope uses a single workspace run updated by many tenant jobs
+- **Decision**: Implement All-tenants as:
+  1) one **workspace-scoped** `OperationRun` (tenant_id = null) created with `OperationRunService::ensureWorkspaceRunWithIdentity()`
+  2) fan-out to many queued tenant jobs that all **increment the same workspace run’s** `summary_counts` and contribute failures
+  3) completion via `OperationRunService::maybeCompleteBulkRun()` when `processed >= total` (same pattern as workspace backfills)
+- **Rationale**:
+  - This matches an existing proven pattern in the repo (`tenantpilot:backfill-workspace-ids` + `BackfillWorkspaceIdsJob`).
+  - It yields a single “View run” target with meaningful progress, without needing parent/child run stitching.
+  - Tenant isolation remains intact because each job still operates tenant-scoped and holds the existing per-tenant lock.
+- **Alternatives considered**:
+  - Separate per-tenant `OperationRun` records + an umbrella run → rejected for v1 due to added coordination complexity.
+
+### 3) Workspace scope for /system runbooks (v1)
+- **Decision**: v1 targets the **default workspace** (same workspace that owns the `platform` Tenant created by `PlatformUserSeeder`).
+- **Rationale**:
+  - Platform identity currently has no explicit workspace selector in the System panel.
+  - Existing seeder creates `Workspace(slug=default)` and a `Tenant(external_id=platform)` inside it.
+- **Alternatives considered**:
+  - Multi-workspace operator selection in `/system` → deferred (not in spec, requires new UX + entitlement model).
+
+### 4) Remove/disable `/admin` maintenance action (FR-001)
+- **Decision**: Remove or feature-flag off the existing `/admin` header action “Backfill findings lifecycle” currently present in `app/Filament/Resources/FindingResource/Pages/ListFindings.php`.
+- **Rationale**: Spec explicitly forbids customer-plane exposure in production-like environments.
+- **Alternatives considered**:
+  - Keep the action but hide visually → rejected; it still exists as an affordance and is easy to re-enable by accident.
+
+### 5) Session isolation for `/system` (SR-004)
+- **Decision**: Add a System-panel-only middleware that sets a dedicated session cookie name for `/system/*` **before** `StartSession` runs.
+- **Rationale**:
+  - SystemPanelProvider defines its own middleware list; we can insert a middleware at the top.
+  - Changing `config(['session.cookie' => ...])` per request is sufficient for cookie separation without introducing a new domain.
+- **Alternatives considered**:
+  - Separate subdomain → deferred (explicitly “later”).
+
+### 6) `/system/login` rate limiting (SR-003)
+- **Decision**: Implement rate limiting inside `app/Filament/System/Pages/Auth/Login.php` (override `authenticate()`) using a combined key: `ip + normalized(email)` at 10/min.
+- **Rationale**:
+  - The System login already overrides `authenticate()` to add auditing.
+  - Implementing rate limiting here keeps the policy tightly scoped to the System login surface.
+- **Alternatives considered**:
+  - Global route middleware throttle → possible, but harder to scope precisely to this Filament auth page.
+
+### 7) 404 vs 403 semantics for platform capability checks (SR-002)
+- **Decision**: Keep cross-plane denial as **404** (existing `EnsureCorrectGuard`), but missing platform capability should return **403**.
+- **Rationale**:
+  - Spec requires: wrong plane → 404; platform lacking capability → 403.
+  - Current `EnsurePlatformCapability` aborts(404), which conflicts with spec.
+- **Alternatives considered**:
+  - Return 404 for missing platform capability → rejected because it contradicts the agreed spec.
+
+### 8) Failure notifications (FR-009)
+- **Decision**: On run failure, emit:
+  1) the canonical terminal DB notification (`OperationRunCompleted`) to the initiating platform operator (in-app)
+  2) an Alerts event (Teams / Email) **if alert routing is configured**
+- **Rationale**:
+  - Alerts system already exists (`AlertDispatchService` + queued deliveries). It can route to Teams webhook / Email.
+  - `OperationRunCompleted` already formats the correct persistent DB notification payload via `OperationUxPresenter`.
+- **Alternatives considered**:
+  - Send Teams webhook directly from job → rejected; bypasses alert rules/cooldowns/quiet hours.
+
+## Notes for implementation
+- Platform capabilities must be defined in the registry (`app/Support/Auth/PlatformCapabilities.php`) and referenced via constants.
+- The System panel currently does not call `->databaseNotifications()`. If we want in-app notifications for platform operators, add it.
+- `OperationRun.user_id` cannot point to `platform_users`; use `context` fields to record platform initiator metadata.
--- a/specs/113-platform-ops-runbooks/spec.md
+++ b/specs/113-platform-ops-runbooks/spec.md
@ -0,0 +1,167 @@
+# Feature Specification: Platform Ops Runbooks (Operator Control Plane) for Backfills & Data Repair
+
+**Feature Branch**: `[113-platform-ops-runbooks]`  
+**Created**: 2026-02-26  
+**Status**: Draft  
+**Input**: Operator control plane runbooks for safe backfills and data repair; deploy-time automatic execution; operator re-run via `/system`; never exposed in customer UI.
+
+## Clarifications
+
+### Session 2026-02-26
+
+- Q: `/system` Session Isolation Strategy (v1) → A: B — Use a distinct session cookie name/config for `/system`.
+- Q: `OperationRun.type` for the findings lifecycle backfill runbook → A: Use `findings.lifecycle.backfill` (consistent with the operation catalog). Runbook trigger is exclusive to `/system`; any `/admin` trigger is removed / feature-flagged off.
+- Q: v1 scope selector for running the runbook → A: All tenants (default) + Single tenant (picker).
+- Q: Failure notification delivery (v1) → A: Deliver via existing alert destinations (Teams webhook / Email) when configured, and always notify the initiating platform operator in-app.
+- Q: `/system/login` rate limiting policy (v1) → A: 10/min per IP + username (combined key).
+- Q: Platform “allowed tenant universe” (v1) → A: All non-platform tenants present in the database (`tenants.external_id != 'platform'`). The System UI must not allow selecting or targeting the platform tenant.
+
+## Spec Scope Fields *(mandatory)*
+
+- **Scope**: canonical-view (platform control plane)
+- **Primary Routes**:
+  - `/system/ops/runbooks` (runbook catalog + preflight + run)
+  - `/system/ops/runs` (run history + run details)
+  - `/admin/*` (explicitly remove any maintenance/backfill affordances)
+- **Data Ownership**:
+  - Tenant-owned customer data that may be modified by runbooks (e.g., “findings” lifecycle/workflow fields)
+  - Platform-owned operational records (operation runs, audit events, operator notifications)
+- **RBAC**:
+  - Platform identity only (separate from tenant users)
+  - Capabilities (v1 minimum): `platform.ops.view`, `platform.runbooks.view`, `platform.runbooks.run`
+  - Optional granular capability for this runbook: `platform.runbooks.findings.lifecycle_backfill`
+
+For canonical-view specs, the spec MUST define:
+
+- **Default filter behavior when tenant-context is active**: the runbook defaults to **All tenants** scope; if a tenant is explicitly selected, all counts/changes MUST be limited to that tenant only.
+- **Explicit entitlement checks preventing cross-tenant leakage**: a tenant-context user MUST NOT be able to access `/system/*` (deny-as-not-found). Platform operators MUST only be able to target tenants within the platform’s allowed tenant universe.
+
+## User Scenarios & Testing *(mandatory)*
+
+### User Story 1 - Operator runs a runbook safely (Priority: P1)
+
+As a platform operator, I can run a predefined “Rebuild Findings Lifecycle” runbook from `/system` with a clear preflight, explicit confirmation, and an audited, trackable run record.
+
+**Why this priority**: This is the primary operator workflow that eliminates the need for SSH/manual scripts and reduces risk for customer-impacting data changes.
+
+**Independent Test**: Fully testable by visiting `/system/ops/runbooks`, running preflight, starting a run, and verifying the run record + audit events exist.
+
+**Acceptance Scenarios**:
+
+1. **Given** an authorized platform operator, **When** they open `/system/ops/runbooks`, **Then** they see the runbook catalog including “Rebuild Findings Lifecycle” and an operator warning that actions may modify customer data.
+2. **Given** preflight reports `affected_count > 0`, **When** the operator confirms the run, **Then** a new operation run is created and the UI links to “View run”.
+3. **Given** preflight reports `affected_count = 0`, **When** the operator attempts to run, **Then** the run action is disabled with a clear “Nothing to do” explanation.
+4. **Given** the operator chooses “All tenants”, **When** they confirm, **Then** typed confirmation is required (e.g., entering `BACKFILL`) and a reason is required.
+
+---
+
+### User Story 2 - Customers never see maintenance actions (Priority: P1)
+
+As a tenant (customer) user, I never see backfill/repair buttons and cannot access the operator control plane.
+
+**Why this priority**: Exposing maintenance controls in customer UI is an enterprise anti-pattern and undermines product trust.
+
+**Independent Test**: Fully testable by checking `/admin` UI surfaces and attempting direct navigation to `/system/*` as a tenant user.
+
+**Acceptance Scenarios**:
+
+1. **Given** a tenant user session, **When** the user requests `/system/ops/runbooks`, **Then** the response is **404** (deny-as-not-found).
+2. **Given** production-like configuration, **When** a tenant user views relevant `/admin` screens, **Then** there is no maintenance/backfill/repair UI.
+
+---
+
+### User Story 3 - Same logic for deploy-time and operator re-run (Priority: P2)
+
+As a platform operator and as a deploy pipeline, the same runbook logic can be executed consistently so that deploy-time backfills are automatic, and manual re-runs remain available and safe.
+
+**Why this priority**: A single execution path reduces drift between “what deploy does” and “what operators can re-run”, and improves reliability.
+
+**Independent Test**: Fully testable by running the operation twice and verifying idempotency and consistent preflight/run results for the same scope.
+
+**Acceptance Scenarios**:
+
+1. **Given** the runbook was executed once successfully, **When** it is executed again with the same scope, **Then** the second run reports `updated_count = 0` (idempotent behavior).
+
+### Edge Cases
+
+- Lock already held: another run is in-progress for the same scope (All tenants or the same tenant).
+- Large dataset: preflight must remain fast enough for operator use; writes must be chunked to avoid long locks.
+- Partial failure: some tenants/records fail while others succeed; run outcome and audit still record what happened.
+- Missing reason: an All-tenants or break-glass run cannot start without a reason.
+
+## Requirements *(mandatory)*
+
+### Constitution alignment notes
+
+- **No customer-plane maintenance**: Any maintenance/backfill/repair affordance in `/admin` is explicitly out of scope for customer UX.
+- **Run observability**: Customer-impacting writes MUST be executed as a tracked operation run with clear status/outcome and operator-facing surfaces.
+- **Safety gates**: Preflight → explicit confirmation → audited execution is mandatory.
+
+### Functional Requirements
+
+- **FR-001 (Remove Customer Exposure)**: The system MUST not expose any backfill/repair controls in `/admin` in production-like environments. Any legacy `/admin` trigger for the findings lifecycle backfill MUST be removed or disabled (feature-flag off by default).
+- **FR-002 (Runbook Catalog)**: The system MUST provide a `/system/ops/runbooks` catalog listing predefined runbooks and their descriptions.
+- **FR-003 (Runbook: Rebuild Findings Lifecycle)**: The system MUST provide a runbook that supports:
+  - Preflight (read-only) showing at least `affected_count`.
+  - Run (write) that starts a tracked operation run and links to “View run”.
+  - Scope selection: All tenants (default) and Single tenant (picker).
+  - Safe confirmation: includes scope + preflight count + “modifies customer data” warning.
+  - Typed confirmation for All-tenants scope (e.g., `BACKFILL`).
+  - Run disabled when preflight indicates nothing to do.
+- **FR-004 (Single Source of Truth)**: The system MUST implement the runbook logic once and reuse it across:
+  - deploy-time execution (automation)
+  - operator UI execution in `/system`
+  The two paths MUST produce consistent results for the same scope.
+- **FR-005 (Operation Run Tracking)**: Each run MUST create a run record including:
+  - run type identifier: `findings.lifecycle.backfill`
+  - scope (all tenants vs single tenant)
+  - actor (platform user, including break-glass marker when applicable)
+  - outcome/status transitions owned by the service layer
+  - numeric summary counts using a centralized allow-list of keys
+  - run context containing: `preflight.affected_count`, `updated_count`, `skipped_count`, `error_count`, and duration
+- **FR-006 (Audit Events)**: The system MUST write audit events for start, completion, and failure. Audit writing MUST be fail-safe (audit failures do not crash the operation run).
+- **FR-007 (Reasons for Sensitive Runs)**: All-tenants runs and break-glass runs MUST require a reason:
+  - `reason_code`: one of `DATA_REPAIR`, `INCIDENT`, `SUPPORT`, `SECURITY`
+  - `reason_text`: free text (max 500 characters)
+- **FR-008 (Locking & Idempotency)**: The system MUST prevent concurrent runs for the same scope via locking and MUST be idempotent (a second execution does not re-write already-correct data).
+- **FR-009 (Operator Notification on Failure)**: A failed run MUST notify operator targets with run type + scope + a link to “View run”. v1 delivery:
+  - If alert destinations are configured, deliver via existing destinations (Teams webhook / Email).
+  - Always notify the initiating platform operator in-app.
+  Success notifications are optional and SHOULD be off by default.
+
+### Security & Non-Functional Requirements
+
+- **SR-001 (Control Plane Isolation)**: `/system` MUST be isolated to platform identity and MUST deny tenant-plane access as **404** (anti-enumeration).
+- **SR-002 (404 vs 403 Semantics)**:
+  - Non-platform users or wrong plane → **404**
+  - Platform user lacking required capability → **403**
+- **SR-003 (Login Throttling)**: The `/system/login` surface MUST be rate limited at **10/min per IP + username (combined key)** and failed login attempts MUST be audited.
+- **SR-004 (Session Isolation Strategy)**: v1 MUST isolate control plane sessions from tenant sessions by using a distinct session cookie name/config for `/system` (same domain). A dedicated subdomain with separate cookie scope may be introduced later.
+- **SR-005 (Break-glass Visibility & Enforcement)**: Break-glass mode MUST be visually obvious and MUST require a reason; break-glass usage MUST be recorded on the run and in audit.
+- **NFR-001 (Performance & Safety)**:
+  - Preflight MUST be read-only and cheap enough for interactive use.
+  - Writes MUST be chunked and resilient to partial failures.
+
+## UI Action Matrix *(mandatory when Filament is changed)*
+
+| Surface | Location | Header Actions | Inspect Affordance (List/Table) | Row Actions (max 2 visible) | Bulk Actions (grouped) | Empty-State CTA(s) | View Header Actions | Create/Edit Save+Cancel | Audit log? | Notes / Exemptions |
+|---|---|---|---|---|---|---|---|---|---|---|
+| Runbooks | `/system/ops/runbooks` | `Preflight` (read-only), `Run…` (write, confirm) | N/A | `View run` (after start) | None | None | N/A | N/A | Yes | `Run…` requires confirmation; typed confirm + reason required for All tenants. |
+| Operation Runs | `/system/ops/runs` | N/A | List links to run detail (“View run”) | `View` | None | None | N/A | N/A | Yes | Run detail includes scope, actor, counts, outcome/status. |
+
+### Key Entities *(include if feature involves data)*
+
+- **Runbook**: A predefined operator action with preflight and run behavior.
+- **Operation Run**: A tracked execution record storing scope, actor, status/outcome, and summary counts.
+- **Audit Event**: Immutable security/ops log entries for preflight/run lifecycle.
+- **Operator Notification**: A delivery record/target for failure alerts.
+- **Finding**: Tenant-owned record whose lifecycle/workflow fields may be backfilled.
+
+## Success Criteria *(mandatory)*
+
+### Measurable Outcomes
+
+- **SC-001**: In production-like environments, customers have **zero** UI affordances to trigger backfills/repairs in `/admin`.
+- **SC-002**: A platform operator can start a runbook without SSH and reach “View run” in **≤ 3 user interactions** from `/system/ops/runbooks`.
+- **SC-003**: 100% of run attempts result in an operation run record and start/completion/failure audit events (with failure still recorded even if notifications fail).
+- **SC-004**: Re-running the same runbook on the same scope after completion results in `updated_count = 0` (idempotency).
--- a/specs/113-platform-ops-runbooks/tasks.md
+++ b/specs/113-platform-ops-runbooks/tasks.md
@ -0,0 +1,177 @@
+---
+
+description: "Task list for Spec 113 implementation"
+---
+
+# Tasks: Platform Ops Runbooks (Operator Control Plane)
+
+**Input**: Design documents from `specs/113-platform-ops-runbooks/`
+**Prerequisites**: `specs/113-platform-ops-runbooks/plan.md`, `specs/113-platform-ops-runbooks/spec.md`, plus `specs/113-platform-ops-runbooks/research.md`, `specs/113-platform-ops-runbooks/data-model.md`, `specs/113-platform-ops-runbooks/contracts/system-ops-runbooks.openapi.yaml`, `specs/113-platform-ops-runbooks/quickstart.md`.
+
+**Tests**: REQUIRED (Pest) for all runtime behavior changes.
+
+---
+
+## Phase 1: Setup (Shared Infrastructure)
+
+**Purpose**: Confirm touch points and keep spec artifacts aligned.
+
+- [ ] T001 Confirm spec UI Action Matrix is complete in specs/113-platform-ops-runbooks/spec.md
+- [ ] T002 Confirm System panel provider registration in bootstrap/providers.php (Laravel 11+/12 provider registration)
+- [ ] T003 [P] Capture current legacy /admin trigger location in app/Filament/Resources/FindingResource/Pages/ListFindings.php ("Backfill findings lifecycle" header action)
+- [ ] T004 [P] Review existing single-tenant backfill pipeline entry points in app/Console/Commands/TenantpilotBackfillFindingLifecycle.php and app/Jobs/BackfillFindingLifecycleJob.php
+
+---
+
+## Phase 2: Foundational (Blocking Prerequisites)
+
+**Purpose**: Security semantics, session isolation, and auth hardening that block all user stories.
+
+- [ ] T005 Add platform runbook capability constants to app/Support/Auth/PlatformCapabilities.php (e.g., platform.ops.view, platform.runbooks.view, platform.runbooks.run, platform.runbooks.findings.lifecycle_backfill)
+- [ ] T006 Update System panel access control to use capability registry constants in app/Providers/Filament/SystemPanelProvider.php (keep ACCESS_SYSTEM_PANEL gate, add per-page capability checks)
+- [ ] T007 Change platform capability denial semantics to 403 (member-but-missing-capability) in app/Http/Middleware/EnsurePlatformCapability.php (keep wrong-plane 404 handled by ensure-correct-guard)
+- [ ] T008 [P] Add SR-002 regression tests for 404 vs 403 semantics in tests/Feature/System/Spec113/AuthorizationSemanticsTest.php (tenant user -> 404 on /system/*, platform user without capability -> 403, platform user with capability -> 200)
+
+- [ ] T009 Define and enforce the “allowed tenant universe” for System runbooks in app/Services/System/AllowedTenantUniverse.php (v1: exclude platform tenant; provide tenant query for pickers and runtime guard)
+- [ ] T010 [P] Add allowed tenant universe tests in tests/Feature/System/Spec113/AllowedTenantUniverseTest.php (picker excludes platform tenant; attempts to target excluded tenant are rejected; no OperationRun created)
+
+- [ ] T011 Create System session cookie isolation middleware in app/Http/Middleware/UseSystemSessionCookie.php (set dedicated session cookie name before StartSession)
+- [ ] T012 Wire System session cookie middleware before StartSession in app/Providers/Filament/SystemPanelProvider.php (SR-004)
+- [ ] T013 [P] Add System session isolation test in tests/Feature/System/Spec113/SystemSessionIsolationTest.php (assert response sets the System session cookie name for /system)
+
+- [ ] T014 Implement /system/login throttling (10/min per IP + username key) in app/Filament/System/Pages/Auth/Login.php (SR-003; use RateLimiter and clear on success)
+- [ ] T015 [P] Add /system/login throttling tests in tests/Feature/System/Spec113/SystemLoginThrottleTest.php (assert throttled after N failures; ensure failures still emit audit via AuditLogger)
+
+---
+
+## Phase 3: User Story 1 — Operator runs a runbook safely (Priority: P1) 🎯 MVP
+
+**Goal**: `/system/ops/runbooks` supports preflight + explicit confirmation + reason capture + typed confirmation for all-tenants; starts a tracked `OperationRun` and links to “View run”.
+
+**Independent Test**: Visit `/system/ops/runbooks`, run preflight, start run, follow “View run” to `/system/ops/runs/{id}`, and confirm audit/run records exist.
+
+### Tests for User Story 1
+
+- [ ] T016 [P] [US1] Add runbook preflight tests in tests/Feature/System/OpsRunbooks/FindingsLifecycleBackfillPreflightTest.php (single tenant + all tenants preflight returns affected_count)
+- [ ] T017 [P] [US1] Add runbook start/confirmation tests in tests/Feature/System/OpsRunbooks/FindingsLifecycleBackfillStartTest.php (typed confirmation + reason required for all_tenants; disabled when affected_count=0)
+- [ ] T018 [P] [US1] Add break-glass reason enforcement + recording tests in tests/Feature/System/OpsRunbooks/FindingsLifecycleBackfillBreakGlassTest.php (reason required when break-glass active; break-glass marker and reason recorded on run + audit)
+- [ ] T019 [P] [US1] Add Ops-UX feedback contract test for start surface in tests/Feature/System/OpsRunbooks/OpsUxStartSurfaceContractTest.php (toast intent-only + “View run” link; no DB queued/running notifications)
+- [ ] T020 [P] [US1] Add audit fail-safe test in tests/Feature/System/OpsRunbooks/FindingsLifecycleBackfillAuditFailSafeTest.php (audit logger failure does not crash run; run still records failure outcome)
+
+### Implementation for User Story 1
+
+- [ ] T021 [US1] Create runbook service app/Services/Runbooks/FindingsLifecycleBackfillRunbookService.php with methods preflight(scope) and start(scope, initiator, reason, source)
+- [ ] T022 [P] [US1] Create runbook scope/value objects in app/Services/Runbooks/FindingsLifecycleBackfillScope.php and app/Services/Runbooks/RunbookReason.php (validate reason_code and reason_text max 500 chars; include break-glass reason requirements)
+- [ ] T023 [US1] Add audit events for preflight/start/completed/failed using AuditLogger in app/Services/Runbooks/FindingsLifecycleBackfillRunbookService.php (action IDs per specs/113-platform-ops-runbooks/data-model.md; must be fail-safe)
+- [ ] T024 [US1] Record break-glass marker + reason on OperationRun context and audit in app/Services/Runbooks/FindingsLifecycleBackfillRunbookService.php (SR-005)
+
+- [ ] T025 [US1] Implement all-tenants orchestration job in app/Jobs/BackfillFindingLifecycleWorkspaceJob.php (create/lock workspace-scoped OperationRun; dispatch tenant fan-out; set summary_counts[tenants/total/processed])
+- [ ] T026 [US1] Implement tenant worker job that updates the shared workspace run in app/Jobs/BackfillFindingLifecycleTenantIntoWorkspaceRunJob.php (chunk writes; increment summary_counts keys from OperationSummaryKeys::all(); append failures; call maybeCompleteBulkRun())
+- [ ] T027 [US1] Ensure scope-level lock prevents concurrent all-tenants runs in app/Services/Runbooks/FindingsLifecycleBackfillRunbookService.php (lock key includes workspace + scope)
+
+- [ ] T028 [US1] Enable platform in-app notifications for run completion/failure by turning on database notifications in app/Providers/Filament/SystemPanelProvider.php (ensure terminal notification is OperationRunCompleted, initiator-only)
+- [ ] T029 [P] [US1] Add System “View run” URL helper in app/Support/System/SystemOperationRunLinks.php and use it for UI + alerts/notifications (avoid admin-plane links)
+- [ ] T030 [US1] Dispatch Alerts event on failure using app/Services/Alerts/AlertDispatchService.php from app/Services/Runbooks/FindingsLifecycleBackfillRunbookService.php (event_type operations.run.failed; include System “View run” URL)
+
+- [ ] T031 [US1] Create System runbooks page class app/Filament/System/Pages/Ops/Runbooks.php (capability-gated; scope selector uses AllowedTenantUniverse; Preflight action; Run action with confirmation + typed confirm + reason)
+- [ ] T032 [P] [US1] Create System runbooks page view resources/views/filament/system/pages/ops/runbooks.blade.php (operator warning; show preflight results + disable Run when nothing to do)
+
+- [ ] T033 [US1] Create System runs list page class app/Filament/System/Pages/Ops/Runs.php (table listing operation runs for runbook types; default sort newest)
+- [ ] T034 [P] [US1] Create System runs list view resources/views/filament/system/pages/ops/runs.blade.php (record inspection affordance: clickable row -> run detail)
+
+- [ ] T035 [US1] Create System run detail page class app/Filament/System/Pages/Ops/ViewRun.php (infolist rendering of OperationRun; show scope/actor/counts/failures)
+- [ ] T036 [P] [US1] Create System run detail view resources/views/filament/system/pages/ops/view-run.blade.php
+
+---
+
+## Phase 4: User Story 2 — Customers never see maintenance actions (Priority: P1)
+
+**Goal**: No `/admin` maintenance/backfill affordances by default; tenant users cannot access `/system/*` (404).
+
+**Independent Test**: As a tenant user, `/system/*` returns 404; in `/admin` Findings list there is no backfill action when the feature flag is defaulted off.
+
+### Tests for User Story 2
+
+- [ ] T037 [P] [US2] Add regression test asserting /admin Findings list has no backfill action by default in tests/Feature/Filament/Spec113/AdminFindingsNoMaintenanceActionsTest.php (targets app/Filament/Resources/FindingResource/Pages/ListFindings.php)
+- [ ] T038 [P] [US2] Add tenant-plane 404 test for /system/ops/runbooks in tests/Feature/System/Spec113/TenantPlaneCannotAccessSystemTest.php
+
+### Implementation for User Story 2
+
+- [ ] T039 [US2] Remove or feature-flag off the legacy header action in app/Filament/Resources/FindingResource/Pages/ListFindings.php (FR-001; default off in production-like envs)
+- [ ] T040 [US2] Add a config-backed feature flag defaulting to false in config/tenantpilot.php (e.g., allow_admin_maintenance_actions) and wire it in app/Filament/Resources/FindingResource/Pages/ListFindings.php
+
+---
+
+## Phase 5: User Story 3 — Same logic for deploy-time and operator re-run (Priority: P2)
+
+**Goal**: One implementation path for preflight/start that is reused by System UI, CLI, and deploy-time automation.
+
+**Independent Test**: Run the runbook twice with the same scope; second run produces updated_count=0; deploy-time entry point calls the same service.
+
+### Tests for User Story 3
+
+- [ ] T041 [P] [US3] Add idempotency test in tests/Feature/System/OpsRunbooks/FindingsLifecycleBackfillIdempotencyTest.php (second run updated=0 and/or preflight affected_count=0)
+- [ ] T042 [P] [US3] Add deploy-time entry point test in tests/Feature/Console/Spec113/DeployRunbooksCommandTest.php (command delegates to FindingsLifecycleBackfillRunbookService)
+
+### Implementation for User Story 3
+
+- [ ] T043 [US3] Refactor CLI command to call shared runbook service in app/Console/Commands/TenantpilotBackfillFindingLifecycle.php (single-tenant scope, source=cli)
+- [ ] T044 [US3] Add deploy-time runbooks command in app/Console/Commands/TenantpilotRunDeployRunbooks.php (source=deploy_hook; initiator null; uses FindingsLifecycleBackfillRunbookService)
+- [ ] T045 [US3] Ensure System UI uses the same runbook service start() call path in app/Filament/System/Pages/Ops/Runbooks.php (source=system_ui)
+- [ ] T046 [US3] Ensure initiator-null runs do not emit terminal DB notification in app/Services/OperationRunService.php (system-run behavior; audit/alerts still apply)
+
+---
+
+## Phase 6: Polish & Cross-Cutting Concerns
+
+- [ ] T047 [P] Run new Spec 113 tests via vendor/bin/sail artisan test --compact tests/Feature/System/Spec113/ (ensure all new tests pass)
+- [ ] T048 [P] Run Ops Runbooks tests via vendor/bin/sail artisan test --compact tests/Feature/System/OpsRunbooks/ (ensure US1/US3 tests pass)
+- [ ] T049 [P] Run formatting on touched files via vendor/bin/sail bin pint --dirty --format agent (targets app/Http/Middleware/, app/Filament/System/Pages/, app/Services/Runbooks/, tests/Feature/System/)
+
+---
+
+## Dependencies & Execution Order
+
+### Phase Dependencies
+
+- **Setup (Phase 1)**: no dependencies
+- **Foundational (Phase 2)**: depends on Setup; BLOCKS all story work
+- **US1 (Phase 3)**: depends on Foundational
+- **US2 (Phase 4)**: depends on Foundational
+- **US3 (Phase 5)**: depends on US1 shared runbook service (T021) + Foundational
+- **Polish (Phase 6)**: depends on desired stories being complete
+
+### User Story Dependencies
+
+- **US1 (P1)**: foundational security + session isolation + login throttle must be in place first
+- **US2 (P1)**: can be implemented after Foundational; independent of US1 UI
+- **US3 (P2)**: depends on the shared runbook service created in US1
+
+---
+
+## Parallel Execution Examples
+
+### US1 parallelizable tasks
+
+- T016, T017, T018, T019, T020 can be drafted in parallel (tests in separate files under tests/Feature/System/OpsRunbooks/)
+- T031/T032, T033/T034, and T035/T036 can be built in parallel (separate System page classes/views)
+- T025 and T026 can be built in parallel once the service contract (T021) is agreed
+
+### US2 parallelizable tasks
+
+- T037 and T038 can run in parallel (tests)
+- T039 and T040 can run in parallel if T040 lands first (feature flag), otherwise keep sequential
+
+### US3 parallelizable tasks
+
+- T041 and T042 can run in parallel (tests)
+- T043 and T044 can be implemented in parallel once T021 exists
+
+---
+
+## Implementation Strategy (MVP First)
+
+1) Complete Phase 2 (security semantics + session isolation + login throttle)
+2) Deliver US1 (System runbooks page + OperationRun tracking + System runs detail)
+3) Deliver US2 (remove/disable /admin maintenance UI)
+4) Deliver US3 (shared logic reused by CLI + deploy-time automation)