spec: add 094 assignment ops observability hardening
This commit is contained in:
parent
92a36ab89e
commit
c905f211a5
@ -0,0 +1,35 @@
|
||||
# Specification Quality Checklist: 094 Assignment Ops Observability Hardening
|
||||
|
||||
**Purpose**: Validate specification completeness and quality before proceeding to planning
|
||||
**Created**: 2026-02-14
|
||||
**Feature**: [specs/094-assignment-ops-observability-hardening/spec.md](specs/094-assignment-ops-observability-hardening/spec.md)
|
||||
|
||||
## Content Quality
|
||||
|
||||
- [x] No implementation details (languages, frameworks, APIs)
|
||||
- [x] Focused on user value and business needs
|
||||
- [x] Written for non-technical stakeholders
|
||||
- [x] All mandatory sections completed
|
||||
|
||||
## Requirement Completeness
|
||||
|
||||
- [x] No [NEEDS CLARIFICATION] markers remain
|
||||
- [x] Requirements are testable and unambiguous
|
||||
- [x] Success criteria are measurable
|
||||
- [x] Success criteria are technology-agnostic (no implementation details)
|
||||
- [x] All acceptance scenarios are defined
|
||||
- [x] Edge cases are identified
|
||||
- [x] Scope is clearly bounded
|
||||
- [x] Dependencies and assumptions identified
|
||||
|
||||
## Feature Readiness
|
||||
|
||||
- [x] All functional requirements have clear acceptance criteria
|
||||
- [x] User scenarios cover primary flows
|
||||
- [x] Feature meets measurable outcomes defined in Success Criteria
|
||||
- [x] No implementation details leak into specification
|
||||
|
||||
## Notes
|
||||
|
||||
- Spec is intentionally ship-safety hardening only; no new domain features.
|
||||
- Spec was rewritten to follow the official template structure and removed code/file/class references.
|
||||
@ -0,0 +1,127 @@
|
||||
openapi: 3.0.3
|
||||
info:
|
||||
title: TenantPilot - Assignment Operations (Internal)
|
||||
version: "1.0"
|
||||
description: |
|
||||
Internal contract describing the user-triggered operation start surfaces and Monitoring read surfaces
|
||||
relevant to assignment fetch/restore observability.
|
||||
|
||||
servers:
|
||||
- url: /
|
||||
|
||||
paths:
|
||||
/admin/t/{tenant}/backup-items/{backupItem}/assignments/fetch:
|
||||
post:
|
||||
summary: Start assignment fetch/enrichment
|
||||
parameters:
|
||||
- in: path
|
||||
name: tenant
|
||||
required: true
|
||||
schema: { type: string }
|
||||
- in: path
|
||||
name: backupItem
|
||||
required: true
|
||||
schema: { type: integer }
|
||||
responses:
|
||||
"202":
|
||||
description: Accepted; operation run created/reused and job enqueued
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/OperationRunStartResponse"
|
||||
"403": { description: Forbidden (member missing capability) }
|
||||
"404": { description: Not found (non-member / wrong plane) }
|
||||
|
||||
/admin/t/{tenant}/restore-runs/{restoreRun}/assignments/restore:
|
||||
post:
|
||||
summary: Start assignment restore
|
||||
parameters:
|
||||
- in: path
|
||||
name: tenant
|
||||
required: true
|
||||
schema: { type: string }
|
||||
- in: path
|
||||
name: restoreRun
|
||||
required: true
|
||||
schema: { type: integer }
|
||||
responses:
|
||||
"202":
|
||||
description: Accepted; operation run created/reused and job enqueued
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/OperationRunStartResponse"
|
||||
"403": { description: Forbidden (member missing capability) }
|
||||
"404": { description: Not found (non-member / wrong plane) }
|
||||
|
||||
/admin/monitoring/operations:
|
||||
get:
|
||||
summary: List operation runs (Monitoring)
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/OperationRunSummary"
|
||||
|
||||
/admin/monitoring/operations/{operationRun}:
|
||||
get:
|
||||
summary: Get operation run detail (Monitoring)
|
||||
parameters:
|
||||
- in: path
|
||||
name: operationRun
|
||||
required: true
|
||||
schema: { type: integer }
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/OperationRunDetail"
|
||||
"404": { description: Not found (non-member / no entitlement) }
|
||||
|
||||
components:
|
||||
schemas:
|
||||
OperationRunStartResponse:
|
||||
type: object
|
||||
required: [operationRunId]
|
||||
properties:
|
||||
operationRunId:
|
||||
type: integer
|
||||
|
||||
OperationRunSummary:
|
||||
type: object
|
||||
required: [id, type, status, outcome, createdAt]
|
||||
properties:
|
||||
id: { type: integer }
|
||||
type: { type: string }
|
||||
status: { type: string }
|
||||
outcome: { type: string }
|
||||
createdAt: { type: string, format: date-time }
|
||||
|
||||
OperationRunDetail:
|
||||
allOf:
|
||||
- $ref: "#/components/schemas/OperationRunSummary"
|
||||
- type: object
|
||||
properties:
|
||||
startedAt: { type: string, format: date-time, nullable: true }
|
||||
completedAt: { type: string, format: date-time, nullable: true }
|
||||
context:
|
||||
type: object
|
||||
additionalProperties: true
|
||||
failures:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/FailureItem"
|
||||
|
||||
FailureItem:
|
||||
type: object
|
||||
required: [code, reasonCode, message]
|
||||
properties:
|
||||
code: { type: string }
|
||||
reasonCode: { type: string }
|
||||
message: { type: string }
|
||||
@ -0,0 +1,50 @@
|
||||
# Data Model — 094 Assignment Operations Observability Hardening
|
||||
|
||||
This feature introduces **no new entities**. It strengthens how existing operational artifacts are created, deduplicated, and displayed.
|
||||
|
||||
## Entities (existing)
|
||||
|
||||
### OperationRun
|
||||
|
||||
Represents a single observable operation for Monitoring.
|
||||
|
||||
- Ownership:
|
||||
- `workspace_id` required.
|
||||
- `tenant_id` present for tenant-bound runs.
|
||||
- Key fields:
|
||||
- `type`: operation type identifier (string).
|
||||
- `status`: queued/running/completed.
|
||||
- `outcome`: pending/succeeded/failed/etc.
|
||||
- `run_identity_hash`: stable identity used for active-run dedupe.
|
||||
- `context`: structured details for Monitoring (inputs, progress, counters).
|
||||
- `failures`: array of failure items with stable `code`, normalized `reason_code`, and sanitized `message`.
|
||||
- Counter-like fields may be stored in `context` depending on existing conventions.
|
||||
|
||||
### AuditLog
|
||||
|
||||
Immutable audit record for security/ops-relevant mutations.
|
||||
|
||||
- Scope invariant: if `tenant_id` is present, `workspace_id` must also be present.
|
||||
- This spec requires exactly one audit entry per assignment restore execution.
|
||||
|
||||
### RestoreRun
|
||||
|
||||
Represents a restore execution request and its lifecycle. Assignment restores are performed under an existing restore run.
|
||||
|
||||
### BackupItem
|
||||
|
||||
Represents an item within a backup set; assignment fetch/enrichment is performed for a specific backup item.
|
||||
|
||||
## Relationships (conceptual)
|
||||
|
||||
- One tenant has many operation runs.
|
||||
- One restore run should be associated with one or more operation runs depending on orchestration; this spec requires that assignment restore execution is observable via at least one operation run.
|
||||
- One backup item can have a related fetch/enrichment operation run.
|
||||
|
||||
## State transitions
|
||||
|
||||
### OperationRun lifecycle
|
||||
|
||||
- `queued` → `running` → `completed`
|
||||
- Outcome set on completion to indicate success or failure.
|
||||
|
||||
174
specs/094-assignment-ops-observability-hardening/plan.md
Normal file
174
specs/094-assignment-ops-observability-hardening/plan.md
Normal file
@ -0,0 +1,174 @@
|
||||
# Implementation Plan: 094 Assignment Operations Observability Hardening
|
||||
|
||||
**Branch**: `094-assignment-ops-observability-hardening` | **Date**: 2026-02-15 | **Spec**: [spec.md](spec.md)
|
||||
**Input**: Feature specification from [spec.md](spec.md)
|
||||
|
||||
## Summary
|
||||
|
||||
This work hardens ship-safety by making assignment fetch/restore executions fully observable in Monitoring (via `OperationRun`), improving Graph testability by using the Graph client interface, and closing a small set of authorization inconsistencies (cross-plane guard leak, policy bypass, and 404/403 ordering).
|
||||
|
||||
## Technical Context
|
||||
|
||||
**Language/Version**: PHP 8.4 (Laravel 12)
|
||||
**Primary Dependencies**: Filament v5, Livewire v4, Laravel Sail, Microsoft Graph integration
|
||||
**Storage**: PostgreSQL (Sail)
|
||||
**Testing**: Pest v4 (`./vendor/bin/sail artisan test`)
|
||||
**Target Platform**: Linux container runtime (Sail/Dokploy)
|
||||
**Project Type**: Laravel web application (admin UI via Filament)
|
||||
**Performance Goals**: Monitoring pages must remain DB-only and render quickly (no external calls during render).
|
||||
**Constraints**: No secrets/tokens in logs or run failures; RBAC-UX semantics must match constitution (404 vs 403).
|
||||
**Scale/Scope**: Small hardening change set; no new domain entities; focused on two jobs + a few UI/policy surfaces.
|
||||
|
||||
## Constitution Check
|
||||
|
||||
*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.*
|
||||
|
||||
- Inventory-first: PASS (no inventory semantics changed).
|
||||
- Read/write separation: PASS (restore remains a user-triggered operation; change is observability/auditability).
|
||||
- Graph contract path: PASS-BY-DESIGN (plan includes removing concrete client injections in assignment-related services).
|
||||
- Deterministic capabilities: PASS (no new capability model; micro-fixes ensure enforcement uses canonical patterns).
|
||||
- RBAC-UX planes: PASS (plan includes closing remaining cross-plane guard leak; non-member 404, member missing capability 403).
|
||||
- Workspace isolation: PASS (operations remain tenant-bound; Monitoring remains DB-only).
|
||||
- Destructive confirmations: PASS (no new destructive actions added; enforcement fixes must preserve confirmations).
|
||||
- Global search safety: N/A (no global search changes).
|
||||
- Tenant isolation: PASS (no cross-tenant views added; monitoring surfaces already entitlement-checked).
|
||||
- Run observability: PASS-BY-DESIGN (this spec exists to bring remaining assignment jobs under `OperationRun`).
|
||||
- Automation: PASS (dedupe identity clarified; existing partial unique index patterns used).
|
||||
- Data minimization: PASS (failures must be sanitized; no raw payloads stored).
|
||||
- Badge semantics: N/A (no badge mapping changes).
|
||||
- Filament UI Action Surface Contract: PASS (no new resources; micro-fixes must not remove inspect affordances).
|
||||
|
||||
## Project Structure
|
||||
|
||||
### Documentation (this feature)
|
||||
|
||||
```text
|
||||
specs/094-assignment-ops-observability-hardening/
|
||||
├── plan.md
|
||||
├── spec.md
|
||||
├── tasks.md
|
||||
├── research.md
|
||||
├── data-model.md
|
||||
├── quickstart.md
|
||||
├── contracts/
|
||||
│ └── assignment-ops.openapi.yaml
|
||||
└── checklists/
|
||||
└── requirements.md
|
||||
```
|
||||
|
||||
### Source Code (repository root)
|
||||
|
||||
```text
|
||||
app/
|
||||
├── Jobs/
|
||||
│ ├── FetchAssignmentsJob.php
|
||||
│ ├── RestoreAssignmentsJob.php
|
||||
│ └── Middleware/TrackOperationRun.php
|
||||
├── Services/
|
||||
│ ├── OperationRunService.php
|
||||
│ ├── AssignmentBackupService.php
|
||||
│ ├── AssignmentRestoreService.php
|
||||
│ └── Graph/
|
||||
│ ├── GraphClientInterface.php
|
||||
│ ├── MicrosoftGraphClient.php
|
||||
│ ├── NullGraphClient.php
|
||||
│ ├── AssignmentFetcher.php
|
||||
│ ├── AssignmentFilterResolver.php
|
||||
│ └── GroupResolver.php
|
||||
├── Filament/
|
||||
│ └── Resources/...
|
||||
└── Http/
|
||||
└── Middleware/EnsureCorrectGuard.php
|
||||
|
||||
routes/
|
||||
└── web.php
|
||||
|
||||
tests/
|
||||
├── Feature/
|
||||
└── Unit/
|
||||
```
|
||||
|
||||
**Structure Decision**: Use the existing Laravel structure; changes are limited to the job layer, Graph service DI, a few Filament surfaces, and targeted Pest tests.
|
||||
|
||||
## Phase 0 — Outline & Research
|
||||
|
||||
Artifacts:
|
||||
|
||||
- [research.md](research.md)
|
||||
|
||||
Research outcomes (resolved decisions):
|
||||
|
||||
- Operation run identity & dedupe: tenant + type + target/scope.
|
||||
- Counters semantics: total attempted, processed succeeded, failed failed.
|
||||
- Failure convention: operation-specific `code` + normalized `reason_code`.
|
||||
- Audit log granularity: exactly one entry per restore execution.
|
||||
|
||||
## Phase 1 — Design & Contracts
|
||||
|
||||
Artifacts:
|
||||
|
||||
- [data-model.md](data-model.md)
|
||||
- [contracts/assignment-ops.openapi.yaml](contracts/assignment-ops.openapi.yaml)
|
||||
- [quickstart.md](quickstart.md)
|
||||
|
||||
Design notes:
|
||||
|
||||
- No new persistent entities.
|
||||
- Operation run tracking must use existing dedupe/index patterns.
|
||||
- Monitoring surfaces remain DB-only.
|
||||
|
||||
## Phase 1 — Agent Context Update
|
||||
|
||||
Run:
|
||||
|
||||
- `.specify/scripts/bash/update-agent-context.sh copilot`
|
||||
|
||||
## Phase 1 — Constitution Check (re-evaluation)
|
||||
|
||||
Re-check result: PASS expected once implementation removes concrete Graph client injections and adds OperationRun tracking for the two remaining assignment jobs.
|
||||
|
||||
## Phase 2 — Implementation Plan
|
||||
|
||||
### Step 1 — OperationRun tracking for assignment fetch/restore
|
||||
|
||||
- Locate job dispatch/start surfaces.
|
||||
- Ensure each execution creates/reuses an `OperationRun`:
|
||||
- Dedupe identity:
|
||||
- Fetch: tenant + type + backup item (or equivalent policy-version identifier).
|
||||
- Restore: tenant + type + restore run identifier.
|
||||
- Ensure Tracking middleware is applied and the job exposes a run handle per existing conventions.
|
||||
- Ensure failure details:
|
||||
- `code`: operation-specific namespace.
|
||||
- `reason_code`: normalized cause.
|
||||
- message: sanitized.
|
||||
- Ensure counters match the agreed semantics.
|
||||
|
||||
### Step 2 — Audit log for assignment restore execution
|
||||
|
||||
- Ensure exactly one audit log entry is written per assignment restore execution.
|
||||
|
||||
### Step 3 — Graph client interface enforcement
|
||||
|
||||
- Update assignment-related services to accept the Graph client interface.
|
||||
|
||||
### Step 4 — Authorization micro-fixes
|
||||
|
||||
- Close cross-plane guard leak on workspace-scoped admin routes.
|
||||
- Remove any policy bypasses on Provider Connections list surfaces.
|
||||
- Fix membership (404) vs capability (403) ordering for backup item surfaces.
|
||||
- Ensure legacy UI enforcement helpers are not used where the canonical helper exists.
|
||||
|
||||
### Step 5 — Tests + formatting
|
||||
|
||||
- Add targeted Pest regression coverage for:
|
||||
- operation run tracking (success/failure)
|
||||
- guard leak
|
||||
- policy enforcement
|
||||
- 404/403 ordering
|
||||
- Graph interface mockability
|
||||
- Run `./vendor/bin/sail bin pint --dirty`.
|
||||
- Run targeted tests then widen as needed.
|
||||
|
||||
## Complexity Tracking
|
||||
|
||||
No constitution violations are required for this feature.
|
||||
@ -0,0 +1,33 @@
|
||||
# Quickstart — 094 Assignment Operations Observability Hardening
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Docker + Laravel Sail
|
||||
- PHP/Composer dependencies installed (via Sail)
|
||||
|
||||
## Setup
|
||||
|
||||
- Start services: `./vendor/bin/sail up -d`
|
||||
- Install PHP deps (if needed): `./vendor/bin/sail composer install`
|
||||
- Run migrations: `./vendor/bin/sail artisan migrate`
|
||||
|
||||
## Running tests (targeted)
|
||||
|
||||
Run the smallest set first, then widen:
|
||||
|
||||
- Feature tests added for this spec (once implemented):
|
||||
- `./vendor/bin/sail artisan test --compact tests/Feature/Operations`
|
||||
- `./vendor/bin/sail artisan test --compact tests/Feature/Auth`
|
||||
- `./vendor/bin/sail artisan test --compact tests/Feature/Rbac`
|
||||
|
||||
## Formatting
|
||||
|
||||
- Run Pint on touched files: `./vendor/bin/sail bin pint --dirty`
|
||||
|
||||
## Manual verification (admin)
|
||||
|
||||
- Trigger an assignment fetch and confirm a Monitoring → Operations entry appears.
|
||||
- Trigger an assignment restore and confirm:
|
||||
- Monitoring shows a run with failure codes (if any) and counters.
|
||||
- Exactly one audit log entry is created for the restore execution.
|
||||
|
||||
59
specs/094-assignment-ops-observability-hardening/research.md
Normal file
59
specs/094-assignment-ops-observability-hardening/research.md
Normal file
@ -0,0 +1,59 @@
|
||||
# Research — 094 Assignment Operations Observability Hardening
|
||||
|
||||
This document resolves implementation-relevant questions raised by the spec and records the key decisions.
|
||||
|
||||
## Decision 1 — Operation run identity + dedupe
|
||||
|
||||
- Decision: Use **dedupe per tenant + operation type + operation target/scope**.
|
||||
- Fetch target/scope: backup item identifier (or equivalent policy-version identifier).
|
||||
- Restore target/scope: restore run identifier.
|
||||
- Rationale: Prevents operator confusion while allowing independent operations to proceed in parallel.
|
||||
- Alternatives considered:
|
||||
- Dedupe per tenant only → collapses unrelated runs and obscures what is actually running.
|
||||
- No dedupe → increases confusion and makes tests non-deterministic.
|
||||
|
||||
## Decision 2 — How jobs become OperationRun-tracked
|
||||
|
||||
- Decision: Prefer passing an existing operation run identifier into queued jobs when a user-triggered start surface exists; otherwise the job must create/reuse a canonical run using the standard service.
|
||||
- Rationale: Start surfaces should remain enqueue-only while ensuring a single umbrella run record exists for Monitoring.
|
||||
- Alternatives considered:
|
||||
- Create runs only inside jobs → makes it harder to link UI initiation to the run and can lead to duplicate runs.
|
||||
|
||||
## Decision 3 — Counters semantics
|
||||
|
||||
- Decision: `total` = items attempted, `processed` = succeeded, `failed` = failed.
|
||||
- Rationale: Works for both read-only fetch and destructive restore, and is easy to interpret in Monitoring.
|
||||
- Alternatives considered:
|
||||
- “total discovered” semantics → ambiguous when discovery and execution are separate steps.
|
||||
- Different semantics per operation type → harder to reason about and to test.
|
||||
|
||||
## Decision 4 — Failure structure: stable code + normalized reason_code
|
||||
|
||||
- Decision: Use operation-specific failure `code` namespaces (e.g., `assignments.fetch_failed`, `assignments.restore_failed`) and store the underlying cause as a normalized `reason_code`.
|
||||
- Rationale: Operators can identify what failed (operation) and why (normalized cause) consistently.
|
||||
- Alternatives considered:
|
||||
- Generic failure codes only → loses context; Monitoring becomes less actionable.
|
||||
- Reusing unrelated restore codes for assignment operations → conflates domains and increases ambiguity.
|
||||
|
||||
## Decision 5 — Audit log granularity for assignment restores
|
||||
|
||||
- Decision: Write **exactly one audit log entry per assignment restore execution** (per restore run).
|
||||
- Rationale: Satisfies auditability while keeping log volume predictable and reviewable.
|
||||
- Alternatives considered:
|
||||
- Per-item audit logs → noisy for large restores.
|
||||
- Both summary + per-item → overkill for this hardening scope.
|
||||
|
||||
## Decision 6 — Graph client abstraction
|
||||
|
||||
- Decision: Ensure assignment-related services depend on the **Graph client interface**, not a concrete implementation.
|
||||
- Rationale: Enables deterministic non-production tests and allows safe stub/null implementations.
|
||||
- Alternatives considered:
|
||||
- Concrete client type-hints → blocks mocking and makes tests fragile.
|
||||
|
||||
## Decision 7 — Authorization semantics & cross-plane safety
|
||||
|
||||
- Decision: Enforce deny-as-not-found (404) for non-members/cross-plane access and forbidden (403) for members lacking capability.
|
||||
- Rationale: Matches constitution RBAC-UX rules and prevents route/resource enumeration.
|
||||
- Alternatives considered:
|
||||
- Using 403 for non-members → leaks existence and violates deny-as-not-found standard.
|
||||
|
||||
137
specs/094-assignment-ops-observability-hardening/spec.md
Normal file
137
specs/094-assignment-ops-observability-hardening/spec.md
Normal file
@ -0,0 +1,137 @@
|
||||
# Feature Specification: Assignment Operations Observability Hardening
|
||||
|
||||
**Feature Branch**: `094-assignment-ops-observability-hardening`
|
||||
**Created**: 2026-02-14
|
||||
**Status**: Draft
|
||||
**Input**: User description: "Harden assignment operation observability and close remaining authorization inconsistencies so operations are fully traceable, diagnosable, and correctly access-controlled."
|
||||
|
||||
## Spec Scope Fields *(mandatory)*
|
||||
|
||||
- **Scope**: canonical-view
|
||||
- **Primary Routes**:
|
||||
- Admin Monitoring → Operations (list + detail views)
|
||||
- Provider Connections list
|
||||
- Backup Sets list
|
||||
- Restore Runs list
|
||||
- Backup Items relationship view under a backup set
|
||||
- **Data Ownership**: operational activity records must be scoped to the correct workspace and tenant context where applicable; no new domain entities are introduced.
|
||||
- **RBAC**: all affected admin surfaces require workspace membership; actions that change configuration or trigger restores require explicit permissions as defined in the central capability registry.
|
||||
|
||||
### Canonical-view Constraints
|
||||
|
||||
- **Default behavior when tenant-context is active**: Monitoring → Operations defaults to showing operational records for the currently selected tenant context (if one is selected).
|
||||
- **Entitlement checks**: Monitoring → Operations MUST not reveal tenant-owned operational records unless the actor is entitled to that tenant scope within the active workspace context.
|
||||
|
||||
## Clarifications
|
||||
|
||||
### Session 2026-02-14
|
||||
|
||||
- Q: For FR-005 / SC-004, what should “identity scope” mean for deduping “active” runs? → A: Dedupe per tenant and operation target/scope.
|
||||
- Q: For FR-003 (“summary counters”) what exact semantics should total / processed / failed follow? → A: total = items attempted; processed = succeeded; failed = failed.
|
||||
- Q: For FR-004 (“stable failure code”), which convention should we standardize on for assignment fetch/restore runs? → A: Operation-specific code namespaces + normalized reason_code for the cause.
|
||||
- Q: For FR-006 (identity scope = tenant + operation type + target/scope), which identifiers should define “target/scope” for these two operations? → A: Fetch targets the backup item (or policy version) identifier; restore targets the restore run identifier.
|
||||
- Q: For US1 / FR-002 (“restore is observable/auditable”), what audit log granularity do you want for assignment restore? → A: One audit log entry per assignment restore execution (per restore run).
|
||||
|
||||
## User Scenarios & Testing *(mandatory)*
|
||||
|
||||
### User Story 1 — Observe assignment operations end-to-end (Priority: P1)
|
||||
|
||||
Workspace administrators need to see assignment-related operations (both read-only fetch and destructive restore) in Monitoring so they can confirm what ran, what changed, and why something failed, without relying on server logs.
|
||||
|
||||
**Why this priority**: Assignment restore is high-risk; missing visibility creates operational and audit gaps.
|
||||
|
||||
**Independent Test**: Trigger both an assignment fetch and an assignment restore; verify each produces a monitoring-visible run record with correct lifecycle and failure details.
|
||||
|
||||
**Acceptance Scenarios**:
|
||||
|
||||
1. **Given** an administrator triggers an assignment fetch, **When** the operation starts and completes, **Then** Monitoring shows a run record with start/end timestamps, final outcome, and summary counters.
|
||||
2. **Given** an administrator triggers an assignment restore, **When** the operation starts and completes, **Then** Monitoring shows a run record including a clear indication that it was a change-making operation.
|
||||
- And exactly one audit log entry is written for the restore execution.
|
||||
3. **Given** an assignment fetch or restore fails due to an external dependency error, **When** the run completes, **Then** Monitoring shows a stable failure code and a sanitized, user-readable message.
|
||||
4. **Given** the same assignment operation is triggered multiple times concurrently for the same tenant and scope, **When** the system creates tracking records, **Then** the admin sees a single “active” run per identity (or an equivalent deduped representation).
|
||||
|
||||
---
|
||||
|
||||
### User Story 2 — Enforce correct access control semantics on affected admin surfaces (Priority: P2)
|
||||
|
||||
Workspace administrators and platform operators must not be able to cross authentication “planes” accidentally, and the admin UI must not expose bypasses that let users initiate sensitive actions without authorization.
|
||||
|
||||
**Why this priority**: Prevents cross-plane leakage and closes known authorization inconsistencies.
|
||||
|
||||
**Independent Test**: Attempt access with the wrong authentication plane and with insufficient permissions; verify outcomes are deny-as-not-found (404) or forbidden (403) per policy.
|
||||
|
||||
**Acceptance Scenarios**:
|
||||
|
||||
1. **Given** a user is authenticated in a different auth plane, **When** they attempt to access workspace-scoped admin routes, **Then** the response is deny-as-not-found (404).
|
||||
2. **Given** a user is not a member of the workspace, **When** they attempt to view backup items under a backup set, **Then** the response is deny-as-not-found (404) and does not reveal record existence.
|
||||
3. **Given** a user is a workspace member but lacks the required permission, **When** they attempt a protected action (such as managing provider connections), **Then** the response is forbidden (403).
|
||||
|
||||
---
|
||||
|
||||
### User Story 3 — Validate assignment operations safely in non-production contexts (Priority: P3)
|
||||
|
||||
Platform engineers need to validate that assignment operations behave correctly without requiring live external dependencies, so regressions can be caught early.
|
||||
|
||||
**Why this priority**: Improves reliability and reduces the risk of shipping changes that only fail when external services are slow/unavailable.
|
||||
|
||||
**Independent Test**: Run automated tests that simulate both successful and failing external interactions; verify monitoring records and authorization behaviors.
|
||||
|
||||
**Acceptance Scenarios**:
|
||||
|
||||
1. **Given** external interactions are simulated as “successful”, **When** the operation runs, **Then** the run is marked successful and includes expected summary counters.
|
||||
2. **Given** external interactions are simulated as “failing”, **When** the operation runs, **Then** the run is marked failed with a stable failure code and sanitized message.
|
||||
|
||||
### Edge Cases
|
||||
|
||||
- External dependency timeouts, throttling, or transient failures.
|
||||
- Retries and duplicate dispatches (ensure tracking remains coherent and non-spammy).
|
||||
- Missing or inconsistent tenant/workspace context (must fail safely, not leak).
|
||||
- Partial completion (some items processed, some failed): counters and failure details must remain interpretable.
|
||||
|
||||
## Requirements *(mandatory)*
|
||||
|
||||
### Functional Requirements
|
||||
|
||||
- **FR-001**: The system MUST create and maintain an operational tracking record (OperationRun) for every assignment fetch operation execution.
|
||||
- **FR-002**: The system MUST create and maintain an operational tracking record (OperationRun) for every assignment restore operation execution.
|
||||
- **FR-003**: Tracking records MUST include lifecycle state (queued/running/completed), timestamps, outcome, and summary counters sufficient to understand progress and results.
|
||||
- Counter semantics: `total` = items attempted, `processed` = succeeded, `failed` = failed.
|
||||
- **FR-004**: Failed runs MUST include a stable failure code and a sanitized, user-readable failure message.
|
||||
- Failure convention: use operation-specific `code` namespaces for the run, and store the underlying cause as a normalized `reason_code`.
|
||||
- **FR-005**: The system MUST prevent duplicate “active” runs for the same tenant and identity scope (tenant + operation type + operation target/scope), or otherwise present a deduped representation that avoids operator confusion.
|
||||
- **FR-006**: Identity scope MUST be defined as:
|
||||
- For assignment fetch operations: tenant + operation type + backup item identifier (or equivalent policy-version identifier).
|
||||
- For assignment restore operations: tenant + operation type + restore run identifier.
|
||||
- **FR-007**: Monitoring pages MUST render using persisted operational data only (no outbound calls during page render).
|
||||
- **FR-008**: Cross-plane access MUST be deny-as-not-found (404) on affected routes.
|
||||
- **FR-009**: Authorization MUST follow consistent semantics:
|
||||
- non-member / not entitled → 404 (deny-as-not-found)
|
||||
- member without required permission → 403
|
||||
- **FR-010**: Any action that can change configuration or trigger a restore MUST be server-authorized; UI visibility MUST NOT be treated as authorization.
|
||||
- **FR-011**: Sensitive or destructive-like actions MUST require explicit confirmation.
|
||||
- **FR-012**: Each assignment restore execution MUST write exactly one audit log entry for the restore run.
|
||||
|
||||
### Dependencies & Assumptions
|
||||
|
||||
- Assignment fetch and assignment restore operations can be triggered by administrators or scheduled/queued execution paths.
|
||||
- Monitoring users have access to the Operations area only when they have appropriate workspace membership and permissions.
|
||||
- External dependency failures may occur and must be represented consistently (stable failure codes + sanitized messages).
|
||||
|
||||
## UI Action Matrix *(mandatory when the admin UI is changed)*
|
||||
|
||||
| Surface | Location | Header Actions | Inspect Affordance (List/Table) | Row Actions (max 2 visible) | Bulk Actions (grouped) | Empty-State CTA(s) | View Header Actions | Create/Edit Save+Cancel | Audit log? | Notes / Exemptions |
|
||||
|---|---|---|---|---|---|---|---|---|---|---|
|
||||
| Provider Connections | Admin → Provider Connections | Create connection (permission-gated) | View connection details | Edit / Delete (permission-gated; destructive confirms) | None | Create connection | N/A | Save + Cancel | Yes | Ensures no authorization bypass exists on header/empty-state CTAs. |
|
||||
| Backup Sets | Admin → Backups | None (unchanged) | View backup set | Actions unchanged | None | None | N/A | N/A | N/A | Only enforcement helper consistency is affected. |
|
||||
| Restore Runs | Admin → Restores | None (unchanged) | View restore run | Actions unchanged | None | None | N/A | N/A | Yes | Restore operations must be observable via Monitoring. |
|
||||
| Backup Items | Under a backup set | None | View backup item details | Actions unchanged | None | None | N/A | N/A | N/A | Membership/404 checks must occur before capability/403 checks. |
|
||||
| Monitoring → Operations | Admin → Monitoring | None | View operation run details | None | None | None | N/A | N/A | Yes | Read-only view; must not call external services during render. |
|
||||
|
||||
## Success Criteria *(mandatory)*
|
||||
|
||||
### Measurable Outcomes
|
||||
|
||||
- **SC-001**: 100% of assignment fetch and assignment restore executions appear in Monitoring with a completed outcome (success or failure) and timestamps.
|
||||
- **SC-002**: For failed runs, operators can identify a stable failure code and a readable failure message from Monitoring within 60 seconds, without checking server logs.
|
||||
- **SC-003**: Automated tests verify deny-as-not-found (404) vs forbidden (403) semantics for the affected surfaces.
|
||||
- **SC-004**: Duplicate active-run confusion is eliminated: repeated triggers produce a single active run per identity scope (tenant + operation type + operation target/scope) or equivalent deduped visibility.
|
||||
172
specs/094-assignment-ops-observability-hardening/tasks.md
Normal file
172
specs/094-assignment-ops-observability-hardening/tasks.md
Normal file
@ -0,0 +1,172 @@
|
||||
# Tasks: 094 Assignment Operations Observability Hardening
|
||||
|
||||
**Input**: Design documents from `specs/094-assignment-ops-observability-hardening/`
|
||||
**Prerequisites**: `specs/094-assignment-ops-observability-hardening/plan.md` (required), `specs/094-assignment-ops-observability-hardening/spec.md` (required), `specs/094-assignment-ops-observability-hardening/research.md`, `specs/094-assignment-ops-observability-hardening/data-model.md`, `specs/094-assignment-ops-observability-hardening/contracts/assignment-ops.openapi.yaml`
|
||||
|
||||
**Tests**: Required (Pest) for runtime behavior changes.
|
||||
|
||||
---
|
||||
|
||||
## Phase 1: Setup (Shared Infrastructure)
|
||||
|
||||
**Purpose**: Ensure the implementation has the correct local workflow context and that the existing conventions to extend are understood.
|
||||
|
||||
- [X] T001 Run SpecKit prerequisites check via `.specify/scripts/bash/check-prerequisites.sh --json` and record `FEATURE_DIR` + `AVAILABLE_DOCS` for this feature
|
||||
- [X] T002 Review existing OperationRun tracking conventions in `app/Jobs/Middleware/TrackOperationRun.php` and `app/Services/OperationRunService.php`
|
||||
- [X] T003 Review existing OperationRun test patterns in `tests/Feature/TrackOperationRunMiddlewareTest.php` and `tests/Feature/OperationRunServiceTest.php`
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Foundational (Blocking Prerequisites)
|
||||
|
||||
**Purpose**: Confirm the persistence + dedupe primitives exist and match the spec’s requirements before adding new run types and audits.
|
||||
|
||||
**⚠️ CRITICAL**: Complete this phase before implementing any user story.
|
||||
|
||||
- [X] T004 Verify `operation_runs` schema supports `run_identity_hash`, `summary_counts`, and `failure_summary` per `database/migrations/2026_01_16_180642_create_operation_runs_table.php`
|
||||
- [X] T005 Verify active-run dedupe constraints exist and align with `OperationRunService::ensureRunWithIdentity()` in `app/Services/OperationRunService.php` and `database/migrations/2026_02_10_004939_add_unique_index_for_backup_schedule_scheduled_operation_runs.php`
|
||||
- [X] T006 [P] Identify existing restore audit logging expectations to preserve by reviewing `tests/Feature/RestoreAuditLoggingTest.php` and `app/Services/Intune/AuditLogger.php`
|
||||
- [X] T034 [P] Verify active-run dedupe constraints cover the new assignment run identity shapes; if not, add a DB-enforced partial unique index/migration + regression test for dedupe behavior
|
||||
|
||||
**Checkpoint**: Foundation ready — user story work can begin.
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: User Story 1 — Observe assignment operations end-to-end (Priority: P1) 🎯 MVP
|
||||
|
||||
**Goal**: Assignment fetch (during backup capture when assignments are included) and assignment restore (during restore execution) are observable via `OperationRun` with stable failure codes, normalized `reason_code`, correct counters, and exactly one audit log entry per assignment restore execution.
|
||||
|
||||
**Independent Test**: Trigger (a) backup creation with assignments included and (b) a restore that applies assignments; verify `operation_runs` rows exist and `audit_logs` contains exactly one assignment-restore summary entry.
|
||||
|
||||
### Tests for User Story 1 (write first)
|
||||
|
||||
- [X] T007 [P] [US1] Add test for assignment restore OperationRun lifecycle + counters in `tests/Feature/Operations/AssignmentRestoreOperationRunTest.php`
|
||||
- [X] T008 [P] [US1] Add test for stable failure `code` + normalized `reason_code` on assignment restore failure in `tests/Feature/Operations/AssignmentRestoreOperationRunFailureTest.php`
|
||||
- [X] T009 [P] [US1] Add test that assignment restore writes exactly one audit log entry per restore execution in `tests/Feature/Audit/AssignmentRestoreAuditSummaryTest.php`
|
||||
- [X] T010 [P] [US1] Add test that backup capture with assignments included records an assignment-fetch OperationRun entry for the resulting backup item in `tests/Feature/Operations/AssignmentFetchOperationRunTest.php`
|
||||
- [X] T035 [P] [US1] Add regression test for Monitoring pages being DB-only at render time (no outbound calls) when viewing Operations list + detail in `tests/Feature/Monitoring/OperationsDbOnlyRenderTest.php`
|
||||
|
||||
### Implementation for User Story 1
|
||||
|
||||
- [X] T011 [US1] Add assignment-restore OperationRun creation + dedupe by `restore_run_id` in `app/Services/Intune/RestoreService.php` (use `OperationRunService::ensureRunWithIdentity()`; identity inputs include `restore_run_id`)
|
||||
- [X] T012 [US1] Update assignment-restore OperationRun summary counts (`total` attempted, `processed` succeeded, `failed` failed) in `app/Services/Intune/RestoreService.php` and `app/Services/OperationRunService.php`
|
||||
- [X] T013 [US1] Record stable failure `code` namespaces + normalized `reason_code` + sanitized messages for assignment restore failures in `app/Services/Intune/RestoreService.php` (leverage `OperationRunService` failure sanitization)
|
||||
- [X] T014 [US1] Remove per-assignment audit log emission from `app/Services/AssignmentRestoreService.php` (replace per-item `auditLogger->log(...)` calls with in-memory aggregation only)
|
||||
- [X] T015 [US1] Add exactly one audit log entry per assignment restore execution in `app/Services/Intune/RestoreService.php` using `app/Services/Intune/AuditLogger.php` (resourceType `restore_run`, resourceId = restore run id)
|
||||
- [X] T016 [US1] Add assignment-fetch OperationRun creation + dedupe keyed by `backup_item_id` in `app/Services/AssignmentBackupService.php` (wrap the Graph fetch inside `enrichWithAssignments()` using `OperationRunService::ensureRunWithIdentity()`)
|
||||
- [X] T017 [US1] Ensure assignment-related job classes can be OperationRun-tracked if they are used in the future by adding `public ?OperationRun $operationRun` + `middleware()` returning `TrackOperationRun` in `app/Jobs/FetchAssignmentsJob.php` and `app/Jobs/RestoreAssignmentsJob.php`
|
||||
- [X] T036 [US1] Identify and document all start surfaces that can trigger assignment fetch/restore, and ensure each path creates/reuses the same run identity (avoid “tracked in one path, untracked in another” gaps)
|
||||
|
||||
**Checkpoint**: US1 delivers Monitoring visibility + auditability for assignment restore and assignment fetch during backup capture.
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: User Story 2 — Enforce correct access control semantics on affected admin surfaces (Priority: P2)
|
||||
|
||||
**Goal**: Cross-plane access returns 404, non-member access returns 404, and member-without-capability returns 403. Remove any authorization bypasses.
|
||||
|
||||
**Independent Test**: Access the affected routes/surfaces with wrong guard and with insufficient permissions; assert 404 vs 403 semantics.
|
||||
|
||||
### Tests for User Story 2 (write first)
|
||||
|
||||
- [X] T018 [P] [US2] Add regression test for `/admin/w/{workspace}` guard enforcement (cross-plane 404) in `tests/Feature/Guards/AdminWorkspaceRoutesGuardTest.php`
|
||||
- [X] T019 [P] [US2] Add regression test ensuring Provider Connection create CTA does not bypass authorization in `tests/Feature/ProviderConnections/ProviderConnectionListAuthorizationTest.php`
|
||||
- [X] T020 [P] [US2] Add regression test for membership (404) before capability (403) in backup items relation manager in `tests/Feature/Rbac/BackupItemsRelationManagerSemanticsTest.php`
|
||||
|
||||
### Implementation for User Story 2
|
||||
|
||||
- [X] T021 [US2] Add missing `ensure-correct-guard:web` middleware to the `/admin/w/{workspace}` route group in `routes/web.php`
|
||||
- [X] T022 [US2] Remove the `->authorize(fn (): bool => true)` bypass from header and empty-state create actions in `app/Filament/Resources/ProviderConnectionResource/Pages/ListProviderConnections.php`
|
||||
- [X] T023 [US2] Fix membership (404) vs capability (403) ordering for backup items under a backup set in `app/Filament/Resources/BackupSetResource/RelationManagers/BackupItemsRelationManager.php`
|
||||
- [X] T024 [US2] Swap legacy enforcement helper imports to canonical RBAC helper in `app/Filament/Resources/BackupSetResource/Pages/ListBackupSets.php` and `app/Filament/Resources/RestoreRunResource/Pages/ListRestoreRuns.php`
|
||||
- [X] T037 [US2] Verify any destructive-like Provider Connections actions still require explicit confirmation (no regressions), and that server authorization remains enforced for both header and empty-state CTAs
|
||||
|
||||
**Checkpoint**: US2 closes cross-plane leak + removes bypasses + restores correct 404/403 semantics.
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: User Story 3 — Validate assignment operations safely in non-production contexts (Priority: P3)
|
||||
|
||||
**Goal**: Assignment-related Graph services depend only on `GraphClientInterface`, enabling tests to run with `NullGraphClient` or fakes without concrete client coupling.
|
||||
|
||||
**Independent Test**: Resolve the assignment Graph services from the container with Graph disabled and run a minimal flow that exercises the interface contract.
|
||||
|
||||
### Tests for User Story 3 (write first)
|
||||
|
||||
- [X] T025 [P] [US3] Add unit test asserting assignment Graph services resolve with `GraphClientInterface` binding in `tests/Feature/Graph/AssignmentGraphServiceResolutionTest.php`
|
||||
- [X] T026 [P] [US3] Add test using a fake `GraphClientInterface` to simulate assignment fetch failures without real HTTP in `tests/Feature/Operations/AssignmentFetchOperationRunFailureTest.php`
|
||||
|
||||
### Implementation for User Story 3
|
||||
|
||||
- [X] T027 [P] [US3] Replace `MicrosoftGraphClient` constructor type-hint with `GraphClientInterface` in `app/Services/Graph/AssignmentFetcher.php`
|
||||
- [X] T028 [P] [US3] Replace `MicrosoftGraphClient` constructor type-hint with `GraphClientInterface` in `app/Services/Graph/GroupResolver.php`
|
||||
- [X] T029 [P] [US3] Replace `MicrosoftGraphClient` constructor type-hint with `GraphClientInterface` in `app/Services/Graph/AssignmentFilterResolver.php`
|
||||
- [X] T030 [US3] Confirm container bindings remain canonical and no concrete client is injected directly by reviewing `app/Providers/AppServiceProvider.php`
|
||||
|
||||
**Checkpoint**: US3 enables deterministic tests and non-prod validation for assignment operations.
|
||||
|
||||
---
|
||||
|
||||
## Phase 6: Polish & Cross-Cutting Concerns
|
||||
|
||||
**Purpose**: Formatting + targeted verification commands + confidence checks.
|
||||
|
||||
- [X] T031 [P] Run formatting for touched files via `./vendor/bin/sail bin pint --dirty` (see `specs/094-assignment-ops-observability-hardening/quickstart.md`)
|
||||
- [X] T032 Run targeted tests via `./vendor/bin/sail artisan test --compact tests/Feature/Operations tests/Feature/Rbac tests/Feature/Guards tests/Feature/Audit` (see `specs/094-assignment-ops-observability-hardening/quickstart.md`)
|
||||
- [X] T033 Run the full test suite via `./vendor/bin/sail artisan test --compact` (see `specs/094-assignment-ops-observability-hardening/quickstart.md`)
|
||||
- [X] T038 Update any requirement references in tasks/spec if FR numbering changes (keep traceability from FR-001..FR-012 to the task IDs)
|
||||
|
||||
---
|
||||
|
||||
## Dependencies & Execution Order
|
||||
|
||||
### Phase Dependencies
|
||||
|
||||
- **Setup (Phase 1)**: No dependencies.
|
||||
- **Foundational (Phase 2)**: Depends on Phase 1.
|
||||
- **US1 (Phase 3)**: Depends on Phase 2. No dependency on US2/US3.
|
||||
- **US2 (Phase 4)**: Depends on Phase 2. Independent from US1/US3.
|
||||
- **US3 (Phase 5)**: Depends on Phase 2. Independent from US1/US2.
|
||||
- **Polish (Phase 6)**: Depends on completing the desired stories.
|
||||
|
||||
### User Story Dependencies
|
||||
|
||||
- **US1 (P1)**: Standalone MVP.
|
||||
- **US2 (P2)**: Standalone hardening.
|
||||
- **US3 (P3)**: Standalone testability hardening.
|
||||
|
||||
---
|
||||
|
||||
## Parallel Example: User Story 1
|
||||
|
||||
Parallelizable test tasks: T007, T008, T009, T010 (different files under `tests/Feature/...`).
|
||||
|
||||
---
|
||||
|
||||
## Parallel Example: User Story 2
|
||||
|
||||
Parallelizable test tasks: T018, T019, T020 (different files under `tests/Feature/...`).
|
||||
|
||||
Parallelizable implementation tasks (after tests land): T021 + T022 (different files under `routes/` vs `app/Filament/...`).
|
||||
|
||||
---
|
||||
|
||||
## Parallel Example: User Story 3
|
||||
|
||||
Parallelizable implementation tasks: T027, T028, T029 (different files under `app/Services/Graph/...`).
|
||||
|
||||
---
|
||||
|
||||
## Implementation Strategy
|
||||
|
||||
### MVP First (User Story 1 Only)
|
||||
|
||||
1. Complete Phase 1 (Setup) + Phase 2 (Foundational)
|
||||
2. Complete Phase 3 (US1) including tests
|
||||
3. Validate Monitoring visibility + audit log semantics
|
||||
|
||||
### Incremental Delivery
|
||||
|
||||
1. US1 → deploy/demo
|
||||
2. US2 → deploy/demo
|
||||
3. US3 → deploy/demo
|
||||
Loading…
Reference in New Issue
Block a user