diff --git a/app/Filament/Pages/DriftLanding.php b/app/Filament/Pages/DriftLanding.php index 3b7c1d6..d69c67c 100644 --- a/app/Filament/Pages/DriftLanding.php +++ b/app/Filament/Pages/DriftLanding.php @@ -11,6 +11,7 @@ use App\Models\InventorySyncRun; use App\Models\Tenant; use App\Models\User; +use App\Notifications\RunStatusChangedNotification; use App\Services\BulkOperationService; use App\Services\Drift\DriftRunSelector; use App\Support\RunIdempotency; @@ -176,13 +177,24 @@ public function mount(): void return; } + if (! $user->canSyncTenant($tenant)) { + $this->state = 'blocked'; + $this->message = 'You can view existing drift findings and run history, but you do not have permission to generate drift.'; + + return; + } + $bulkOperationService = app(BulkOperationService::class); $run = $bulkOperationService->createRun( tenant: $tenant, user: $user, resource: 'drift', action: 'generate', - itemIds: [$scopeKey], + itemIds: [ + 'scope_key' => $scopeKey, + 'baseline_run_id' => (int) $baseline->getKey(), + 'current_run_id' => (int) $current->getKey(), + ], totalItems: 1, ); @@ -199,6 +211,20 @@ public function mount(): void scopeKey: $scopeKey, bulkOperationRunId: (int) $run->getKey(), ); + + $user->notify(new RunStatusChangedNotification([ + 'tenant_id' => (int) $tenant->getKey(), + 'run_type' => 'bulk_operation', + 'run_id' => (int) $run->getKey(), + 'status' => 'queued', + 'counts' => [ + 'total' => (int) $run->total_items, + 'processed' => (int) $run->processed_items, + 'succeeded' => (int) $run->succeeded, + 'failed' => (int) $run->failed, + 'skipped' => (int) $run->skipped, + ], + ])); } public function getFindingsUrl(): string diff --git a/app/Filament/Resources/BulkOperationRunResource.php b/app/Filament/Resources/BulkOperationRunResource.php index cfdde9b..ca9677e 100644 --- a/app/Filament/Resources/BulkOperationRunResource.php +++ b/app/Filament/Resources/BulkOperationRunResource.php @@ -7,6 +7,7 @@ use App\Models\Tenant; use BackedEnum; use Filament\Actions; +use Filament\Forms\Components\DatePicker; use Filament\Infolists\Components\TextEntry; use Filament\Infolists\Components\ViewEntry; use Filament\Resources\Resource; @@ -25,7 +26,9 @@ class BulkOperationRunResource extends Resource protected static string|BackedEnum|null $navigationIcon = 'heroicon-o-clock'; - protected static string|UnitEnum|null $navigationGroup = 'Backups & Restore'; + protected static string|UnitEnum|null $navigationGroup = 'Monitoring'; + + protected static ?string $navigationLabel = 'Operations'; public static function form(Schema $schema): Schema { @@ -44,8 +47,10 @@ public static function infolist(Schema $schema): Schema TextEntry::make('resource')->badge(), TextEntry::make('action')->badge(), TextEntry::make('status') + ->label('Outcome') ->badge() - ->color(fn (BulkOperationRun $record): string => static::statusColor($record->status)), + ->state(fn (BulkOperationRun $record): string => $record->statusBucket()) + ->color(fn (BulkOperationRun $record): string => static::statusBucketColor($record->statusBucket())), TextEntry::make('total_items')->label('Total')->numeric(), TextEntry::make('processed_items')->label('Processed')->numeric(), TextEntry::make('succeeded')->numeric(), @@ -58,6 +63,86 @@ public static function infolist(Schema $schema): Schema ->columns(2) ->columnSpanFull(), + Section::make('Related') + ->schema([ + TextEntry::make('related_backup_set') + ->label('Backup set') + ->state(function (BulkOperationRun $record): ?string { + $backupSetId = static::backupSetIdFromItemIds($record); + + if (! $backupSetId) { + return null; + } + + return "#{$backupSetId}"; + }) + ->url(function (BulkOperationRun $record): ?string { + $backupSetId = static::backupSetIdFromItemIds($record); + + if (! $backupSetId) { + return null; + } + + return BackupSetResource::getUrl('view', ['record' => $backupSetId], tenant: Tenant::current()); + }) + ->visible(fn (BulkOperationRun $record): bool => static::backupSetIdFromItemIds($record) !== null) + ->placeholder('—') + ->columnSpanFull(), + TextEntry::make('related_drift_findings') + ->label('Drift findings') + ->state('View') + ->url(function (BulkOperationRun $record): ?string { + if ($record->runType() !== 'drift.generate') { + return null; + } + + $payload = $record->item_ids ?? []; + if (! is_array($payload)) { + return FindingResource::getUrl('index', tenant: Tenant::current()); + } + + $scopeKey = null; + $baselineRunId = null; + $currentRunId = null; + + if (array_is_list($payload) && isset($payload[0]) && is_string($payload[0])) { + $scopeKey = $payload[0]; + } else { + $scopeKey = is_string($payload['scope_key'] ?? null) ? $payload['scope_key'] : null; + + if (is_numeric($payload['baseline_run_id'] ?? null)) { + $baselineRunId = (int) $payload['baseline_run_id']; + } + + if (is_numeric($payload['current_run_id'] ?? null)) { + $currentRunId = (int) $payload['current_run_id']; + } + } + + $tableFilters = []; + + if (is_string($scopeKey) && $scopeKey !== '') { + $tableFilters['scope_key'] = ['scope_key' => $scopeKey]; + } + + if (is_int($baselineRunId) || is_int($currentRunId)) { + $tableFilters['run_ids'] = [ + 'baseline_run_id' => $baselineRunId, + 'current_run_id' => $currentRunId, + ]; + } + + $parameters = $tableFilters !== [] ? ['tableFilters' => $tableFilters] : []; + + return FindingResource::getUrl('index', $parameters, tenant: Tenant::current()); + }) + ->visible(fn (BulkOperationRun $record): bool => $record->runType() === 'drift.generate') + ->placeholder('—') + ->columnSpanFull(), + ]) + ->visible(fn (BulkOperationRun $record): bool => in_array($record->runType(), ['backup_set.add_policies', 'drift.generate'], true)) + ->columnSpanFull(), + Section::make('Items') ->schema([ ViewEntry::make('item_ids') @@ -97,13 +182,112 @@ public static function table(Table $table): Table Tables\Columns\TextColumn::make('resource')->badge(), Tables\Columns\TextColumn::make('action')->badge(), Tables\Columns\TextColumn::make('status') + ->label('Outcome') ->badge() - ->color(fn (BulkOperationRun $record): string => static::statusColor($record->status)), + ->formatStateUsing(fn (BulkOperationRun $record): string => $record->statusBucket()) + ->color(fn (BulkOperationRun $record): string => static::statusBucketColor($record->statusBucket())), Tables\Columns\TextColumn::make('created_at')->since(), Tables\Columns\TextColumn::make('total_items')->label('Total')->numeric(), Tables\Columns\TextColumn::make('processed_items')->label('Processed')->numeric(), Tables\Columns\TextColumn::make('failed')->numeric(), ]) + ->filters([ + Tables\Filters\SelectFilter::make('run_type') + ->label('Run type') + ->options(fn (): array => static::runTypeOptions()) + ->query(function (Builder $query, array $data): Builder { + $value = $data['value'] ?? null; + + if (! is_string($value) || $value === '' || ! str_contains($value, '.')) { + return $query; + } + + [$resource, $action] = explode('.', $value, 2); + + if ($resource === '' || $action === '') { + return $query; + } + + return $query + ->where('resource', $resource) + ->where('action', $action); + }), + Tables\Filters\SelectFilter::make('status_bucket') + ->label('Status') + ->options([ + 'queued' => 'Queued', + 'running' => 'Running', + 'succeeded' => 'Succeeded', + 'partially succeeded' => 'Partially succeeded', + 'failed' => 'Failed', + ]) + ->query(function (Builder $query, array $data): Builder { + $value = $data['value'] ?? null; + + if (! is_string($value) || $value === '') { + return $query; + } + + $nonSkippedFailureSql = "EXISTS (SELECT 1 FROM jsonb_array_elements(COALESCE(failures, '[]'::jsonb)) AS elem WHERE (elem->>'type' IS NULL OR elem->>'type' <> 'skipped'))"; + + return match ($value) { + 'queued' => $query->where('status', 'pending'), + 'running' => $query->where('status', 'running'), + 'succeeded' => $query + ->whereIn('status', ['completed', 'completed_with_errors']) + ->where('failed', 0) + ->whereRaw("NOT {$nonSkippedFailureSql}"), + 'partially succeeded' => $query + ->whereNotIn('status', ['pending', 'running']) + ->where('succeeded', '>', 0) + ->where(function (Builder $q) use ($nonSkippedFailureSql): void { + $q->where('failed', '>', 0)->orWhereRaw($nonSkippedFailureSql); + }), + 'failed' => $query + ->whereNotIn('status', ['pending', 'running']) + ->where(function (Builder $q) use ($nonSkippedFailureSql): void { + $q->where(function (Builder $q) use ($nonSkippedFailureSql): void { + $q->where('succeeded', 0) + ->where(function (Builder $q) use ($nonSkippedFailureSql): void { + $q->where('failed', '>', 0)->orWhereRaw($nonSkippedFailureSql); + }); + })->orWhere(function (Builder $q) use ($nonSkippedFailureSql): void { + $q->whereIn('status', ['failed', 'aborted']) + ->whereNot(function (Builder $q) use ($nonSkippedFailureSql): void { + $q->where('succeeded', '>', 0) + ->where(function (Builder $q) use ($nonSkippedFailureSql): void { + $q->where('failed', '>', 0)->orWhereRaw($nonSkippedFailureSql); + }); + }); + }); + }), + default => $query, + }; + }), + Tables\Filters\Filter::make('created_at') + ->label('Created') + ->form([ + DatePicker::make('created_from') + ->label('From') + ->default(fn () => now()->subDays(30)), + DatePicker::make('created_until') + ->label('Until') + ->default(fn () => now()), + ]) + ->query(function (Builder $query, array $data): Builder { + $from = $data['created_from'] ?? null; + if ($from) { + $query->whereDate('created_at', '>=', $from); + } + + $until = $data['created_until'] ?? null; + if ($until) { + $query->whereDate('created_at', '<=', $until); + } + + return $query; + }), + ]) ->actions([ Actions\ViewAction::make(), ]) @@ -125,14 +309,65 @@ public static function getPages(): array ]; } - private static function statusColor(?string $status): string + /** + * @return array + */ + private static function runTypeOptions(): array { - return match ($status) { - 'completed' => 'success', - 'completed_with_errors' => 'warning', + $tenantId = Tenant::current()->getKey(); + + $knownTypes = [ + 'drift.generate' => 'drift.generate', + 'backup_set.add_policies' => 'backup_set.add_policies', + ]; + + $storedTypes = BulkOperationRun::query() + ->where('tenant_id', $tenantId) + ->select(['resource', 'action']) + ->distinct() + ->orderBy('resource') + ->orderBy('action') + ->get() + ->mapWithKeys(function (BulkOperationRun $run): array { + $type = "{$run->resource}.{$run->action}"; + + return [$type => $type]; + }) + ->all(); + + return array_replace($storedTypes, $knownTypes); + } + + private static function statusBucketColor(string $statusBucket): string + { + return match ($statusBucket) { + 'succeeded' => 'success', + 'partially succeeded' => 'warning', 'failed' => 'danger', 'running' => 'info', + 'queued' => 'gray', default => 'gray', }; } + + private static function backupSetIdFromItemIds(BulkOperationRun $record): ?int + { + if ($record->runType() !== 'backup_set.add_policies') { + return null; + } + + $payload = $record->item_ids ?? []; + if (! is_array($payload)) { + return null; + } + + $backupSetId = $payload['backup_set_id'] ?? null; + if (! is_numeric($backupSetId)) { + return null; + } + + $backupSetId = (int) $backupSetId; + + return $backupSetId > 0 ? $backupSetId : null; + } } diff --git a/app/Jobs/GenerateDriftFindingsJob.php b/app/Jobs/GenerateDriftFindingsJob.php index 353cb77..f5c75ec 100644 --- a/app/Jobs/GenerateDriftFindingsJob.php +++ b/app/Jobs/GenerateDriftFindingsJob.php @@ -5,6 +5,7 @@ use App\Models\BulkOperationRun; use App\Models\InventorySyncRun; use App\Models\Tenant; +use App\Notifications\RunStatusChangedNotification; use App\Services\BulkOperationService; use App\Services\Drift\DriftFindingGenerator; use Illuminate\Bus\Queueable; @@ -86,6 +87,8 @@ public function handle(DriftFindingGenerator $generator, BulkOperationService $b $bulkOperationService->recordSuccess($run); $bulkOperationService->complete($run); + + $this->notifyStatus($run->refresh()); } catch (Throwable $e) { Log::error('GenerateDriftFindingsJob: failed', [ 'tenant_id' => $this->tenantId, @@ -96,9 +99,86 @@ public function handle(DriftFindingGenerator $generator, BulkOperationService $b 'error' => $e->getMessage(), ]); + $bulkOperationService->recordFailure( + run: $run, + itemId: $this->scopeKey, + reason: $e->getMessage(), + reasonCode: 'unknown', + ); + $bulkOperationService->fail($run, $e->getMessage()); + $this->notifyStatus($run->refresh()); throw $e; } } + + private function notifyStatus(BulkOperationRun $run): void + { + try { + if (! $run->relationLoaded('user')) { + $run->loadMissing('user'); + } + + if (! $run->user) { + return; + } + + $status = 'failed'; + + try { + $status = $run->statusBucket(); + } catch (Throwable) { + $failureEntries = $run->failures ?? []; + $hasNonSkippedFailure = false; + foreach ($failureEntries as $entry) { + if (! is_array($entry)) { + continue; + } + + if (($entry['type'] ?? 'failed') !== 'skipped') { + $hasNonSkippedFailure = true; + break; + } + } + + $failedCount = (int) ($run->failed ?? 0); + $succeededCount = (int) ($run->succeeded ?? 0); + $hasFailures = $failedCount > 0 || $hasNonSkippedFailure; + + if ($succeededCount > 0 && $hasFailures) { + $status = 'partially succeeded'; + } elseif ($succeededCount === 0 && $hasFailures) { + $status = 'failed'; + } else { + $status = match ($run->status) { + 'pending' => 'queued', + 'running' => 'running', + 'completed', 'completed_with_errors' => 'succeeded', + default => 'failed', + }; + } + } + + $run->user->notify(new RunStatusChangedNotification([ + 'tenant_id' => (int) $run->tenant_id, + 'run_type' => 'bulk_operation', + 'run_id' => (int) $run->getKey(), + 'status' => $status, + 'counts' => [ + 'total' => (int) $run->total_items, + 'processed' => (int) $run->processed_items, + 'succeeded' => (int) $run->succeeded, + 'failed' => (int) $run->failed, + 'skipped' => (int) $run->skipped, + ], + ])); + } catch (Throwable $e) { + Log::warning('GenerateDriftFindingsJob: status notification failed', [ + 'tenant_id' => (int) $run->tenant_id, + 'bulk_operation_run_id' => (int) $run->getKey(), + 'error' => $e->getMessage(), + ]); + } + } } diff --git a/app/Models/BulkOperationRun.php b/app/Models/BulkOperationRun.php index 5880dba..5e5d135 100644 --- a/app/Models/BulkOperationRun.php +++ b/app/Models/BulkOperationRun.php @@ -51,4 +51,54 @@ public function auditLog(): BelongsTo { return $this->belongsTo(AuditLog::class); } + + public function runType(): string + { + return "{$this->resource}.{$this->action}"; + } + + public function statusBucket(): string + { + $status = $this->status; + + if ($status === 'pending') { + return 'queued'; + } + + if ($status === 'running') { + return 'running'; + } + + $succeededCount = (int) ($this->succeeded ?? 0); + $failedCount = (int) ($this->failed ?? 0); + $failureEntries = $this->failures ?? []; + $hasNonSkippedFailure = false; + + foreach ($failureEntries as $entry) { + if (! is_array($entry)) { + continue; + } + + if (($entry['type'] ?? 'failed') !== 'skipped') { + $hasNonSkippedFailure = true; + break; + } + } + + $hasFailures = $failedCount > 0 || $hasNonSkippedFailure; + + if ($succeededCount > 0 && $hasFailures) { + return 'partially succeeded'; + } + + if ($succeededCount === 0 && $hasFailures) { + return 'failed'; + } + + return match ($status) { + 'completed', 'completed_with_errors' => 'succeeded', + 'failed', 'aborted' => 'failed', + default => 'failed', + }; + } } diff --git a/app/Notifications/RunStatusChangedNotification.php b/app/Notifications/RunStatusChangedNotification.php index 0a0f6d0..37ae65b 100644 --- a/app/Notifications/RunStatusChangedNotification.php +++ b/app/Notifications/RunStatusChangedNotification.php @@ -44,7 +44,7 @@ public function toDatabase(object $notifiable): array 'queued' => 'Run queued', 'running' => 'Run started', 'completed', 'succeeded' => 'Run completed', - 'partial', 'completed_with_errors' => 'Run completed (partial)', + 'partial', 'partially succeeded', 'completed_with_errors' => 'Run completed (partial)', 'failed' => 'Run failed', default => 'Run updated', }; @@ -54,7 +54,7 @@ public function toDatabase(object $notifiable): array $color = match ($status) { 'queued', 'running' => 'gray', 'completed', 'succeeded' => 'success', - 'partial', 'completed_with_errors' => 'warning', + 'partial', 'partially succeeded', 'completed_with_errors' => 'warning', 'failed' => 'danger', default => 'gray', }; diff --git a/specs/053-unify-runs-monitoring/checklists/requirements.md b/specs/053-unify-runs-monitoring/checklists/requirements.md new file mode 100644 index 0000000..6f68a31 --- /dev/null +++ b/specs/053-unify-runs-monitoring/checklists/requirements.md @@ -0,0 +1,35 @@ +# Specification Quality Checklist: Unified Operations Runs + Monitoring Hub (053) + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2026-01-15 +**Feature**: [spec.md](../spec.md) + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders +- [x] All mandatory sections completed + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic (no implementation details) +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified +- [x] Scope is clearly bounded +- [x] Dependencies and assumptions identified + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria +- [x] User scenarios cover primary flows +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification + +## Notes + +- Items marked incomplete require spec updates before `/speckit.clarify` or `/speckit.plan` +- Validation run on 2026-01-15; no issues found. diff --git a/specs/053-unify-runs-monitoring/checklists/writing.md b/specs/053-unify-runs-monitoring/checklists/writing.md new file mode 100644 index 0000000..b4008f4 --- /dev/null +++ b/specs/053-unify-runs-monitoring/checklists/writing.md @@ -0,0 +1,78 @@ +# Requirements Writing Checklist: Unified Operations Runs + Monitoring Hub (053) + +**Purpose**: “Unit tests for English” to validate the clarity, completeness, and internal consistency of `spec.md` before implementation planning +**Created**: 2026-01-16 +**Feature**: [spec.md](../spec.md) + +**Note**: These items validate what the requirements *say* (or don’t say). They are not implementation/QA tests. + +## Requirement Completeness + +- [x] CHK001 Are all required run attributes explicitly enumerated (type, status, timestamps, initiator, label)? [Completeness, Spec §FR-001] — Evidence: Spec §FR-001 “...run type, scope/target (when applicable), status, timestamps (created/started/finished), initiator (user or system), and a human-readable label...” +- [x] CHK002 Are list requirements complete for Monitoring/Operations (default sort, default time window, and filterable fields/values)? [Completeness, Spec §FR-002] — Evidence: Spec §FR-002 “...sorted most-recent-first by default, defaulting to... (last 30 days), and supporting filtering by run type, status, and time range...” +- [x] CHK003 Are run detail requirements complete, including which summary counts are required and when they apply (total/succeeded/failed/skipped)? [Completeness, Spec §FR-003] — Evidence: Spec §FR-003 “For itemized operations... counts MUST include `total`, `succeeded`, `failed`, and `skipped` (if applicable).” +- [x] CHK004 Are Phase 1 included operations explicitly listed, and are all other candidate operations explicitly deferred? [Completeness, Spec §Scope & Assumptions] — Evidence: Spec §Scope & Assumptions “Phase 1 supported operations are: Drift generation; Backup Set “Add Policies”. All other candidate operations are explicitly deferred to Phase 2+...” +- [x] CHK005 Are “related artifact” link requirements defined per Phase 1 operation (drift findings link; backup set context link)? [Completeness, Spec §FR-012, Spec §FR-013] — Evidence: Spec §FR-012 “...run detail view MUST provide a link to that artifact.” + Spec §FR-013 “...link back to the related backup set context.” +- [x] CHK006 Are notification requirements complete across lifecycle events, including who receives them (initiator only vs tenant operators)? [Completeness, Spec §FR-011] — Evidence: Spec §FR-011 “Phase 1 notifications MUST be delivered to the initiating user...”; Spec §User Story 2 scenarios 4–5 define queued + completion notifications. + +## Requirement Clarity + +- [x] CHK007 Is “supported long-running operation” bounded and unambiguous for Phase 1 (what is included/excluded)? [Clarity, Spec §Scope & Assumptions] — Evidence: Spec §Scope & Assumptions “Phase 1 supported operations are: Drift generation; Backup Set “Add Policies”...” +- [x] CHK008 Is the run status set closed or open, and are optional statuses (e.g., canceled/aborted) explicitly addressed as included or excluded? [Clarity, Spec §FR-004] — Evidence: Spec §FR-004 “Phase 1 status set: `queued`, `running`, `succeeded`, `partially succeeded`, `failed`... Cancellation/abort outcomes are deferred to Phase 2.” +- [x] CHK009 Is the dedupe definition of “identical run” unambiguous (scope components, time window, and whether initiator matters)? [Clarity, Spec §FR-006] — Evidence: Spec §FR-006 “Identical means... same tenant... same run type... same scope/target... same effective inputs... The initiator MUST NOT be part of the identity...” +- [x] CHK010 Are “sanitized” and “minimized” failure detail requirements defined with explicit redaction expectations and allowed/forbidden content? [Clarity, Spec §FR-010] — Evidence: Spec §FR-010 “MUST NOT include secrets, credentials, tokens, PII, or full external payload dumps...” and defines allowed per-item references. +- [x] CHK011 Is “human-readable label” defined (format, stability, and whether localization is required) or explicitly deferred? [Clarity, Spec §FR-001] — Evidence: Spec §FR-001 “label MUST be a stable operator-facing description combining run type and scope/target (English-only in Phase 1; localization deferred to Phase 2).” +- [x] CHK012 Is the “View run” link requirement specific about its destination and the surfaces where it must appear (immediate confirmation vs lifecycle notifications)? [Clarity, Spec §FR-005, Spec §FR-011] — Evidence: Spec §FR-005 “...‘View run’ link that opens the run detail view...” + Spec §FR-011 “Notifications MUST include a ‘View run’ link that opens the run detail view.” + +## Requirement Consistency + +- [x] CHK013 Is status vocabulary consistent across the spec (e.g., “completed” vs “succeeded/partially succeeded/failed”)? [Consistency, Spec §FR-004, Spec §SC-001] — Evidence: Spec §FR-004 and Spec §SC-001 use the same status set: `queued` / `running` / `succeeded` / `partially succeeded` / `failed`. +- [x] CHK014 Are view-only constraints consistent across Clarifications, acceptance scenarios, and FR-014 (no manage controls in hub)? [Consistency, Spec §Clarifications, Spec §User Story 1, Spec §FR-014] — Evidence: Spec §Clarifications “Monitoring/Operations is view-only in Phase 1...” + Spec §User Story 1 scenario 5 “...view-only...” + Spec §FR-014 “...MUST be view-only...” +- [x] CHK015 Are permissions consistent across scenarios and FR-009 (who can view vs start), including `Readonly` restrictions? [Consistency, Spec §Scope & Assumptions, Spec §User Story 2, Spec §FR-009] — Evidence: Spec §Scope & Assumptions roles bullet defines `Readonly` view-only; Spec §User Story 2 scenario 2 denies start; Spec §FR-009 forbids `Readonly` start/manage. +- [x] CHK016 Are dedupe semantics consistent between user story scenarios, FR-006, and SC-003? [Consistency, Spec §User Story 3, Spec §FR-006, Spec §SC-003] — Evidence: Spec §User Story 3 scenario 2 (reuses existing run) + Spec §FR-006 (reuse queued/running) + Spec §SC-003 (no more than one active run). + +## Acceptance Criteria Quality + +- [x] CHK017 Does each user story have acceptance scenarios specific enough to validate without unstated assumptions (filters, links, permissions)? [Measurability, Spec §User Scenarios & Testing] — Evidence: Spec §User Story 1 scenarios specify default sort/window + filters + cross-tenant denial; Spec §User Story 2 scenarios specify notifications + background-unavailable behavior; Spec §User Story 3 scenarios specify findings link + failure summary. +- [x] CHK018 Are success criteria measurable with defined measurement methods/proxies (e.g., how “under 30 seconds” is assessed)? [Measurability, Spec §Success Criteria] — Evidence: Spec §SC-001 “measured via timed operator walkthroughs...” + Spec §SC-002/SC-003/SC-004 include measurement notes. +- [x] CHK019 Is “within 2 seconds under normal conditions” defined with explicit conditions/examples so it can be measured consistently? [Clarity, Spec §SC-002] — Evidence: Spec §SC-002 defines “normal conditions” (no active service degradation; typical tenant dataset sizes; excludes maintenance/outage windows). +- [x] CHK020 Are “99% of repeated-start attempts” and “95% of cases” scoped precisely (population, timeframe, and measurement approach)? [Clarity, Spec §SC-003, Spec §SC-004] — Evidence: Spec §SC-003 and §SC-004 specify Phase 1 population, rolling 30-day window, and measurement approaches. + +## Scenario Coverage + +- [x] CHK021 Are requirements/scenarios present for system-initiated runs (no interactive initiator) and how they appear in Monitoring/Operations? [Coverage, Spec §Scope & Assumptions, Spec §User Story 1] — Evidence: Spec §Scope & Assumptions “System-initiated runs may exist... initiator shown as ‘System’.” + Spec §User Story 1 scenario 9. +- [x] CHK022 Are scenarios present for cross-tenant access attempts for both list and detail views (and expected denial behavior)? [Coverage, Spec §User Story 1, Spec §FR-008] — Evidence: Spec §User Story 1 scenario 7 “...access is denied and no run data is disclosed.” + Spec §FR-008 “...MUST NOT disclose run existence or details.” +- [x] CHK023 Do scenarios cover related-artifact behavior across outcomes (results link on success; safe failure summary on failure)? [Coverage, Spec §User Story 3, Spec §FR-012] — Evidence: Spec §User Story 3 scenario 3 (findings link on success) + scenario 4 (safe failure summary on failure) + Spec §FR-012. +- [x] CHK024 Is the “permissions changed mid-run” scenario defined with explicit expected outcomes (viewability and notifications)? [Completeness, Spec §Edge Cases] — Evidence: Spec §Edge Cases “visibility is evaluated at time of access; and completion notifications are delivered only if the recipient remains authorized...” + +## Edge Case Coverage + +- [x] CHK025 Are Edge Cases written as explicit expected behaviors (not only open questions), or explicitly deferred with ownership/timing? [Completeness, Spec §Edge Cases] — Evidence: Spec §Edge Cases bullets are written as “If... MUST...” statements (no open placeholders). +- [x] CHK026 Is drift eligibility defined (minimum data needed) and the user-facing outcome when eligibility is not met? [Completeness, Spec §User Story 3, Spec §Edge Cases] — Evidence: Spec §User Story 3 scenario 5 “...not enough eligible data...” + Spec §Edge Cases references failing/refusing with reason code (e.g., `insufficient_data`) and actionable message. +- [x] CHK027 Are “very large scope” partial completion expectations defined (counts semantics; when partial vs failed applies)? [Clarity, Spec §User Story 1, Spec §FR-004] — Evidence: Spec §User Story 1 scenario 8 defines partial vs failed; Spec §FR-004 defines status semantics; Spec §Edge Cases addresses large scopes. +- [x] CHK028 Are failure-display requirements defined for sensitive underlying errors (what is shown vs redacted)? [Clarity, Spec §Edge Cases, Spec §FR-010] — Evidence: Spec §Edge Cases “...only stable reason codes and short sanitized messages are shown...” + Spec §FR-010 “MUST NOT include secrets... tokens, PII, or full external payload dumps.” + +## Non-Functional Requirements + +- [x] CHK029 Are confidentiality constraints for stored/displayed failures explicit and framed as hard requirements (no secrets/tokens/full payload dumps)? [Non-Functional, Security, Spec §FR-010] — Evidence: Spec §FR-010 “MUST NOT include secrets, credentials, tokens, PII, or full external payload dumps.” +- [x] CHK030 Are reliability expectations defined for background execution and notification delivery, including user-visible behavior when background execution is unavailable? [Completeness, Spec §Scope & Assumptions, Spec §FR-005, Spec §FR-011] — Evidence: Spec §FR-005 defines behavior when background execution is unavailable; Spec §FR-011 “If a notification cannot be delivered, Monitoring/Operations remains the source of truth...” +- [x] CHK031 Are scalability expectations defined for Monitoring/Operations usage (expected run volume, retention horizon, and list size constraints) or explicitly deferred? [Completeness, Spec §Scope & Assumptions] — Evidence: Spec §Scope & Assumptions “Run retention horizon and scale targets... are deferred to Phase 2 (Owner: Product)...” +- [x] CHK032 Are auditability requirements explicit beyond “initiator metadata” (what constitutes the audit trail for start/outcome/view)? [Completeness, Spec §Scope & Assumptions] — Evidence: Spec §Scope & Assumptions “Auditability for Phase 1 is achieved via the run record... and lifecycle notifications. Auditing ‘who viewed which run’ is deferred to Phase 2...” + +## Dependencies & Assumptions + +- [x] CHK033 Are Phase 1 adoption assumptions (which operations adopt the unified run model now vs later) stable, complete, and traceable to requirements? [Assumption, Spec §Scope & Assumptions, Spec §FR-002, Spec §FR-007, Spec §FR-013] — Evidence: Spec §Scope & Assumptions lists Phase 1 supported ops; Spec §FR-002 requires filtering for them; Spec §FR-007 and §FR-013 require run tracking for them. +- [x] CHK034 Are environment dependencies (background execution) mapped to requirement-level behavior when unmet (operator guidance, degraded mode)? [Completeness, Spec §Scope & Assumptions, Spec §FR-005] — Evidence: Spec §Scope & Assumptions describes dependency; Spec §FR-005 defines clear error + no misleading “queued” confirmation when unavailable. +- [x] CHK035 Is tenant isolation defined beyond “forbidden cross-tenant” with enough specificity to guide acceptance and review (e.g., scoping expectations)? [Clarity, Spec §FR-008, Spec §User Story 1] — Evidence: Spec §FR-008 “Tenant scoping MUST be applied before any filtering... MUST NOT disclose run existence or details.” + Spec §User Story 1 scenario 7. + +## Ambiguities & Conflicts + +- [x] CHK036 Are the roles `Owner`/`Manager`/`Operator`/`Readonly` defined or referenced so permission requirements are interpretable to reviewers? [Clarity, Spec §Scope & Assumptions] — Evidence: Spec §Scope & Assumptions roles bullet defines view/start capabilities per role. +- [x] CHK037 Is “Monitoring/Operations” named consistently and distinguished from per-feature start surfaces (avoid competing synonyms)? [Consistency, Spec §User Story 1, Spec §Scope & Assumptions, Spec §FR-014] — Evidence: Spec consistently uses “Monitoring/Operations” for the hub and states start controls remain in feature areas (Spec §Scope & Assumptions; Spec §FR-014). +- [x] CHK038 Is “run type” terminology consistent between Key Entities and filtering/permissions language (avoid conflicting synonyms like type/resource/action)? [Consistency, Spec §Key Entities, Spec §FR-002, Spec §FR-009] — Evidence: Spec §Key Entities defines “Run Type”; Spec §FR-002 and §FR-009 use “run type” for filtering/permissions. +- [x] CHK039 Is traceability complete: does every “MUST” requirement have at least one corresponding scenario and/or measurable success criterion (or an explicit rationale if not)? [Traceability, Spec §Functional Requirements, Spec §User Scenarios & Testing, Spec §Success Criteria] — Evidence: US1 scenarios cover FR-002/003/004/008/014; US2 scenarios cover FR-005/011/013; US3 scenarios cover FR-006/007/012; Success Criteria (SC-001–SC-005) provide measurable outcomes for the overall feature. + +## Notes + +- Mark items as completed: `[x]` +- Capture findings inline under the relevant item(s) (add links/quotes from `spec.md` as needed) diff --git a/specs/053-unify-runs-monitoring/contracts/admin-pages.openapi.yaml b/specs/053-unify-runs-monitoring/contracts/admin-pages.openapi.yaml new file mode 100644 index 0000000..ae3886f --- /dev/null +++ b/specs/053-unify-runs-monitoring/contracts/admin-pages.openapi.yaml @@ -0,0 +1,54 @@ +openapi: 3.0.3 +info: + title: TenantPilot Admin Monitoring Contracts (Feature 053) + version: 0.1.0 + description: | + Minimal page-render contracts for the Monitoring/Operations hub. + + These pages must render from the database only (no external tenant calls) + and display only sanitized failure detail (no secrets/tokens/raw payload dumps). + +servers: + - url: / + +paths: + /admin/t/{tenantExternalId}/bulk-operation-runs: + get: + operationId: monitoringOperationsIndex + summary: Monitoring → Operations (tenant-scoped) + parameters: + - name: tenantExternalId + in: path + required: true + schema: + type: string + responses: + '200': + description: Page renders successfully. + '302': + description: Redirect to login when unauthenticated. + + /admin/t/{tenantExternalId}/bulk-operation-runs/{bulkOperationRunId}: + get: + operationId: monitoringOperationsView + summary: Operation run detail (tenant-scoped) + parameters: + - name: tenantExternalId + in: path + required: true + schema: + type: string + - name: bulkOperationRunId + in: path + required: true + schema: + type: integer + responses: + '200': + description: Page renders successfully. + '302': + description: Redirect to login when unauthenticated. + '403': + description: Forbidden when attempting cross-tenant access. + +components: {} diff --git a/specs/053-unify-runs-monitoring/data-model.md b/specs/053-unify-runs-monitoring/data-model.md new file mode 100644 index 0000000..32f3527 --- /dev/null +++ b/specs/053-unify-runs-monitoring/data-model.md @@ -0,0 +1,107 @@ +# Data Model: Unified Operations Runs + Monitoring Hub (053) + +This feature primarily standardizes and surfaces existing run records for long-running operations, and links operators to the underlying business artifacts (e.g., drift findings). + +## Entities + +## 1) BulkOperationRun (`bulk_operation_runs`) + +**Purpose:** Canonical tenant-scoped run record for long-running operations (Phase 1). + +**Model:** `App\Models\BulkOperationRun` + +**Key fields (existing):** +- `id` (PK) +- `tenant_id` (FK → tenants) +- `user_id` (FK → users) +- `resource` (string) — e.g. `drift`, `backup_set` +- `action` (string) — e.g. `generate`, `add_policies` +- `idempotency_key` (string|null) +- `status` (string) — `pending`, `running`, `completed`, `completed_with_errors`, `failed`, `aborted` +- counters: `total_items`, `processed_items`, `succeeded`, `failed`, `skipped` +- `item_ids` (jsonb) — stable identifiers for the items/scope of the run + - Example (`drift.generate`): `{ scope_key, baseline_run_id, current_run_id }` + - Example (`backup_set.add_policies`): `{ backup_set_id, policy_ids, options }` +- `failures` (jsonb|null) — sanitized failure details (including per-item failures for itemized operations) +- `audit_log_id` (FK → audit_logs|null) +- `created_at`, `updated_at` + +**Relationships:** +- `BulkOperationRun belongsTo Tenant` +- `BulkOperationRun belongsTo User` +- `BulkOperationRun belongsTo AuditLog` (nullable) + +**Uniqueness / idempotency:** +- Active-run uniqueness enforced via a partial unique index on `(tenant_id, idempotency_key)` for active statuses. +- Idempotency keys are deterministic and stable per tenant + operation type + scope. + +**State transitions (storage):** +- `pending → running → completed | completed_with_errors | failed | aborted` + +**Status mapping (operator UI semantics):** +- `pending` → `queued` +- `running` → `running` +- `completed` → `succeeded` +- `completed_with_errors` → `partially succeeded` +- `failed`/`aborted` → `failed` + +**Failure entry shape (sanitized):** +- `reason_code` (string, stable) + `reason` (short sanitized message) +- for itemized runs: `item_id` per failure entry (and optional `type=skipped` for non-failure outcomes) + +--- + +## 2) Finding (`findings`) — Drift results + +**Purpose:** Persisted analytic findings; drift findings are the primary “related artifact” for Drift generation runs. + +**Model:** `App\Models\Finding` + +**Key fields (existing, drift-related):** +- `id` (PK) +- `tenant_id` (FK → tenants) +- `finding_type` (`drift`) +- `scope_key` (string) +- `baseline_run_id` (FK → inventory_sync_runs|null) +- `current_run_id` (FK → inventory_sync_runs|null) +- `fingerprint` (string; deterministic; unique per tenant) +- `subject_type`, `subject_external_id` +- `status` (`new|acknowledged`) +- `evidence_jsonb` (jsonb; sanitized allowlist) +- `created_at`, `updated_at` + +**Relationships:** +- `Finding belongsTo Tenant` +- `Finding belongsTo InventorySyncRun` via `baseline_run_id` and `current_run_id` (nullable) + +**Notes:** +- Phase 1 can link operators from the drift run to findings through scope/baseline/current identifiers without introducing a new DB foreign key. +- If later needed, introduce an explicit link (e.g., `findings.bulk_operation_run_id`) to make navigation and reporting easier. + +--- + +## 3) InventorySyncRun (`inventory_sync_runs`) — Drift inputs + +**Purpose:** “Last observed” inventory run records used as baseline/current inputs for drift comparisons. + +**Model:** `App\Models\InventorySyncRun` + +**Relevant fields (existing):** +- `tenant_id` +- `status` +- `selection_hash` (used as `scope_key`) +- `finished_at` + +--- + +## 4) Notification Event (DB notifications) + +**Purpose:** Persist run lifecycle notifications (queued/completed) linking operators to the run detail page. + +**Storage:** Laravel Notifications (DB channel). + +**Payload (target):** +- tenant identifier +- run identifier + type (`bulk_operation_run`) +- status bucket (queued/running/succeeded/partial/failed) +- summary counts and a safe error summary (when applicable) diff --git a/specs/053-unify-runs-monitoring/plan.md b/specs/053-unify-runs-monitoring/plan.md new file mode 100644 index 0000000..7c2794b --- /dev/null +++ b/specs/053-unify-runs-monitoring/plan.md @@ -0,0 +1,150 @@ +# Implementation Plan: Unified Operations Runs + Monitoring Hub (053) + +**Branch**: `feat/053-unify-runs-monitoring` | **Date**: 2026-01-16 | **Spec**: `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/053-unify-runs-monitoring/spec.md` ([spec.md](spec.md)) +**Input**: Feature specification from `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/053-unify-runs-monitoring/spec.md` + +**Note**: This plan is filled in by the `/speckit.plan` command. See `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/.specify/scripts/` for helper scripts. + +## Summary + +Unify how long-running operations are tracked and monitored by using the existing tenant-scoped run record (`BulkOperationRun`) as the canonical “operation run”, surfacing it in a single Monitoring/Operations hub (view-only, tenant-scoped, role-aware), and standardizing status semantics, notifications, failure detail minimization, and idempotent de-duplication. + +Phase 1 adoption scope (per clarifications): Drift generation + Backup Set “Add Policies”. + +## Technical Context + +**Language/Version**: PHP 8.4.15 (Laravel 12) +**Primary Dependencies**: Filament v4, Livewire v3 +**Storage**: PostgreSQL (JSONB for run `item_ids` and `failures`) +**Testing**: Pest v4 (PHPUnit v12) +**Target Platform**: Web application (Sail-first locally, Dokploy-first deploy) +**Project Type**: web +**Performance Goals**: Monitoring/Operations index renders in <1s for the most recent ~250 runs; start actions confirm and provide a “View run” link within 2 seconds (aligns with SC-002). +**Constraints**: Monitoring pages are DB-only and view-only; strict tenant isolation; no secrets/tokens stored; run failures use stable reason codes + short sanitized messages; itemized runs store per-item failures (sanitized). +**Scale/Scope**: Tenant-scoped run history across multiple modules; Phase 1 covers drift generation + backup set “add policies”, with more run producers added in later phases. + +## Constitution Check + +*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.* + +- Inventory-first: PASS. Monitoring uses persisted run records; drift generation is based on inventory sync “last observed” state and stores findings (not raw snapshots). +- Read/write separation: PASS. Monitoring/Operations is view-only (no start/rerun/cancel/delete). Write actions remain in their feature UIs and already use queued jobs + auditability. +- Graph contract path: PASS (no new Graph calls introduced by Monitoring). Existing Graph calls remain behind existing abstractions and must not occur during Monitoring page render. +- Deterministic capabilities: N/A for this feature (no new capability resolver). Existing idempotency key builder remains deterministic. +- Tenant isolation: PASS. Run list/view/start remain tenant-scoped; cross-tenant access is forbidden (403). +- Automation: PASS. Active-run de-duplication uses deterministic idempotency keys + partial unique indexes; runs remain observable with status + counts + safe errors. +- Data minimization: PASS. Failures are sanitized/minimized; no secrets/tokens/raw external payload dumps stored or displayed. + +**Gate status (pre-Phase 0)**: PASS (no violations). + +**Gate status (post-Phase 1)**: PASS (design artifacts present: research.md, data-model.md, contracts/*, quickstart.md). + +## Project Structure + +### Documentation (this feature) + +```text +/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/053-unify-runs-monitoring/ +├── plan.md # This file (/speckit.plan command output) +├── spec.md # Feature specification (input) +├── checklists/ +│ └── requirements.md # Spec quality checklist +├── research.md # Phase 0 output +├── data-model.md # Phase 1 output +├── quickstart.md # Phase 1 output +├── contracts/ # Phase 1 output +└── tasks.md # Phase 2 output (/speckit.tasks command - NOT created by /speckit.plan) +``` + +### Source Code (repository root) +```text +app/ +├── Filament/ +│ ├── Pages/ +│ └── Resources/ +├── Jobs/ +├── Models/ +├── Policies/ +├── Services/ +└── Support/ + +config/ + +database/ +└── migrations/ + +routes/ + +tests/ +├── Feature/ +└── Unit/ +``` + +**Structure Decision**: Laravel web application. Implement Monitoring/Operations primarily via Filament (Resources/Pages) and reuse existing run-record primitives (`bulk_operation_runs`) with tenant-scoped policies and Pest tests. + +## Key Implementation Decisions (Pinned) + +- **Phase 1 scope**: Monitoring/Operations hub + Drift generation + Backup Set “Add Policies”. +- **Monitoring permissions**: `Owner`, `Manager`, `Operator`, `Readonly` can view. `Readonly` is strictly view-only. +- **Monitoring guardrail**: Monitoring/Operations is view-only in Phase 1 (no start/rerun/cancel/delete). +- **Status semantics**: UI-level statuses are consistent and testable: + - `partially succeeded` = at least one success and at least one failure + - `failed` = zero successes (or the run could not proceed) +- **Failure detail**: Stable reason codes + short sanitized messages; itemized operations include per-item failures (sanitized). + +## Execution Model + +### Run record primitive + +- Canonical run record: `App\Models\BulkOperationRun` (tenant-scoped) for Phase 1. +- Producers in Phase 1: + - Drift generation: `resource=drift`, `action=generate` + - Backup Set “Add Policies”: `resource=backup_set`, `action=add_policies` (or existing canonical action naming) + +### Status mapping (storage ↔ UI semantics) + +The UI MUST present consistent meanings while allowing storage to keep existing vocabulary: + +- `pending` → `queued` +- `running` → `running` +- `completed` → `succeeded` +- `completed_with_errors` → `partially succeeded` +- `failed` / `aborted` → `failed` + +### Idempotency & de-duplication + +- Deterministic idempotency key per tenant + operation type + scope via `App\Support\RunIdempotency`. +- Active-run reuse: if an identical run is `pending` or `running`, reuse it (return the existing run and link to it). +- Race reduction: rely on the existing partial unique index for active runs and handle collisions by “find existing and reuse”. + +### Notifications + +- Use DB notifications for “queued” and “completed” lifecycle events, linking to the run detail page. +- Notifications and persisted run failures must remain sanitized (no secrets/tokens/raw payloads). + +### Monitoring/Operations hub + +- Central list + filters for the active tenant: + - filter by `resource`/`action`, status bucket (queued/running/succeeded/partial/failed), and time range + - drill-down to run detail (status + counts + sanitized failures + item identifiers) +- View-only: no hub actions to start, rerun, cancel, or delete runs. + +## Definition of Done (ends at Phase 2 planning) + +### Phase 2 (MVP implementation readiness) + +- Monitoring/Operations navigation exists and lists tenant runs with the required filters and drill-down. +- Role guardrail enforced: `Readonly` can view list + detail but has no action controls. +- Status bucket semantics are consistent and testable (including partial vs failed). +- Drift generation and Backup Set “Add Policies” runs are visible and linkable from their feature entry points and from Monitoring/Operations. +- Design artifacts exist and are referenced by this plan: + - `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/053-unify-runs-monitoring/research.md` + - `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/053-unify-runs-monitoring/data-model.md` + - `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/053-unify-runs-monitoring/contracts/` + - `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/053-unify-runs-monitoring/quickstart.md` + +## Complexity Tracking + +> **Fill ONLY if Constitution Check has violations that must be justified** + +N/A (no constitution violations anticipated) diff --git a/specs/053-unify-runs-monitoring/quickstart.md b/specs/053-unify-runs-monitoring/quickstart.md new file mode 100644 index 0000000..6c0a46b --- /dev/null +++ b/specs/053-unify-runs-monitoring/quickstart.md @@ -0,0 +1,27 @@ +# Quickstart: Unified Operations Runs + Monitoring Hub (053) + +## Goal + +Provide a single Monitoring/Operations area (view-only) to observe tenant-scoped long-running runs with consistent status semantics, safe failure visibility, and links to related artifacts. Phase 1 scope includes Drift generation and Backup Set “Add Policies”. + +## Local development + +- Bring Sail up: `./vendor/bin/sail up -d` +- Run migrations: `./vendor/bin/sail artisan migrate` +- Run a queue worker (separate terminal): `./vendor/bin/sail artisan queue:work` + +## Testing + +Run the most relevant tests first: + +- Tenant scoping for runs: `./vendor/bin/sail artisan test tests/Feature/RunAuthorizationTenantIsolationTest.php` +- Drift generation run dispatch: `./vendor/bin/sail artisan test tests/Feature/Drift/DriftGenerationDispatchTest.php` +- Drift job notifications + failure details: `./vendor/bin/sail artisan test tests/Feature/Drift/GenerateDriftFindingsJobNotificationTest.php` +- Backup Set “Add Policies” job orchestration: `./vendor/bin/sail artisan test tests/Feature/BackupSets/AddPoliciesToBackupSetJobTest.php` +- Status bucket semantics: `./vendor/bin/sail artisan test tests/Unit/BulkOperationRunStatusBucketTest.php` + +## Operational notes + +- Monitoring pages must remain DB-only (no external tenant calls on render). +- Run failures must stay sanitized and must not contain secrets/tokens. +- Ensure queue workers are running in staging/production so runs complete outside interactive sessions. diff --git a/specs/053-unify-runs-monitoring/research.md b/specs/053-unify-runs-monitoring/research.md new file mode 100644 index 0000000..1bc4cce --- /dev/null +++ b/specs/053-unify-runs-monitoring/research.md @@ -0,0 +1,106 @@ +# Research: Unified Operations Runs + Monitoring Hub (053) + +This document resolves Phase 0 open questions and records design choices for Feature 053. + +## Decisions + +### 1) Canonical run record (Phase 1) + +**Decision:** Reuse the existing `bulk_operation_runs` / `App\Models\BulkOperationRun` as the canonical “operation run” record for Phase 1. + +**Rationale:** +- The codebase already uses `BulkOperationRun` for long-running background work (including Drift generation and Backup Set “Add Policies”). +- It already supports tenant scoping, initiator attribution, counts, and safe failure persistence. +- Avoids a high-risk cross-feature migration before we have proven consistent semantics across modules. + +**Alternatives considered:** +- **Create a new generic `operation_runs` (+ optional `operation_run_items`) model** and migrate all producers to it. + - Rejected (Phase 1): higher schema + refactor cost, higher coordination risk, and would slow down delivering the Monitoring hub. + +### 2) Monitoring/Operations hub surface + +**Decision:** Implement the Monitoring/Operations hub by evolving the existing Filament `BulkOperationRunResource` (navigation group/label + filters), rather than creating a new custom monitoring page in Phase 1. + +**Rationale:** +- The resource already provides a tenant-scoped list and a run detail view. +- Small changes deliver high value quickly and reduce risk. + +**Alternatives considered:** +- **New “Monitoring → Operations” Filament Page + bespoke table/detail.** + - Rejected (Phase 1): duplicates existing capabilities and increases maintenance. + +### 3) View-only guardrail and viewer roles + +**Decision:** Monitoring/Operations is view-only in Phase 1 and is visible to tenant roles `Owner`, `Manager`, `Operator`, and `Readonly`. Start/re-run controls remain in the respective feature UIs. + +**Rationale:** +- Adding run management actions implies introducing cancellation semantics, locks, permission matrices, and race handling across producers. +- View-only delivers the primary value (transparency + auditability) without expanding scope. + +**Alternatives considered:** +- **Add `Rerun` / `Cancel` actions in the hub.** + - Rejected (Phase 1): scope expansion into “run management”. +- **Restrict viewing to non-Readonly roles.** + - Rejected: increases “what happened?” support loops; viewing is safe when sanitized. + +### 4) Status semantics and mapping + +**Decision:** Standardize UI-level status semantics as `queued → running → (succeeded | partially succeeded | failed)` while allowing underlying storage to keep its current status vocabulary. + +- `partially succeeded` = at least one success and at least one failure. +- `failed` = zero successes (or the run could not proceed). +- `BulkOperationRun.status` mapping: `pending→queued`, `running→running`, `completed→succeeded`, `completed_with_errors→partially succeeded`, `failed/aborted→failed`. + +**Rationale:** +- Keeps the operator-facing meaning consistent and testable without forcing a broad “rename statuses everywhere” refactor. + +**Alternatives considered:** +- **Normalize all run status values across all run tables immediately.** + - Rejected (Phase 1): broad blast radius across many features and tests. + +### 5) Failure detail storage + +**Decision:** Persist stable reason codes and short sanitized messages for failures; itemized operations also store a sanitized per-item failures list. + +**Rationale:** +- Operators and support should understand failures without reading server logs. +- Per-item failures avoid rerunning large operations just to identify the affected item. + +**Alternatives considered:** +- **Summary-only failure storage.** + - Rejected: loses actionable “which item failed?” detail for itemized runs. +- **Logs-only (no persisted failure detail).** + - Rejected: weaker observability and not aligned with “safe, actionable failures”. + +### 6) Idempotency & de-duplication + +**Decision:** Use deterministic idempotency keys and active-run reuse as the primary dedupe mechanism: +- Key builder: `App\Support\RunIdempotency::buildKey(...)` with stable, sorted context. +- Active-run lookup: reuse when status is active (`pending`/`running`). +- Race reduction: rely on the existing partial unique index for active runs and handle collisions by finding and reusing the existing run. + +**Rationale:** +- Aligns with the constitution (“Automation must be Idempotent & Observable”). +- Durable across restarts and observable in the database. + +**Alternatives considered:** +- **Cache-only locks without persisted keys.** + - Rejected: less observable and easier to break across deploys/restarts. + +### 7) Phase 1 producer scope + +**Decision:** Phase 1 adopts the unified monitoring semantics for: +- Drift generation (`drift.generate`) +- Backup Set “Add Policies” (`backup_set.add_policies`) + +**Rationale:** +- Both are already using `BulkOperationRun` and provide immediate value in the Monitoring hub. +- Keeps Phase 1 bounded while proving the pattern across two modules. + +**Alternatives considered:** +- **Include every long-running producer in one pass.** + - Rejected (Phase 1): larger blast radius and higher coordination cost. + +## Notes + +- Retention/purge policy for run history should follow existing platform retention controls (defer to planning if changes are required). diff --git a/specs/053-unify-runs-monitoring/spec.md b/specs/053-unify-runs-monitoring/spec.md new file mode 100644 index 0000000..9bc2f81 --- /dev/null +++ b/specs/053-unify-runs-monitoring/spec.md @@ -0,0 +1,138 @@ +# Feature Specification: Unified Operations Runs + Monitoring Hub (053) + +**Feature Branch**: `feat/053-unify-runs-monitoring` +**Created**: 2026-01-15 +**Status**: Draft +**Input**: User description: "Unify long-running operations into a consistent, observable run model and provide a single Monitoring/Operations hub with tenant-scoped authorization, consistent status/count semantics, safe failure visibility, duplicate prevention, and Drift generation as a first-class tracked operation." + +## Clarifications + +### Session 2026-01-15 + +- Q: Who can view runs in Monitoring/Operations? → A: `Owner`, `Manager`, `Operator`, and `Readonly` (view-only; `Readonly` sees sanitized details but no action controls). +- Q: Which operations are included in Phase 1? → A: Monitoring/Operations hub + Drift generation + Backup Set “Add Policies”. +- Q: Should Monitoring/Operations include run management actions (rerun/cancel/delete)? → A: No; Monitoring/Operations is view-only in Phase 1 (start/re-run remain in feature UIs). +- Q: How is “partially succeeded” determined vs “failed”? → A: `partially succeeded` means both successes and failures occurred; `failed` means nothing succeeded (or the run could not proceed). +- Q: What failure detail should be stored and shown for runs? → A: Stable reason codes + short sanitized messages; itemized operations also include per-item failures (sanitized). + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 - Monitor operations in one place (Priority: P1) + +As an operator, I want a single Monitoring/Operations area where I can see what operations are queued/running/succeeded/partially succeeded/failed for my tenant and drill into details, so I can quickly answer “what is happening” without jumping between modules. + +**Why this priority**: Provides immediate operational visibility, reduces time-to-diagnose failures, and prevents “run sprawl” (different screens/status meanings per module). + +**Independent Test**: Seed multiple runs with different types/statuses and verify the hub lists them tenant-scoped, supports filtering, and a run detail view is available with status, timing, summary counts, and safe error summaries. + +**Acceptance Scenarios**: + +1. **Given** I am signed into tenant A, **When** I open Monitoring → Operations, **Then** I see runs for tenant A sorted most-recent-first and the list defaults to the last 30 days. +2. **Given** the list contains multiple run types and statuses, **When** I filter by run type and status, **Then** I only see matching runs and can open a run detail page for any result. +3. **Given** I need an older or narrower timeframe, **When** I set a custom time range, **Then** the list updates to match the selected range. +4. **Given** I am a `Readonly` user in tenant A, **When** I open a run detail page, **Then** I can view status and sanitized failure information but I do not see controls to start, rerun, cancel, or delete runs. +5. **Given** I can view Monitoring/Operations, **When** I use the hub, **Then** it is view-only (no controls to start, rerun, cancel, or delete runs). +6. **Given** an itemized run has failures, **When** I open the run detail page, **Then** I can see which items failed with stable reason codes and short sanitized messages. +7. **Given** I attempt to access a run from another tenant (via list or direct link), **When** I request it, **Then** access is denied and no run data is disclosed. +8. **Given** a run has both successes and failures, **When** I view it, **Then** its outcome is “partially succeeded”; **And given** it has zero successes (or cannot proceed), **Then** its outcome is “failed”. +9. **Given** a run was initiated by the system, **When** I view it, **Then** the initiator is shown as “System”. + +--- + +### User Story 2 - Start long-running actions without waiting (Priority: P2) + +As an operator, when I trigger a long-running action, I want an immediate confirmation with a “View run” link so I can keep working while the operation completes in the background. + +**Why this priority**: Prevents timeouts and long waits, improves reliability under retries/double-clicks, and standardizes the start experience across modules. + +**Independent Test**: Start a supported operation (e.g., drift generation) and verify an operation run is created/reused, the UI returns quickly with a “View run” link, and the run progresses to a terminal status. + +**Acceptance Scenarios**: + +1. **Given** I have permission to start a supported operation in tenant A, **When** I start it, **Then** I receive immediate confirmation with a “View run” link and the run shows as queued or running. +2. **Given** I am a `Readonly` user in tenant A, **When** I attempt to start a supported operation, **Then** the system denies the request and no new run is created. +3. **Given** I start Backup Set “Add Policies”, **When** I submit the selection, **Then** I receive immediate confirmation with a “View run” link and the run is visible in Monitoring/Operations. +4. **Given** I start a supported operation successfully, **When** it is queued, **Then** I receive a “Queued” notification with a “View run” link to the run detail view. +5. **Given** my run reaches a terminal outcome, **When** that occurs, **Then** I receive a completion notification stating `succeeded`, `partially succeeded`, or `failed`, including summary counts when applicable. +6. **Given** background execution is unavailable, **When** I attempt to start a supported operation, **Then** I receive a clear message that the operation cannot be queued and I do not receive a misleading “queued” confirmation. + +--- + +### User Story 3 - Drift generation is observable like other operations (Priority: P3) + +As an operator, I want drift generation to behave like any other operation: it creates a run, shows progress and outcomes, and links to results so drift is easy to monitor and troubleshoot. + +**Why this priority**: Drift is an operator-facing workflow; making it observable avoids confusion and reduces support burden when it fails or takes time. + +**Independent Test**: Trigger drift generation for a defined scope, verify it appears in the monitoring hub, reaches a terminal status, and provides either a results link or a safe failure summary. + +**Acceptance Scenarios**: + +1. **Given** drift generation is available for a selected scope, **When** I trigger “Generate drift now”, **Then** I see a drift run in Monitoring and can open its details. +2. **Given** drift generation is already queued/running for the same scope, **When** I trigger it again, **Then** the system reuses the existing run and does not start a duplicate. +3. **Given** drift generation succeeds for a scope, **When** I open the run detail view, **Then** I see a link to the drift findings produced for that scope. +4. **Given** drift generation fails, **When** I open the run detail view, **Then** I see a safe failure summary (reason code + short sanitized message) and no sensitive data. +5. **Given** drift generation is requested but there is not enough eligible data to compare, **When** I trigger it, **Then** the system produces no findings and communicates a clear, actionable reason (for example, “insufficient data”). + +--- + +### Edge Cases + +- If drift generation is requested for a scope without enough eligible data to compare, the system MUST refuse to start the operation or MUST complete the run as `failed` with a stable reason code (for example, `insufficient_data`) and a short, actionable message. +- If repeated start attempts occur for the same tenant + run type + scope while an existing run is `queued` or `running`, the system MUST reuse the existing run and MUST NOT start duplicate background work. +- If the initiator’s permissions change while a run is in progress, the run continues; visibility is evaluated at time of access; and completion notifications are delivered only if the recipient remains authorized to view the run. +- Failure details MUST remain sanitized even when underlying errors contain sensitive data: only stable reason codes and short sanitized messages are shown; secrets/tokens/raw payload dumps are never shown. +- For very large scopes, the system MAY summarize or truncate per-item failure listings in the UI, but it MUST preserve accurate summary counts and MUST indicate when failure details are truncated. + +## Requirements *(mandatory)* + +**Constitution alignment (required):** If this feature introduces any new external tenant reads/writes or any write/change behavior, +the spec MUST describe safety gates (preview/confirmation/audit), tenant isolation, auditability, and tests. This feature is primarily about observability and standardization; monitoring views MUST be read-only and MUST NOT trigger external data collection. + +### Scope & Assumptions + +- This feature standardizes tracking and monitoring for long-running operations that already exist in the product. +- **Phase 1 supported operations** are: Drift generation; Backup Set “Add Policies”. All other candidate operations are explicitly deferred to Phase 2+ adoption (for example: inventory sync, directory group sync, snapshot/backup capture, restore execution, cross-tenant comparison/promotion). +- System-initiated runs may exist (for example, scheduled operations); they appear in Monitoring/Operations with initiator shown as “System”. Notification routing for system-initiated runs is deferred to Phase 2 (Owner: Product) to avoid unintended noise. +- **Roles in this spec**: `Owner`, `Manager`, and `Operator` can view Monitoring/Operations and can start Phase 1 supported operations; `Readonly` can view Monitoring/Operations but cannot start or manage runs and must not see those controls. +- Cross-tenant monitoring aggregation is out of scope; monitoring is tenant-scoped. +- Advanced dashboards (charts, badges, progress widgets) are out of scope; focus is on consistent run tracking, filtering, and drill-down. +- Run retention horizon and scale targets (volume per tenant, archiving/export) are deferred to Phase 2 (Owner: Product) to align with real usage data and storage constraints. Phase 1 focuses on operational clarity and defaults the list view to a recent time window. +- This feature assumes background execution is enabled in each environment so long-running operations can complete outside of interactive user sessions; when it is unavailable, the system must communicate clearly and must not mislead operators into thinking work is queued. +- Monitoring/Operations is view-only in Phase 1; run start/re-run controls remain in their respective feature areas. +- Auditability for Phase 1 is achieved via the run record (initiator, timestamps, outcome, counts, safe failure summary) and lifecycle notifications. Auditing “who viewed which run” is deferred to Phase 2 (Owner: Product). + +### Functional Requirements + +- **FR-001**: System MUST track each supported long-running operation as a tenant-scoped run with: run type, scope/target (when applicable), status, timestamps (created/started/finished), initiator (user or system), and a human-readable label. The label MUST be a stable operator-facing description combining run type and scope/target (English-only in Phase 1; localization deferred to Phase 2). +- **FR-002**: System MUST provide a Monitoring/Operations area that lists runs for the current tenant, sorted most-recent-first by default, defaulting to a recent time window (last 30 days), and supporting filtering by run type, status, and time range. Status filter values MUST include: `queued`, `running`, `succeeded`, `partially succeeded`, `failed`. Run type filtering MUST include the Phase 1 supported operations (Drift generation; Backup Set “Add Policies”). +- **FR-003**: System MUST provide a run detail view that shows status/outcome, timing, summary counts, and safe failure summaries. For itemized operations (operations that process a set of items), counts MUST include `total`, `succeeded`, `failed`, and `skipped` (if applicable). +- **FR-004**: System MUST use consistent run status semantics across run types using the Phase 1 status set: `queued`, `running`, `succeeded`, `partially succeeded`, `failed`. Status meanings MUST be unambiguous: `partially succeeded` indicates at least one success and at least one failure; `failed` indicates zero successes (or the run could not proceed). Cancellation/abort outcomes are deferred to Phase 2. +- **FR-005**: When an operator starts a supported long-running operation, the system MUST provide immediate confirmation and a “View run” link that opens the run detail view without blocking on completion. If background execution is unavailable, the system MUST provide a clear error and MUST NOT present a misleading “queued” confirmation. +- **FR-006**: System MUST avoid duplicate runs for the same tenant + run type + scope when an identical run is already `queued` or `running` by reusing the existing run. “Identical” means the same tenant, the same run type, the same scope/target, and the same effective inputs (for example: the same drift scope selection; the same backup set and selected policies). The initiator MUST NOT be part of the identity for duplicate prevention. +- **FR-007**: Drift generation MUST be tracked as a run and MUST surface completion status and either a link to produced findings or an actionable, safe failure summary. +- **FR-008**: Run list, run view, and run start actions MUST be tenant-scoped and forbidden cross-tenant. Tenant scoping MUST be applied before any filtering or lookup to prevent cross-tenant data leakage, and cross-tenant access attempts MUST NOT disclose run existence or details. +- **FR-009**: Run visibility and run start actions MUST be permission-gated by run type (least privilege). By default, `Owner`, `Manager`, `Operator`, and `Readonly` can view runs, but `Readonly` MUST NOT be able to start or manage runs (and must not see those controls). +- **FR-010**: Failure information stored and displayed for runs MUST be sanitized and minimized; it MUST NOT include secrets, credentials, tokens, PII, or full external payload dumps. Runs MUST store stable reason codes with short sanitized messages. Itemized operations MUST additionally store a sanitized per-item failures list that identifies the affected item using a non-sensitive reference (for example, an item name and/or ID that is safe to display) plus reason code and short message. +- **FR-011**: System MUST provide consistent user notifications for run lifecycle events: when queued and when reaching a terminal outcome (`succeeded` / `partially succeeded` / `failed`). Notifications MUST include a “View run” link that opens the run detail view. Phase 1 notifications MUST be delivered to the initiating user; notification routing for system-initiated runs is deferred to Phase 2. If a notification cannot be delivered, Monitoring/Operations remains the source of truth for run status. +- **FR-012**: Where a run produces or relates to a separate business artifact (for example, drift findings), the run detail view MUST provide a link to that artifact. +- **FR-013**: Backup Set “Add Policies” MUST be tracked as a run and MUST surface terminal outcome status, summary counts, and a link back to the related backup set context. +- **FR-014**: Monitoring/Operations MUST be view-only in Phase 1 and MUST NOT offer controls to start, rerun, cancel, or delete runs. + +### Key Entities *(include if feature involves data)* + +- **Run**: A tenant-scoped record representing the execution of a long-running operation, including status, timestamps, summary counts, and safe failure information. +- **Failure Detail**: For a run, a stable reason code plus a short sanitized message; for itemized operations, a sanitized list of per-item failures to identify affected items without exposing sensitive data. +- **Run Type**: A classification of what the run does (e.g., inventory sync, backup capture, restore, drift generation) used for filtering, permissions, and consistent status/count semantics. +- **Run Scope/Target**: The object or scope the run applies to (e.g., a backup set, a policy, a drift scope), used for duplicate prevention and navigation. +- **Related Artifact**: A separate business record produced by an operation (e.g., restore details, drift findings) that can be linked from the run. + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: An operator can determine the current state (`queued` / `running` / `succeeded` / `partially succeeded` / `failed`) of any recent operation in under 30 seconds using Monitoring/Operations (measured via timed operator walkthroughs using Monitoring/Operations only). +- **SC-002**: Starting a Phase 1 supported long-running operation provides user-visible confirmation and a “View run” link within 2 seconds under normal conditions (normal conditions: no active service degradation and typical tenant dataset sizes; excludes maintenance/outage windows; measured via timed start flows during operator walkthroughs). +- **SC-003**: For identical Phase 1 operation requests (same tenant + run type + scope + effective inputs), the system creates no more than one active run at a time (`queued` or `running`) in at least 99% of repeated-start attempts over a rolling 30-day window (measured by sampling repeated-start attempts and counting resulting active runs). +- **SC-004**: For Phase 1 terminal runs, operators can identify a clear outcome (`succeeded` / `partially succeeded` / `failed`) and a short, non-sensitive failure reason (when applicable) using Monitoring/Operations without inspecting server logs in at least 95% of cases over a rolling 30-day window (measured via support/operator triage review of run records). +- **SC-005**: Operator- or support-reported incidents caused by “unknown/stuck status” long-running operations decrease by at least 50% within one release cycle after Phase 1 adoption (measured via support ticket tagging/categorization). diff --git a/specs/053-unify-runs-monitoring/tasks.md b/specs/053-unify-runs-monitoring/tasks.md new file mode 100644 index 0000000..f7377d8 --- /dev/null +++ b/specs/053-unify-runs-monitoring/tasks.md @@ -0,0 +1,174 @@ +--- + +description: "Task list for implementing Unified Operations Runs + Monitoring Hub (053)" +--- + +# Tasks: Unified Operations Runs + Monitoring Hub (053) + +**Input**: Design documents from `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/053-unify-runs-monitoring/` +**Prerequisites**: plan.md (required), spec.md (required), research.md, data-model.md, contracts/, quickstart.md + +**Tests**: Not explicitly requested in spec.md. Add/adjust Pest tests as needed during implementation; validate with the existing test suite. + +**Organization**: Tasks are grouped by user story so each story can be implemented and validated independently. + +## Format: `- [ ] T### [P?] [US#?] Description with file path` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[US#]**: User story mapping (US1/US2/US3). Setup/Foundational/Polish tasks have no story label. + +## Path Conventions (Laravel) + +- App code: `app/` +- Filament admin: `app/Filament/` +- Livewire: `app/Livewire/` +- Jobs: `app/Jobs/` +- DB: `database/migrations/` +- Views: `resources/views/` +- Tests (Pest): `tests/Feature/`, `tests/Unit/` + +--- + +## Phase 1: Setup (Shared Infrastructure) + +**Purpose**: Confirm baseline assumptions and align documentation artifacts with the codebase. + +- [x] T001 [P] Confirm “Monitoring/Operations hub = evolve BulkOperationRunResource” decision remains correct and update notes if needed in specs/053-unify-runs-monitoring/research.md +- [x] T002 [P] Verify Filament URLs match contracts (index/view) and update specs/053-unify-runs-monitoring/contracts/admin-pages.openapi.yaml if paths differ + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Shared building blocks required by all user stories. + +**⚠️ CRITICAL**: No user story work should begin until this phase is complete. + +- [x] T003 Add `runType()` and `statusBucket()` accessors (queued/running/succeeded/partial/failed) to app/Models/BulkOperationRun.php +- [x] T004 [P] Confirm `Readonly` users can view run list/detail tenant-scoped (and only view) by reviewing/updating app/Policies/BulkOperationRunPolicy.php + +**Checkpoint**: Foundation ready — Monitoring UI and run producers can reuse consistent status semantics. + +--- + +## Phase 3: User Story 1 - Monitor operations in one place (Priority: P1) 🎯 MVP + +**Goal**: Provide a single Monitoring/Operations area to list and drill into tenant runs with consistent status semantics and safe failure visibility. + +**Independent Test**: Visit Monitoring → Operations for a tenant with runs; filter by type/status; open a run and confirm counts + sanitized failures are visible; verify `Readonly` sees view-only UI. + +### Implementation + +- [x] T005 [US1] Move Operations runs into “Monitoring” navigation and label it “Operations” in app/Filament/Resources/BulkOperationRunResource.php +- [x] T006 [US1] Render status badges using `statusBucket()` (not raw status) in app/Filament/Resources/BulkOperationRunResource.php +- [x] T007 [US1] Add filters for run type (`resource.action`) and status bucket in app/Filament/Resources/BulkOperationRunResource.php +- [x] T008 [US1] Add time range filter (created_at from/to) in app/Filament/Resources/BulkOperationRunResource.php +- [x] T009 [US1] Add a “Related” section on the run detail view linking to the relevant feature context (e.g., Backup Set for `backup_set.add_policies`) in app/Filament/Resources/BulkOperationRunResource.php + +**Checkpoint**: US1 complete — operators can monitor and drill into runs in one place. + +--- + +## Phase 4: User Story 2 - Start long-running actions without waiting (Priority: P2) + +**Goal**: Starting a supported long-running operation is non-blocking and provides immediate confirmation + “View run” link; unauthorized users cannot start. + +**Independent Test**: Trigger Drift generation and Backup Set “Add Policies”; confirm immediate feedback with “View run” link; confirm `Readonly` cannot start drift generation and no run is created. + +### Implementation + +- [x] T010 [US2] Prevent drift generation from being started by `Readonly` users (blocked state + message) in app/Filament/Pages/DriftLanding.php +- [x] T011 [US2] Emit a queued DB notification with “View run” link when Drift generation is queued in app/Filament/Pages/DriftLanding.php +- [x] T012 [P] [US2] Emit Drift completion and failure DB notifications with “View run” link in app/Jobs/GenerateDriftFindingsJob.php + +**Checkpoint**: US2 complete — start UX is consistent and permission-gated. + +--- + +## Phase 5: User Story 3 - Drift generation is observable like other operations (Priority: P3) + +**Goal**: Drift generation creates/reuses a run, surfaces safe failure details, and links operators to results. + +**Independent Test**: Trigger Drift generation; observe it in Monitoring → Operations; open the run and follow a link to Drift findings; simulate failure and confirm safe failure reason is visible on the run. + +### Implementation + +- [x] T013 [US3] Store Drift context (scope_key, baseline_run_id, current_run_id) inside the run payload so Monitoring can link to results in app/Filament/Pages/DriftLanding.php +- [x] T014 [P] [US3] Record a sanitized failure entry (reason_code + short message) into `BulkOperationRun.failures` when Drift generation fails in app/Jobs/GenerateDriftFindingsJob.php +- [x] T015 [US3] Add a “Drift findings” link for `drift.generate` runs in the run detail “Related” section in app/Filament/Resources/BulkOperationRunResource.php + +**Checkpoint**: US3 complete — drift runs are actionable and consistent with other operations. + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Final alignment, validation, and guardrails. + +- [x] T016 [P] Update operator-facing notes and validation commands in specs/053-unify-runs-monitoring/quickstart.md (only if implementation changes) +- [x] T017 [P] Update docs to match implementation if needed: specs/053-unify-runs-monitoring/spec.md and specs/053-unify-runs-monitoring/data-model.md +- [x] T018 Run formatting on changed PHP files with `./vendor/bin/pint --dirty` (reference: specs/053-unify-runs-monitoring/quickstart.md) +- [x] T019 Run targeted validation commands from specs/053-unify-runs-monitoring/quickstart.md (queue worker optional; run relevant Pest tests) +- [x] T020 [P] Re-verify contracts match real URLs and access behavior in specs/053-unify-runs-monitoring/contracts/admin-pages.openapi.yaml + +--- + +## Dependencies & Execution Order + +### Dependency Graph (User Stories) + +```text +Phase 1 (Setup) ─┬─> Phase 2 (Foundational) ─┬─> US1 (P1) ─┬─> Polish + │ ├─> US2 (P2) │ + │ └─> US3 (P3) ┘ + └──────────────────────────────────────────── +``` + +### User Story Dependencies + +- **US1** depends on Phase 2 (Foundational); independent of US2/US3. +- **US2** depends on Phase 2 (Foundational); independent of US1/US3. +- **US3** depends on Phase 2 (Foundational) and benefits from US1 (Monitoring visibility) but can be implemented independently. + +--- + +## Parallel Execution Examples + +### US1 (Monitoring UI) + +```text +After Phase 2 is complete, one developer can focus on: +- app/Filament/Resources/BulkOperationRunResource.php (T005–T009) +``` + +### US2 (Start UX / Notifications) + +```text +These can be done in parallel after Phase 2: +- app/Filament/Pages/DriftLanding.php (T010–T011) +- app/Jobs/GenerateDriftFindingsJob.php (T012) +``` + +### US3 (Drift observability) + +```text +These can be done in parallel after Phase 2: +- app/Filament/Pages/DriftLanding.php (T013) +- app/Jobs/GenerateDriftFindingsJob.php (T014) +``` + +--- + +## Implementation Strategy + +### MVP First (US1 Only) + +1. Complete Phase 1 + Phase 2 +2. Complete US1 (Phase 3) and validate Monitoring/Operations end-to-end +3. Ship/demonstrate Monitoring value before expanding run producer behavior + +### Incremental Delivery + +1. US1 (Monitoring hub) → validates visibility/auditability +2. US2 (start guardrails + notifications) → standardizes operator feedback +3. US3 (drift linking + safe failure detail) → makes drift runs fully actionable diff --git a/tests/Feature/Drift/DriftGenerationDispatchTest.php b/tests/Feature/Drift/DriftGenerationDispatchTest.php index 09fc6fa..5807cc4 100644 --- a/tests/Feature/Drift/DriftGenerationDispatchTest.php +++ b/tests/Feature/Drift/DriftGenerationDispatchTest.php @@ -1,9 +1,11 @@ resource)->toBe('drift'); expect($bulkRun->action)->toBe('generate'); expect($bulkRun->status)->toBe('pending'); + expect($bulkRun->item_ids)->toBe([ + 'scope_key' => $scopeKey, + 'baseline_run_id' => (int) $baseline->getKey(), + 'current_run_id' => (int) $current->getKey(), + ]); + + $this->assertDatabaseHas('notifications', [ + 'notifiable_id' => $user->getKey(), + 'notifiable_type' => $user->getMorphClass(), + 'type' => RunStatusChangedNotification::class, + ]); + + $notification = $user->notifications()->latest('id')->first(); + expect($notification)->not->toBeNull(); + expect($notification->data['actions'][0]['url'] ?? null) + ->toBe(BulkOperationRunResource::getUrl('view', ['record' => $bulkRun->getKey()], tenant: $tenant)); Queue::assertPushed(GenerateDriftFindingsJob::class, function (GenerateDriftFindingsJob $job) use ($tenant, $user, $baseline, $current, $scopeKey, $bulkRun): bool { return $job->tenantId === (int) $tenant->getKey() @@ -127,3 +145,30 @@ Queue::assertNothingPushed(); expect(BulkOperationRun::query()->where('tenant_id', $tenant->getKey())->count())->toBe(0); }); + +test('opening Drift does not dispatch generation for readonly users', function () { + Queue::fake(); + + [$user, $tenant] = createUserWithTenant(role: 'readonly'); + $this->actingAs($user); + Filament::setTenant($tenant, true); + + $scopeKey = hash('sha256', 'scope-readonly-blocked'); + + InventorySyncRun::factory()->for($tenant)->create([ + 'selection_hash' => $scopeKey, + 'status' => InventorySyncRun::STATUS_SUCCESS, + 'finished_at' => now()->subDays(2), + ]); + + InventorySyncRun::factory()->for($tenant)->create([ + 'selection_hash' => $scopeKey, + 'status' => InventorySyncRun::STATUS_SUCCESS, + 'finished_at' => now()->subDay(), + ]); + + Livewire::test(DriftLanding::class); + + Queue::assertNothingPushed(); + expect(BulkOperationRun::query()->where('tenant_id', $tenant->getKey())->count())->toBe(0); +}); diff --git a/tests/Feature/Drift/GenerateDriftFindingsJobNotificationTest.php b/tests/Feature/Drift/GenerateDriftFindingsJobNotificationTest.php new file mode 100644 index 0000000..0100470 --- /dev/null +++ b/tests/Feature/Drift/GenerateDriftFindingsJobNotificationTest.php @@ -0,0 +1,143 @@ +for($tenant)->create([ + 'selection_hash' => $scopeKey, + 'status' => InventorySyncRun::STATUS_SUCCESS, + 'finished_at' => now()->subDays(2), + ]); + + $current = InventorySyncRun::factory()->for($tenant)->create([ + 'selection_hash' => $scopeKey, + 'status' => InventorySyncRun::STATUS_SUCCESS, + 'finished_at' => now()->subDay(), + ]); + + $run = BulkOperationRun::factory()->create([ + 'tenant_id' => $tenant->getKey(), + 'user_id' => $user->getKey(), + 'resource' => 'drift', + 'action' => 'generate', + 'status' => 'pending', + 'total_items' => 1, + 'processed_items' => 0, + 'succeeded' => 0, + 'failed' => 0, + 'skipped' => 0, + 'failures' => [], + ]); + + $this->mock(DriftFindingGenerator::class, function (MockInterface $mock) { + $mock->shouldReceive('generate')->once()->andReturn(0); + }); + + $job = new GenerateDriftFindingsJob( + tenantId: (int) $tenant->getKey(), + userId: (int) $user->getKey(), + baselineRunId: (int) $baseline->getKey(), + currentRunId: (int) $current->getKey(), + scopeKey: $scopeKey, + bulkOperationRunId: (int) $run->getKey(), + ); + + $job->handle(app(DriftFindingGenerator::class), app(BulkOperationService::class)); + + expect($run->refresh()->status)->toBe('completed'); + + $this->assertDatabaseHas('notifications', [ + 'notifiable_id' => $user->getKey(), + 'notifiable_type' => $user->getMorphClass(), + 'type' => RunStatusChangedNotification::class, + ]); + + $notification = $user->notifications()->latest('id')->first(); + expect($notification)->not->toBeNull(); + expect($notification->data['actions'][0]['url'] ?? null) + ->toBe(BulkOperationRunResource::getUrl('view', ['record' => $run->getKey()], tenant: $tenant)); +}); + +test('drift generation job sends failure notification with view link', function () { + [$user, $tenant] = createUserWithTenant(role: 'manager'); + + $scopeKey = hash('sha256', 'scope-job-notification-failure'); + + $baseline = InventorySyncRun::factory()->for($tenant)->create([ + 'selection_hash' => $scopeKey, + 'status' => InventorySyncRun::STATUS_SUCCESS, + 'finished_at' => now()->subDays(2), + ]); + + $current = InventorySyncRun::factory()->for($tenant)->create([ + 'selection_hash' => $scopeKey, + 'status' => InventorySyncRun::STATUS_SUCCESS, + 'finished_at' => now()->subDay(), + ]); + + $run = BulkOperationRun::factory()->create([ + 'tenant_id' => $tenant->getKey(), + 'user_id' => $user->getKey(), + 'resource' => 'drift', + 'action' => 'generate', + 'status' => 'pending', + 'total_items' => 1, + 'processed_items' => 0, + 'succeeded' => 0, + 'failed' => 0, + 'skipped' => 0, + 'failures' => [], + ]); + + $this->mock(DriftFindingGenerator::class, function (MockInterface $mock) { + $mock->shouldReceive('generate')->once()->andThrow(new RuntimeException('boom')); + }); + + $job = new GenerateDriftFindingsJob( + tenantId: (int) $tenant->getKey(), + userId: (int) $user->getKey(), + baselineRunId: (int) $baseline->getKey(), + currentRunId: (int) $current->getKey(), + scopeKey: $scopeKey, + bulkOperationRunId: (int) $run->getKey(), + ); + + try { + $job->handle(app(DriftFindingGenerator::class), app(BulkOperationService::class)); + } catch (RuntimeException) { + // Expected. + } + + $run->refresh(); + + expect($run->status)->toBe('failed') + ->and($run->processed_items)->toBe(1) + ->and($run->failed)->toBe(1) + ->and($run->failures)->toBeArray() + ->and($run->failures)->toHaveCount(1) + ->and($run->failures[0]['item_id'] ?? null)->toBe($scopeKey) + ->and($run->failures[0]['reason_code'] ?? null)->toBe('unknown') + ->and($run->failures[0]['reason'] ?? null)->toBe('boom'); + + $this->assertDatabaseHas('notifications', [ + 'notifiable_id' => $user->getKey(), + 'notifiable_type' => $user->getMorphClass(), + 'type' => RunStatusChangedNotification::class, + ]); + + $notification = $user->notifications()->latest('id')->first(); + expect($notification)->not->toBeNull(); + expect($notification->data['actions'][0]['url'] ?? null) + ->toBe(BulkOperationRunResource::getUrl('view', ['record' => $run->getKey()], tenant: $tenant)); +}); diff --git a/tests/Feature/RunAuthorizationTenantIsolationTest.php b/tests/Feature/RunAuthorizationTenantIsolationTest.php index 8553667..2190fb9 100644 --- a/tests/Feature/RunAuthorizationTenantIsolationTest.php +++ b/tests/Feature/RunAuthorizationTenantIsolationTest.php @@ -56,3 +56,31 @@ ->get(BulkOperationRunResource::getUrl('view', ['record' => $runB], tenant: $tenantA)) ->assertForbidden(); }); + +test('readonly users can view bulk operation runs for their tenant', function () { + $tenant = Tenant::factory()->create(); + + $run = BulkOperationRun::factory()->create([ + 'tenant_id' => $tenant->getKey(), + 'resource' => 'drift', + 'action' => 'generate', + ]); + + $user = User::factory()->create(); + $user->tenants()->syncWithoutDetaching([ + $tenant->getKey() => ['role' => 'readonly'], + ]); + + $this->actingAs($user) + ->get(BulkOperationRunResource::getUrl('index', tenant: $tenant)) + ->assertOk() + ->assertSee('drift') + ->assertSee('generate'); + + $this->actingAs($user) + ->get(BulkOperationRunResource::getUrl('view', ['record' => $run], tenant: $tenant)) + ->assertOk() + ->assertSee('drift') + ->assertSee('generate') + ->assertSee('Drift findings'); +}); diff --git a/tests/Unit/BulkOperationRunStatusBucketTest.php b/tests/Unit/BulkOperationRunStatusBucketTest.php new file mode 100644 index 0000000..4ee4e1d --- /dev/null +++ b/tests/Unit/BulkOperationRunStatusBucketTest.php @@ -0,0 +1,70 @@ +create([ + 'resource' => 'drift', + 'action' => 'generate', + ]); + + expect($run->runType())->toBe('drift.generate'); +}); + +test('bulk operation statusBucket maps pending and running', function () { + $pending = BulkOperationRun::factory()->create(['status' => 'pending']); + $running = BulkOperationRun::factory()->create(['status' => 'running']); + + expect($pending->statusBucket())->toBe('queued') + ->and($running->statusBucket())->toBe('running'); +}); + +test('bulk operation statusBucket maps terminal outcomes using counts', function () { + $succeeded = BulkOperationRun::factory()->create([ + 'status' => 'completed', + 'succeeded' => 3, + 'failed' => 0, + ]); + + $partial = BulkOperationRun::factory()->create([ + 'status' => 'completed_with_errors', + 'succeeded' => 2, + 'failed' => 1, + ]); + + $failedWithErrors = BulkOperationRun::factory()->create([ + 'status' => 'completed_with_errors', + 'succeeded' => 0, + 'failed' => 4, + ]); + + $failedAfterProgress = BulkOperationRun::factory()->create([ + 'status' => 'failed', + 'succeeded' => 1, + 'failed' => 1, + ]); + + $partialWithNonCountedFailures = BulkOperationRun::factory()->create([ + 'status' => 'completed_with_errors', + 'succeeded' => 1, + 'failed' => 0, + 'failures' => [ + [ + 'type' => 'foundation', + 'item_id' => 'foundation', + 'reason' => 'Forbidden', + 'reason_code' => 'graph_forbidden', + 'timestamp' => now()->toIso8601String(), + ], + ], + ]); + + expect($succeeded->statusBucket())->toBe('succeeded') + ->and($partial->statusBucket())->toBe('partially succeeded') + ->and($failedWithErrors->statusBucket())->toBe('failed') + ->and($failedAfterProgress->statusBucket())->toBe('partially succeeded') + ->and($partialWithNonCountedFailures->statusBucket())->toBe('partially succeeded'); +});