feat: harden operation lifecycle monitoring
This commit is contained in:
parent
8426741068
commit
59fc90a4db
4
.github/agents/copilot-instructions.md
vendored
4
.github/agents/copilot-instructions.md
vendored
@ -100,6 +100,8 @@ ## Active Technologies
|
||||
- PostgreSQL-backed existing records such as `operation_runs`, tenant governance records, onboarding workflow state, and provider connection state; no new business-domain table is required for the first slice (157-reason-code-translation)
|
||||
- PHP 8.4.15 + Laravel 12, Filament v5, Livewire v4, Pest v4, existing `BadgeCatalog` / `BadgeRenderer` / `OperatorOutcomeTaxonomy`, `ReasonPresenter`, `OperationRunService`, `TenantReviewReadinessGate`, existing baseline/evidence/review/review-pack resources and canonical pages (158-artifact-truth-semantics)
|
||||
- PostgreSQL with existing JSONB-backed `summary`, `summary_jsonb`, and `context` payloads on baseline snapshots, evidence snapshots, tenant reviews, review packs, and operation runs; no new primary storage required for the first slice (158-artifact-truth-semantics)
|
||||
- PHP 8.4.15 + Laravel 12, Filament 5, Livewire 4, Pest 4, Laravel queue workers, existing `OperationRunService`, `TrackOperationRun`, `OperationUxPresenter`, `ReasonPresenter`, `BadgeCatalog` domain badges, and current Operations Monitoring pages (160-operation-lifecycle-guarantees)
|
||||
- PostgreSQL for `operation_runs`, `jobs`, and `failed_jobs`; JSONB-backed `context`, `summary_counts`, and `failure_summary`; configuration in `config/queue.php` and `config/tenantpilot.php` (160-operation-lifecycle-guarantees)
|
||||
|
||||
- PHP 8.4.15 (feat/005-bulk-operations)
|
||||
|
||||
@ -119,8 +121,8 @@ ## Code Style
|
||||
PHP 8.4.15: Follow standard conventions
|
||||
|
||||
## Recent Changes
|
||||
- 160-operation-lifecycle-guarantees: Added PHP 8.4.15 + Laravel 12, Filament 5, Livewire 4, Pest 4, Laravel queue workers, existing `OperationRunService`, `TrackOperationRun`, `OperationUxPresenter`, `ReasonPresenter`, `BadgeCatalog` domain badges, and current Operations Monitoring pages
|
||||
- 159-baseline-snapshot-truth: Added PHP 8.4 + Laravel 12, Filament v5, Livewire v4
|
||||
- 158-artifact-truth-semantics: Added PHP 8.4.15 + Laravel 12, Filament v5, Livewire v4, Pest v4, existing `BadgeCatalog` / `BadgeRenderer` / `OperatorOutcomeTaxonomy`, `ReasonPresenter`, `OperationRunService`, `TenantReviewReadinessGate`, existing baseline/evidence/review/review-pack resources and canonical pages
|
||||
- 157-reason-code-translation: Added PHP 8.4.15 + Laravel 12, Filament v5, Livewire v4, PostgreSQL, Laravel Sail, Pest v4
|
||||
<!-- MANUAL ADDITIONS START -->
|
||||
<!-- MANUAL ADDITIONS END -->
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
use App\Models\OperationRun;
|
||||
use App\Models\Tenant;
|
||||
use App\Services\OperationRunService;
|
||||
use App\Services\Operations\OperationLifecycleReconciler;
|
||||
use App\Support\OperationRunOutcome;
|
||||
use Illuminate\Console\Command;
|
||||
|
||||
@ -18,8 +19,10 @@ class TenantpilotReconcileBackupScheduleOperationRuns extends Command
|
||||
|
||||
protected $description = 'Reconcile stuck backup schedule OperationRuns without legacy run-table lookups.';
|
||||
|
||||
public function handle(OperationRunService $operationRunService): int
|
||||
{
|
||||
public function handle(
|
||||
OperationRunService $operationRunService,
|
||||
OperationLifecycleReconciler $operationLifecycleReconciler,
|
||||
): int {
|
||||
$tenantIdentifiers = array_values(array_filter((array) $this->option('tenant')));
|
||||
$olderThanMinutes = max(0, (int) $this->option('older-than'));
|
||||
$dryRun = (bool) $this->option('dry-run');
|
||||
@ -96,31 +99,9 @@ public function handle(OperationRunService $operationRunService): int
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($operationRun->status === 'queued' && $operationRunService->isStaleQueuedRun($operationRun, max(1, $olderThanMinutes))) {
|
||||
if (! $dryRun) {
|
||||
$operationRunService->failStaleQueuedRun($operationRun, 'Backup schedule run was queued but never started.');
|
||||
}
|
||||
|
||||
$reconciled++;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($operationRun->status === 'running') {
|
||||
if (! $dryRun) {
|
||||
$operationRunService->updateRun(
|
||||
$operationRun,
|
||||
status: 'completed',
|
||||
outcome: OperationRunOutcome::Failed->value,
|
||||
failures: [
|
||||
[
|
||||
'code' => 'backup_schedule.stalled',
|
||||
'message' => 'Backup schedule run exceeded reconciliation timeout and was marked failed.',
|
||||
],
|
||||
],
|
||||
);
|
||||
}
|
||||
$change = $operationLifecycleReconciler->reconcileRun($operationRun, $dryRun);
|
||||
|
||||
if ($change !== null) {
|
||||
$reconciled++;
|
||||
|
||||
continue;
|
||||
|
||||
105
app/Console/Commands/TenantpilotReconcileOperationRuns.php
Normal file
105
app/Console/Commands/TenantpilotReconcileOperationRuns.php
Normal file
@ -0,0 +1,105 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Console\Commands;
|
||||
|
||||
use App\Models\Tenant;
|
||||
use App\Services\Operations\OperationLifecycleReconciler;
|
||||
use App\Support\Operations\OperationLifecyclePolicy;
|
||||
use Illuminate\Console\Command;
|
||||
|
||||
class TenantpilotReconcileOperationRuns extends Command
|
||||
{
|
||||
protected $signature = 'tenantpilot:operation-runs:reconcile
|
||||
{--type=* : Limit reconciliation to one or more covered operation types}
|
||||
{--tenant=* : Limit reconciliation to tenant_id or tenant external_id}
|
||||
{--workspace=* : Limit reconciliation to workspace ids}
|
||||
{--limit=100 : Maximum number of active runs to inspect}
|
||||
{--dry-run : Report the changes without writing them}';
|
||||
|
||||
protected $description = 'Reconcile stale covered operation runs back to deterministic terminal truth.';
|
||||
|
||||
public function handle(
|
||||
OperationLifecycleReconciler $reconciler,
|
||||
OperationLifecyclePolicy $policy,
|
||||
): int {
|
||||
$types = array_values(array_filter(
|
||||
(array) $this->option('type'),
|
||||
static fn (mixed $type): bool => is_string($type) && trim($type) !== '',
|
||||
));
|
||||
$workspaceIds = array_values(array_filter(
|
||||
array_map(
|
||||
static fn (mixed $workspaceId): int => is_numeric($workspaceId) ? (int) $workspaceId : 0,
|
||||
(array) $this->option('workspace'),
|
||||
),
|
||||
static fn (int $workspaceId): bool => $workspaceId > 0,
|
||||
));
|
||||
$tenantIds = $this->resolveTenantIds(array_values(array_filter((array) $this->option('tenant'))));
|
||||
$dryRun = (bool) $this->option('dry-run');
|
||||
|
||||
if ($types === []) {
|
||||
$types = $policy->coveredTypeNames();
|
||||
}
|
||||
|
||||
$result = $reconciler->reconcile([
|
||||
'types' => $types,
|
||||
'tenant_ids' => $tenantIds,
|
||||
'workspace_ids' => $workspaceIds,
|
||||
'limit' => max(1, (int) $this->option('limit')),
|
||||
'dry_run' => $dryRun,
|
||||
]);
|
||||
|
||||
$rows = collect($result['changes'] ?? [])
|
||||
->map(static function (array $change): array {
|
||||
return [
|
||||
'Run' => (string) ($change['operation_run_id'] ?? '—'),
|
||||
'Type' => (string) ($change['type'] ?? '—'),
|
||||
'Reason' => (string) ($change['reason_code'] ?? '—'),
|
||||
'Applied' => (($change['applied'] ?? false) === true) ? 'yes' : 'no',
|
||||
];
|
||||
})
|
||||
->values()
|
||||
->all();
|
||||
|
||||
if ($rows !== []) {
|
||||
$this->table(['Run', 'Type', 'Reason', 'Applied'], $rows);
|
||||
}
|
||||
|
||||
$this->info(sprintf(
|
||||
'Inspected %d run(s); reconciled %d; skipped %d.',
|
||||
(int) ($result['candidates'] ?? 0),
|
||||
(int) ($result['reconciled'] ?? 0),
|
||||
(int) ($result['skipped'] ?? 0),
|
||||
));
|
||||
|
||||
if ($dryRun) {
|
||||
$this->comment('Dry-run: no changes written.');
|
||||
}
|
||||
|
||||
return self::SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<int, string> $tenantIdentifiers
|
||||
* @return array<int, int>
|
||||
*/
|
||||
private function resolveTenantIds(array $tenantIdentifiers): array
|
||||
{
|
||||
if ($tenantIdentifiers === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$tenantIds = [];
|
||||
|
||||
foreach ($tenantIdentifiers as $identifier) {
|
||||
$tenant = Tenant::query()->forTenant($identifier)->first();
|
||||
|
||||
if ($tenant instanceof Tenant) {
|
||||
$tenantIds[] = (int) $tenant->getKey();
|
||||
}
|
||||
}
|
||||
|
||||
return array_values(array_unique($tenantIds));
|
||||
}
|
||||
}
|
||||
@ -34,6 +34,7 @@
|
||||
use Filament\Tables\Contracts\HasTable;
|
||||
use Filament\Tables\Filters\SelectFilter;
|
||||
use Filament\Tables\Table;
|
||||
use Illuminate\Contracts\View\View;
|
||||
use Illuminate\Database\Eloquent\Builder;
|
||||
use Symfony\Component\HttpKernel\Exception\NotFoundHttpException;
|
||||
use UnitEnum;
|
||||
@ -42,8 +43,6 @@ class AuditLog extends Page implements HasTable
|
||||
{
|
||||
use InteractsWithTable;
|
||||
|
||||
public ?int $selectedAuditLogId = null;
|
||||
|
||||
protected static bool $isDiscovered = false;
|
||||
|
||||
protected static bool $shouldRegisterNavigation = false;
|
||||
@ -82,14 +81,15 @@ public static function actionSurfaceDeclaration(): ActionSurfaceDeclaration
|
||||
public function mount(): void
|
||||
{
|
||||
$this->authorizePageAccess();
|
||||
$this->selectedAuditLogId = is_numeric(request()->query('event')) ? (int) request()->query('event') : null;
|
||||
$requestedEventId = is_numeric(request()->query('event')) ? (int) request()->query('event') : null;
|
||||
|
||||
app(CanonicalAdminTenantFilterState::class)->sync($this->getTableFiltersSessionKey(), request: request());
|
||||
|
||||
$this->mountInteractsWithTable();
|
||||
|
||||
if ($this->selectedAuditLogId !== null) {
|
||||
$this->selectedAuditLog();
|
||||
if ($requestedEventId !== null) {
|
||||
$this->resolveAuditLog($requestedEventId);
|
||||
$this->mountTableAction('inspect', (string) $requestedEventId);
|
||||
}
|
||||
}
|
||||
|
||||
@ -98,31 +98,10 @@ public function mount(): void
|
||||
*/
|
||||
protected function getHeaderActions(): array
|
||||
{
|
||||
$actions = app(OperateHubShell::class)->headerActions(
|
||||
return app(OperateHubShell::class)->headerActions(
|
||||
scopeActionName: 'operate_hub_scope_audit_log',
|
||||
returnActionName: 'operate_hub_return_audit_log',
|
||||
);
|
||||
|
||||
if ($this->selectedAuditLog() instanceof AuditLogModel) {
|
||||
$actions[] = Action::make('clear_selected_audit_event')
|
||||
->label('Close details')
|
||||
->color('gray')
|
||||
->action(function (): void {
|
||||
$this->clearSelectedAuditLog();
|
||||
});
|
||||
|
||||
$relatedLink = $this->selectedAuditLink();
|
||||
|
||||
if (is_array($relatedLink)) {
|
||||
$actions[] = Action::make('open_selected_audit_target')
|
||||
->label($relatedLink['label'])
|
||||
->icon('heroicon-o-arrow-top-right-on-square')
|
||||
->color('gray')
|
||||
->url($relatedLink['url']);
|
||||
}
|
||||
}
|
||||
|
||||
return $actions;
|
||||
}
|
||||
|
||||
public function table(Table $table): Table
|
||||
@ -195,9 +174,16 @@ public function table(Table $table): Table
|
||||
->label('Inspect event')
|
||||
->icon('heroicon-o-eye')
|
||||
->color('gray')
|
||||
->action(function (AuditLogModel $record): void {
|
||||
$this->selectedAuditLogId = (int) $record->getKey();
|
||||
}),
|
||||
->slideOver()
|
||||
->stickyModalHeader()
|
||||
->modalSubmitAction(false)
|
||||
->modalCancelAction(fn (Action $action): Action => $action->label('Close details'))
|
||||
->modalHeading(fn (AuditLogModel $record): string => $record->summaryText())
|
||||
->modalDescription(fn (AuditLogModel $record): ?string => $record->recorded_at?->toDayDateTimeString())
|
||||
->modalContent(fn (AuditLogModel $record): View => view('filament.pages.monitoring.partials.audit-log-inspect-event', [
|
||||
'selectedAudit' => $record,
|
||||
'selectedAuditLink' => $this->auditTargetLink($record),
|
||||
])),
|
||||
])
|
||||
->bulkActions([])
|
||||
->emptyStateHeading('No audit events match this view')
|
||||
@ -209,48 +195,11 @@ public function table(Table $table): Table
|
||||
->icon('heroicon-o-x-mark')
|
||||
->color('gray')
|
||||
->action(function (): void {
|
||||
$this->selectedAuditLogId = null;
|
||||
$this->resetTable();
|
||||
}),
|
||||
]);
|
||||
}
|
||||
|
||||
public function clearSelectedAuditLog(): void
|
||||
{
|
||||
$this->selectedAuditLogId = null;
|
||||
}
|
||||
|
||||
public function selectedAuditLog(): ?AuditLogModel
|
||||
{
|
||||
if (! is_numeric($this->selectedAuditLogId)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$record = $this->auditBaseQuery()
|
||||
->whereKey((int) $this->selectedAuditLogId)
|
||||
->first();
|
||||
|
||||
if (! $record instanceof AuditLogModel) {
|
||||
throw new NotFoundHttpException;
|
||||
}
|
||||
|
||||
return $record;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{label: string, url: string}|null
|
||||
*/
|
||||
public function selectedAuditLink(): ?array
|
||||
{
|
||||
$record = $this->selectedAuditLog();
|
||||
|
||||
if (! $record instanceof AuditLogModel) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return app(RelatedNavigationResolver::class)->auditTargetLink($record);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<int, Tenant>
|
||||
*/
|
||||
@ -323,6 +272,27 @@ private function auditBaseQuery(): Builder
|
||||
->latestFirst();
|
||||
}
|
||||
|
||||
private function resolveAuditLog(int $auditLogId): AuditLogModel
|
||||
{
|
||||
$record = $this->auditBaseQuery()
|
||||
->whereKey($auditLogId)
|
||||
->first();
|
||||
|
||||
if (! $record instanceof AuditLogModel) {
|
||||
throw new NotFoundHttpException;
|
||||
}
|
||||
|
||||
return $record;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{label: string, url: string}|null
|
||||
*/
|
||||
private function auditTargetLink(AuditLogModel $record): ?array
|
||||
{
|
||||
return app(RelatedNavigationResolver::class)->auditTargetLink($record);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, string>
|
||||
*/
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
use App\Support\OperateHub\OperateHubShell;
|
||||
use App\Support\OperationRunOutcome;
|
||||
use App\Support\OperationRunStatus;
|
||||
use App\Support\Operations\OperationLifecyclePolicy;
|
||||
use App\Support\Workspaces\WorkspaceContext;
|
||||
use BackedEnum;
|
||||
use Filament\Actions\Action;
|
||||
@ -165,6 +166,68 @@ public function table(Table $table): Table
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{likely_stale:int,reconciled:int}
|
||||
*/
|
||||
public function lifecycleVisibilitySummary(): array
|
||||
{
|
||||
$baseQuery = $this->scopedSummaryQuery();
|
||||
|
||||
if (! $baseQuery instanceof Builder) {
|
||||
return [
|
||||
'likely_stale' => 0,
|
||||
'reconciled' => 0,
|
||||
];
|
||||
}
|
||||
|
||||
$reconciled = (clone $baseQuery)
|
||||
->whereNotNull('context->reconciliation->reconciled_at')
|
||||
->count();
|
||||
|
||||
$policy = app(OperationLifecyclePolicy::class);
|
||||
$likelyStale = (clone $baseQuery)
|
||||
->whereIn('status', [
|
||||
OperationRunStatus::Queued->value,
|
||||
OperationRunStatus::Running->value,
|
||||
])
|
||||
->where(function (Builder $query) use ($policy): void {
|
||||
foreach ($policy->coveredTypeNames() as $type) {
|
||||
$query->orWhere(function (Builder $typeQuery) use ($policy, $type): void {
|
||||
$typeQuery
|
||||
->where('type', $type)
|
||||
->where(function (Builder $stateQuery) use ($policy, $type): void {
|
||||
$stateQuery
|
||||
->where(function (Builder $queuedQuery) use ($policy, $type): void {
|
||||
$queuedQuery
|
||||
->where('status', OperationRunStatus::Queued->value)
|
||||
->whereNull('started_at')
|
||||
->where('created_at', '<=', now()->subSeconds($policy->queuedStaleAfterSeconds($type)));
|
||||
})
|
||||
->orWhere(function (Builder $runningQuery) use ($policy, $type): void {
|
||||
$runningQuery
|
||||
->where('status', OperationRunStatus::Running->value)
|
||||
->where(function (Builder $startedAtQuery) use ($policy, $type): void {
|
||||
$startedAtQuery
|
||||
->where('started_at', '<=', now()->subSeconds($policy->runningStaleAfterSeconds($type)))
|
||||
->orWhere(function (Builder $fallbackQuery) use ($policy, $type): void {
|
||||
$fallbackQuery
|
||||
->whereNull('started_at')
|
||||
->where('created_at', '<=', now()->subSeconds($policy->runningStaleAfterSeconds($type)));
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
})
|
||||
->count();
|
||||
|
||||
return [
|
||||
'likely_stale' => $likelyStale,
|
||||
'reconciled' => $reconciled,
|
||||
];
|
||||
}
|
||||
|
||||
private function applyActiveTab(Builder $query): Builder
|
||||
{
|
||||
return match ($this->activeTab) {
|
||||
@ -187,4 +250,26 @@ private function applyActiveTab(Builder $query): Builder
|
||||
default => $query,
|
||||
};
|
||||
}
|
||||
|
||||
private function scopedSummaryQuery(): ?Builder
|
||||
{
|
||||
$workspaceId = app(WorkspaceContext::class)->currentWorkspaceId(request());
|
||||
|
||||
if (! $workspaceId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$tenantFilter = data_get($this->tableFilters, 'tenant_id.value');
|
||||
|
||||
if (! is_numeric($tenantFilter)) {
|
||||
$tenantFilter = data_get(session()->get($this->getTableFiltersSessionKey(), []), 'tenant_id.value');
|
||||
}
|
||||
|
||||
return OperationRun::query()
|
||||
->where('workspace_id', (int) $workspaceId)
|
||||
->when(
|
||||
is_numeric($tenantFilter),
|
||||
fn (Builder $query): Builder => $query->where('tenant_id', (int) $tenantFilter),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@ -183,6 +183,40 @@ public function blockedExecutionBanner(): ?array
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{tone: string, title: string, body: string}|null
|
||||
*/
|
||||
public function lifecycleBanner(): ?array
|
||||
{
|
||||
if (! isset($this->run)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$attention = OperationUxPresenter::lifecycleAttentionSummary($this->run);
|
||||
|
||||
if ($attention === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$detail = OperationUxPresenter::surfaceFailureDetail($this->run) ?? 'Lifecycle truth needs operator review.';
|
||||
$guidance = OperationUxPresenter::surfaceGuidance($this->run);
|
||||
$body = $guidance !== null ? $detail.' '.$guidance : $detail;
|
||||
|
||||
return match ($this->run->freshnessState()->value) {
|
||||
'likely_stale' => [
|
||||
'tone' => 'amber',
|
||||
'title' => 'Likely stale run',
|
||||
'body' => $body,
|
||||
],
|
||||
'reconciled_failed' => [
|
||||
'tone' => 'rose',
|
||||
'title' => 'Automatically reconciled',
|
||||
'body' => $body,
|
||||
],
|
||||
default => null,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{tone: string, title: string, body: string}|null
|
||||
*/
|
||||
|
||||
@ -128,10 +128,11 @@ public static function table(Table $table): Table
|
||||
->columns([
|
||||
Tables\Columns\TextColumn::make('status')
|
||||
->badge()
|
||||
->formatStateUsing(BadgeRenderer::label(BadgeDomain::OperationRunStatus))
|
||||
->color(BadgeRenderer::color(BadgeDomain::OperationRunStatus))
|
||||
->icon(BadgeRenderer::icon(BadgeDomain::OperationRunStatus))
|
||||
->iconColor(BadgeRenderer::iconColor(BadgeDomain::OperationRunStatus)),
|
||||
->formatStateUsing(fn (mixed $state, OperationRun $record): string => BadgeRenderer::spec(BadgeDomain::OperationRunStatus, static::statusBadgeState($record))->label)
|
||||
->color(fn (mixed $state, OperationRun $record): string => BadgeRenderer::spec(BadgeDomain::OperationRunStatus, static::statusBadgeState($record))->color)
|
||||
->icon(fn (mixed $state, OperationRun $record): ?string => BadgeRenderer::spec(BadgeDomain::OperationRunStatus, static::statusBadgeState($record))->icon)
|
||||
->iconColor(fn (mixed $state, OperationRun $record): ?string => BadgeRenderer::spec(BadgeDomain::OperationRunStatus, static::statusBadgeState($record))->iconColor)
|
||||
->description(fn (OperationRun $record): ?string => OperationUxPresenter::lifecycleAttentionSummary($record)),
|
||||
Tables\Columns\TextColumn::make('type')
|
||||
->label('Operation')
|
||||
->formatStateUsing(fn (?string $state): string => OperationCatalog::label((string) $state))
|
||||
@ -154,10 +155,10 @@ public static function table(Table $table): Table
|
||||
}),
|
||||
Tables\Columns\TextColumn::make('outcome')
|
||||
->badge()
|
||||
->formatStateUsing(BadgeRenderer::label(BadgeDomain::OperationRunOutcome))
|
||||
->color(BadgeRenderer::color(BadgeDomain::OperationRunOutcome))
|
||||
->icon(BadgeRenderer::icon(BadgeDomain::OperationRunOutcome))
|
||||
->iconColor(BadgeRenderer::iconColor(BadgeDomain::OperationRunOutcome))
|
||||
->formatStateUsing(fn (mixed $state, OperationRun $record): string => BadgeRenderer::spec(BadgeDomain::OperationRunOutcome, static::outcomeBadgeState($record))->label)
|
||||
->color(fn (mixed $state, OperationRun $record): string => BadgeRenderer::spec(BadgeDomain::OperationRunOutcome, static::outcomeBadgeState($record))->color)
|
||||
->icon(fn (mixed $state, OperationRun $record): ?string => BadgeRenderer::spec(BadgeDomain::OperationRunOutcome, static::outcomeBadgeState($record))->icon)
|
||||
->iconColor(fn (mixed $state, OperationRun $record): ?string => BadgeRenderer::spec(BadgeDomain::OperationRunOutcome, static::outcomeBadgeState($record))->iconColor)
|
||||
->description(fn (OperationRun $record): ?string => OperationUxPresenter::surfaceGuidance($record)),
|
||||
])
|
||||
->filters([
|
||||
@ -253,8 +254,8 @@ private static function enterpriseDetailPage(OperationRun $record): \App\Support
|
||||
{
|
||||
$factory = new \App\Support\Ui\EnterpriseDetail\EnterpriseDetailSectionFactory;
|
||||
|
||||
$statusSpec = BadgeRenderer::spec(BadgeDomain::OperationRunStatus, $record->status);
|
||||
$outcomeSpec = BadgeRenderer::spec(BadgeDomain::OperationRunOutcome, $record->outcome);
|
||||
$statusSpec = BadgeRenderer::spec(BadgeDomain::OperationRunStatus, static::statusBadgeState($record));
|
||||
$outcomeSpec = BadgeRenderer::spec(BadgeDomain::OperationRunOutcome, static::outcomeBadgeState($record));
|
||||
$targetScope = static::targetScopeDisplay($record);
|
||||
$summaryLine = \App\Support\OpsUx\SummaryCountsNormalizer::renderSummaryLine(is_array($record->summary_counts) ? $record->summary_counts : []);
|
||||
$referencedTenantLifecycle = $record->tenant instanceof Tenant
|
||||
@ -347,6 +348,18 @@ private static function enterpriseDetailPage(OperationRun $record): \App\Support
|
||||
$referencedTenantLifecycle?->contextNote !== null
|
||||
? $factory->keyFact('Viewer context', $referencedTenantLifecycle->contextNote)
|
||||
: null,
|
||||
static::freshnessLabel($record) !== null
|
||||
? $factory->keyFact('Freshness', (string) static::freshnessLabel($record))
|
||||
: null,
|
||||
static::reconciliationHeadline($record) !== null
|
||||
? $factory->keyFact('Lifecycle truth', (string) static::reconciliationHeadline($record))
|
||||
: null,
|
||||
static::reconciledAtLabel($record) !== null
|
||||
? $factory->keyFact('Reconciled at', (string) static::reconciledAtLabel($record))
|
||||
: null,
|
||||
static::reconciliationSourceLabel($record) !== null
|
||||
? $factory->keyFact('Reconciled by', (string) static::reconciliationSourceLabel($record))
|
||||
: null,
|
||||
$artifactTruth !== null
|
||||
? $factory->keyFact('Artifact next step', $artifactTruth->nextStepText())
|
||||
: null,
|
||||
@ -416,6 +429,19 @@ private static function enterpriseDetailPage(OperationRun $record): \App\Support
|
||||
);
|
||||
}
|
||||
|
||||
if (static::reconciliationPayload($record) !== []) {
|
||||
$builder->addSection(
|
||||
$factory->viewSection(
|
||||
id: 'reconciliation',
|
||||
kind: 'operational_context',
|
||||
title: 'Lifecycle reconciliation',
|
||||
view: 'filament.infolists.entries.snapshot-json',
|
||||
viewData: ['payload' => static::reconciliationPayload($record)],
|
||||
description: 'Lifecycle reconciliation is diagnostic evidence showing when TenantPilot force-resolved the run.',
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
if ((string) $record->type === 'baseline_compare') {
|
||||
$baselineCompareFacts = static::baselineCompareFacts($record, $factory);
|
||||
$baselineCompareEvidence = static::baselineCompareEvidencePayload($record);
|
||||
@ -726,6 +752,82 @@ private static function contextPayload(OperationRun $record): array
|
||||
return $context;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{status:string,freshness_state:string}
|
||||
*/
|
||||
private static function statusBadgeState(OperationRun $record): array
|
||||
{
|
||||
return [
|
||||
'status' => (string) $record->status,
|
||||
'freshness_state' => $record->freshnessState()->value,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{outcome:string,status:string,freshness_state:string}
|
||||
*/
|
||||
private static function outcomeBadgeState(OperationRun $record): array
|
||||
{
|
||||
return [
|
||||
'outcome' => (string) $record->outcome,
|
||||
'status' => (string) $record->status,
|
||||
'freshness_state' => $record->freshnessState()->value,
|
||||
];
|
||||
}
|
||||
|
||||
private static function freshnessLabel(OperationRun $record): ?string
|
||||
{
|
||||
return match ($record->freshnessState()->value) {
|
||||
'fresh_active' => 'Fresh activity',
|
||||
'likely_stale' => 'Likely stale',
|
||||
'reconciled_failed' => 'Automatically reconciled',
|
||||
'terminal_normal' => 'Terminal truth confirmed',
|
||||
default => null,
|
||||
};
|
||||
}
|
||||
|
||||
private static function reconciliationHeadline(OperationRun $record): ?string
|
||||
{
|
||||
if (! $record->isLifecycleReconciled()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return 'TenantPilot force-resolved this run after normal lifecycle truth was lost.';
|
||||
}
|
||||
|
||||
private static function reconciledAtLabel(OperationRun $record): ?string
|
||||
{
|
||||
$reconciledAt = data_get($record->reconciliation(), 'reconciled_at');
|
||||
|
||||
return is_string($reconciledAt) && trim($reconciledAt) !== '' ? trim($reconciledAt) : null;
|
||||
}
|
||||
|
||||
private static function reconciliationSourceLabel(OperationRun $record): ?string
|
||||
{
|
||||
$source = data_get($record->reconciliation(), 'source');
|
||||
|
||||
if (! is_string($source) || trim($source) === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
return match (trim($source)) {
|
||||
'failed_callback' => 'Direct failed() bridge',
|
||||
'scheduled_reconciler' => 'Scheduled reconciler',
|
||||
'adapter_reconciler' => 'Adapter reconciler',
|
||||
default => ucfirst(str_replace('_', ' ', trim($source))),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, mixed>
|
||||
*/
|
||||
private static function reconciliationPayload(OperationRun $record): array
|
||||
{
|
||||
$reconciliation = $record->reconciliation();
|
||||
|
||||
return $reconciliation;
|
||||
}
|
||||
|
||||
private static function formatDetailTimestamp(mixed $value): string
|
||||
{
|
||||
if (! $value instanceof \Illuminate\Support\Carbon) {
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Jobs\Operations\BackupSetRestoreWorkerJob;
|
||||
use App\Models\OperationRun;
|
||||
use App\Services\OperationRunService;
|
||||
@ -11,11 +12,18 @@
|
||||
use Illuminate\Queue\InteractsWithQueue;
|
||||
use Illuminate\Queue\SerializesModels;
|
||||
use RuntimeException;
|
||||
use Throwable;
|
||||
|
||||
class BulkBackupSetRestoreJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
use BridgesFailedOperationRun;
|
||||
use Dispatchable;
|
||||
use InteractsWithQueue;
|
||||
use Queueable;
|
||||
use SerializesModels;
|
||||
|
||||
public int $timeout = 300;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public int $bulkRunId = 0;
|
||||
|
||||
@ -68,32 +76,6 @@ public function handle(OperationRunService $runs): void
|
||||
}
|
||||
}
|
||||
|
||||
public function failed(Throwable $e): void
|
||||
{
|
||||
$run = $this->operationRun;
|
||||
|
||||
if (! $run instanceof OperationRun && $this->bulkRunId > 0) {
|
||||
$run = OperationRun::query()->find($this->bulkRunId);
|
||||
}
|
||||
|
||||
if (! $run instanceof OperationRun) {
|
||||
return;
|
||||
}
|
||||
|
||||
/** @var OperationRunService $runs */
|
||||
$runs = app(OperationRunService::class);
|
||||
|
||||
$runs->updateRun(
|
||||
$run,
|
||||
status: 'completed',
|
||||
outcome: 'failed',
|
||||
failures: [[
|
||||
'code' => 'bulk_job.failed',
|
||||
'message' => $e->getMessage(),
|
||||
]],
|
||||
);
|
||||
}
|
||||
|
||||
private function resolveOperationRun(): OperationRun
|
||||
{
|
||||
if ($this->operationRun instanceof OperationRun) {
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Jobs\Middleware\EnsureQueuedExecutionLegitimate;
|
||||
use App\Jobs\Operations\TenantSyncWorkerJob;
|
||||
use App\Models\OperationRun;
|
||||
@ -15,7 +16,15 @@
|
||||
|
||||
class BulkTenantSyncJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
use BridgesFailedOperationRun;
|
||||
use Dispatchable;
|
||||
use InteractsWithQueue;
|
||||
use Queueable;
|
||||
use SerializesModels;
|
||||
|
||||
public int $timeout = 180;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public ?OperationRun $operationRun = null;
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Jobs\Middleware\TrackOperationRun;
|
||||
use App\Models\BaselineProfile;
|
||||
use App\Models\BaselineSnapshot;
|
||||
@ -39,10 +40,16 @@
|
||||
|
||||
class CaptureBaselineSnapshotJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
use BridgesFailedOperationRun;
|
||||
use Dispatchable;
|
||||
use InteractsWithQueue;
|
||||
use Queueable;
|
||||
use SerializesModels;
|
||||
|
||||
public int $timeout = 300;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public ?OperationRun $operationRun = null;
|
||||
|
||||
public function __construct(
|
||||
@ -65,13 +72,13 @@ public function handle(
|
||||
AuditLogger $auditLogger,
|
||||
OperationRunService $operationRunService,
|
||||
?CurrentStateHashResolver $hashResolver = null,
|
||||
?BaselineSnapshotItemNormalizer $snapshotItemNormalizer = null,
|
||||
?BaselineContentCapturePhase $contentCapturePhase = null,
|
||||
?BaselineSnapshotItemNormalizer $snapshotItemNormalizer = null,
|
||||
?BaselineFullContentRolloutGate $rolloutGate = null,
|
||||
): void {
|
||||
$hashResolver ??= app(CurrentStateHashResolver::class);
|
||||
$snapshotItemNormalizer ??= app(BaselineSnapshotItemNormalizer::class);
|
||||
$contentCapturePhase ??= app(BaselineContentCapturePhase::class);
|
||||
$snapshotItemNormalizer ??= app(BaselineSnapshotItemNormalizer::class);
|
||||
$rolloutGate ??= app(BaselineFullContentRolloutGate::class);
|
||||
|
||||
if (! $this->operationRun instanceof OperationRun) {
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Jobs\Middleware\TrackOperationRun;
|
||||
use App\Models\BaselineProfile;
|
||||
use App\Models\BaselineSnapshot;
|
||||
@ -56,10 +57,16 @@
|
||||
|
||||
class CompareBaselineToTenantJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
use BridgesFailedOperationRun;
|
||||
use Dispatchable;
|
||||
use InteractsWithQueue;
|
||||
use Queueable;
|
||||
use SerializesModels;
|
||||
|
||||
public int $timeout = 300;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
/**
|
||||
* @var array<int, string>
|
||||
*/
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Models\OperationRun;
|
||||
use App\Models\TenantReview;
|
||||
use App\Services\OperationRunService;
|
||||
@ -17,8 +18,13 @@
|
||||
|
||||
class ComposeTenantReviewJob implements ShouldQueue
|
||||
{
|
||||
use BridgesFailedOperationRun;
|
||||
use Queueable;
|
||||
|
||||
public int $timeout = 240;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public function __construct(
|
||||
public int $tenantReviewId,
|
||||
public int $operationRunId,
|
||||
|
||||
58
app/Jobs/Concerns/BridgesFailedOperationRun.php
Normal file
58
app/Jobs/Concerns/BridgesFailedOperationRun.php
Normal file
@ -0,0 +1,58 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Jobs\Concerns;
|
||||
|
||||
use App\Models\OperationRun;
|
||||
use App\Services\OperationRunService;
|
||||
use Throwable;
|
||||
|
||||
trait BridgesFailedOperationRun
|
||||
{
|
||||
public function failed(Throwable $exception): void
|
||||
{
|
||||
$operationRun = $this->failedBridgeOperationRun();
|
||||
|
||||
if (! $operationRun instanceof OperationRun) {
|
||||
return;
|
||||
}
|
||||
|
||||
app(OperationRunService::class)->bridgeFailedJobFailure($operationRun, $exception);
|
||||
}
|
||||
|
||||
protected function failedBridgeOperationRun(): ?OperationRun
|
||||
{
|
||||
if (property_exists($this, 'operationRun') && $this->operationRun instanceof OperationRun) {
|
||||
return $this->operationRun;
|
||||
}
|
||||
|
||||
if (property_exists($this, 'run') && $this->run instanceof OperationRun) {
|
||||
return $this->run;
|
||||
}
|
||||
|
||||
$candidateIds = [];
|
||||
|
||||
foreach (['operationRunId', 'bulkRunId', 'runId'] as $property) {
|
||||
if (! property_exists($this, $property)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$value = $this->{$property};
|
||||
|
||||
if (is_numeric($value) && (int) $value > 0) {
|
||||
$candidateIds[] = (int) $value;
|
||||
}
|
||||
}
|
||||
|
||||
foreach (array_values(array_unique($candidateIds)) as $candidateId) {
|
||||
$operationRun = OperationRun::query()->find($candidateId);
|
||||
|
||||
if ($operationRun instanceof OperationRun) {
|
||||
return $operationRun;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@ -20,6 +20,10 @@ class EntraGroupSyncJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
public int $timeout = 240;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public ?OperationRun $operationRun = null;
|
||||
|
||||
public function __construct(
|
||||
|
||||
@ -25,6 +25,10 @@ class ExecuteRestoreRunJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
public int $timeout = 420;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public ?OperationRun $operationRun = null;
|
||||
|
||||
public function __construct(
|
||||
|
||||
@ -19,6 +19,10 @@ class GenerateEvidenceSnapshotJob implements ShouldQueue
|
||||
{
|
||||
use Queueable;
|
||||
|
||||
public int $timeout = 240;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public function __construct(
|
||||
public int $snapshotId,
|
||||
public int $operationRunId,
|
||||
|
||||
@ -28,6 +28,10 @@ class GenerateReviewPackJob implements ShouldQueue
|
||||
{
|
||||
use Queueable;
|
||||
|
||||
public int $timeout = 240;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public function __construct(
|
||||
public int $reviewPackId,
|
||||
public int $operationRunId,
|
||||
|
||||
@ -40,6 +40,10 @@ class RunBackupScheduleJob implements ShouldQueue
|
||||
|
||||
public int $tries = 3;
|
||||
|
||||
public int $timeout = 300;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
/**
|
||||
* Compatibility-only legacy field.
|
||||
*
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Jobs\Middleware\EnsureQueuedExecutionLegitimate;
|
||||
use App\Jobs\Middleware\TrackOperationRun;
|
||||
use App\Models\OperationRun;
|
||||
@ -24,7 +25,15 @@
|
||||
|
||||
class RunInventorySyncJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
use BridgesFailedOperationRun;
|
||||
use Dispatchable;
|
||||
use InteractsWithQueue;
|
||||
use Queueable;
|
||||
use SerializesModels;
|
||||
|
||||
public int $timeout = 240;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public ?OperationRun $operationRun = null;
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Jobs\Middleware\EnsureQueuedExecutionLegitimate;
|
||||
use App\Jobs\Middleware\TrackOperationRun;
|
||||
use App\Models\OperationRun;
|
||||
@ -21,7 +22,15 @@
|
||||
|
||||
class SyncPoliciesJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
use BridgesFailedOperationRun;
|
||||
use Dispatchable;
|
||||
use InteractsWithQueue;
|
||||
use Queueable;
|
||||
use SerializesModels;
|
||||
|
||||
public int $timeout = 180;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public ?OperationRun $operationRun = null;
|
||||
|
||||
|
||||
@ -20,6 +20,10 @@ class SyncRoleDefinitionsJob implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
public int $timeout = 240;
|
||||
|
||||
public bool $failOnTimeout = true;
|
||||
|
||||
public ?OperationRun $operationRun = null;
|
||||
|
||||
/**
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
namespace App\Models;
|
||||
|
||||
use App\Support\OperationCatalog;
|
||||
use App\Support\Operations\OperationRunFreshnessState;
|
||||
use Illuminate\Database\Eloquent\Builder;
|
||||
use Illuminate\Database\Eloquent\Factories\HasFactory;
|
||||
use Illuminate\Database\Eloquent\Model;
|
||||
@ -159,4 +160,32 @@ public function relatedArtifactId(): ?int
|
||||
default => null,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, mixed>
|
||||
*/
|
||||
public function reconciliation(): array
|
||||
{
|
||||
$context = is_array($this->context) ? $this->context : [];
|
||||
$reconciliation = $context['reconciliation'] ?? null;
|
||||
|
||||
return is_array($reconciliation) ? $reconciliation : [];
|
||||
}
|
||||
|
||||
public function isLifecycleReconciled(): bool
|
||||
{
|
||||
return $this->reconciliation() !== [];
|
||||
}
|
||||
|
||||
public function lifecycleReconciliationReasonCode(): ?string
|
||||
{
|
||||
$reasonCode = $this->reconciliation()['reason_code'] ?? null;
|
||||
|
||||
return is_string($reasonCode) && trim($reasonCode) !== '' ? trim($reasonCode) : null;
|
||||
}
|
||||
|
||||
public function freshnessState(): OperationRunFreshnessState
|
||||
{
|
||||
return OperationRunFreshnessState::forRun($this);
|
||||
}
|
||||
}
|
||||
|
||||
@ -61,7 +61,7 @@ public function view(User $user, OperationRun $run): Response|bool
|
||||
}
|
||||
|
||||
$requiredCapability = app(OperationRunCapabilityResolver::class)
|
||||
->requiredCapabilityForType((string) $run->type);
|
||||
->requiredCapabilityForRun($run);
|
||||
|
||||
if (! is_string($requiredCapability) || $requiredCapability === '') {
|
||||
return true;
|
||||
|
||||
@ -28,6 +28,7 @@
|
||||
use App\Services\Auth\WorkspaceCapabilityResolver;
|
||||
use App\Services\Auth\WorkspaceRoleCapabilityMap;
|
||||
use App\Support\Auth\Capabilities;
|
||||
use App\Support\Filament\PanelThemeAsset;
|
||||
use App\Support\Workspaces\WorkspaceContext;
|
||||
use Filament\Http\Middleware\Authenticate;
|
||||
use Filament\Http\Middleware\AuthenticateSession;
|
||||
@ -202,7 +203,7 @@ public function panel(Panel $panel): Panel
|
||||
]);
|
||||
|
||||
if (! app()->runningUnitTests()) {
|
||||
$panel->viteTheme('resources/css/filament/admin/theme.css');
|
||||
$panel->theme(PanelThemeAsset::resolve('resources/css/filament/admin/theme.css'));
|
||||
}
|
||||
|
||||
return $panel;
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
use App\Filament\System\Pages\Dashboard;
|
||||
use App\Http\Middleware\UseSystemSessionCookie;
|
||||
use App\Support\Auth\PlatformCapabilities;
|
||||
use App\Support\Filament\PanelThemeAsset;
|
||||
use Filament\Http\Middleware\Authenticate;
|
||||
use Filament\Http\Middleware\AuthenticateSession;
|
||||
use Filament\Http\Middleware\DisableBladeIconComponents;
|
||||
@ -60,6 +61,6 @@ public function panel(Panel $panel): Panel
|
||||
Authenticate::class,
|
||||
'ensure-platform-capability:'.PlatformCapabilities::ACCESS_SYSTEM_PANEL,
|
||||
])
|
||||
->viteTheme('resources/css/filament/system/theme.css');
|
||||
->theme(PanelThemeAsset::resolve('resources/css/filament/system/theme.css'));
|
||||
}
|
||||
}
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
use App\Filament\Pages\TenantDashboard;
|
||||
use App\Filament\Resources\TenantReviewResource;
|
||||
use App\Models\Tenant;
|
||||
use App\Support\Filament\PanelThemeAsset;
|
||||
use App\Support\Middleware\DenyNonMemberTenantAccess;
|
||||
use Filament\Facades\Filament;
|
||||
use Filament\Http\Middleware\Authenticate;
|
||||
@ -112,7 +113,7 @@ public function panel(Panel $panel): Panel
|
||||
]);
|
||||
|
||||
if (! app()->runningUnitTests()) {
|
||||
$panel->viteTheme('resources/css/filament/admin/theme.css');
|
||||
$panel->theme(PanelThemeAsset::resolve('resources/css/filament/admin/theme.css'));
|
||||
}
|
||||
|
||||
return $panel;
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
use App\Models\RestoreRun;
|
||||
use App\Support\OperationRunOutcome;
|
||||
use App\Support\OperationRunStatus;
|
||||
use App\Support\Operations\LifecycleReconciliationReason;
|
||||
use App\Support\RestoreRunStatus;
|
||||
use Carbon\CarbonImmutable;
|
||||
use Illuminate\Database\Eloquent\Builder;
|
||||
@ -151,25 +152,23 @@ private function reconcileOne(OperationRun $run, bool $dryRun): ?array
|
||||
/** @var OperationRunService $runs */
|
||||
$runs = app(OperationRunService::class);
|
||||
|
||||
$runs->updateRun(
|
||||
$run,
|
||||
$runs->updateRunWithReconciliation(
|
||||
run: $run,
|
||||
status: $opStatus,
|
||||
outcome: $opOutcome,
|
||||
summaryCounts: $summaryCounts,
|
||||
failures: $failures,
|
||||
reasonCode: LifecycleReconciliationReason::AdapterOutOfSync->value,
|
||||
reasonMessage: LifecycleReconciliationReason::AdapterOutOfSync->defaultMessage(),
|
||||
source: 'adapter_reconciler',
|
||||
evidence: [
|
||||
'restore_run_id' => (int) $restoreRun->getKey(),
|
||||
'restore_status' => $restoreStatus?->value,
|
||||
],
|
||||
);
|
||||
|
||||
$run->refresh();
|
||||
|
||||
$updatedContext = is_array($run->context) ? $run->context : [];
|
||||
$reconciliation = is_array($updatedContext['reconciliation'] ?? null) ? $updatedContext['reconciliation'] : [];
|
||||
$reconciliation['reconciled_at'] = CarbonImmutable::now()->toIso8601String();
|
||||
$reconciliation['reason'] = 'adapter_out_of_sync';
|
||||
|
||||
$updatedContext['reconciliation'] = $reconciliation;
|
||||
|
||||
$run->context = $updatedContext;
|
||||
|
||||
if ($run->started_at === null && $restoreRun->started_at !== null) {
|
||||
$run->started_at = $restoreRun->started_at;
|
||||
}
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
use App\Support\OperationRunStatus;
|
||||
use App\Support\Operations\ExecutionAuthorityMode;
|
||||
use App\Support\Operations\ExecutionDenialReasonCode;
|
||||
use App\Support\Operations\LifecycleReconciliationReason;
|
||||
use App\Support\Operations\OperationRunCapabilityResolver;
|
||||
use App\Support\Operations\QueuedExecutionLegitimacyDecision;
|
||||
use App\Support\OpsUx\BulkRunContext;
|
||||
@ -62,15 +63,45 @@ public function isStaleQueuedRun(OperationRun $run, int $thresholdMinutes = 5):
|
||||
|
||||
public function failStaleQueuedRun(OperationRun $run, string $message = 'Run was queued but never started.'): OperationRun
|
||||
{
|
||||
return $this->updateRun(
|
||||
return $this->forceFailNonTerminalRun(
|
||||
$run,
|
||||
status: OperationRunStatus::Completed->value,
|
||||
outcome: OperationRunOutcome::Failed->value,
|
||||
failures: [
|
||||
[
|
||||
'code' => 'run.stale_queued',
|
||||
'message' => $message,
|
||||
],
|
||||
reasonCode: LifecycleReconciliationReason::StaleQueued->value,
|
||||
message: $message,
|
||||
source: 'scheduled_reconciler',
|
||||
evidence: [
|
||||
'status' => OperationRunStatus::Queued->value,
|
||||
'created_at' => $run->created_at?->toIso8601String(),
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
public function isStaleRunningRun(OperationRun $run, int $thresholdMinutes = 15): bool
|
||||
{
|
||||
if ($run->status !== OperationRunStatus::Running->value) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$startedAt = $run->started_at ?? $run->created_at;
|
||||
|
||||
if ($startedAt === null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return $startedAt->lte(now()->subMinutes(max(1, $thresholdMinutes)));
|
||||
}
|
||||
|
||||
public function failStaleRunningRun(
|
||||
OperationRun $run,
|
||||
string $message = 'Run stopped reporting progress and was marked failed.',
|
||||
): OperationRun {
|
||||
return $this->forceFailNonTerminalRun(
|
||||
$run,
|
||||
reasonCode: LifecycleReconciliationReason::StaleRunning->value,
|
||||
message: $message,
|
||||
source: 'scheduled_reconciler',
|
||||
evidence: [
|
||||
'status' => OperationRunStatus::Running->value,
|
||||
'started_at' => ($run->started_at ?? $run->created_at)?->toIso8601String(),
|
||||
],
|
||||
);
|
||||
}
|
||||
@ -721,6 +752,136 @@ public function failRun(OperationRun $run, Throwable $e): OperationRun
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $evidence
|
||||
* @param array<string, mixed> $summaryCounts
|
||||
*/
|
||||
public function forceFailNonTerminalRun(
|
||||
OperationRun $run,
|
||||
string $reasonCode,
|
||||
string $message,
|
||||
string $source = 'scheduled_reconciler',
|
||||
array $evidence = [],
|
||||
array $summaryCounts = [],
|
||||
): OperationRun {
|
||||
return $this->updateRunWithReconciliation(
|
||||
run: $run,
|
||||
status: OperationRunStatus::Completed->value,
|
||||
outcome: OperationRunOutcome::Failed->value,
|
||||
summaryCounts: $summaryCounts,
|
||||
failures: [[
|
||||
'code' => $reasonCode,
|
||||
'reason_code' => $reasonCode,
|
||||
'message' => $message,
|
||||
]],
|
||||
reasonCode: $reasonCode,
|
||||
reasonMessage: $message,
|
||||
source: $source,
|
||||
evidence: $evidence,
|
||||
);
|
||||
}
|
||||
|
||||
public function bridgeFailedJobFailure(
|
||||
OperationRun $run,
|
||||
Throwable $exception,
|
||||
string $source = 'failed_callback',
|
||||
): OperationRun {
|
||||
$reason = $this->bridgeReasonForThrowable($exception);
|
||||
$message = $reason->defaultMessage();
|
||||
$exceptionMessage = $this->sanitizeMessage($exception->getMessage());
|
||||
|
||||
if ($exceptionMessage !== '') {
|
||||
$message = $exceptionMessage;
|
||||
}
|
||||
|
||||
return $this->forceFailNonTerminalRun(
|
||||
$run,
|
||||
reasonCode: $reason->value,
|
||||
message: $message,
|
||||
source: $source,
|
||||
evidence: [
|
||||
'exception_class' => $exception::class,
|
||||
'bridge_source' => $source,
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $summaryCounts
|
||||
* @param array<int, array{code?: mixed, reason_code?: mixed, message?: mixed}> $failures
|
||||
* @param array<string, mixed> $evidence
|
||||
*/
|
||||
public function updateRunWithReconciliation(
|
||||
OperationRun $run,
|
||||
string $status,
|
||||
string $outcome,
|
||||
array $summaryCounts,
|
||||
array $failures,
|
||||
string $reasonCode,
|
||||
string $reasonMessage,
|
||||
string $source = 'scheduled_reconciler',
|
||||
array $evidence = [],
|
||||
): OperationRun {
|
||||
/** @var OperationRun $updated */
|
||||
$updated = DB::transaction(function () use (
|
||||
$run,
|
||||
$status,
|
||||
$outcome,
|
||||
$summaryCounts,
|
||||
$failures,
|
||||
$reasonCode,
|
||||
$reasonMessage,
|
||||
$source,
|
||||
$evidence,
|
||||
): OperationRun {
|
||||
$locked = OperationRun::query()
|
||||
->whereKey($run->getKey())
|
||||
->lockForUpdate()
|
||||
->first();
|
||||
|
||||
if (! $locked instanceof OperationRun) {
|
||||
return $run;
|
||||
}
|
||||
|
||||
if ((string) $locked->status === OperationRunStatus::Completed->value) {
|
||||
return $locked;
|
||||
}
|
||||
|
||||
$context = is_array($locked->context) ? $locked->context : [];
|
||||
$context['reason_code'] = RunFailureSanitizer::normalizeReasonCode($reasonCode);
|
||||
$context['reconciliation'] = $this->reconciliationMetadata(
|
||||
reasonCode: $reasonCode,
|
||||
reasonMessage: $reasonMessage,
|
||||
source: $source,
|
||||
evidence: $evidence,
|
||||
);
|
||||
|
||||
$translatedContext = $this->withReasonTranslationContext(
|
||||
run: $locked,
|
||||
context: $context,
|
||||
failures: $failures,
|
||||
);
|
||||
|
||||
$locked->update([
|
||||
'context' => $translatedContext ?? $context,
|
||||
]);
|
||||
|
||||
$locked->refresh();
|
||||
|
||||
return $this->updateRun(
|
||||
$locked,
|
||||
status: $status,
|
||||
outcome: $outcome,
|
||||
summaryCounts: $summaryCounts,
|
||||
failures: $failures,
|
||||
);
|
||||
});
|
||||
|
||||
$updated->refresh();
|
||||
|
||||
return $updated;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalize a run as blocked with deterministic reason_code + link-only next steps.
|
||||
*
|
||||
@ -1033,16 +1194,49 @@ private function isDirectlyTranslatableReason(string $reasonCode): bool
|
||||
|
||||
return ProviderReasonCodes::isKnown($reasonCode)
|
||||
|| ExecutionDenialReasonCode::tryFrom($reasonCode) instanceof ExecutionDenialReasonCode
|
||||
|| LifecycleReconciliationReason::tryFrom($reasonCode) instanceof LifecycleReconciliationReason
|
||||
|| TenantOperabilityReasonCode::tryFrom($reasonCode) instanceof TenantOperabilityReasonCode
|
||||
|| RbacReason::tryFrom($reasonCode) instanceof RbacReason;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $evidence
|
||||
* @return array<string, mixed>
|
||||
*/
|
||||
private function reconciliationMetadata(
|
||||
string $reasonCode,
|
||||
string $reasonMessage,
|
||||
string $source,
|
||||
array $evidence,
|
||||
): array {
|
||||
return [
|
||||
'reconciled_at' => now()->toIso8601String(),
|
||||
'reason' => RunFailureSanitizer::normalizeReasonCode($reasonCode),
|
||||
'reason_code' => RunFailureSanitizer::normalizeReasonCode($reasonCode),
|
||||
'reason_message' => $this->sanitizeMessage($reasonMessage),
|
||||
'source' => $this->sanitizeFailureCode($source),
|
||||
'evidence' => $evidence,
|
||||
];
|
||||
}
|
||||
|
||||
private function bridgeReasonForThrowable(Throwable $exception): LifecycleReconciliationReason
|
||||
{
|
||||
$className = strtolower(class_basename($exception));
|
||||
|
||||
if (str_contains($className, 'timeout') || str_contains($className, 'attempts')) {
|
||||
return LifecycleReconciliationReason::InfrastructureTimeoutOrAbandonment;
|
||||
}
|
||||
|
||||
return LifecycleReconciliationReason::QueueFailureBridge;
|
||||
}
|
||||
|
||||
private function writeTerminalAudit(OperationRun $run): void
|
||||
{
|
||||
$tenant = $run->tenant;
|
||||
$workspace = $run->workspace;
|
||||
$context = is_array($run->context) ? $run->context : [];
|
||||
$executionLegitimacy = is_array($context['execution_legitimacy'] ?? null) ? $context['execution_legitimacy'] : [];
|
||||
$reconciliation = is_array($context['reconciliation'] ?? null) ? $context['reconciliation'] : [];
|
||||
$operationLabel = OperationCatalog::label((string) $run->type);
|
||||
|
||||
$action = match ($run->outcome) {
|
||||
@ -1072,6 +1266,7 @@ private function writeTerminalAudit(OperationRun $run): void
|
||||
'authority_mode' => $executionLegitimacy['authority_mode'] ?? ($context['execution_authority_mode'] ?? null),
|
||||
'acting_identity_type' => $executionLegitimacy['initiator']['identity_type'] ?? ($run->user instanceof User ? 'user' : 'system'),
|
||||
'blocked_by' => $context['blocked_by'] ?? null,
|
||||
'reconciliation' => $reconciliation !== [] ? $reconciliation : null,
|
||||
],
|
||||
],
|
||||
workspace: $workspace,
|
||||
|
||||
139
app/Services/Operations/OperationLifecyclePolicyValidator.php
Normal file
139
app/Services/Operations/OperationLifecyclePolicyValidator.php
Normal file
@ -0,0 +1,139 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Services\Operations;
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Support\Operations\OperationLifecyclePolicy;
|
||||
use RuntimeException;
|
||||
|
||||
final class OperationLifecyclePolicyValidator
|
||||
{
|
||||
public function __construct(
|
||||
private readonly OperationLifecyclePolicy $policy,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* @return array{
|
||||
* valid:bool,
|
||||
* errors:array<int, string>,
|
||||
* definitions:array<string, array<string, mixed>>
|
||||
* }
|
||||
*/
|
||||
public function validate(): array
|
||||
{
|
||||
$errors = [];
|
||||
$definitions = [];
|
||||
|
||||
foreach ($this->policy->coveredTypeNames() as $operationType) {
|
||||
$definition = $this->policy->definition($operationType);
|
||||
|
||||
if ($definition === null) {
|
||||
$errors[] = sprintf('Missing lifecycle policy definition for [%s].', $operationType);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$definitions[$operationType] = $definition;
|
||||
$jobClass = $this->policy->jobClass($operationType);
|
||||
|
||||
if ($jobClass === null || ! class_exists($jobClass)) {
|
||||
$errors[] = sprintf('Lifecycle policy [%s] points to a missing job class.', $operationType);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$timeout = $this->jobTimeoutSeconds($operationType);
|
||||
|
||||
if (! is_int($timeout) || $timeout <= 0) {
|
||||
$errors[] = sprintf('Lifecycle policy [%s] requires an explicit positive job timeout.', $operationType);
|
||||
}
|
||||
|
||||
if (! $this->jobFailsOnTimeout($operationType)) {
|
||||
$errors[] = sprintf('Lifecycle policy [%s] requires failOnTimeout=true.', $operationType);
|
||||
}
|
||||
|
||||
if ($this->policy->requiresDirectFailedBridge($operationType) && ! $this->jobUsesDirectFailedBridge($operationType)) {
|
||||
$errors[] = sprintf('Lifecycle policy [%s] requires a direct failed-job bridge.', $operationType);
|
||||
}
|
||||
|
||||
$retryAfter = $this->policy->queueRetryAfterSeconds($this->policy->queueConnection($operationType));
|
||||
$safetyMargin = $this->policy->retryAfterSafetyMarginSeconds();
|
||||
|
||||
if (is_int($timeout) && is_int($retryAfter) && $timeout >= ($retryAfter - $safetyMargin)) {
|
||||
$errors[] = sprintf(
|
||||
'Lifecycle policy [%s] has timeout %d which is not safely below retry_after %d (margin %d).',
|
||||
$operationType,
|
||||
$timeout,
|
||||
$retryAfter,
|
||||
$safetyMargin,
|
||||
);
|
||||
}
|
||||
|
||||
$expectedMaxRuntime = $this->policy->expectedMaxRuntimeSeconds($operationType);
|
||||
|
||||
if (is_int($expectedMaxRuntime) && is_int($retryAfter) && $expectedMaxRuntime >= ($retryAfter - $safetyMargin)) {
|
||||
$errors[] = sprintf(
|
||||
'Lifecycle policy [%s] expected runtime %d is not safely below retry_after %d (margin %d).',
|
||||
$operationType,
|
||||
$expectedMaxRuntime,
|
||||
$retryAfter,
|
||||
$safetyMargin,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'valid' => $errors === [],
|
||||
'errors' => $errors,
|
||||
'definitions' => $definitions,
|
||||
];
|
||||
}
|
||||
|
||||
public function assertValid(): void
|
||||
{
|
||||
$result = $this->validate();
|
||||
|
||||
if (($result['valid'] ?? false) === true) {
|
||||
return;
|
||||
}
|
||||
|
||||
throw new RuntimeException(implode(' ', $result['errors'] ?? ['Lifecycle policy validation failed.']));
|
||||
}
|
||||
|
||||
public function jobTimeoutSeconds(string $operationType): ?int
|
||||
{
|
||||
$jobClass = $this->policy->jobClass($operationType);
|
||||
|
||||
if ($jobClass === null || ! class_exists($jobClass)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$timeout = get_class_vars($jobClass)['timeout'] ?? null;
|
||||
|
||||
return is_numeric($timeout) ? (int) $timeout : null;
|
||||
}
|
||||
|
||||
public function jobFailsOnTimeout(string $operationType): bool
|
||||
{
|
||||
$jobClass = $this->policy->jobClass($operationType);
|
||||
|
||||
if ($jobClass === null || ! class_exists($jobClass)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (bool) (get_class_vars($jobClass)['failOnTimeout'] ?? false);
|
||||
}
|
||||
|
||||
public function jobUsesDirectFailedBridge(string $operationType): bool
|
||||
{
|
||||
$jobClass = $this->policy->jobClass($operationType);
|
||||
|
||||
if ($jobClass === null || ! class_exists($jobClass)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return in_array(BridgesFailedOperationRun::class, class_uses_recursive($jobClass), true);
|
||||
}
|
||||
}
|
||||
200
app/Services/Operations/OperationLifecycleReconciler.php
Normal file
200
app/Services/Operations/OperationLifecycleReconciler.php
Normal file
@ -0,0 +1,200 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Services\Operations;
|
||||
|
||||
use App\Models\OperationRun;
|
||||
use App\Services\OperationRunService;
|
||||
use App\Support\OperationRunOutcome;
|
||||
use App\Support\OperationRunStatus;
|
||||
use App\Support\Operations\LifecycleReconciliationReason;
|
||||
use App\Support\Operations\OperationLifecyclePolicy;
|
||||
use App\Support\Operations\OperationRunFreshnessState;
|
||||
use Illuminate\Database\Eloquent\Builder;
|
||||
|
||||
final class OperationLifecycleReconciler
|
||||
{
|
||||
public function __construct(
|
||||
private readonly OperationLifecyclePolicy $policy,
|
||||
private readonly OperationRunService $operationRunService,
|
||||
private readonly QueuedExecutionLegitimacyGate $queuedExecutionLegitimacyGate,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* @param array{
|
||||
* types?: array<int, string>,
|
||||
* tenant_ids?: array<int, int>,
|
||||
* workspace_ids?: array<int, int>,
|
||||
* limit?: int,
|
||||
* dry_run?: bool
|
||||
* } $options
|
||||
* @return array{candidates:int,reconciled:int,skipped:int,changes:array<int, array<string, mixed>>}
|
||||
*/
|
||||
public function reconcile(array $options = []): array
|
||||
{
|
||||
$types = array_values(array_filter(
|
||||
$options['types'] ?? $this->policy->coveredTypeNames(),
|
||||
static fn (mixed $type): bool => is_string($type) && trim($type) !== '',
|
||||
));
|
||||
$tenantIds = array_values(array_filter(
|
||||
$options['tenant_ids'] ?? [],
|
||||
static fn (mixed $tenantId): bool => is_int($tenantId) && $tenantId > 0,
|
||||
));
|
||||
$workspaceIds = array_values(array_filter(
|
||||
$options['workspace_ids'] ?? [],
|
||||
static fn (mixed $workspaceId): bool => is_int($workspaceId) && $workspaceId > 0,
|
||||
));
|
||||
$limit = min(max(1, (int) ($options['limit'] ?? $this->policy->reconciliationBatchLimit())), 500);
|
||||
$dryRun = (bool) ($options['dry_run'] ?? false);
|
||||
|
||||
$runs = OperationRun::query()
|
||||
->with(['tenant', 'user'])
|
||||
->whereIn('type', $types)
|
||||
->whereIn('status', [
|
||||
OperationRunStatus::Queued->value,
|
||||
OperationRunStatus::Running->value,
|
||||
])
|
||||
->when(
|
||||
$tenantIds !== [],
|
||||
fn (Builder $query): Builder => $query->whereIn('tenant_id', $tenantIds),
|
||||
)
|
||||
->when(
|
||||
$workspaceIds !== [],
|
||||
fn (Builder $query): Builder => $query->whereIn('workspace_id', $workspaceIds),
|
||||
)
|
||||
->orderBy('id')
|
||||
->limit($limit)
|
||||
->get();
|
||||
|
||||
$changes = [];
|
||||
$reconciled = 0;
|
||||
$skipped = 0;
|
||||
|
||||
foreach ($runs as $run) {
|
||||
$change = $this->reconcileRun($run, $dryRun);
|
||||
|
||||
if ($change === null) {
|
||||
$skipped++;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$changes[] = $change;
|
||||
|
||||
if (($change['applied'] ?? false) === true) {
|
||||
$reconciled++;
|
||||
}
|
||||
}
|
||||
|
||||
return [
|
||||
'candidates' => $runs->count(),
|
||||
'reconciled' => $reconciled,
|
||||
'skipped' => $skipped,
|
||||
'changes' => $changes,
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<string, mixed>|null
|
||||
*/
|
||||
public function reconcileRun(OperationRun $run, bool $dryRun = false): ?array
|
||||
{
|
||||
$assessment = $this->assessment($run);
|
||||
|
||||
if ($assessment === null || ($assessment['should_reconcile'] ?? false) !== true) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$before = [
|
||||
'status' => (string) $run->status,
|
||||
'outcome' => (string) $run->outcome,
|
||||
'freshness_state' => OperationRunFreshnessState::forRun($run, $this->policy)->value,
|
||||
];
|
||||
$after = [
|
||||
'status' => OperationRunStatus::Completed->value,
|
||||
'outcome' => OperationRunOutcome::Failed->value,
|
||||
'freshness_state' => OperationRunFreshnessState::ReconciledFailed->value,
|
||||
];
|
||||
|
||||
if ($dryRun) {
|
||||
return [
|
||||
'applied' => false,
|
||||
'operation_run_id' => (int) $run->getKey(),
|
||||
'type' => (string) $run->type,
|
||||
'before' => $before,
|
||||
'after' => $after,
|
||||
'reason_code' => $assessment['reason_code'],
|
||||
'reason_message' => $assessment['reason_message'],
|
||||
'evidence' => $assessment['evidence'],
|
||||
];
|
||||
}
|
||||
|
||||
$updated = $this->operationRunService->forceFailNonTerminalRun(
|
||||
run: $run,
|
||||
reasonCode: (string) $assessment['reason_code'],
|
||||
message: (string) $assessment['reason_message'],
|
||||
source: 'scheduled_reconciler',
|
||||
evidence: is_array($assessment['evidence'] ?? null) ? $assessment['evidence'] : [],
|
||||
);
|
||||
|
||||
return [
|
||||
'applied' => true,
|
||||
'operation_run_id' => (int) $updated->getKey(),
|
||||
'type' => (string) $updated->type,
|
||||
'before' => $before,
|
||||
'after' => $after,
|
||||
'reason_code' => $assessment['reason_code'],
|
||||
'reason_message' => $assessment['reason_message'],
|
||||
'evidence' => $assessment['evidence'],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{should_reconcile:bool,reason_code:string,reason_message:string,evidence:array<string, mixed>}|null
|
||||
*/
|
||||
public function assessment(OperationRun $run): ?array
|
||||
{
|
||||
if ((string) $run->status === OperationRunStatus::Completed->value) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (! $this->policy->supports((string) $run->type) || ! $this->policy->supportsScheduledReconciliation((string) $run->type)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$freshnessState = OperationRunFreshnessState::forRun($run, $this->policy);
|
||||
|
||||
if (! $freshnessState->isLikelyStale()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$reason = (string) $run->status === OperationRunStatus::Queued->value
|
||||
? LifecycleReconciliationReason::StaleQueued
|
||||
: LifecycleReconciliationReason::StaleRunning;
|
||||
$referenceTime = (string) $run->status === OperationRunStatus::Queued->value
|
||||
? $run->created_at
|
||||
: ($run->started_at ?? $run->created_at);
|
||||
$thresholdSeconds = (string) $run->status === OperationRunStatus::Queued->value
|
||||
? $this->policy->queuedStaleAfterSeconds((string) $run->type)
|
||||
: $this->policy->runningStaleAfterSeconds((string) $run->type);
|
||||
$legitimacy = $this->queuedExecutionLegitimacyGate->evaluate($run)->toArray();
|
||||
|
||||
return [
|
||||
'should_reconcile' => true,
|
||||
'reason_code' => $reason->value,
|
||||
'reason_message' => $reason->defaultMessage(),
|
||||
'evidence' => [
|
||||
'evaluated_at' => now()->toIso8601String(),
|
||||
'freshness_state' => $freshnessState->value,
|
||||
'threshold_seconds' => $thresholdSeconds,
|
||||
'reference_time' => $referenceTime?->toIso8601String(),
|
||||
'status' => (string) $run->status,
|
||||
'execution_legitimacy' => $legitimacy,
|
||||
'terminal_truth_path' => $this->policy->requiresDirectFailedBridge((string) $run->type)
|
||||
? 'direct_and_scheduled'
|
||||
: 'scheduled_only',
|
||||
],
|
||||
];
|
||||
}
|
||||
}
|
||||
@ -8,12 +8,46 @@
|
||||
use App\Support\Badges\BadgeSpec;
|
||||
use App\Support\Badges\OperatorOutcomeTaxonomy;
|
||||
use App\Support\OperationRunOutcome;
|
||||
use App\Support\OperationRunStatus;
|
||||
use App\Support\Operations\OperationRunFreshnessState;
|
||||
|
||||
final class OperationRunOutcomeBadge implements BadgeMapper
|
||||
{
|
||||
public function spec(mixed $value): BadgeSpec
|
||||
{
|
||||
$state = BadgeCatalog::normalizeState($value);
|
||||
$state = null;
|
||||
|
||||
if (is_array($value)) {
|
||||
$outcome = BadgeCatalog::normalizeState($value['outcome'] ?? null);
|
||||
$status = BadgeCatalog::normalizeState($value['status'] ?? null);
|
||||
$freshnessState = BadgeCatalog::normalizeState($value['freshness_state'] ?? null);
|
||||
|
||||
if ($outcome === null) {
|
||||
if ($freshnessState === OperationRunFreshnessState::ReconciledFailed->value) {
|
||||
$outcome = OperationRunOutcome::Failed->value;
|
||||
} elseif (
|
||||
$freshnessState === OperationRunFreshnessState::LikelyStale->value
|
||||
|| in_array($status, [OperationRunStatus::Queued->value, OperationRunStatus::Running->value], true)
|
||||
) {
|
||||
$outcome = OperationRunOutcome::Pending->value;
|
||||
}
|
||||
}
|
||||
|
||||
if ($outcome === OperationRunOutcome::Failed->value
|
||||
&& $freshnessState === OperationRunFreshnessState::ReconciledFailed->value
|
||||
) {
|
||||
return new BadgeSpec(
|
||||
label: 'Reconciled failed',
|
||||
color: 'danger',
|
||||
icon: 'heroicon-m-arrow-path-rounded-square',
|
||||
iconColor: 'danger',
|
||||
);
|
||||
}
|
||||
|
||||
$state = $outcome;
|
||||
}
|
||||
|
||||
$state ??= BadgeCatalog::normalizeState($value);
|
||||
|
||||
return match ($state) {
|
||||
OperationRunOutcome::Pending->value => OperatorOutcomeTaxonomy::spec(BadgeDomain::OperationRunOutcome, $state, 'heroicon-m-clock'),
|
||||
|
||||
@ -8,12 +8,33 @@
|
||||
use App\Support\Badges\BadgeSpec;
|
||||
use App\Support\Badges\OperatorOutcomeTaxonomy;
|
||||
use App\Support\OperationRunStatus;
|
||||
use App\Support\Operations\OperationRunFreshnessState;
|
||||
|
||||
final class OperationRunStatusBadge implements BadgeMapper
|
||||
{
|
||||
public function spec(mixed $value): BadgeSpec
|
||||
{
|
||||
$state = BadgeCatalog::normalizeState($value);
|
||||
$state = null;
|
||||
|
||||
if (is_array($value)) {
|
||||
$status = BadgeCatalog::normalizeState($value['status'] ?? null);
|
||||
$freshnessState = BadgeCatalog::normalizeState($value['freshness_state'] ?? null);
|
||||
|
||||
if (in_array($status, [OperationRunStatus::Queued->value, OperationRunStatus::Running->value], true)
|
||||
&& $freshnessState === OperationRunFreshnessState::LikelyStale->value
|
||||
) {
|
||||
return new BadgeSpec(
|
||||
label: 'Likely stale',
|
||||
color: 'warning',
|
||||
icon: 'heroicon-m-exclamation-triangle',
|
||||
iconColor: 'warning',
|
||||
);
|
||||
}
|
||||
|
||||
$state = $status;
|
||||
}
|
||||
|
||||
$state ??= BadgeCatalog::normalizeState($value);
|
||||
|
||||
return match ($state) {
|
||||
OperationRunStatus::Queued->value => OperatorOutcomeTaxonomy::spec(BadgeDomain::OperationRunStatus, $state, 'heroicon-m-clock'),
|
||||
|
||||
27
app/Support/Filament/PanelThemeAsset.php
Normal file
27
app/Support/Filament/PanelThemeAsset.php
Normal file
@ -0,0 +1,27 @@
|
||||
<?php
|
||||
|
||||
namespace App\Support\Filament;
|
||||
|
||||
use Illuminate\Support\Facades\Vite;
|
||||
|
||||
class PanelThemeAsset
|
||||
{
|
||||
public static function resolve(string $entry): string
|
||||
{
|
||||
$manifest = public_path('build/manifest.json');
|
||||
|
||||
if (! is_file($manifest)) {
|
||||
return Vite::asset($entry);
|
||||
}
|
||||
|
||||
/** @var array<string, array{file?: string}>|null $decoded */
|
||||
$decoded = json_decode((string) file_get_contents($manifest), true);
|
||||
$file = $decoded[$entry]['file'] ?? null;
|
||||
|
||||
if (! is_string($file) || $file === '') {
|
||||
return Vite::asset($entry);
|
||||
}
|
||||
|
||||
return asset('build/'.$file);
|
||||
}
|
||||
}
|
||||
83
app/Support/Operations/LifecycleReconciliationReason.php
Normal file
83
app/Support/Operations/LifecycleReconciliationReason.php
Normal file
@ -0,0 +1,83 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Support\Operations;
|
||||
|
||||
use App\Support\ReasonTranslation\NextStepOption;
|
||||
use App\Support\ReasonTranslation\ReasonResolutionEnvelope;
|
||||
|
||||
enum LifecycleReconciliationReason: string
|
||||
{
|
||||
case StaleQueued = 'run.stale_queued';
|
||||
case StaleRunning = 'run.stale_running';
|
||||
case InfrastructureTimeoutOrAbandonment = 'run.infrastructure_timeout_or_abandonment';
|
||||
case QueueFailureBridge = 'run.queue_failure_bridge';
|
||||
case AdapterOutOfSync = 'run.adapter_out_of_sync';
|
||||
|
||||
public function operatorLabel(): string
|
||||
{
|
||||
return match ($this) {
|
||||
self::StaleQueued => 'Run never started',
|
||||
self::StaleRunning => 'Run stopped reporting progress',
|
||||
self::InfrastructureTimeoutOrAbandonment => 'Infrastructure ended the run',
|
||||
self::QueueFailureBridge => 'Queue failure was reconciled',
|
||||
self::AdapterOutOfSync => 'Lifecycle was reconciled from related records',
|
||||
};
|
||||
}
|
||||
|
||||
public function shortExplanation(): string
|
||||
{
|
||||
return match ($this) {
|
||||
self::StaleQueued => 'The run stayed queued past its lifecycle window and was marked failed.',
|
||||
self::StaleRunning => 'The run stayed active past its lifecycle window and was marked failed.',
|
||||
self::InfrastructureTimeoutOrAbandonment => 'Queue infrastructure ended the job before normal completion could update the run.',
|
||||
self::QueueFailureBridge => 'The platform bridged a queue failure back to the owning run and marked it failed.',
|
||||
self::AdapterOutOfSync => 'A related restore record reached terminal truth before the operation run was updated.',
|
||||
};
|
||||
}
|
||||
|
||||
public function actionability(): string
|
||||
{
|
||||
return match ($this) {
|
||||
self::AdapterOutOfSync => 'non_actionable',
|
||||
default => 'retryable_transient',
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array<int, NextStepOption>
|
||||
*/
|
||||
public function nextSteps(): array
|
||||
{
|
||||
return match ($this) {
|
||||
self::AdapterOutOfSync => [
|
||||
NextStepOption::instruction('Review the related restore record before deciding whether to run the workflow again.'),
|
||||
],
|
||||
default => [
|
||||
NextStepOption::instruction('Review worker health and logs before retrying this operation.'),
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
public function defaultMessage(): string
|
||||
{
|
||||
return $this->shortExplanation();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array<string, mixed> $context
|
||||
*/
|
||||
public function toReasonResolutionEnvelope(string $surface = 'detail', array $context = []): ReasonResolutionEnvelope
|
||||
{
|
||||
return new ReasonResolutionEnvelope(
|
||||
internalCode: $this->value,
|
||||
operatorLabel: $this->operatorLabel(),
|
||||
shortExplanation: $this->shortExplanation(),
|
||||
actionability: $this->actionability(),
|
||||
nextSteps: $this->nextSteps(),
|
||||
showNoActionNeeded: false,
|
||||
diagnosticCodeLabel: $this->value,
|
||||
);
|
||||
}
|
||||
}
|
||||
152
app/Support/Operations/OperationLifecyclePolicy.php
Normal file
152
app/Support/Operations/OperationLifecyclePolicy.php
Normal file
@ -0,0 +1,152 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Support\Operations;
|
||||
|
||||
use Illuminate\Support\Arr;
|
||||
|
||||
final class OperationLifecyclePolicy
|
||||
{
|
||||
/**
|
||||
* @return array<string, array{
|
||||
* job_class?: class-string,
|
||||
* queued_stale_after_seconds?: int,
|
||||
* running_stale_after_seconds?: int,
|
||||
* expected_max_runtime_seconds?: int,
|
||||
* direct_failed_bridge?: bool,
|
||||
* scheduled_reconciliation?: bool
|
||||
* }>
|
||||
*/
|
||||
public function coveredTypes(): array
|
||||
{
|
||||
$coveredTypes = config('tenantpilot.operations.lifecycle.covered_types', []);
|
||||
|
||||
return is_array($coveredTypes) ? $coveredTypes : [];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{
|
||||
* job_class?: class-string,
|
||||
* queued_stale_after_seconds:int,
|
||||
* running_stale_after_seconds:int,
|
||||
* expected_max_runtime_seconds:?int,
|
||||
* direct_failed_bridge:bool,
|
||||
* scheduled_reconciliation:bool
|
||||
* }|null
|
||||
*/
|
||||
public function definition(string $operationType): ?array
|
||||
{
|
||||
$operationType = trim($operationType);
|
||||
|
||||
if ($operationType === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$definition = $this->coveredTypes()[$operationType] ?? null;
|
||||
|
||||
if (! is_array($definition)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return [
|
||||
'job_class' => is_string($definition['job_class'] ?? null) ? $definition['job_class'] : null,
|
||||
'queued_stale_after_seconds' => max(1, (int) ($definition['queued_stale_after_seconds'] ?? 300)),
|
||||
'running_stale_after_seconds' => max(1, (int) ($definition['running_stale_after_seconds'] ?? 900)),
|
||||
'expected_max_runtime_seconds' => is_numeric($definition['expected_max_runtime_seconds'] ?? null)
|
||||
? max(1, (int) $definition['expected_max_runtime_seconds'])
|
||||
: null,
|
||||
'direct_failed_bridge' => (bool) ($definition['direct_failed_bridge'] ?? false),
|
||||
'scheduled_reconciliation' => (bool) ($definition['scheduled_reconciliation'] ?? true),
|
||||
];
|
||||
}
|
||||
|
||||
public function supports(string $operationType): bool
|
||||
{
|
||||
return $this->definition($operationType) !== null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return list<string>
|
||||
*/
|
||||
public function coveredTypeNames(): array
|
||||
{
|
||||
return array_values(array_keys($this->coveredTypes()));
|
||||
}
|
||||
|
||||
public function queuedStaleAfterSeconds(string $operationType): int
|
||||
{
|
||||
return (int) ($this->definition($operationType)['queued_stale_after_seconds'] ?? 300);
|
||||
}
|
||||
|
||||
public function runningStaleAfterSeconds(string $operationType): int
|
||||
{
|
||||
return (int) ($this->definition($operationType)['running_stale_after_seconds'] ?? 900);
|
||||
}
|
||||
|
||||
public function expectedMaxRuntimeSeconds(string $operationType): ?int
|
||||
{
|
||||
$expectedMaxRuntimeSeconds = $this->definition($operationType)['expected_max_runtime_seconds'] ?? null;
|
||||
|
||||
return is_int($expectedMaxRuntimeSeconds) ? $expectedMaxRuntimeSeconds : null;
|
||||
}
|
||||
|
||||
public function requiresDirectFailedBridge(string $operationType): bool
|
||||
{
|
||||
return (bool) ($this->definition($operationType)['direct_failed_bridge'] ?? false);
|
||||
}
|
||||
|
||||
public function supportsScheduledReconciliation(string $operationType): bool
|
||||
{
|
||||
return (bool) ($this->definition($operationType)['scheduled_reconciliation'] ?? false);
|
||||
}
|
||||
|
||||
public function reconciliationBatchLimit(): int
|
||||
{
|
||||
return max(1, (int) config('tenantpilot.operations.lifecycle.reconciliation.batch_limit', 100));
|
||||
}
|
||||
|
||||
public function reconciliationScheduleMinutes(): int
|
||||
{
|
||||
return max(1, (int) config('tenantpilot.operations.lifecycle.reconciliation.schedule_minutes', 5));
|
||||
}
|
||||
|
||||
public function retryAfterSafetyMarginSeconds(): int
|
||||
{
|
||||
return max(1, (int) config('queue.lifecycle_invariants.retry_after_safety_margin', 30));
|
||||
}
|
||||
|
||||
public function queueConnection(string $operationType): ?string
|
||||
{
|
||||
$jobClass = $this->jobClass($operationType);
|
||||
|
||||
if ($jobClass === null || ! class_exists($jobClass)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$connection = Arr::get(get_class_vars($jobClass), 'connection');
|
||||
|
||||
return is_string($connection) && trim($connection) !== '' ? trim($connection) : config('queue.default');
|
||||
}
|
||||
|
||||
public function queueRetryAfterSeconds(?string $connection = null): ?int
|
||||
{
|
||||
$connection = is_string($connection) && trim($connection) !== '' ? trim($connection) : (string) config('queue.default', 'database');
|
||||
$retryAfter = config("queue.connections.{$connection}.retry_after");
|
||||
|
||||
if (is_numeric($retryAfter)) {
|
||||
return max(1, (int) $retryAfter);
|
||||
}
|
||||
|
||||
$databaseRetryAfter = config('queue.connections.database.retry_after');
|
||||
|
||||
return is_numeric($databaseRetryAfter) ? max(1, (int) $databaseRetryAfter) : null;
|
||||
}
|
||||
|
||||
public function jobClass(string $operationType): ?string
|
||||
{
|
||||
$jobClass = $this->definition($operationType)['job_class'] ?? null;
|
||||
|
||||
return is_string($jobClass) && $jobClass !== '' ? $jobClass : null;
|
||||
}
|
||||
}
|
||||
@ -2,10 +2,16 @@
|
||||
|
||||
namespace App\Support\Operations;
|
||||
|
||||
use App\Models\OperationRun;
|
||||
use App\Support\Auth\Capabilities;
|
||||
|
||||
final class OperationRunCapabilityResolver
|
||||
{
|
||||
public function requiredCapabilityForRun(OperationRun $run): ?string
|
||||
{
|
||||
return $this->requiredCapabilityForType((string) $run->type);
|
||||
}
|
||||
|
||||
public function requiredCapabilityForType(string $operationType): ?string
|
||||
{
|
||||
$operationType = trim($operationType);
|
||||
|
||||
69
app/Support/Operations/OperationRunFreshnessState.php
Normal file
69
app/Support/Operations/OperationRunFreshnessState.php
Normal file
@ -0,0 +1,69 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Support\Operations;
|
||||
|
||||
use App\Models\OperationRun;
|
||||
use App\Support\OperationRunStatus;
|
||||
|
||||
enum OperationRunFreshnessState: string
|
||||
{
|
||||
case FreshActive = 'fresh_active';
|
||||
case LikelyStale = 'likely_stale';
|
||||
case ReconciledFailed = 'reconciled_failed';
|
||||
case TerminalNormal = 'terminal_normal';
|
||||
case Unknown = 'unknown';
|
||||
|
||||
public static function forRun(OperationRun $run, ?OperationLifecyclePolicy $policy = null): self
|
||||
{
|
||||
$policy ??= app(OperationLifecyclePolicy::class);
|
||||
|
||||
if ((string) $run->status === OperationRunStatus::Completed->value) {
|
||||
return $run->isLifecycleReconciled() ? self::ReconciledFailed : self::TerminalNormal;
|
||||
}
|
||||
|
||||
if (! $policy->supports((string) $run->type)) {
|
||||
return self::Unknown;
|
||||
}
|
||||
|
||||
if ((string) $run->status === OperationRunStatus::Queued->value) {
|
||||
if ($run->started_at !== null || $run->created_at === null) {
|
||||
return self::Unknown;
|
||||
}
|
||||
|
||||
return $run->created_at->lte(now()->subSeconds($policy->queuedStaleAfterSeconds((string) $run->type)))
|
||||
? self::LikelyStale
|
||||
: self::FreshActive;
|
||||
}
|
||||
|
||||
if ((string) $run->status === OperationRunStatus::Running->value) {
|
||||
$startedAt = $run->started_at ?? $run->created_at;
|
||||
|
||||
if ($startedAt === null) {
|
||||
return self::Unknown;
|
||||
}
|
||||
|
||||
return $startedAt->lte(now()->subSeconds($policy->runningStaleAfterSeconds((string) $run->type)))
|
||||
? self::LikelyStale
|
||||
: self::FreshActive;
|
||||
}
|
||||
|
||||
return self::Unknown;
|
||||
}
|
||||
|
||||
public function isFreshActive(): bool
|
||||
{
|
||||
return $this === self::FreshActive;
|
||||
}
|
||||
|
||||
public function isLikelyStale(): bool
|
||||
{
|
||||
return $this === self::LikelyStale;
|
||||
}
|
||||
|
||||
public function isReconciledFailed(): bool
|
||||
{
|
||||
return $this === self::ReconciledFailed;
|
||||
}
|
||||
}
|
||||
@ -7,6 +7,7 @@
|
||||
use App\Models\OperationRun;
|
||||
use App\Models\Tenant;
|
||||
use App\Support\OperationCatalog;
|
||||
use App\Support\Operations\OperationRunFreshnessState;
|
||||
use App\Support\ReasonTranslation\ReasonPresenter;
|
||||
use App\Support\RedactionIntegrity;
|
||||
use Filament\Notifications\Notification as FilamentNotification;
|
||||
@ -99,6 +100,15 @@ public static function surfaceGuidance(OperationRun $run): ?string
|
||||
$reasonEnvelope = self::reasonEnvelope($run);
|
||||
$reasonGuidance = app(ReasonPresenter::class)->guidance($reasonEnvelope);
|
||||
$nextStepLabel = self::firstNextStepLabel($run);
|
||||
$freshnessState = self::freshnessState($run);
|
||||
|
||||
if ($freshnessState->isLikelyStale()) {
|
||||
return 'This run is past its lifecycle window. Review worker health and logs before retrying from the start surface.';
|
||||
}
|
||||
|
||||
if ($freshnessState->isReconciledFailed()) {
|
||||
return $reasonGuidance ?? 'TenantPilot reconciled this run after lifecycle truth was lost. Review the recorded evidence before retrying.';
|
||||
}
|
||||
|
||||
if (in_array($uxStatus, ['blocked', 'failed', 'partial'], true) && $reasonGuidance !== null) {
|
||||
return $reasonGuidance;
|
||||
@ -130,11 +140,29 @@ public static function surfaceFailureDetail(OperationRun $run): ?string
|
||||
return $reasonEnvelope->shortExplanation;
|
||||
}
|
||||
|
||||
if (self::freshnessState($run)->isLikelyStale()) {
|
||||
return 'This run is no longer within its normal lifecycle window and may no longer be progressing.';
|
||||
}
|
||||
|
||||
$failureMessage = (string) (($run->failure_summary[0]['message'] ?? '') ?? '');
|
||||
|
||||
return self::sanitizeFailureMessage($failureMessage);
|
||||
}
|
||||
|
||||
public static function freshnessState(OperationRun $run): OperationRunFreshnessState
|
||||
{
|
||||
return $run->freshnessState();
|
||||
}
|
||||
|
||||
public static function lifecycleAttentionSummary(OperationRun $run): ?string
|
||||
{
|
||||
return match (self::freshnessState($run)) {
|
||||
OperationRunFreshnessState::LikelyStale => 'Likely stale',
|
||||
OperationRunFreshnessState::ReconciledFailed => 'Automatically reconciled',
|
||||
default => null,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array{titleSuffix: string, body: string, status: string}
|
||||
*/
|
||||
@ -142,6 +170,15 @@ private static function terminalPresentation(OperationRun $run): array
|
||||
{
|
||||
$uxStatus = OperationStatusNormalizer::toUxStatus($run->status, $run->outcome);
|
||||
$reasonEnvelope = self::reasonEnvelope($run);
|
||||
$freshnessState = self::freshnessState($run);
|
||||
|
||||
if ($freshnessState->isReconciledFailed()) {
|
||||
return [
|
||||
'titleSuffix' => 'was automatically reconciled',
|
||||
'body' => $reasonEnvelope?->operatorLabel ?? 'Automatically reconciled after infrastructure failure.',
|
||||
'status' => 'danger',
|
||||
];
|
||||
}
|
||||
|
||||
return match ($uxStatus) {
|
||||
'succeeded' => [
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
|
||||
use App\Models\OperationRun;
|
||||
use App\Support\OperationCatalog;
|
||||
use App\Support\Operations\OperationRunFreshnessState;
|
||||
use Illuminate\Support\Facades\Cache;
|
||||
|
||||
final class RunDurationInsights
|
||||
@ -118,6 +119,10 @@ public static function expectedHuman(OperationRun $run): ?string
|
||||
|
||||
public static function stuckGuidance(OperationRun $run): ?string
|
||||
{
|
||||
if ($run->freshnessState() === OperationRunFreshnessState::LikelyStale) {
|
||||
return 'Past the lifecycle window. Review worker health and logs before retrying.';
|
||||
}
|
||||
|
||||
$uxStatus = OperationStatusNormalizer::toUxStatus($run->status, $run->outcome);
|
||||
|
||||
if (! in_array($uxStatus, ['queued', 'running'], true)) {
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
use App\Services\Intune\SecretClassificationService;
|
||||
use App\Support\Baselines\BaselineReasonCodes;
|
||||
use App\Support\Operations\ExecutionDenialReasonCode;
|
||||
use App\Support\Operations\LifecycleReconciliationReason;
|
||||
use App\Support\Providers\ProviderReasonCodes;
|
||||
|
||||
final class RunFailureSanitizer
|
||||
@ -131,9 +132,15 @@ public static function isStructuredOperatorReasonCode(string $candidate): bool
|
||||
ExecutionDenialReasonCode::cases(),
|
||||
);
|
||||
|
||||
$lifecycleReasonCodes = array_map(
|
||||
static fn (LifecycleReconciliationReason $reasonCode): string => $reasonCode->value,
|
||||
LifecycleReconciliationReason::cases(),
|
||||
);
|
||||
|
||||
return ProviderReasonCodes::isKnown($candidate)
|
||||
|| BaselineReasonCodes::isKnown($candidate)
|
||||
|| in_array($candidate, $executionDenialReasonCodes, true);
|
||||
|| in_array($candidate, $executionDenialReasonCodes, true)
|
||||
|| in_array($candidate, $lifecycleReasonCodes, true);
|
||||
}
|
||||
|
||||
public static function sanitizeMessage(string $message): string
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
use App\Models\ProviderConnection;
|
||||
use App\Models\Tenant;
|
||||
use App\Support\Operations\ExecutionDenialReasonCode;
|
||||
use App\Support\Operations\LifecycleReconciliationReason;
|
||||
use App\Support\Providers\ProviderReasonCodes;
|
||||
use App\Support\Providers\ProviderReasonTranslator;
|
||||
use App\Support\RbacReason;
|
||||
@ -91,6 +92,7 @@ private function isDirectlyTranslatableOperationReason(string $reasonCode): bool
|
||||
|
||||
return ProviderReasonCodes::isKnown($reasonCode)
|
||||
|| ExecutionDenialReasonCode::tryFrom($reasonCode) instanceof ExecutionDenialReasonCode
|
||||
|| LifecycleReconciliationReason::tryFrom($reasonCode) instanceof LifecycleReconciliationReason
|
||||
|| TenantOperabilityReasonCode::tryFrom($reasonCode) instanceof TenantOperabilityReasonCode
|
||||
|| RbacReason::tryFrom($reasonCode) instanceof RbacReason;
|
||||
}
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
|
||||
use App\Support\Baselines\BaselineReasonCodes;
|
||||
use App\Support\Operations\ExecutionDenialReasonCode;
|
||||
use App\Support\Operations\LifecycleReconciliationReason;
|
||||
use App\Support\Providers\ProviderReasonCodes;
|
||||
use App\Support\Providers\ProviderReasonTranslator;
|
||||
use App\Support\RbacReason;
|
||||
@ -48,6 +49,7 @@ public function translate(
|
||||
$artifactKey === null && BaselineReasonCodes::isKnown($reasonCode) => $this->translateBaselineReason($reasonCode),
|
||||
$artifactKey === self::EXECUTION_DENIAL_ARTIFACT,
|
||||
$artifactKey === null && ExecutionDenialReasonCode::tryFrom($reasonCode) instanceof ExecutionDenialReasonCode => ExecutionDenialReasonCode::tryFrom($reasonCode)?->toReasonResolutionEnvelope($surface, $context),
|
||||
$artifactKey === null && LifecycleReconciliationReason::tryFrom($reasonCode) instanceof LifecycleReconciliationReason => LifecycleReconciliationReason::tryFrom($reasonCode)?->toReasonResolutionEnvelope($surface, $context),
|
||||
$artifactKey === self::TENANT_OPERABILITY_ARTIFACT,
|
||||
$artifactKey === null && TenantOperabilityReasonCode::tryFrom($reasonCode) instanceof TenantOperabilityReasonCode => TenantOperabilityReasonCode::tryFrom($reasonCode)?->toReasonResolutionEnvelope($surface, $context),
|
||||
$artifactKey === self::RBAC_ARTIFACT,
|
||||
|
||||
@ -126,4 +126,8 @@
|
||||
'table' => 'failed_jobs',
|
||||
],
|
||||
|
||||
'lifecycle_invariants' => [
|
||||
'retry_after_safety_margin' => (int) env('QUEUE_RETRY_AFTER_SAFETY_MARGIN', 30),
|
||||
],
|
||||
|
||||
];
|
||||
|
||||
@ -13,6 +13,113 @@
|
||||
],
|
||||
],
|
||||
|
||||
'operations' => [
|
||||
'lifecycle' => [
|
||||
'reconciliation' => [
|
||||
'batch_limit' => (int) env('TENANTPILOT_OPERATION_RUN_RECONCILIATION_BATCH_LIMIT', 100),
|
||||
'schedule_minutes' => (int) env('TENANTPILOT_OPERATION_RUN_RECONCILIATION_SCHEDULE_MINUTES', 5),
|
||||
],
|
||||
'covered_types' => [
|
||||
'baseline_capture' => [
|
||||
'job_class' => \App\Jobs\CaptureBaselineSnapshotJob::class,
|
||||
'queued_stale_after_seconds' => 600,
|
||||
'running_stale_after_seconds' => 1800,
|
||||
'expected_max_runtime_seconds' => 300,
|
||||
'direct_failed_bridge' => true,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'baseline_compare' => [
|
||||
'job_class' => \App\Jobs\CompareBaselineToTenantJob::class,
|
||||
'queued_stale_after_seconds' => 600,
|
||||
'running_stale_after_seconds' => 1800,
|
||||
'expected_max_runtime_seconds' => 300,
|
||||
'direct_failed_bridge' => true,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'inventory_sync' => [
|
||||
'job_class' => \App\Jobs\RunInventorySyncJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 1200,
|
||||
'expected_max_runtime_seconds' => 240,
|
||||
'direct_failed_bridge' => true,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'policy.sync' => [
|
||||
'job_class' => \App\Jobs\SyncPoliciesJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 900,
|
||||
'expected_max_runtime_seconds' => 180,
|
||||
'direct_failed_bridge' => true,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'policy.sync_one' => [
|
||||
'job_class' => \App\Jobs\SyncPoliciesJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 900,
|
||||
'expected_max_runtime_seconds' => 180,
|
||||
'direct_failed_bridge' => true,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'entra_group_sync' => [
|
||||
'job_class' => \App\Jobs\EntraGroupSyncJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 900,
|
||||
'expected_max_runtime_seconds' => 240,
|
||||
'direct_failed_bridge' => false,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'directory_role_definitions.sync' => [
|
||||
'job_class' => \App\Jobs\SyncRoleDefinitionsJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 900,
|
||||
'expected_max_runtime_seconds' => 240,
|
||||
'direct_failed_bridge' => false,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'backup_schedule_run' => [
|
||||
'job_class' => \App\Jobs\RunBackupScheduleJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 1200,
|
||||
'expected_max_runtime_seconds' => 300,
|
||||
'direct_failed_bridge' => false,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'restore.execute' => [
|
||||
'job_class' => \App\Jobs\ExecuteRestoreRunJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 1500,
|
||||
'expected_max_runtime_seconds' => 420,
|
||||
'direct_failed_bridge' => false,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'tenant.review_pack.generate' => [
|
||||
'job_class' => \App\Jobs\GenerateReviewPackJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 900,
|
||||
'expected_max_runtime_seconds' => 240,
|
||||
'direct_failed_bridge' => false,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'tenant.review.compose' => [
|
||||
'job_class' => \App\Jobs\ComposeTenantReviewJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 900,
|
||||
'expected_max_runtime_seconds' => 240,
|
||||
'direct_failed_bridge' => true,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
'tenant.evidence.snapshot.generate' => [
|
||||
'job_class' => \App\Jobs\GenerateEvidenceSnapshotJob::class,
|
||||
'queued_stale_after_seconds' => 300,
|
||||
'running_stale_after_seconds' => 900,
|
||||
'expected_max_runtime_seconds' => 240,
|
||||
'direct_failed_bridge' => false,
|
||||
'scheduled_reconciliation' => true,
|
||||
],
|
||||
],
|
||||
],
|
||||
],
|
||||
|
||||
'allow_admin_maintenance_actions' => (bool) env('ALLOW_ADMIN_MAINTENANCE_ACTIONS', false),
|
||||
|
||||
'supported_policy_types' => [
|
||||
|
||||
@ -150,6 +150,8 @@ ### Operations UX
|
||||
- **3-surface feedback**: Toast (immediate) → Progress widget (polling) → DB notification (terminal).
|
||||
- **OperationRun lifecycle**: Service-owned transitions only via `OperationRunService` — no direct status writes.
|
||||
- **Idempotent creation**: Hash-based dedup with partial unique index.
|
||||
- **Lifecycle guarantees (Spec 160)**: Covered queued runs (`baseline_capture`, `baseline_compare`, `inventory_sync`, `policy.sync`, `policy.sync_one`, `entra_group_sync`, `directory_role_definitions.sync`, `backup_schedule_run`, `restore.execute`, `tenant.review_pack.generate`, `tenant.review.compose`, `tenant.evidence.snapshot.generate`) now have a config-backed lifecycle policy, direct failed-job bridges where declared, scheduled stale-run reconciliation, and UI freshness semantics for stale or automatically reconciled runs.
|
||||
- **Queue timing invariant**: Covered job `timeout` values and policy `expected_max_runtime_seconds` must remain safely below queue `retry_after` with the configured safety margin. After changing queue lifecycle settings, restart workers (`php artisan queue:restart` in the target environment) so the new contract takes effect.
|
||||
|
||||
### Filament Standards
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ # Spec Candidates
|
||||
>
|
||||
> **Flow**: Inbox → Qualified → Planned → Spec created → moved to `Promoted to Spec`
|
||||
|
||||
**Last reviewed**: 2026-03-23 (added governance operator outcome compression follow-up; promoted Spec 158 into ledger)
|
||||
**Last reviewed**: 2026-03-23 (added Operator Explanation Layer candidate; added governance operator outcome compression follow-up; promoted Spec 158 into ledger)
|
||||
|
||||
---
|
||||
|
||||
@ -224,20 +224,74 @@ ### Humanized Diagnostic Summaries for Governance Operations
|
||||
- **Strategic sequencing**: Best treated as the run-detail explainability companion to Governance Operator Outcome Compression. Compression improves governance artifact scan surfaces; this candidate makes governance run detail self-explanatory once an operator drills in.
|
||||
- **Priority**: high
|
||||
|
||||
### Operator Explanation Layer for Degraded / Partial / Suppressed Results
|
||||
- **Type**: cross-cutting UX / domain semantics / operator clarity
|
||||
- **Source**: product analysis 2026-03-23; direct follow-up to Spec 156 (`operator-outcome-taxonomy`)
|
||||
- **Vehicle**: new standalone candidate
|
||||
- **Problem**: Spec 156 improves status, outcome, and run vocabulary, but pure outcome taxonomy does not resolve a deeper operator-readability gap: the product stores truth across multiple semantic dimensions (execution, evaluation, reliability, coverage, recommended action) that are currently presented side by side without separation or explanation. Several governance and operational surfaces — Baseline Compare, Baseline Capture, Operation Run detail, Tenant Reviews, evidence-dependent results — show technically correct but operatorisch schwer lesbare combinations such as `Run finished` + `Counts 0` + `Why no findings: evidence_capture_incomplete`. These force operators to synthesize whether: the run was technically successful, the result is trustworthy, findings were genuinely absent, data was missing, assignments were ambiguous, or follow-up is required. The product preserves truth better than it explains truth. Distinct truth dimensions collapse into shared reading surfaces — count blocks that look like complete results when they show only a subset, `0 findings` that reads as reassurance when evaluation was incomplete, reason codes that serve as primary operator explanation when they are diagnostic material. Enterprise operators need the UI to make without JSON and without implicit product knowledge clear: what happened, how reliable the result is, and what to do next.
|
||||
- **Why it matters**: This is the missing interpretation layer between the outcome taxonomy foundation (Spec 156) and the operator's actual decision. Without it, the product remains formally correct but interpretively weak on its highest-trust governance surfaces. Concrete consequences: operators read `0 findings` as all-clear when evaluation was constrained by evidence gaps; support and debug cost stays high because run-detail pages require JSON or expert knowledge; reason codes leak into operator surfaces as primary explanation; semantic inconsistencies multiply with each new governance/evidence/review feature; enterprise readiness suffers despite strong internal modelling. The gap is systemic — not a single-surface fix — because the same class of problem (execution succeeded, but result is degraded/partial/suppressed) recurs across baselines, reviews, evidence, monitoring, and future governance domains.
|
||||
- **Proposed direction**:
|
||||
- **R1 — Multi-dimensional outcome model in the UI**: operator surfaces must separate result communication into at least five distinguishable axes: execution status, evaluation result, reliability/confidence/trustworthiness, coverage/completeness, recommended action. Not every page needs all five equally prominent, but the UI model must keep them distinct instead of collapsing them into a single badge or count block.
|
||||
- **R2 — Primary operator explanation over raw reason codes**: every relevant reason code or reason cluster gets an operator-readable title, clear meaning description, impact explanation, and next-step guidance. Technical reason codes remain stored and diagnostically usable but stop being the primary explanation. Example: not `evidence_capture_incomplete` as the headline, but "Vergleich unvollständig bewertet — 16 Evidence-Lücken verhinderten vollständige Bewertung."
|
||||
- **R3 — Semantically unambiguous counts**: count blocks must not imply completeness when they represent only a subset. The spec defines which counts are execution counts, which are evaluation-output counts, which are data-quality/completeness counts, and how suppressed/incomplete/degraded states surface. `0 findings` when evidence gaps exist must not read as reassurance.
|
||||
- **R4 — "Why no findings / no results / no report" explanation patterns**: a shared pattern for explaining absent output, distinguishing between genuinely no issues found, no results producible due to missing inputs, results suppressed due to unreliable evidence, prerequisite failure, viewer/rendering limitation, pending calculation, and intentionally unpublished.
|
||||
- **R5 — Reliability/confidence visibility**: compare-, review-, and evidence-dependent results must show whether the result is trustworthy, limited in trustworthiness, incomplete, diagnostically usable but not decision-grade, or not usable. Terms like fidelity, coverage, partial, meta, full, degraded get a defined reading direction instead of appearing loosely side by side.
|
||||
- **R6 — Semantically derived next-step guidance**: the next-step surface must distinguish between no action needed, observe, re-run, fix prerequisite, update inventory/sync, check evidence gaps, manually validate, and escalate — derived from cause and severity, not a generic fallback phrase.
|
||||
- **R7 — Diagnostics available but not dominant**: technical raw data, run context, and reason codes remain accessible for debugging but the default reading path leads with meaning and action: what happened → how reliable → why it looks this way → what to do → then technical details.
|
||||
- **R8 — Shared explanation patterns, not page-by-page special logic**: define reusable cross-domain explanation patterns: completed but degraded, completed but incomplete, no output because suppressed, no output because insufficient evidence, partial result with fallback, output exists but not publication-ready, viewer limitation vs source limitation, prerequisite missing vs execution failed.
|
||||
- **R9 — Baseline Compare as reference case**: Baseline Compare is the golden path for this spec. The case where compare finishes technically, driftResults = 0, counts = 0, evidence gaps exist, and "why no findings" is currently reason-code-heavy must become immediately understandable without additional product knowledge.
|
||||
- **R10 — No contradiction between top-level state and detail interpretation**: top-level states (run finished, completed with follow-up, succeeded, partially succeeded) must not conflict with the underlying result surfaces. Technically successful but fachlich limited results must read as a consistent composite.
|
||||
- **Primary adoption surfaces**:
|
||||
- Operation Run detail pages
|
||||
- Baseline Capture
|
||||
- Baseline Compare
|
||||
- Baseline Snapshot / Compare truth surfaces
|
||||
- Tenant Reviews / review generation states
|
||||
- Evidence-dependent result surfaces
|
||||
- Run summaries, state banners, next-step texts, count summaries
|
||||
- Reason code presentation / translation layer
|
||||
- Count semantics for runs with suppressed / incomplete / degraded results
|
||||
- **Optional extension surfaces**: restore/backup operation outcomes, prerequisite/readiness/missing-input explanation surfaces, alerts that rely on degraded semantics
|
||||
- **UX/semantics principles**:
|
||||
- Truth first, but operator-first wording — truth is not softened but expressed in readable operator language
|
||||
- No false reassurance — `0 findings` or `no errors` must not read as all-clear when evaluation was constrained
|
||||
- Separate execution from confidence — a technically successful run can still produce a limited-confidence result
|
||||
- Default path before diagnostics — the normal reading flow answers meaning and action first, then technical detail
|
||||
- Consistent semantics across domains — the same kind of state must not be named or explained differently in baselines, reviews, and evidence
|
||||
- **Scope boundaries**:
|
||||
- **In scope**: explanation dimension definitions, outcome-to-operator-message mapping, shared explanation pattern library, reference implementation for Baseline Compare / Operation Run detail / Baseline Capture, count semantics rules, reliability visibility rules, next-step guidance patterns, reason code presentation hierarchy
|
||||
- **Out of scope**: queue/retry/job-runtime model redesign, large-scale raw data model restructuring (existing run-context data should suffice), purely visual polishing without semantic improvement, reason code elimination (they remain for diagnostics), complete redesign of every product page, compliance/governance domain logic changes
|
||||
- **Phased delivery**:
|
||||
- Phase 1 — Semantics model: define explanation dimensions, outcome vs reliability vs coverage vs action separation, reason-code-to-operator-message mapping, shared explanation pattern library
|
||||
- Phase 2 — Reference implementation: Operation Run detail page, Baseline Compare, Baseline Capture
|
||||
- Phase 3 — Extension: Tenant Reviews, evidence/report surfaces, further degraded/prerequisite-heavy flows
|
||||
- **Success criteria**:
|
||||
- An operator can understand degraded/suppressed results without JSON: what was executed, whether results were produced, how reliable they are, why nothing was generated, and what to do next
|
||||
- Baseline Compare with 0 findings + evidence gaps shows no implicit all-clear
|
||||
- Reason codes remain technically usable but no longer serve as sole primary explanation
|
||||
- Count surfaces are semantically unambiguous, distinguishing output counts from completeness/reliability signals
|
||||
- At least one shared explanation pattern is reused across domains rather than landing as one-off page logic
|
||||
- **Dependencies**: Spec 156 (operator-outcome-taxonomy — shared vocabulary foundation), Spec 157 (reason-code-translation — humanized code labels), Spec 158 (artifact-truth-semantics — governance artifact truth model), Spec 159 (baseline-snapshot-truth — baseline truth surfaces)
|
||||
- **Related specs / candidates**: Governance Operator Outcome Compression (governance-artifact list/scan compression — complementary, narrower scope), Humanized Diagnostic Summaries for Governance Operations (governance run-detail explainability — complementary, narrower scope), Baseline Capture Truthful Outcomes (capture-specific precondition and outcome hardening), OperationRun Humanization & Diagnostics Boundary (run-detail operator-first hierarchy)
|
||||
- **Strategic importance**: This is not UI polish but an enterprise-readability and governance-trust layer. As the roadmap expands toward baseline governance, findings workflow, exceptions/risk acceptance, stored reports, tenant reviews, evidence packs, and MSP portfolio views, the cost of not having a systematic explanation layer increases with every new feature. The longer this is deferred, the more governance surfaces ship with locally invented explanation patterns that diverge instead of converging. This candidate is strategically anschlussfähig rather than locally reactive.
|
||||
- **Priority**: high
|
||||
|
||||
> **Operator Truth Initiative — Sequencing Note**
|
||||
>
|
||||
> The operator-truth work now has two connected lanes: a shared truth-foundation lane and a governance-surface compression lane. Together they address the systemic gap between backend truth richness and operator-facing truth quality without forcing operators to parse raw internal semantics.
|
||||
>
|
||||
> **Recommended order:**
|
||||
> 1. **Operator Outcome Taxonomy and Cross-Domain State Separation** — defines the shared vocabulary, state-axis separation rules, and color-severity conventions that all other operator-facing work references. This is the smallest deliverable (a reference document + restructuring guidelines) but the highest-leverage decision. Without it, the other two candidates will invent local vocabularies that diverge.
|
||||
> 1. **Operator Outcome Taxonomy and Cross-Domain State Separation** — defines the shared vocabulary, state-axis separation rules, and color-severity conventions that all other operator-facing work references. This is the smallest deliverable (a reference document + restructuring guidelines) but the highest-leverage decision. Without it, the other candidates will invent local vocabularies that diverge.
|
||||
> 2. **Operator Reason Code Translation and Humanization Contract** — defines the translation bridge from internal codes to operator-facing labels using the Outcome Taxonomy's vocabulary. Can begin in parallel with the taxonomy using pragmatic interim labels, but final convergence depends on the taxonomy.
|
||||
> 3. **Governance Artifact Truthful Outcomes & Fidelity Semantics** — establishes the full internal truth model for governance artifacts, keeping existence, usability, freshness, completeness, publication readiness, and actionability distinct.
|
||||
> 4. **Governance Operator Outcome Compression** — applies the foundation to governance workflow surfaces so lists and details answer the operator's primary question first, while preserving diagnostics as second-layer detail.
|
||||
> 5. **Provider-Backed Action Preflight and Dispatch Gate Unification** — extends the proven Gen 2 gate pattern to all provider-backed operations and establishes a shared result presenter. Benefits from the shared vocabulary work but remains a parallel hardening lane rather than a governance-surface adoption slice.
|
||||
> 4. **Operator Explanation Layer for Degraded / Partial / Suppressed Results** — defines the cross-cutting interpretation layer that turns internal truth dimensions into operator-readable explanation: multi-dimensional outcome separation (execution, evaluation, reliability, coverage, action), shared explanation patterns for degraded/partial/suppressed states, count semantics rules, "why no findings" patterns, and reliability visibility. Consumes the taxonomy, translation, and artifact-truth foundations; provides the shared explanation pattern library that compression and humanized summaries adopt on their respective surfaces. Baseline Compare is the golden-path reference implementation.
|
||||
> 5. **Governance Operator Outcome Compression** — applies the foundation and explanation layer to governance workflow surfaces so lists and details answer the operator's primary question first, while preserving diagnostics as second-layer detail.
|
||||
> 6. **Humanized Diagnostic Summaries for Governance Operations** — the run-detail explainability companion to compression; makes governance run detail self-explanatory using the explanation patterns established in step 4.
|
||||
> 7. **Provider-Backed Action Preflight and Dispatch Gate Unification** — extends the proven Gen 2 gate pattern to all provider-backed operations and establishes a shared result presenter. Benefits from the shared vocabulary work but remains a parallel hardening lane rather than a governance-surface adoption slice.
|
||||
>
|
||||
> **Why this order rather than the inverse:** The semantic-clarity audit classified the taxonomy problem as P0 (60% of warning badges are false alarms — actively damages operator trust). Reason code translation creates the shared human-facing language. Spec 158 establishes the correct internal truth engine for governance artifacts. The compression follow-up is then what turns that engine into a scanable operator cockpit before more governance features land. Gate unification remains highly valuable, but it is a neighboring hardening lane rather than the immediate follow-up needed to make governance truth semantics feel product-ready.
|
||||
> **Why this order rather than the inverse:** The semantic-clarity audit classified the taxonomy problem as P0 (60% of warning badges are false alarms — actively damages operator trust). Reason code translation creates the shared human-facing language. Spec 158 establishes the correct internal truth engine for governance artifacts. The Operator Explanation Layer then defines the cross-cutting interpretation patterns that all downstream surfaces need — the systemwide rules for how degraded, partial, suppressed, and incomplete results are separated, explained, and acted upon. Compression and humanized summaries are adoption slices that apply those patterns to specific surface families (governance artifact lists and governance run details respectively). Gate unification remains highly valuable but is a neighboring hardening lane.
|
||||
>
|
||||
> **Why these are not one spec:** Each candidate has a different implementation surface, different stakeholders, and different shippability boundary. The taxonomy is a cross-cutting decision document. Reason code translation touches reason-code artifacts and notification builders. Spec 158 defines the richer artifact truth engine. Governance operator outcome compression is a UI-information-architecture adoption slice across governance surfaces. Gate unification touches provider dispatch and notification plumbing across ~20 services. Merging them would create an unshippable monolith. Keeping them sequenced preserves independent delivery while still converging on one operator language.
|
||||
> **Why these are not one spec:** Each candidate has a different implementation surface, different stakeholders, and different shippability boundary. The taxonomy is a cross-cutting decision document. Reason code translation touches reason-code artifacts and notification builders. Spec 158 defines the richer artifact truth engine. The Operator Explanation Layer defines the shared interpretation semantics and explanation patterns. Governance operator outcome compression is a UI-information-architecture adoption slice across governance artifact surfaces. Humanized diagnostic summaries are an adoption slice for governance run-detail pages. Gate unification touches provider dispatch and notification plumbing across ~20 services. Merging them would create an unshippable monolith. Keeping them sequenced preserves independent delivery while still converging on one operator language.
|
||||
|
||||
### Baseline Snapshot Fidelity Semantics
|
||||
- **Type**: hardening
|
||||
|
||||
170
public/js/tenantpilot/ops-ux-progress-widget-poller.js
Normal file
170
public/js/tenantpilot/ops-ux-progress-widget-poller.js
Normal file
@ -0,0 +1,170 @@
|
||||
(() => {
|
||||
if (typeof window === 'undefined') {
|
||||
return;
|
||||
}
|
||||
|
||||
if (typeof window.opsUxProgressWidgetPoller === 'function') {
|
||||
return;
|
||||
}
|
||||
|
||||
window.opsUxProgressWidgetPoller = function opsUxProgressWidgetPoller() {
|
||||
return {
|
||||
timer: null,
|
||||
activeSinceMs: null,
|
||||
fastUntilMs: null,
|
||||
teardownObserver: null,
|
||||
|
||||
init() {
|
||||
this.onVisibilityChange = this.onVisibilityChange.bind(this);
|
||||
window.addEventListener('visibilitychange', this.onVisibilityChange);
|
||||
|
||||
this.onNavigated = this.onNavigated.bind(this);
|
||||
window.addEventListener('livewire:navigated', this.onNavigated);
|
||||
|
||||
this.teardownObserver = new MutationObserver(() => {
|
||||
if (!this.$el || this.$el.isConnected !== true) {
|
||||
this.destroy();
|
||||
}
|
||||
});
|
||||
|
||||
this.teardownObserver.observe(document.body, { childList: true, subtree: true });
|
||||
|
||||
this.schedule(0);
|
||||
},
|
||||
|
||||
destroy() {
|
||||
this.stop();
|
||||
window.removeEventListener('visibilitychange', this.onVisibilityChange);
|
||||
window.removeEventListener('livewire:navigated', this.onNavigated);
|
||||
|
||||
if (this.teardownObserver) {
|
||||
this.teardownObserver.disconnect();
|
||||
this.teardownObserver = null;
|
||||
}
|
||||
},
|
||||
|
||||
stop() {
|
||||
if (this.timer) {
|
||||
clearTimeout(this.timer);
|
||||
this.timer = null;
|
||||
}
|
||||
},
|
||||
|
||||
isModalOpen() {
|
||||
return document.querySelector('[role="dialog"][aria-modal="true"]') !== null;
|
||||
},
|
||||
|
||||
isPaused() {
|
||||
if (document.hidden === true) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (this.isModalOpen()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!this.$el || this.$el.isConnected !== true) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
},
|
||||
|
||||
onVisibilityChange() {
|
||||
if (!this.isPaused()) {
|
||||
this.schedule(0);
|
||||
}
|
||||
},
|
||||
|
||||
onNavigated() {
|
||||
if (!this.isPaused()) {
|
||||
this.schedule(0);
|
||||
}
|
||||
},
|
||||
|
||||
activeAgeSeconds() {
|
||||
if (this.activeSinceMs === null) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return Math.floor((Date.now() - this.activeSinceMs) / 1000);
|
||||
},
|
||||
|
||||
nextIntervalMs() {
|
||||
if (this.$wire?.disabled === true) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (this.$wire?.hasActiveRuns !== true) {
|
||||
this.activeSinceMs = null;
|
||||
return 30_000;
|
||||
}
|
||||
|
||||
if (this.activeSinceMs === null) {
|
||||
this.activeSinceMs = Date.now();
|
||||
}
|
||||
|
||||
const now = Date.now();
|
||||
|
||||
if (this.fastUntilMs && now < this.fastUntilMs) {
|
||||
return 1_000;
|
||||
}
|
||||
|
||||
const age = this.activeAgeSeconds();
|
||||
|
||||
if (age < 10) {
|
||||
return 1_000;
|
||||
}
|
||||
|
||||
if (age < 60) {
|
||||
return 5_000;
|
||||
}
|
||||
|
||||
return 10_000;
|
||||
},
|
||||
|
||||
async tick() {
|
||||
if (this.isPaused()) {
|
||||
this.schedule(2_000);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.$wire.refreshRuns();
|
||||
} catch (error) {
|
||||
const isCancellation = Boolean(
|
||||
error &&
|
||||
typeof error === 'object' &&
|
||||
error.status === null &&
|
||||
error.body === null &&
|
||||
error.json === null &&
|
||||
error.errors === null,
|
||||
);
|
||||
|
||||
if (!isCancellation) {
|
||||
console.warn('Ops UX widget refreshRuns failed', error);
|
||||
}
|
||||
}
|
||||
|
||||
const next = this.nextIntervalMs();
|
||||
|
||||
if (next === null) {
|
||||
this.stop();
|
||||
return;
|
||||
}
|
||||
|
||||
this.schedule(next);
|
||||
},
|
||||
|
||||
schedule(delayMs) {
|
||||
this.stop();
|
||||
|
||||
const delay = Math.max(0, Number(delayMs ?? 0));
|
||||
|
||||
this.timer = setTimeout(() => {
|
||||
this.tick().catch(() => {});
|
||||
}, delay);
|
||||
},
|
||||
};
|
||||
};
|
||||
})();
|
||||
@ -1,5 +1,9 @@
|
||||
@import '../../../../vendor/filament/filament/resources/css/theme.css';
|
||||
|
||||
@theme {
|
||||
--radius-2xl: 1rem;
|
||||
}
|
||||
|
||||
@source '../../../../app/Filament/**/*';
|
||||
@source '../../../../resources/views/filament/**/*.blade.php';
|
||||
@source '../../../../resources/views/livewire/**/*.blade.php';
|
||||
|
||||
@ -1,4 +1,8 @@
|
||||
@import '../../../../vendor/filament/filament/resources/css/theme.css';
|
||||
|
||||
@theme {
|
||||
--radius-2xl: 1rem;
|
||||
}
|
||||
|
||||
@source '../../../../app/Filament/System/**/*';
|
||||
@source '../../../../resources/views/filament/system/**/*.blade.php';
|
||||
|
||||
@ -8,21 +8,12 @@
|
||||
$action = is_array($card['action'] ?? null) ? $card['action'] : null;
|
||||
@endphp
|
||||
|
||||
<div class="rounded-2xl border border-gray-200 bg-white/90 p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900/80">
|
||||
<div class="flex items-start justify-between gap-3">
|
||||
<div>
|
||||
<div class="text-sm font-semibold text-gray-950 dark:text-white">
|
||||
{{ $card['title'] ?? 'Supporting detail' }}
|
||||
</div>
|
||||
|
||||
@if (filled($card['description'] ?? null))
|
||||
<div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
|
||||
{{ $card['description'] }}
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
|
||||
@if ($action !== null && filled($action['url'] ?? null))
|
||||
<x-filament::section
|
||||
:heading="$card['title'] ?? 'Supporting detail'"
|
||||
:description="$card['description'] ?? null"
|
||||
>
|
||||
@if ($action !== null && filled($action['url'] ?? null))
|
||||
<x-slot name="headerEnd">
|
||||
<a
|
||||
href="{{ $action['url'] }}"
|
||||
@if (($action['openInNewTab'] ?? false) === true) target="_blank" rel="noreferrer noopener" @endif
|
||||
@ -30,10 +21,10 @@ class="text-xs font-medium {{ ($action['destructive'] ?? false) === true ? 'text
|
||||
>
|
||||
{{ $action['label'] }}
|
||||
</a>
|
||||
@endif
|
||||
</div>
|
||||
</x-slot>
|
||||
@endif
|
||||
|
||||
<div class="mt-4">
|
||||
<div>
|
||||
@if ($view !== null)
|
||||
{!! view($view, is_array($card['viewData'] ?? null) ? $card['viewData'] : [])->render() !!}
|
||||
@elseif ($items !== [])
|
||||
@ -42,4 +33,4 @@ class="text-xs font-medium {{ ($action['destructive'] ?? false) === true ? 'text
|
||||
@include('filament.infolists.entries.enterprise-detail.empty-state', ['state' => $emptyState])
|
||||
@endif
|
||||
</div>
|
||||
</div>
|
||||
</x-filament::section>
|
||||
|
||||
@ -16,133 +16,4 @@
|
||||
</x-filament::section>
|
||||
|
||||
{{ $this->table }}
|
||||
|
||||
@php
|
||||
$selectedAudit = $this->selectedAuditLog();
|
||||
$selectedAuditLink = $this->selectedAuditLink();
|
||||
@endphp
|
||||
|
||||
@if ($selectedAudit)
|
||||
<x-filament::section
|
||||
:heading="$selectedAudit->summaryText()"
|
||||
:description="$selectedAudit->recorded_at?->toDayDateTimeString()"
|
||||
>
|
||||
<div class="flex flex-col gap-6">
|
||||
<div class="flex flex-wrap items-center gap-3">
|
||||
<span class="inline-flex items-center rounded-full bg-gray-100 px-3 py-1 text-xs font-medium text-gray-700 dark:bg-gray-800 dark:text-gray-200">
|
||||
{{ \App\Support\Badges\BadgeRenderer::label(\App\Support\Badges\BadgeDomain::AuditOutcome)($selectedAudit->normalizedOutcome()->value) }}
|
||||
</span>
|
||||
<span class="inline-flex items-center rounded-full bg-gray-100 px-3 py-1 text-xs font-medium text-gray-700 dark:bg-gray-800 dark:text-gray-200">
|
||||
{{ \App\Support\Badges\BadgeRenderer::label(\App\Support\Badges\BadgeDomain::AuditActorType)($selectedAudit->actorSnapshot()->type->value) }}
|
||||
</span>
|
||||
|
||||
@if (is_array($selectedAuditLink))
|
||||
<a
|
||||
class="inline-flex items-center rounded-lg border border-gray-300 px-3 py-2 text-sm font-medium text-gray-700 transition hover:bg-gray-50 dark:border-gray-700 dark:text-gray-200 dark:hover:bg-gray-900"
|
||||
href="{{ $selectedAuditLink['url'] }}"
|
||||
>
|
||||
{{ $selectedAuditLink['label'] }}
|
||||
</a>
|
||||
@endif
|
||||
|
||||
<button
|
||||
class="inline-flex items-center rounded-lg border border-transparent px-3 py-2 text-sm font-medium text-gray-500 transition hover:text-gray-700 dark:text-gray-400 dark:hover:text-gray-200"
|
||||
type="button"
|
||||
wire:click="clearSelectedAuditLog"
|
||||
>
|
||||
Close details
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<div class="grid gap-4 lg:grid-cols-3">
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
Actor
|
||||
</div>
|
||||
<div class="mt-2 text-sm font-medium text-gray-900 dark:text-gray-100">
|
||||
{{ $selectedAudit->actorDisplayLabel() }}
|
||||
</div>
|
||||
<div class="mt-1 text-sm text-gray-600 dark:text-gray-300">
|
||||
{{ $selectedAudit->actorSnapshot()->type->label() }}
|
||||
</div>
|
||||
@if ($selectedAudit->actorSnapshot()->email)
|
||||
<div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
|
||||
{{ $selectedAudit->actorSnapshot()->email }}
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
Target
|
||||
</div>
|
||||
<div class="mt-2 text-sm font-medium text-gray-900 dark:text-gray-100">
|
||||
{{ $selectedAudit->targetDisplayLabel() ?? 'No target snapshot' }}
|
||||
</div>
|
||||
<div class="mt-1 text-sm text-gray-600 dark:text-gray-300">
|
||||
{{ $selectedAudit->resource_type ? ucfirst(str_replace('_', ' ', $selectedAudit->resource_type)) : 'Workspace event' }}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
Scope
|
||||
</div>
|
||||
<div class="mt-2 text-sm font-medium text-gray-900 dark:text-gray-100">
|
||||
{{ $selectedAudit->tenant?->name ?? 'Workspace-wide event' }}
|
||||
</div>
|
||||
<div class="mt-1 text-sm text-gray-600 dark:text-gray-300">
|
||||
Workspace #{{ $selectedAudit->workspace_id }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid gap-4 lg:grid-cols-2">
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-sm font-semibold text-gray-900 dark:text-gray-100">
|
||||
Readable context
|
||||
</div>
|
||||
|
||||
@if ($selectedAudit->contextItems() === [])
|
||||
<div class="mt-3 text-sm text-gray-600 dark:text-gray-300">
|
||||
No additional context was recorded for this event.
|
||||
</div>
|
||||
@else
|
||||
<dl class="mt-3 space-y-3">
|
||||
@foreach ($selectedAudit->contextItems() as $item)
|
||||
<div>
|
||||
<dt class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
{{ $item['label'] }}
|
||||
</dt>
|
||||
<dd class="mt-1 text-sm text-gray-900 dark:text-gray-100">
|
||||
{{ is_bool($item['value']) ? ($item['value'] ? 'true' : 'false') : $item['value'] }}
|
||||
</dd>
|
||||
</div>
|
||||
@endforeach
|
||||
</dl>
|
||||
@endif
|
||||
</div>
|
||||
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-sm font-semibold text-gray-900 dark:text-gray-100">
|
||||
Technical metadata
|
||||
</div>
|
||||
|
||||
<dl class="mt-3 space-y-3">
|
||||
@foreach ($selectedAudit->technicalMetadata() as $label => $value)
|
||||
<div>
|
||||
<dt class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
{{ $label }}
|
||||
</dt>
|
||||
<dd class="mt-1 break-all text-sm text-gray-900 dark:text-gray-100">
|
||||
{{ $value }}
|
||||
</dd>
|
||||
</div>
|
||||
@endforeach
|
||||
</dl>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</x-filament::section>
|
||||
@endif
|
||||
</x-filament-panels::page>
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
<x-filament-panels::page>
|
||||
@php($lifecycleSummary = $this->lifecycleVisibilitySummary())
|
||||
|
||||
<x-filament::tabs label="Operations tabs">
|
||||
<x-filament::tabs.item
|
||||
:active="$this->activeTab === 'all'"
|
||||
@ -38,5 +40,12 @@
|
||||
</x-filament::tabs.item>
|
||||
</x-filament::tabs>
|
||||
|
||||
@if (($lifecycleSummary['likely_stale'] ?? 0) > 0 || ($lifecycleSummary['reconciled'] ?? 0) > 0)
|
||||
<div class="mb-4 rounded-lg border border-amber-200 bg-amber-50 px-4 py-3 text-sm text-amber-900 dark:border-amber-500/30 dark:bg-amber-500/10 dark:text-amber-100">
|
||||
{{ ($lifecycleSummary['likely_stale'] ?? 0) }} active run(s) are beyond their lifecycle window.
|
||||
{{ ($lifecycleSummary['reconciled'] ?? 0) }} run(s) have already been automatically reconciled.
|
||||
</div>
|
||||
@endif
|
||||
|
||||
{{ $this->table }}
|
||||
</x-filament-panels::page>
|
||||
|
||||
@ -0,0 +1,115 @@
|
||||
@php
|
||||
$selectedAudit = $selectedAudit ?? null;
|
||||
$selectedAuditLink = $selectedAuditLink ?? null;
|
||||
@endphp
|
||||
|
||||
@if ($selectedAudit)
|
||||
<div class="flex flex-col gap-6">
|
||||
<div class="flex flex-wrap items-center gap-3">
|
||||
<span class="inline-flex items-center rounded-full bg-gray-100 px-3 py-1 text-xs font-medium text-gray-700 dark:bg-gray-800 dark:text-gray-200">
|
||||
{{ \App\Support\Badges\BadgeRenderer::label(\App\Support\Badges\BadgeDomain::AuditOutcome)($selectedAudit->normalizedOutcome()->value) }}
|
||||
</span>
|
||||
<span class="inline-flex items-center rounded-full bg-gray-100 px-3 py-1 text-xs font-medium text-gray-700 dark:bg-gray-800 dark:text-gray-200">
|
||||
{{ \App\Support\Badges\BadgeRenderer::label(\App\Support\Badges\BadgeDomain::AuditActorType)($selectedAudit->actorSnapshot()->type->value) }}
|
||||
</span>
|
||||
|
||||
@if (is_array($selectedAuditLink))
|
||||
<a
|
||||
class="inline-flex items-center rounded-lg border border-gray-300 px-3 py-2 text-sm font-medium text-gray-700 transition hover:bg-gray-50 dark:border-gray-700 dark:text-gray-200 dark:hover:bg-gray-900"
|
||||
href="{{ $selectedAuditLink['url'] }}"
|
||||
>
|
||||
{{ $selectedAuditLink['label'] }}
|
||||
</a>
|
||||
@endif
|
||||
</div>
|
||||
|
||||
<div class="grid gap-4 lg:grid-cols-3">
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
Actor
|
||||
</div>
|
||||
<div class="mt-2 text-sm font-medium text-gray-900 dark:text-gray-100">
|
||||
{{ $selectedAudit->actorDisplayLabel() }}
|
||||
</div>
|
||||
<div class="mt-1 text-sm text-gray-600 dark:text-gray-300">
|
||||
{{ $selectedAudit->actorSnapshot()->type->label() }}
|
||||
</div>
|
||||
@if ($selectedAudit->actorSnapshot()->email)
|
||||
<div class="mt-1 text-xs text-gray-500 dark:text-gray-400">
|
||||
{{ $selectedAudit->actorSnapshot()->email }}
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
Target
|
||||
</div>
|
||||
<div class="mt-2 text-sm font-medium text-gray-900 dark:text-gray-100">
|
||||
{{ $selectedAudit->targetDisplayLabel() ?? 'No target snapshot' }}
|
||||
</div>
|
||||
<div class="mt-1 text-sm text-gray-600 dark:text-gray-300">
|
||||
{{ $selectedAudit->resource_type ? ucfirst(str_replace('_', ' ', $selectedAudit->resource_type)) : 'Workspace event' }}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
Scope
|
||||
</div>
|
||||
<div class="mt-2 text-sm font-medium text-gray-900 dark:text-gray-100">
|
||||
{{ $selectedAudit->tenant?->name ?? 'Workspace-wide event' }}
|
||||
</div>
|
||||
<div class="mt-1 text-sm text-gray-600 dark:text-gray-300">
|
||||
Workspace #{{ $selectedAudit->workspace_id }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid gap-4 lg:grid-cols-2">
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-sm font-semibold text-gray-900 dark:text-gray-100">
|
||||
Readable context
|
||||
</div>
|
||||
|
||||
@if ($selectedAudit->contextItems() === [])
|
||||
<div class="mt-3 text-sm text-gray-600 dark:text-gray-300">
|
||||
No additional context was recorded for this event.
|
||||
</div>
|
||||
@else
|
||||
<dl class="mt-3 space-y-3">
|
||||
@foreach ($selectedAudit->contextItems() as $item)
|
||||
<div>
|
||||
<dt class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
{{ $item['label'] }}
|
||||
</dt>
|
||||
<dd class="mt-1 text-sm text-gray-900 dark:text-gray-100">
|
||||
{{ is_bool($item['value']) ? ($item['value'] ? 'true' : 'false') : $item['value'] }}
|
||||
</dd>
|
||||
</div>
|
||||
@endforeach
|
||||
</dl>
|
||||
@endif
|
||||
</div>
|
||||
|
||||
<div class="rounded-2xl border border-gray-200 bg-white p-4 shadow-sm dark:border-gray-800 dark:bg-gray-900">
|
||||
<div class="text-sm font-semibold text-gray-900 dark:text-gray-100">
|
||||
Technical metadata
|
||||
</div>
|
||||
|
||||
<dl class="mt-3 space-y-3">
|
||||
@foreach ($selectedAudit->technicalMetadata() as $label => $value)
|
||||
<div>
|
||||
<dt class="text-xs font-semibold uppercase tracking-wide text-gray-500 dark:text-gray-400">
|
||||
{{ $label }}
|
||||
</dt>
|
||||
<dd class="mt-1 break-all text-sm text-gray-900 dark:text-gray-100">
|
||||
{{ $value }}
|
||||
</dd>
|
||||
</div>
|
||||
@endforeach
|
||||
</dl>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@endif
|
||||
@ -1,6 +1,7 @@
|
||||
@php
|
||||
$contextBanner = $this->canonicalContextBanner();
|
||||
$blockedBanner = $this->blockedExecutionBanner();
|
||||
$lifecycleBanner = $this->lifecycleBanner();
|
||||
$pollInterval = $this->pollInterval();
|
||||
@endphp
|
||||
|
||||
@ -37,6 +38,20 @@
|
||||
</div>
|
||||
@endif
|
||||
|
||||
@if ($lifecycleBanner !== null)
|
||||
@php
|
||||
$lifecycleBannerClasses = match ($lifecycleBanner['tone']) {
|
||||
'rose' => 'border-rose-200 bg-rose-50 text-rose-900 dark:border-rose-500/30 dark:bg-rose-500/10 dark:text-rose-100',
|
||||
default => 'border-amber-200 bg-amber-50 text-amber-900 dark:border-amber-500/30 dark:bg-amber-500/10 dark:text-amber-100',
|
||||
};
|
||||
@endphp
|
||||
|
||||
<div class="mb-6 rounded-lg border px-4 py-3 text-sm {{ $lifecycleBannerClasses }}">
|
||||
<p class="font-semibold">{{ $lifecycleBanner['title'] }}</p>
|
||||
<p class="mt-1">{{ $lifecycleBanner['body'] }}</p>
|
||||
</div>
|
||||
@endif
|
||||
|
||||
@if ($this->redactionIntegrityNote())
|
||||
<div class="mb-6 rounded-lg border border-amber-200 bg-amber-50 px-4 py-3 text-sm text-amber-900 dark:border-amber-500/30 dark:bg-amber-500/10 dark:text-amber-100">
|
||||
{{ $this->redactionIntegrityNote() }}
|
||||
|
||||
@ -1,2 +1,3 @@
|
||||
<script defer src="{{ asset('js/tenantpilot/livewire-intercept-shim.js') }}"></script>
|
||||
<script defer src="{{ asset('js/tenantpilot/filament-sidebar-store-fallback.js') }}"></script>
|
||||
<script src="{{ asset('js/tenantpilot/livewire-intercept-shim.js') }}"></script>
|
||||
<script src="{{ asset('js/tenantpilot/filament-sidebar-store-fallback.js') }}"></script>
|
||||
<script src="{{ asset('js/tenantpilot/ops-ux-progress-widget-poller.js') }}"></script>
|
||||
|
||||
@ -50,173 +50,3 @@ class="block rounded-lg bg-white/90 dark:bg-gray-800/90 px-4 py-2 text-center te
|
||||
</div>
|
||||
@endif
|
||||
</div>
|
||||
|
||||
<script>
|
||||
window.opsUxProgressWidgetPoller ??= function opsUxProgressWidgetPoller() {
|
||||
return {
|
||||
timer: null,
|
||||
activeSinceMs: null,
|
||||
fastUntilMs: null,
|
||||
teardownObserver: null,
|
||||
|
||||
init() {
|
||||
this.onVisibilityChange = this.onVisibilityChange.bind(this);
|
||||
window.addEventListener('visibilitychange', this.onVisibilityChange);
|
||||
|
||||
this.onNavigated = this.onNavigated.bind(this);
|
||||
window.addEventListener('livewire:navigated', this.onNavigated);
|
||||
|
||||
// Ensure we always detach listeners when Livewire/Filament removes this
|
||||
// element (for example during modal unmounts or navigation/morph).
|
||||
this.teardownObserver = new MutationObserver(() => {
|
||||
if (!this.$el || this.$el.isConnected !== true) {
|
||||
this.destroy();
|
||||
}
|
||||
});
|
||||
|
||||
this.teardownObserver.observe(document.body, { childList: true, subtree: true });
|
||||
|
||||
// First sync immediately.
|
||||
this.schedule(0);
|
||||
},
|
||||
|
||||
destroy() {
|
||||
this.stop();
|
||||
window.removeEventListener('visibilitychange', this.onVisibilityChange);
|
||||
window.removeEventListener('livewire:navigated', this.onNavigated);
|
||||
|
||||
if (this.teardownObserver) {
|
||||
this.teardownObserver.disconnect();
|
||||
this.teardownObserver = null;
|
||||
}
|
||||
},
|
||||
|
||||
stop() {
|
||||
if (this.timer) {
|
||||
clearTimeout(this.timer);
|
||||
this.timer = null;
|
||||
}
|
||||
},
|
||||
|
||||
isModalOpen() {
|
||||
return document.querySelector('[role="dialog"][aria-modal="true"]') !== null;
|
||||
},
|
||||
|
||||
isPaused() {
|
||||
if (document.hidden === true) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (this.isModalOpen()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!this.$el || this.$el.isConnected !== true) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
},
|
||||
|
||||
onVisibilityChange() {
|
||||
if (!this.isPaused()) {
|
||||
this.schedule(0);
|
||||
}
|
||||
},
|
||||
|
||||
onNavigated() {
|
||||
if (!this.isPaused()) {
|
||||
this.schedule(0);
|
||||
}
|
||||
},
|
||||
|
||||
activeAgeSeconds() {
|
||||
if (this.activeSinceMs === null) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return Math.floor((Date.now() - this.activeSinceMs) / 1000);
|
||||
},
|
||||
|
||||
nextIntervalMs() {
|
||||
// Stop polling entirely if server says this component is disabled.
|
||||
if (this.$wire?.disabled === true) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Discovery polling.
|
||||
if (this.$wire?.hasActiveRuns !== true) {
|
||||
this.activeSinceMs = null;
|
||||
return 30_000;
|
||||
}
|
||||
|
||||
// Active polling backoff.
|
||||
if (this.activeSinceMs === null) {
|
||||
this.activeSinceMs = Date.now();
|
||||
}
|
||||
|
||||
const now = Date.now();
|
||||
if (this.fastUntilMs && now < this.fastUntilMs) {
|
||||
return 1_000;
|
||||
}
|
||||
|
||||
const age = this.activeAgeSeconds();
|
||||
if (age < 10) {
|
||||
return 1_000;
|
||||
}
|
||||
|
||||
if (age < 60) {
|
||||
return 5_000;
|
||||
}
|
||||
|
||||
return 10_000;
|
||||
},
|
||||
|
||||
async tick() {
|
||||
if (this.isPaused()) {
|
||||
// Keep it calm: check again later.
|
||||
this.schedule(2_000);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.$wire.refreshRuns();
|
||||
} catch (error) {
|
||||
// Livewire will reject pending action promises when requests are
|
||||
// cancelled (for example during `wire:navigate`). That's expected
|
||||
// for this background poller, so we swallow cancellation rejections.
|
||||
const isCancellation = Boolean(
|
||||
error &&
|
||||
typeof error === 'object' &&
|
||||
error.status === null &&
|
||||
error.body === null &&
|
||||
error.json === null &&
|
||||
error.errors === null,
|
||||
);
|
||||
|
||||
if (!isCancellation) {
|
||||
console.warn('Ops UX widget refreshRuns failed', error);
|
||||
}
|
||||
}
|
||||
|
||||
const next = this.nextIntervalMs();
|
||||
if (next === null) {
|
||||
this.stop();
|
||||
return;
|
||||
}
|
||||
|
||||
this.schedule(next);
|
||||
},
|
||||
|
||||
schedule(delayMs) {
|
||||
this.stop();
|
||||
|
||||
const delay = Math.max(0, Number(delayMs ?? 0));
|
||||
|
||||
this.timer = setTimeout(() => {
|
||||
this.tick().catch(() => {});
|
||||
}, delay);
|
||||
},
|
||||
};
|
||||
};
|
||||
</script>
|
||||
|
||||
@ -29,6 +29,11 @@
|
||||
->name(ReconcileAdapterRunsJob::class)
|
||||
->withoutOverlapping();
|
||||
|
||||
Schedule::command('tenantpilot:operation-runs:reconcile')
|
||||
->everyFiveMinutes()
|
||||
->name('tenantpilot:operation-runs:reconcile')
|
||||
->withoutOverlapping();
|
||||
|
||||
Schedule::command('stored-reports:prune')
|
||||
->daily()
|
||||
->name('stored-reports:prune')
|
||||
|
||||
@ -0,0 +1,36 @@
|
||||
# Specification Quality Checklist: Operation Lifecycle Guarantees & Queue-to-Domain Failure Reconciliation
|
||||
|
||||
**Purpose**: Validate specification completeness and quality before proceeding to planning
|
||||
**Created**: 2026-03-23
|
||||
**Feature**: [spec.md](../spec.md)
|
||||
|
||||
## Content Quality
|
||||
|
||||
- [x] No implementation details (languages, frameworks, APIs)
|
||||
- [x] Focused on user value and business needs
|
||||
- [x] Written for non-technical stakeholders
|
||||
- [x] All mandatory sections completed
|
||||
|
||||
## Requirement Completeness
|
||||
|
||||
- [x] No [NEEDS CLARIFICATION] markers remain
|
||||
- [x] Requirements are testable and unambiguous
|
||||
- [x] Success criteria are measurable
|
||||
- [x] Success criteria are technology-agnostic (no implementation details)
|
||||
- [x] All acceptance scenarios are defined
|
||||
- [x] Edge cases are identified
|
||||
- [x] Scope is clearly bounded
|
||||
- [x] Dependencies and assumptions identified
|
||||
|
||||
## Feature Readiness
|
||||
|
||||
- [x] All functional requirements have clear acceptance criteria
|
||||
- [x] User scenarios cover primary flows
|
||||
- [x] Feature meets measurable outcomes defined in Success Criteria
|
||||
- [x] No implementation details leak into specification
|
||||
|
||||
## Notes
|
||||
|
||||
- Validation pass completed on 2026-03-23.
|
||||
- No open clarification markers remain.
|
||||
- The spec intentionally names domain objects already established in the product, but avoids prescribing implementation structure.
|
||||
@ -0,0 +1,171 @@
|
||||
openapi: 3.1.0
|
||||
info:
|
||||
title: Operation Run Lifecycle Monitoring Contract
|
||||
version: 0.1.0
|
||||
summary: Canonical Monitoring data contract for lifecycle freshness and reconciliation semantics on covered OperationRun records.
|
||||
servers:
|
||||
- url: https://tenantpilot.local
|
||||
paths:
|
||||
/api/admin/operations:
|
||||
get:
|
||||
summary: List operation runs with lifecycle freshness semantics
|
||||
operationId: listOperationRuns
|
||||
parameters:
|
||||
- in: query
|
||||
name: status
|
||||
schema:
|
||||
type: string
|
||||
enum: [queued, running, completed]
|
||||
- in: query
|
||||
name: outcome
|
||||
schema:
|
||||
type: string
|
||||
enum: [pending, succeeded, partially_succeeded, blocked, failed]
|
||||
- in: query
|
||||
name: type
|
||||
schema:
|
||||
type: string
|
||||
- in: query
|
||||
name: tenant_id
|
||||
schema:
|
||||
type: integer
|
||||
- in: query
|
||||
name: freshness_state
|
||||
schema:
|
||||
type: string
|
||||
enum: [fresh_active, likely_stale, reconciled_failed, terminal_normal]
|
||||
- in: query
|
||||
name: reconciled
|
||||
schema:
|
||||
type: boolean
|
||||
responses:
|
||||
'200':
|
||||
description: Paginated collection of operation runs
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
required: [data]
|
||||
properties:
|
||||
data:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/OperationRunListItem'
|
||||
/api/admin/operations/{operationRun}:
|
||||
get:
|
||||
summary: Get one operation run with lifecycle reconciliation detail
|
||||
operationId: getOperationRun
|
||||
parameters:
|
||||
- in: path
|
||||
name: operationRun
|
||||
required: true
|
||||
schema:
|
||||
type: integer
|
||||
responses:
|
||||
'200':
|
||||
description: Operation run detail
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/OperationRunDetail'
|
||||
'404':
|
||||
description: Run not found or caller not entitled to the run scope
|
||||
components:
|
||||
schemas:
|
||||
OperationRunListItem:
|
||||
type: object
|
||||
required:
|
||||
- id
|
||||
- type
|
||||
- status
|
||||
- outcome
|
||||
- freshness_state
|
||||
- reconciled
|
||||
- created_at
|
||||
properties:
|
||||
id:
|
||||
type: integer
|
||||
type:
|
||||
type: string
|
||||
type_label:
|
||||
type: string
|
||||
status:
|
||||
type: string
|
||||
enum: [queued, running, completed]
|
||||
outcome:
|
||||
type: string
|
||||
enum: [pending, succeeded, partially_succeeded, blocked, failed]
|
||||
freshness_state:
|
||||
type: string
|
||||
enum: [fresh_active, likely_stale, reconciled_failed, terminal_normal]
|
||||
reconciled:
|
||||
type: boolean
|
||||
tenant_id:
|
||||
type: integer
|
||||
nullable: true
|
||||
initiator_name:
|
||||
type: string
|
||||
nullable: true
|
||||
operator_message:
|
||||
type: string
|
||||
description: Operator-safe summary line shown by default on Monitoring surfaces.
|
||||
created_at:
|
||||
type: string
|
||||
format: date-time
|
||||
started_at:
|
||||
type: string
|
||||
format: date-time
|
||||
nullable: true
|
||||
completed_at:
|
||||
type: string
|
||||
format: date-time
|
||||
nullable: true
|
||||
OperationRunDetail:
|
||||
allOf:
|
||||
- $ref: '#/components/schemas/OperationRunListItem'
|
||||
- type: object
|
||||
properties:
|
||||
summary_counts:
|
||||
type: object
|
||||
additionalProperties:
|
||||
type: number
|
||||
failure_summary:
|
||||
type: array
|
||||
items:
|
||||
$ref: '#/components/schemas/FailureSummaryItem'
|
||||
reconciliation:
|
||||
$ref: '#/components/schemas/ReconciliationRecord'
|
||||
diagnostics:
|
||||
type: object
|
||||
description: Secondary lifecycle evidence shown only when diagnostics are revealed.
|
||||
properties:
|
||||
threshold_seconds:
|
||||
type: integer
|
||||
evidence:
|
||||
type: object
|
||||
additionalProperties: true
|
||||
FailureSummaryItem:
|
||||
type: object
|
||||
required: [code, message]
|
||||
properties:
|
||||
code:
|
||||
type: string
|
||||
message:
|
||||
type: string
|
||||
ReconciliationRecord:
|
||||
type: object
|
||||
nullable: true
|
||||
properties:
|
||||
reconciled_at:
|
||||
type: string
|
||||
format: date-time
|
||||
source:
|
||||
type: string
|
||||
enum: [failed_callback, scheduled_reconciler, adapter_reconciler]
|
||||
reason_code:
|
||||
type: string
|
||||
reason_message:
|
||||
type: string
|
||||
evidence:
|
||||
type: object
|
||||
additionalProperties: true
|
||||
199
specs/160-operation-lifecycle-guarantees/data-model.md
Normal file
199
specs/160-operation-lifecycle-guarantees/data-model.md
Normal file
@ -0,0 +1,199 @@
|
||||
# Phase 1 Data Model: Operation Lifecycle Guarantees & Queue-to-Domain Failure Reconciliation
|
||||
|
||||
## Overview
|
||||
|
||||
This feature does not require a new database table in the first implementation slice. The primary data-model work is the formalization of existing `OperationRun` persistence plus new derived lifecycle-policy and freshness concepts that make queue truth, domain truth, and operator-visible truth converge deterministically.
|
||||
|
||||
## Persistent Domain Entities
|
||||
|
||||
### OperationRun
|
||||
|
||||
**Purpose**: Canonical workspace-scoped operational record for long-running, queued, scheduled, or otherwise operator-visible work.
|
||||
|
||||
**Key fields**:
|
||||
- `id`
|
||||
- `workspace_id`
|
||||
- `tenant_id` nullable
|
||||
- `user_id` nullable
|
||||
- `type`
|
||||
- `status` with canonical values `queued`, `running`, `completed`
|
||||
- `outcome` with canonical values including `pending`, `succeeded`, `partially_succeeded`, `blocked`, `failed`
|
||||
- `run_identity_hash`
|
||||
- `summary_counts` JSONB
|
||||
- `failure_summary` JSONB array
|
||||
- `context` JSONB
|
||||
- `started_at`
|
||||
- `completed_at`
|
||||
- `created_at`
|
||||
- `updated_at`
|
||||
|
||||
**Relationships**:
|
||||
- Belongs to one workspace
|
||||
- Optionally belongs to one tenant
|
||||
- Optionally belongs to one initiating user
|
||||
|
||||
**Validation rules relevant to this feature**:
|
||||
- `status` and `outcome` transitions remain service-owned via `OperationRunService`.
|
||||
- Non-terminal active runs remain constrained by the existing active-run unique index semantics.
|
||||
- `summary_counts` keys remain from the canonical summary key catalog and values remain numeric-only.
|
||||
- Reconciliation metadata must be stored in a standardized structure inside `context` and `failure_summary` without persisting secrets.
|
||||
|
||||
**State transitions relevant to this feature**:
|
||||
- `queued/pending` → `running/pending`
|
||||
- `queued/pending` → `completed/failed` when stale queued reconciliation or direct queue-failure bridging resolves an orphaned queued run
|
||||
- `running/pending` → `completed/succeeded|partially_succeeded|blocked|failed`
|
||||
- `running/pending` → `completed/failed` when stale running reconciliation resolves an orphaned active run
|
||||
- `completed/*` is terminal and must never be mutated by reconciliation
|
||||
|
||||
### Failed Job Record (`failed_jobs`)
|
||||
|
||||
**Purpose**: Infrastructure-level evidence that a queued job exhausted attempts, timed out, or otherwise failed.
|
||||
|
||||
**Key fields used conceptually**:
|
||||
- UUID or failed-job identifier
|
||||
- connection
|
||||
- queue
|
||||
- payload
|
||||
- exception
|
||||
- failed_at
|
||||
|
||||
**Relationships**:
|
||||
- Not directly related through a foreign key to `OperationRun`
|
||||
- Linked back to `OperationRun` through job-owned identity resolution or reconciliation evidence
|
||||
|
||||
**Validation rules relevant to this feature**:
|
||||
- A failed-job record is evidence, not operator-facing truth by itself.
|
||||
- Evidence may inform reconciliation or diagnostics but must not replace the domain transition on `OperationRun`.
|
||||
|
||||
### Queue Job Definition
|
||||
|
||||
**Purpose**: The queued class that owns or advances a covered `OperationRun`.
|
||||
|
||||
**Key lifecycle-relevant properties**:
|
||||
- `operationRun` reference or `getOperationRun()` contract
|
||||
- optional `$timeout`
|
||||
- optional `$failOnTimeout`
|
||||
- optional `$tries` or `retryUntil()`
|
||||
- `middleware()` including `TrackOperationRun` and other queue middleware
|
||||
- optional `failed(Throwable $e)` callback
|
||||
|
||||
**Validation rules relevant to this feature**:
|
||||
- Covered jobs must provide a credible path to terminal truth through direct failure bridging, fallback reconciliation, or both.
|
||||
- Covered long-running jobs must have intentional timeout behavior and must be compatible with queue timing invariants.
|
||||
|
||||
## New Derived Domain Objects
|
||||
|
||||
### OperationLifecyclePolicy
|
||||
|
||||
**Purpose**: Configuration-backed policy describing which operation types are in scope for V1 and how their lifecycle should be evaluated.
|
||||
|
||||
**Fields**:
|
||||
- `operationType`
|
||||
- `covered` boolean
|
||||
- `queuedStaleAfterSeconds`
|
||||
- `runningStaleAfterSeconds`
|
||||
- `expectedMaxRuntimeSeconds`
|
||||
- `requiresDirectFailedBridge` boolean
|
||||
- `supportsReconciliation` boolean
|
||||
|
||||
**Validation rules**:
|
||||
- `queuedStaleAfterSeconds` and `runningStaleAfterSeconds` must be positive integers.
|
||||
- `expectedMaxRuntimeSeconds` must stay below effective queue `retry_after` with safety margin.
|
||||
- Only covered operation types participate in the generic reconciler for V1.
|
||||
|
||||
### OperationRunFreshnessAssessment
|
||||
|
||||
**Purpose**: Derived classification used by reconcilers and Monitoring surfaces to determine whether a non-terminal run is still trustworthy as active.
|
||||
|
||||
**Fields**:
|
||||
- `operationRunId`
|
||||
- `status`
|
||||
- `freshnessState` with canonical values `fresh`, `likely_stale`, `terminal`, `unknown`
|
||||
- `evaluatedAt`
|
||||
- `thresholdSeconds`
|
||||
- `evidence` key-value map
|
||||
|
||||
**Behavior**:
|
||||
- For `queued` runs, freshness is typically derived from `created_at`, absence of `started_at`, and policy threshold.
|
||||
- For `running` runs, freshness is derived from `started_at`, last meaningful update evidence available in persisted state, and policy threshold.
|
||||
- `completed` runs always assess as terminal and are excluded from stale reconciliation.
|
||||
|
||||
### LifecycleReconciliationRecord
|
||||
|
||||
**Purpose**: Structured reconciliation evidence stored in `OperationRun.context` and mirrored in `failure_summary`.
|
||||
|
||||
**Fields**:
|
||||
- `reconciledAt`
|
||||
- `reconciliationKind` such as `stale_queued`, `stale_running`, `queue_failure_bridge`, `adapter_sync`
|
||||
- `reasonCode`
|
||||
- `reasonMessage`
|
||||
- `evidence` key-value map
|
||||
- `source` such as `failed_callback`, `scheduled_reconciler`, `adapter_reconciler`
|
||||
|
||||
**Validation rules**:
|
||||
- Must only be added when the feature force-resolves or directly bridges a run.
|
||||
- Must be idempotent; repeat reconciliation must not append conflicting terminal truth.
|
||||
- Must be operator-safe and sanitized.
|
||||
|
||||
### OperationQueueFailureBridge
|
||||
|
||||
**Purpose**: Derived mapping between a queued job failure and the owning `OperationRun`.
|
||||
|
||||
**Fields**:
|
||||
- `operationRunId`
|
||||
- `jobClass`
|
||||
- `bridgeSource` such as `failed_callback` or `reconciler`
|
||||
- `exceptionClass`
|
||||
- `reasonCode`
|
||||
- `terminalOutcome`
|
||||
|
||||
**Behavior**:
|
||||
- Exists conceptually as a design contract, not necessarily as a standalone stored table.
|
||||
- Bridges queue truth into service-owned `OperationRun` terminal transitions.
|
||||
|
||||
## Supporting Catalogs
|
||||
|
||||
### Reconciliation Reason Codes
|
||||
|
||||
**Purpose**: Stable reason-code catalog for lifecycle healing.
|
||||
|
||||
**Initial values**:
|
||||
- `run.stale_queued`
|
||||
- `run.stale_running`
|
||||
- `run.infrastructure_timeout_or_abandonment`
|
||||
- `run.queue_failure_bridge`
|
||||
- `run.adapter_out_of_sync`
|
||||
|
||||
**Validation rules**:
|
||||
- Operator-facing text must be derived from centralized presenters or reason translation helpers.
|
||||
- Codes remain stable enough for regression assertions and audit review.
|
||||
|
||||
### Monitoring Freshness State
|
||||
|
||||
**Purpose**: Derived presentation state for Operations surfaces.
|
||||
|
||||
**Initial values**:
|
||||
- `fresh_active`
|
||||
- `likely_stale`
|
||||
- `reconciled_failed`
|
||||
- `terminal_normal`
|
||||
|
||||
**Behavior**:
|
||||
- Not stored as a new top-level database enum in V1.
|
||||
- Derived centrally so tables, detail pages, and notifications do not drift.
|
||||
|
||||
## Consumer Mapping
|
||||
|
||||
| Consumer | Primary data it needs |
|
||||
|---|---|
|
||||
| Generic lifecycle reconciler | Covered operation policy, active non-terminal runs, freshness assessment, standardized reconciliation transition |
|
||||
| Covered queued jobs | Owning `OperationRun`, timeout behavior, direct `failed()` bridge path |
|
||||
| Operations index | Current status, outcome, freshness assessment, reconciliation evidence summary |
|
||||
| Operation run detail | Full reconciliation record, translated reason, run timing, summary counts, failure details |
|
||||
| Runtime invariant validation | Queue connection `retry_after`, effective job timeout, covered operation lifecycle policy |
|
||||
|
||||
## Migration Notes
|
||||
|
||||
- No schema migration is required for the first implementation slice.
|
||||
- Existing `context` and `failure_summary` structures should be normalized for reconciliation evidence rather than replaced.
|
||||
- If later observability needs require indexed reconciliation metrics, a follow-up slice can promote reconciliation metadata into first-class columns or projections.
|
||||
212
specs/160-operation-lifecycle-guarantees/plan.md
Normal file
212
specs/160-operation-lifecycle-guarantees/plan.md
Normal file
@ -0,0 +1,212 @@
|
||||
# Implementation Plan: Operation Lifecycle Guarantees & Queue-to-Domain Failure Reconciliation
|
||||
|
||||
**Branch**: `160-operation-lifecycle-guarantees` | **Date**: 2026-03-23 | **Spec**: [specs/160-operation-lifecycle-guarantees/spec.md](./spec.md)
|
||||
**Input**: Feature specification from `/specs/160-operation-lifecycle-guarantees/spec.md`
|
||||
|
||||
**Note**: This template is filled in by the `/speckit.plan` command. See `.specify/scripts/` for helper scripts.
|
||||
|
||||
## Summary
|
||||
|
||||
Guarantee eventual terminal truth for covered queued `OperationRun` executions by hardening three seams that already exist but are incomplete: service-owned lifecycle transitions, queue-failure bridging, and stale-run healing. The implementation will keep the existing `queued` → `running` → `completed` lifecycle model, extend `OperationRunService` with generic stale-running reconciliation and structured reconciliation metadata, introduce a configuration-backed lifecycle policy for covered operation types and timing thresholds, add reusable failed-job bridging for priority queued jobs, and expose freshness or reconciliation semantics on the existing Monitoring surfaces without changing the overall Operations information architecture.
|
||||
|
||||
This is a reliability-hardening feature, not a queue-platform rewrite. The plan therefore avoids a new orchestration subsystem, avoids a second run-state model, and avoids a resumability design. Instead it builds on current repo seams such as `OperationRunService`, `TrackOperationRun`, the existing restore adapter reconciler, the backup-schedule reconcile command, `OperationUxPresenter`, centralized badge rendering, and the canonical Monitoring pages.
|
||||
|
||||
## Technical Context
|
||||
|
||||
**Language/Version**: PHP 8.4.15
|
||||
**Primary Dependencies**: Laravel 12, Filament 5, Livewire 4, Pest 4, Laravel queue workers, existing `OperationRunService`, `TrackOperationRun`, `OperationUxPresenter`, `ReasonPresenter`, `BadgeCatalog` domain badges, and current Operations Monitoring pages
|
||||
**Storage**: PostgreSQL for `operation_runs`, `jobs`, and `failed_jobs`; JSONB-backed `context`, `summary_counts`, and `failure_summary`; configuration in `config/queue.php` and `config/tenantpilot.php`
|
||||
**Testing**: Pest 4 feature, unit, Filament or Livewire-focused tests, and focused queue interaction tests run through Laravel Sail
|
||||
**Target Platform**: Laravel Sail web application serving the Filament admin panel and queued worker processes
|
||||
**Project Type**: Laravel monolith web application
|
||||
**Performance Goals**: Keep Monitoring render DB-only; ensure reconciliation scans active runs without material query explosion; guarantee deterministic convergence to terminal truth within scheduler cadence; avoid regressions in active-run list responsiveness
|
||||
**Constraints**: Preserve service-owned `OperationRun` transitions, existing `queued`/`running`/`completed` statuses, existing outcome enums, and exactly-three-surface Ops-UX feedback; keep `retry_after` greater than job timeouts with safety margin; no new external calls during Monitoring render; no new panel provider or asset pipeline changes; destructive operations remain confirmation-backed under their originating features
|
||||
**Scale/Scope**: V1 covers the exact operator-visible queued run types `baseline_capture`, `baseline_compare`, `inventory_sync`, `policy.sync`, `policy.sync_one`, `entra_group_sync`, `directory_role_definitions.sync`, `backup_schedule_run`, `restore.execute`, `tenant.review_pack.generate`, `tenant.review.compose`, and `tenant.evidence.snapshot.generate`, plus the shared queue-lifecycle infrastructure those types depend on
|
||||
|
||||
## Constitution Check
|
||||
|
||||
*GATE: Must pass before Phase 0 research. Re-check after Phase 1 design.*
|
||||
|
||||
**Pre-Phase 0 Gate: PASS**
|
||||
|
||||
- Inventory-first: PASS. The feature does not alter inventory-versus-snapshot ownership and only hardens operational truth for queued work that already exists.
|
||||
- Read/write separation: PASS. No new remote write workflow is introduced. Existing write-capable operations such as restore keep their preview, confirmation, audit, and authorization requirements from their originating specs.
|
||||
- Graph contract path: PASS. No new Microsoft Graph contract or endpoint is introduced by this feature.
|
||||
- Deterministic capabilities: PASS. Existing operation start permissions and any dangerous follow-up actions remain tied to the canonical capability registry.
|
||||
- RBAC-UX planes: PASS. This feature stays in the admin `/admin` plane. Cross-plane behavior remains unchanged. Canonical Monitoring routes remain tenant-safe.
|
||||
- Workspace isolation: PASS. `OperationRun` remains workspace-scoped with optional tenant linkage, and workspace-level Monitoring continues to authorize from the run and its related workspace.
|
||||
- Tenant isolation: PASS. Tenant-linked runs shown on canonical Monitoring routes still require tenant entitlement and must remain 404 for non-entitled users.
|
||||
- Destructive confirmation: PASS. No new destructive action is added in V1. Existing destructive actions such as restore remain `->action(...)->requiresConfirmation()` and server-authorized.
|
||||
- Global search safety: PASS. This feature does not introduce or alter globally searchable resources. Existing global-search page contracts remain unchanged.
|
||||
- Run observability and Ops-UX: PASS WITH REQUIRED HARDENING. The feature strengthens the existing `OperationRun` contract instead of bypassing it. Start surfaces remain enqueue-only. Monitoring remains DB-only. Terminal truth remains service-owned.
|
||||
- Ops-UX lifecycle ownership: PASS. Research confirms `OperationRunService` is already the canonical transition path and the plan preserves that boundary.
|
||||
- Ops-UX summary counts: PASS. No new summary-count shape is introduced; numeric-only summary rules remain unchanged.
|
||||
- Ops-UX guards: PASS WITH REQUIRED TEST UPDATES. The feature must extend regression guards to cover reconciliation transitions and failed-job bridging without permitting direct status mutation.
|
||||
- Ops-UX system runs: PASS. Initiator-null behavior remains unchanged; reconciled system runs remain auditable via Monitoring without terminal DB notification fan-out.
|
||||
- Automation and idempotency: PASS WITH REQUIRED EXPANSION. Existing active-run dedupe and `WithoutOverlapping` patterns can be reused, but the plan adds generic stale reconciliation and queue-failure bridging for more run types.
|
||||
- Badge semantics (BADGE-001): PASS WITH REQUIRED CENTRALIZATION. Freshness and reconciliation display must be derived through centralized presenters or badges rather than ad hoc table mappings.
|
||||
- UI naming (UI-NAMING-001): PASS. Operator-facing copy will use domain terms such as `stale`, `reconciled`, and `infrastructure failure`; low-level queue exceptions remain diagnostics-only.
|
||||
- Operator surfaces (OPSURF-001): PASS. The Operations index and run detail remain the canonical operator-first surfaces. Diagnostics stay secondary. No new dangerous UI workflow is introduced.
|
||||
- Filament Action Surface Contract: PASS. The feature changes semantics on existing Monitoring surfaces, not their action inventory.
|
||||
- Filament UX-001: PASS. Existing Operations pages remain in place and only gain new status semantics and diagnostics.
|
||||
- Livewire and Filament version safety: PASS. The plan remains Livewire v4 compliant and does not change Filament v5 panel registration in `bootstrap/providers.php`.
|
||||
- Asset strategy: PASS. No new global or panel-specific assets are planned, so deployment steps for `filament:assets` remain unchanged.
|
||||
|
||||
## Project Structure
|
||||
|
||||
### Documentation (this feature)
|
||||
|
||||
```text
|
||||
specs/160-operation-lifecycle-guarantees/
|
||||
├── plan.md
|
||||
├── research.md
|
||||
├── data-model.md
|
||||
├── quickstart.md
|
||||
├── contracts/
|
||||
│ └── operation-run-lifecycle.openapi.yaml
|
||||
└── tasks.md
|
||||
```
|
||||
|
||||
### Source Code (repository root)
|
||||
|
||||
```text
|
||||
app/
|
||||
├── Console/
|
||||
│ └── Commands/
|
||||
├── Filament/
|
||||
│ ├── Pages/
|
||||
│ │ └── Monitoring/
|
||||
│ ├── Resources/
|
||||
│ │ └── OperationRunResource.php
|
||||
│ └── Widgets/
|
||||
│ └── Operations/
|
||||
├── Jobs/
|
||||
│ └── Middleware/
|
||||
├── Models/
|
||||
│ └── OperationRun.php
|
||||
├── Notifications/
|
||||
├── Policies/
|
||||
│ └── OperationRunPolicy.php
|
||||
├── Services/
|
||||
│ ├── Operations/
|
||||
│ ├── Providers/
|
||||
│ ├── SystemConsole/
|
||||
│ └── OperationRunService.php
|
||||
└── Support/
|
||||
├── Badges/
|
||||
├── OpsUx/
|
||||
├── OperationRun*.php
|
||||
└── ReasonTranslation/
|
||||
config/
|
||||
├── queue.php
|
||||
└── tenantpilot.php
|
||||
routes/
|
||||
└── web.php
|
||||
tests/
|
||||
├── Feature/
|
||||
│ ├── Operations/
|
||||
│ ├── Filament/
|
||||
│ └── Rbac/
|
||||
└── Unit/
|
||||
├── Jobs/
|
||||
├── Operations/
|
||||
└── Support/
|
||||
```
|
||||
|
||||
**Structure Decision**: Use the existing Laravel monolith and strengthen the current operational support layer rather than introducing a separate orchestration subsystem. The implementation seam stays centered on `OperationRunService`, queue job classes and middleware under `app/Jobs`, generic reconciliation services or commands under `app/Services` and `app/Console/Commands`, and semantic presentation on the existing Monitoring pages, presenters, and badges.
|
||||
|
||||
## Phase 0 Research Summary
|
||||
|
||||
- `TrackOperationRun` correctly transitions a run to `running`, catches in-handle exceptions, and marks success when the job returns cleanly, but it does not protect against failures that happen before middleware entry or after infrastructure has already declared the job dead.
|
||||
- `OperationRunService` already owns transitions and already has reusable stale-queued detection via `isStaleQueuedRun()` plus `failStaleQueuedRun()`, but there is no generic stale-running detection or stale-running fail helper.
|
||||
- The repo already has two partial reconciliation patterns: `TenantpilotReconcileBackupScheduleOperationRuns` for `backup_schedule_run` and `AdapterRunReconciler` plus `ops:reconcile-adapter-runs` for `restore.execute`. Both prove the repo accepts DB-driven healing, but both are too type-specific for Spec 160.
|
||||
- Research confirmed only one queued job currently implements a `failed(Throwable $e)` bridge: `BulkBackupSetRestoreJob`. Priority operation jobs such as baseline capture, baseline compare, inventory sync, policy sync, restore execution, and tenant review generation do not yet bridge failed queue truth back to `OperationRun` through `failed()`.
|
||||
- Queue timing defaults currently use `retry_after = 600` seconds for database, Redis, and Beanstalk connections, while at least some long-running jobs declare `public int $timeout = 300;`. The repo does not yet centralize or validate the timing invariant per covered operation type.
|
||||
- Laravel queue documentation for version 12 confirms the relevant design rules: `failed()` is invoked when a job exhausts attempts or times out, the exception may be `MaxAttemptsExceededException` or `TimeoutExceededException`, attempts can be consumed without `handle()` executing, and job timeout must remain shorter than `retry_after` to avoid duplicate or orphaned processing.
|
||||
- Existing operator-facing Monitoring already has reusable seams for safe UI semantics: `OperationUxPresenter`, `ReasonPresenter`, `OperationRunStatusBadge`, `OperationRunOutcomeBadge`, and canonical Operations pages. These should be extended rather than bypassed.
|
||||
- Existing `OperationRunTriageService` already proves the repo is willing to store triage metadata in `context['triage']` and keep service-owned terminal transitions. The same pattern can be reused for reconciliation metadata without requiring a schema change in the first slice.
|
||||
|
||||
## Phase 1 Design
|
||||
|
||||
### Implementation Approach
|
||||
|
||||
1. Extend the existing `OperationRunService` into a generic lifecycle-healing seam.
|
||||
- Add generic running-freshness assessment parallel to `isStaleQueuedRun()`.
|
||||
- Add a service-owned stale-running failure transition parallel to `failStaleQueuedRun()`.
|
||||
- Normalize reconciliation metadata and reason codes through the same service-owned transition path.
|
||||
|
||||
2. Introduce a configuration-backed lifecycle policy for covered operation types.
|
||||
- Define which run types are in scope for V1.
|
||||
- Define status-specific stale thresholds and expected timeout bounds per type.
|
||||
- Record for each covered type whether terminal truth is guaranteed by a direct failed-job bridge, scheduled reconciliation, or both.
|
||||
- Keep this policy in configuration rather than a new table so rollout remains schema-light.
|
||||
|
||||
3. Use a layered truth-bridge strategy instead of a single mechanism.
|
||||
- First-line bridge: reusable `failed(Throwable $e)` handling for covered jobs that can deterministically resolve their owning `OperationRun`.
|
||||
- Safety-net bridge: scheduled stale-run reconciliation that heals queued or running runs when direct failure handling never executes.
|
||||
- Reject a failed-jobs-parser-first design for V1 because payload parsing is brittle compared with job-owned identity plus stale reconciliation.
|
||||
|
||||
4. Generalize current type-specific reconcile patterns into one lifecycle reconciler.
|
||||
- Keep restore adapter reconciliation and backup schedule reconciliation as precedents.
|
||||
- Introduce a generic active-run reconciliation service or command that scans covered `queued` and `running` runs, evaluates freshness, and force-resolves only when evidence justifies it.
|
||||
- Preserve idempotency so repeated runs do not overwrite already terminal records.
|
||||
|
||||
5. Keep the current status model and enrich semantics through derived freshness state.
|
||||
- Preserve `queued`, `running`, and `completed` plus existing outcomes.
|
||||
- Introduce derived freshness states such as `fresh`, `likely_stale`, and `reconciled_failed` at the presenter and contract layer, not as new top-level database statuses.
|
||||
- Centralize that mapping through existing Ops-UX and badge helpers.
|
||||
|
||||
6. Validate runtime timing as part of product correctness.
|
||||
- Capture the invariant that worker timeout and per-job timeout must stay below queue `retry_after` with safety margin.
|
||||
- Add focused validation or tests that fail when a covered lifecycle policy violates that invariant.
|
||||
- Keep deployment documentation explicit about queue worker restart and stop-wait expectations.
|
||||
|
||||
### Planned Workstreams
|
||||
|
||||
- **Workstream A: Lifecycle policy core**
|
||||
- Add a configuration-backed coverage and threshold registry for V1 operation types.
|
||||
- Extend `OperationRunService` with generic stale-running assessment and reconciliation helpers.
|
||||
|
||||
- **Workstream B: Queue-failure bridges**
|
||||
- Introduce a reusable failed-job bridge pattern for priority covered jobs.
|
||||
- Normalize the existing restore bridge and apply the direct bridge pattern first to baseline capture, baseline compare, inventory sync, policy sync, and tenant review composition paths while the lifecycle policy explicitly marks which covered types rely on scheduled reconciliation.
|
||||
|
||||
- **Workstream C: Generic stale-run reconciliation**
|
||||
- Create a lifecycle reconciler service and scheduled command for covered `queued` and `running` runs.
|
||||
- Fold current type-specific healing logic into reusable primitives where possible.
|
||||
|
||||
- **Workstream D: Monitoring semantics**
|
||||
- Extend existing presenter and badge seams to distinguish fresh active, likely stale, and reconciled-failed semantics.
|
||||
- Update the Operations index and run detail to surface those meanings without changing route authority or action inventory.
|
||||
- Expose minimal aggregate reconciliation visibility through Monitoring filters or queryable summaries instead of a new dashboard.
|
||||
|
||||
- **Workstream E: Runtime invariant enforcement**
|
||||
- Add validation for timeout and `retry_after` alignment and document deployment expectations for workers.
|
||||
- Keep the validation focused on covered operation types rather than every queued class in the repo.
|
||||
|
||||
- **Workstream F: Regression hardening**
|
||||
- Add focused Pest coverage for stale queued reconciliation, stale running reconciliation, idempotency, failed-job bridging, UI truth semantics, and queue timing guards.
|
||||
|
||||
### Testing Strategy
|
||||
|
||||
- Add unit tests for `OperationRunService` stale-running assessment and stale-running failure transitions alongside existing stale-queued behavior.
|
||||
- Add unit or feature tests for the generic lifecycle reconciler covering stale queued runs, stale running runs, fresh queued runs, fresh running runs, already completed runs, and idempotent repeat execution.
|
||||
- Add focused queue-lifecycle tests proving that a covered job with a `failed()` callback transitions the owning run to terminal failed truth via `OperationRunService`.
|
||||
- Add a Run-126-style regression test where a run is left `running`, normal finalization never executes, time advances beyond threshold, and reconciliation marks it terminal failed.
|
||||
- Add Monitoring-focused Filament or feature tests proving the Operations index and run detail distinguish fresh activity from stale or reconciled failure semantics without implying indefinite active progress.
|
||||
- Add focused Monitoring coverage for minimal aggregate reconciliation visibility, such as filters or summaries that show how many runs were reconciled and which covered types are most affected.
|
||||
- Add focused authorization coverage to confirm canonical Monitoring access remains 404 for non-entitled users and capability denial remains 403 where relevant.
|
||||
- Add timing guard tests or assertions that fail when covered lifecycle policy timeouts are not safely below `retry_after`.
|
||||
- Add representative start-surface coverage proving queued intent still uses `OperationUxPresenter`, and all `View run` affordances still resolve to the canonical tenantless Monitoring run detail.
|
||||
- Run the minimum focused Pest suite through Sail; full-suite execution is not required for planning artifacts.
|
||||
|
||||
**Post-Phase 1 Re-check: PASS**
|
||||
|
||||
- The design extends existing service, presenter, and command seams instead of introducing a second orchestration stack.
|
||||
- No new panel provider, no Livewire version change, and no Filament asset change is required; provider registration remains unchanged in `bootstrap/providers.php`.
|
||||
- No new global-search behavior is introduced. Existing globally searchable resources remain governed by their current View or Edit page contracts.
|
||||
- Destructive actions remain confirmation-backed under existing feature flows; this feature only hardens eventual run truth.
|
||||
|
||||
## Complexity Tracking
|
||||
|
||||
No constitution violations or exceptional complexity are planned at this stage.
|
||||
105
specs/160-operation-lifecycle-guarantees/quickstart.md
Normal file
105
specs/160-operation-lifecycle-guarantees/quickstart.md
Normal file
@ -0,0 +1,105 @@
|
||||
# Quickstart: Operation Lifecycle Guarantees & Queue-to-Domain Failure Reconciliation
|
||||
|
||||
## Goal
|
||||
|
||||
Validate that covered queued `OperationRun` executions always converge to trustworthy terminal truth and that Monitoring surfaces no longer imply indefinite normal activity for orphaned runs.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Start Sail.
|
||||
2. Ensure the queue worker is running through Sail.
|
||||
3. Ensure the database contains at least one workspace with operator-visible operation runs for covered types.
|
||||
4. Ensure test fixtures or factories can create `OperationRun` records in `queued`, `running`, and `completed` states.
|
||||
|
||||
## Implementation Validation Order
|
||||
|
||||
### 1. Run focused lifecycle service tests
|
||||
|
||||
```bash
|
||||
vendor/bin/sail artisan test --compact --filter=OperationRunService
|
||||
```
|
||||
|
||||
Expected outcome:
|
||||
- Stale queued reconciliation still works.
|
||||
- Stale running reconciliation is added and service-owned.
|
||||
- Terminal runs are not mutated.
|
||||
|
||||
### 2. Run focused reconciler tests
|
||||
|
||||
```bash
|
||||
vendor/bin/sail artisan test --compact --filter=LifecycleReconciler
|
||||
vendor/bin/sail artisan test --compact --filter=stale
|
||||
```
|
||||
|
||||
Expected outcome:
|
||||
- Stale queued runs are force-resolved to `completed/failed`.
|
||||
- Stale running runs are force-resolved to `completed/failed`.
|
||||
- Fresh runs remain untouched.
|
||||
- Reconciliation is idempotent across repeated execution.
|
||||
|
||||
### 3. Run focused failed-job bridge tests
|
||||
|
||||
```bash
|
||||
vendor/bin/sail artisan test --compact --filter=failed
|
||||
vendor/bin/sail artisan test --compact --filter=MaxAttempts
|
||||
vendor/bin/sail artisan test --compact --filter=TimeoutExceeded
|
||||
```
|
||||
|
||||
Expected outcome:
|
||||
- Covered jobs with direct `failed()` bridges map queue failure truth back to `OperationRun`.
|
||||
- Queue failures that never complete normal middleware finalization still converge through reconciliation.
|
||||
|
||||
### 4. Run the Run-126 regression scenario
|
||||
|
||||
```bash
|
||||
vendor/bin/sail artisan test --compact --filter=Run126
|
||||
vendor/bin/sail artisan test --compact --filter=orphaned
|
||||
```
|
||||
|
||||
Expected outcome:
|
||||
- A run left in `running` without `completeRun()` or `failRun()` is marked terminal failed once the stale threshold is exceeded.
|
||||
- The operator-facing state no longer implies normal active work.
|
||||
|
||||
### 5. Run focused Monitoring UX tests
|
||||
|
||||
```bash
|
||||
vendor/bin/sail artisan test --compact tests/Feature/Operations
|
||||
vendor/bin/sail artisan test --compact --filter=Operations
|
||||
```
|
||||
|
||||
Expected outcome:
|
||||
- The Operations index distinguishes fresh activity from stale or reconciled failure semantics.
|
||||
- The run detail distinguishes normal failure from reconciled lifecycle failure.
|
||||
- Canonical Monitoring authorization semantics remain intact.
|
||||
|
||||
### 6. Run runtime timing guard tests
|
||||
|
||||
```bash
|
||||
vendor/bin/sail artisan test --compact --filter=retry_after
|
||||
vendor/bin/sail artisan test --compact --filter=timeout
|
||||
```
|
||||
|
||||
Expected outcome:
|
||||
- Covered lifecycle policy timeouts stay safely below effective `retry_after`.
|
||||
- Misaligned timing assumptions fail validation instead of remaining implicit.
|
||||
|
||||
## Runtime notes
|
||||
|
||||
- Covered lifecycle jobs now declare explicit `timeout` values and set `failOnTimeout = true`.
|
||||
- The lifecycle validator expects covered job timeouts and expected runtimes to stay below queue `retry_after` with a safety margin.
|
||||
- If queue worker settings change during rollout, run `vendor/bin/sail artisan queue:restart` so workers pick up the new lifecycle contract.
|
||||
- Production and staging stop-wait expectations must stay above the longest covered timeout so workers can exit cleanly instead of orphaning in-flight runs.
|
||||
|
||||
### 7. Manual smoke-check in the browser
|
||||
|
||||
1. Open `/admin/operations` and inspect a fresh active run.
|
||||
2. Inspect a deliberately stale or reconciled run and confirm the list no longer presents it as ordinary in-progress work.
|
||||
3. Open `/admin/operations/{run}` for a reconciled run and confirm the detail page shows operator-safe lifecycle explanation plus secondary diagnostics.
|
||||
4. Confirm existing `View run` navigation remains canonical and no new destructive action is introduced.
|
||||
|
||||
## Non-Goals For This Slice
|
||||
|
||||
- No resumable execution or checkpoint recovery.
|
||||
- No queue backend replacement or Horizon adoption.
|
||||
- No new manual retry or re-drive UI.
|
||||
- No new `OperationRun` status enum.
|
||||
58
specs/160-operation-lifecycle-guarantees/research.md
Normal file
58
specs/160-operation-lifecycle-guarantees/research.md
Normal file
@ -0,0 +1,58 @@
|
||||
# Phase 0 Research: Operation Lifecycle Guarantees & Queue-to-Domain Failure Reconciliation
|
||||
|
||||
## Decision: Extend the existing `OperationRunService` and reconciliation seams instead of creating a new orchestration subsystem
|
||||
|
||||
**Rationale**: The repo already treats `OperationRunService` as the canonical owner of lifecycle transitions and already contains two partial healing patterns: stale queued reconciliation in `OperationRunService`, plus type-specific reconciliation for backup schedule runs and restore adapter-backed runs. Extending these seams keeps lifecycle truth service-owned, avoids a second state machine, and aligns with the constitution rule that `OperationRun.status` and `OperationRun.outcome` transitions remain centralized.
|
||||
|
||||
**Alternatives considered**:
|
||||
- Create a new orchestration or workflow engine for all queued operations: rejected because Spec 160 is a reliability-hardening feature, not a queue-platform redesign.
|
||||
- Let each operation type keep bespoke reconcile code forever: rejected because the current type-by-type pattern already left major gaps, including the Run-126 class of failure.
|
||||
|
||||
## Decision: Use a layered truth-bridge strategy: direct `failed()` callbacks for covered jobs plus scheduled stale-run reconciliation as the safety net
|
||||
|
||||
**Rationale**: Laravel queue documentation confirms that `failed()` is invoked for jobs that exhaust attempts or time out, even when the final exception is `MaxAttemptsExceededException` or `TimeoutExceededException`. That makes job-owned `failed()` bridging the cleanest direct path for covered jobs that can resolve their owning `OperationRun`. However, process death, worker kill, or other infrastructure interruption can still prevent any callback from running. Scheduled stale-run reconciliation is therefore still required as the final guarantee.
|
||||
|
||||
**Alternatives considered**:
|
||||
- Rely only on `TrackOperationRun`: rejected because middleware cannot handle failures that occur before the middleware pipeline is entered or after the infrastructure already declared the job failed.
|
||||
- Rely only on stale-run reconciliation: rejected because direct failure bridging gives faster truth convergence and better structured failure reasons when the queue does provide a terminal failure callback.
|
||||
- Parse `failed_jobs` records as the primary bridge: rejected for V1 because payload parsing and job-class introspection are more brittle than job-owned identity plus domain reconciliation.
|
||||
|
||||
## Decision: Preserve the existing `queued` / `running` / `completed` lifecycle model and add derived freshness semantics instead of new top-level statuses
|
||||
|
||||
**Rationale**: The current domain model already separates lifecycle state from execution outcome. The real gap is not missing statuses but missing convergence and missing freshness interpretation. Preserving the existing model avoids broad downstream breakage while still allowing the UI and contracts to distinguish fresh active work, likely stale work, and reconciled failure through centralized presenters and reason codes.
|
||||
|
||||
**Alternatives considered**:
|
||||
- Add new top-level statuses such as `stale` or `reconciled`: rejected because this would spread persistence and presentation changes across the codebase for limited benefit.
|
||||
- Leave stale interpretation implicit: rejected because operators need explicit liveness truth and tests need stable semantics.
|
||||
|
||||
## Decision: Define V1 coverage and stale thresholds through configuration-backed lifecycle policy, not ad hoc hardcoded checks
|
||||
|
||||
**Rationale**: The repo already has one hardcoded stale-queued default and one type-specific backup-schedule reconcile command. Spec 160 needs explicit V1 coverage, status-specific thresholds, and timing expectations across multiple run types. Configuration-backed lifecycle policy keeps the first slice schema-light, auditable, and easier to validate in tests while preventing logic from being scattered across multiple jobs and commands.
|
||||
|
||||
**Alternatives considered**:
|
||||
- Keep a single global stale threshold for every operation type: rejected because long-running baseline or report jobs and shorter sync jobs have different legitimate runtime envelopes.
|
||||
- Store lifecycle policy in a new database table: rejected for V1 because rollout speed and deterministic config review matter more than runtime mutability.
|
||||
|
||||
## Decision: Store reconciliation evidence in existing `context` and `failure_summary` structures for the first slice
|
||||
|
||||
**Rationale**: `OperationRun` already stores structured JSONB context and failure arrays, and existing triage flows already record structured metadata under `context['triage']`. Reusing these structures keeps the first slice migration-free while still allowing operator-safe explanation, auditability, and later observability extraction. The key requirement is to standardize the reconciliation metadata shape and reason codes.
|
||||
|
||||
**Alternatives considered**:
|
||||
- Add top-level `reconciled_at` and `reconciliation_reason` columns immediately: rejected for V1 because the repo already has a structured metadata pattern that can support the feature without schema churn.
|
||||
- Store only a free-text failure message: rejected because stable reason codes and timestamps are required for auditability and future metrics.
|
||||
|
||||
## Decision: Reuse centralized Ops-UX and badge presentation seams for stale and reconciled semantics
|
||||
|
||||
**Rationale**: The repo already centralizes run-facing language through `OperationUxPresenter`, `ReasonPresenter`, and badge domain helpers for status and outcome. Extending those seams keeps stale or reconciled semantics consistent across the Operations index, run detail, and notifications while honoring BADGE-001 and UI-NAMING-001. It also avoids ad hoc table-only mappings that would drift.
|
||||
|
||||
**Alternatives considered**:
|
||||
- Add custom inline labels only on the Operations table: rejected because run detail, widgets, and notifications would then drift semantically.
|
||||
- Surface low-level queue exceptions directly in primary badges: rejected because operator-facing copy must stay domain-safe and infrastructure details must remain diagnostics-only.
|
||||
|
||||
## Decision: Treat queue timing alignment as product correctness and validate it explicitly
|
||||
|
||||
**Rationale**: Current queue connections use `retry_after = 600`, while at least some covered jobs explicitly set `$timeout = 300`. Laravel documentation is explicit that job timeout must remain shorter than `retry_after`; otherwise, the same job may be retried before the worker times out. Spec 160 is driven by exactly this truth-divergence problem, so timing alignment must become a documented and testable lifecycle invariant rather than an informal deployment note.
|
||||
|
||||
**Alternatives considered**:
|
||||
- Document timing rules only in deployment notes: rejected because silent drift in job or worker settings would recreate the same incident class.
|
||||
- Validate only global worker timeout and ignore per-job timeout: rejected because covered jobs already override timeout values and the invariant has to hold at the job-policy level.
|
||||
200
specs/160-operation-lifecycle-guarantees/spec.md
Normal file
200
specs/160-operation-lifecycle-guarantees/spec.md
Normal file
@ -0,0 +1,200 @@
|
||||
# Feature Specification: Operation Lifecycle Guarantees & Queue-to-Domain Failure Reconciliation
|
||||
|
||||
**Feature Branch**: `160-operation-lifecycle-guarantees`
|
||||
**Created**: 2026-03-23
|
||||
**Status**: Draft
|
||||
**Input**: User description: "Introduce explicit lifecycle guarantees for queued `OperationRun` executions so no operation can remain indefinitely ambiguous, orphaned, or misleadingly active without the platform eventually forcing a deterministic terminal truth."
|
||||
|
||||
## Spec Scope Fields *(mandatory)*
|
||||
|
||||
- **Scope**: workspace
|
||||
- **Primary Routes**:
|
||||
- `/admin/operations`
|
||||
- `/admin/operations/{run}`
|
||||
- Existing operator-facing start surfaces that enqueue covered `OperationRun` work, including baseline capture, baseline compare, inventory sync, policy sync, Entra group sync, directory role-definition sync, backup schedule execution, restore execution, review pack generation, tenant review composition, and evidence snapshot generation flows
|
||||
- **Data Ownership**:
|
||||
- `OperationRun` remains the canonical operational record for queued, operator-visible work.
|
||||
- `OperationRun` remains a workspace-scoped monitoring record with optional tenant linkage for tenant-bound runs.
|
||||
- Queue infrastructure evidence remains operational support data and must be mappable back to the owning `OperationRun` without changing workspace or tenant ownership boundaries.
|
||||
- Reconciliation outcomes, failure reasons, and freshness semantics remain part of the same operational truth model rather than a separate shadow state.
|
||||
- **RBAC**:
|
||||
- Authorization plane involved: admin `/admin` including workspace-level Monitoring and tenant-context entry points that can start or inspect covered operations.
|
||||
- Non-members or actors lacking workspace or tenant entitlement for a run receive deny-as-not-found behavior.
|
||||
- Members who are allowed to view a run but lack capability to start or re-trigger a dangerous operation receive forbidden behavior.
|
||||
- Existing capability registry remains canonical for operation start permissions and any dangerous follow-up actions.
|
||||
- This feature does not broaden `/system` access and does not weaken tenant-safe run visibility.
|
||||
|
||||
For canonical-view specs, the spec MUST define:
|
||||
|
||||
- **Default filter behavior when tenant-context is active**: `/admin/operations` may continue to prefilter to the selected tenant as a convenience, but lifecycle legitimacy, stale detection, and terminal truth must not depend on selected tenant context. `/admin/operations/{run}` remains record-authoritative even when tenant context is mismatched, stale, or empty.
|
||||
- **Explicit entitlement checks preventing cross-tenant leakage**: The canonical run viewer must authorize from the resolved `OperationRun`, its workspace relationship, and tenant entitlement for any referenced tenant. Lifecycle reconciliation or stale labeling must not reveal tenant-linked run details to non-members or non-entitled actors.
|
||||
|
||||
## Operator Surface Contract *(mandatory when operator-facing surfaces are changed)*
|
||||
|
||||
| Surface | Primary Persona | Surface Type | Primary Operator Question | Default-visible Information | Diagnostics-only Information | Status Dimensions Used | Mutation Scope | Primary Actions | Dangerous Actions |
|
||||
|---|---|---|---|---|---|---|---|---|---|
|
||||
| Operations index | Workspace operator | List | Which operations are truly active, which are stale, and which require attention? | Run type, initiator, tenant or workspace scope, current lifecycle state, freshness signal, terminal outcome when available | Failure reason codes, reconciliation timestamps, queue-origin evidence, stale-threshold rationale | lifecycle, execution outcome, freshness, tenant scope | TenantPilot only | View run, filter active or stale runs | None on the index in V1 |
|
||||
| Operation run detail | Workspace operator | Detail | Did this run finish normally, fail normally, or get force-reconciled after losing lifecycle truth? | Run identity, started or completed timing, outcome, plain-language failure explanation, whether the platform reconciled the run automatically | Structured infrastructure reason, stale classification evidence, queue failure linkage, reconciliation audit context | lifecycle, execution outcome, freshness, auditability | TenantPilot only | Back to Operations, inspect run history and failure detail | Any future retry or re-drive affordance remains out of scope for V1 |
|
||||
| Existing operation start surfaces | Tenant operator or workspace operator | Action entry | If I start work now, will the platform later tell me the truth even when infrastructure goes wrong? | Existing queued intent messaging, run title, mutation scope, confirmation language where already required | Lifecycle contract diagnostics stay secondary to avoid cluttering start surfaces | lifecycle readiness, mutation scope | TenantPilot only, Microsoft tenant, or simulation only depending on the initiating feature | Start operation, view run | Existing destructive operations such as restore remain confirmation-protected and auditable |
|
||||
|
||||
## User Scenarios & Testing *(mandatory)*
|
||||
|
||||
### User Story 1 - Force Terminal Truth For Orphaned Runs (Priority: P1)
|
||||
|
||||
As an operator, I need every covered queued operation to end in a trustworthy terminal state even when the queue infrastructure fails before normal execution cleanup happens, so that no run remains indefinitely ambiguous.
|
||||
|
||||
**Why this priority**: This is the core reliability failure. If terminal truth is not guaranteed, every higher-level governance workflow inherits false operational state.
|
||||
|
||||
**Independent Test**: Can be fully tested by creating a covered `OperationRun`, preventing the normal completion or failure path from finalizing it, advancing time past the stale threshold, and verifying that the system reconciles it to a terminal failed truth with an operator-safe reason.
|
||||
|
||||
**Acceptance Scenarios**:
|
||||
|
||||
1. **Given** a covered queued run moves to `running` and its normal lifecycle completion path never executes, **When** the platform detects that the run is stale, **Then** the run is forced to terminal failed truth with an explicit infrastructure-oriented reason.
|
||||
2. **Given** a covered queued run is failed by queue infrastructure before normal middleware finalization can update the domain record, **When** reconciliation or a direct failure bridge runs, **Then** the queue failure is reflected in the owning run instead of remaining only in infrastructure evidence.
|
||||
|
||||
---
|
||||
|
||||
### User Story 2 - Show Honest Liveness In Monitoring (Priority: P2)
|
||||
|
||||
As an operator viewing Monitoring, I need the UI to stop implying that obviously stale work is still normally active, so that I can decide whether to investigate, retry manually later, or move on.
|
||||
|
||||
**Why this priority**: The current misleading spinner is the visible symptom of the underlying integrity bug. Honest operator truth is required for trust and supportability.
|
||||
|
||||
**Independent Test**: Can be fully tested by presenting fresh active runs, stale non-terminal runs, and reconciled failed runs on the Operations surfaces and verifying that each state is represented distinctly without indefinite optimistic activity cues.
|
||||
|
||||
**Acceptance Scenarios**:
|
||||
|
||||
1. **Given** a run is fresh and plausibly progressing, **When** the operator opens the Operations index or run detail, **Then** the UI presents it as legitimately active.
|
||||
2. **Given** a run has exceeded the accepted freshness window without credible progress evidence, **When** the operator views it before or after reconciliation, **Then** the UI no longer implies normal active progress and surfaces stale or reconciled failure semantics clearly.
|
||||
|
||||
---
|
||||
|
||||
### User Story 3 - Prevent Repeat Incidents Through Lifecycle Contracts (Priority: P3)
|
||||
|
||||
As a platform owner, I need covered queued jobs and runtime defaults to satisfy explicit lifecycle and timing guarantees, so that reservation expiry, timeout misalignment, and silent lifecycle divergence stop being accepted as normal behavior.
|
||||
|
||||
**Why this priority**: Reconciliation heals truth after failure, but the platform also needs guardrails that reduce the chance of creating orphaned runs in the first place.
|
||||
|
||||
**Independent Test**: Can be fully tested by verifying that covered jobs declare explicit lifecycle bounds, that runtime timing relationships are documented and validated, and that misaligned or ambiguous lifecycle settings are caught by automated checks.
|
||||
|
||||
**Acceptance Scenarios**:
|
||||
|
||||
1. **Given** a covered long-running operation type is introduced or updated, **When** its lifecycle contract is reviewed, **Then** the job has an explicit timeout strategy and a credible path to terminal failure truth.
|
||||
2. **Given** deployment or worker timing values would allow legitimate work to outlive queue reservation semantics, **When** the platform validates lifecycle runtime assumptions, **Then** the mismatch is detected or documented as invalid rather than silently accepted.
|
||||
|
||||
### Edge Cases
|
||||
|
||||
- A run remains in `queued` because a worker never starts it, the worker crashes before status handoff, or dispatch-level failure evidence exists without a domain transition.
|
||||
- A run remains in `running` because the process dies, is killed, or times out after setting active state but before normal finalization.
|
||||
- Queue infrastructure records a decisive terminal failure but no matching middleware or in-job cleanup path executes.
|
||||
- A run becomes terminal through the normal lifecycle path shortly before reconciliation inspects it; reconciliation must not overwrite or flap a legitimately completed run.
|
||||
- A long-running but healthy job approaches the stale threshold; thresholds must be conservative enough to avoid false terminal failure for legitimate work.
|
||||
- A workspace-level run with no tenant reference must still converge to terminal truth without being mistaken for a tenant-leakage exception.
|
||||
- Scheduled or system-initiated runs with no initiator must still become terminally truthful while respecting the initiator-null notification rule.
|
||||
|
||||
## Requirements *(mandatory)*
|
||||
|
||||
**Constitution alignment (required):** This feature governs long-running, queued, and scheduled `OperationRun` work. It introduces no new Microsoft Graph contracts by itself, but it does require explicit lifecycle guarantees for covered `OperationRun` types, run observability on Monitoring surfaces, audit-safe failure semantics, and regression tests. Existing operation start surfaces must preserve their current safety gates, previews, confirmations, and audit requirements. If a covered flow performs a dangerous mutation such as restore, that mutation remains confirmation-protected and auditable; this feature only guarantees that its run truth cannot remain orphaned.
|
||||
|
||||
**Constitution alignment (OPS-UX):** This feature reuses existing `OperationRun` records and remains fully subject to the three-surface Ops-UX contract. Queued intent feedback remains toast-only. Active awareness remains limited to the active-operations widget and Monitoring run views. Terminal truth remains represented by the canonical run record and terminal notification policy. `OperationRun.status` and `OperationRun.outcome` transitions remain service-owned and must continue to occur only through `OperationRunService`, including reconciliation transitions. `summary_counts` remain numeric-only and keyed from the canonical registry. Scheduled and system runs continue to omit terminal DB notifications when there is no initiator, while Monitoring remains the authoritative audit surface. Regression coverage must include service-owned lifecycle transitions, stale-run reconciliation, and failure bridging without reintroducing direct status mutation.
|
||||
|
||||
**Constitution alignment (RBAC-UX):** This feature does not broaden authorization scope but does change the truth semantics shown on `/admin/operations` and `/admin/operations/{run}`. The admin `/admin` plane remains the only plane involved. Cross-plane access remains deny-as-not-found. For this feature, 404 means the actor is not entitled to the workspace or tenant scope of the run; 403 means the actor is in scope but lacks a capability for an operation-start or dangerous follow-up action. Authorization remains server-side through existing Gates, Policies, and the canonical capability registry. Global search and linked Monitoring access remain tenant-safe. Existing destructive-like actions such as restore remain confirmation-required. Validation must include at least one positive Monitoring access test and one negative tenant-entitlement or capability test alongside the lifecycle tests.
|
||||
|
||||
**Constitution alignment (OPS-EX-AUTH-001):** Not applicable beyond reaffirming that lifecycle bridging and stale reconciliation belong to Monitoring and queued operation execution only. Authentication handshake exceptions on `/auth/*` remain unrelated and must not be used as an exception path here.
|
||||
|
||||
**Constitution alignment (BADGE-001):** This feature changes the meaning shown by lifecycle and outcome badges on Operations surfaces. Badge semantics for `queued`, `running`, `completed`, `failed`, and any stale or reconciled indicators must remain centralized so the same run state is not mapped differently across the index, detail view, and widgets. Tests must cover any newly exposed stale or reconciled display values.
|
||||
|
||||
**Constitution alignment (UI-NAMING-001):** The target object is the `OperationRun`. Primary operator verbs remain `View run` and existing start verbs from initiating surfaces. New operator-facing copy must favor domain language such as `stale`, `reconciled`, `infrastructure failure`, and `no longer active` over implementation-first phrasing such as `MaxAttemptsExceededException` or `retry_after mismatch` in primary labels. Low-level queue or reservation details may appear only as secondary diagnostics. The same lifecycle vocabulary must be preserved across run titles, status presentation, notifications, and audit prose.
|
||||
|
||||
**Constitution alignment (OPSURF-001):** This feature materially refactors the meaning of the Operations index and run detail without replacing their overall layout. Default-visible content on `/admin` must remain operator-first: what ran, whether it is still trustworthy as active, how it ended, and whether the platform reconciled it automatically. Raw queue evidence and stale-threshold reasoning remain diagnostics-only. Status dimensions must remain distinct: execution outcome, lifecycle state, and freshness or reconciliation state must not collapse into one misleading badge. Mutating start surfaces continue to disclose mutation scope before execution through their originating specs. Dangerous actions continue to follow the existing safe-execution pattern; this feature does not introduce new dangerous actions.
|
||||
|
||||
**Constitution alignment (Filament Action Surfaces):** This feature modifies existing Filament Monitoring surfaces and therefore includes the UI Action Matrix below. The Action Surface Contract is satisfied. No exemption is needed because the feature changes state semantics and diagnostics, not the basic action inventory.
|
||||
|
||||
**Constitution alignment (UX-001 — Layout & Information Architecture):** Existing Operations list and run detail layouts remain in place. UX-001 stays satisfied so long as the run detail continues to present grouped operational sections instead of raw dumps, and the Operations table preserves search, sort, and filtering over core dimensions such as type, status, outcome, freshness, and tenant scope. Empty states remain single-CTA and explanatory. This feature changes semantic truth, not form layout.
|
||||
|
||||
### Functional Requirements
|
||||
|
||||
- **FR-160-001**: The system MUST guarantee that every covered queued `OperationRun` reaches eventual terminal domain truth or remains demonstrably fresh and legitimately active.
|
||||
- **FR-160-002**: The system MUST define the covered `OperationRun` operation types for V1 as baseline capture, baseline compare, inventory sync, policy sync including single-policy sync, Entra group sync, directory role-definition sync, backup schedule execution, restore execution, review pack generation, tenant review composition, and evidence snapshot generation.
|
||||
- **FR-160-003**: The system MUST provide at least one deterministic bridge from queue or infrastructure terminal failure evidence back to the owning `OperationRun` for every covered run type.
|
||||
- **FR-160-004**: A queue-level terminal failure for a covered run MUST eventually be reflected on the owning `OperationRun` as terminal failed truth rather than remaining only in infrastructure records or logs.
|
||||
- **FR-160-005**: The system MUST detect stale covered runs in `queued` or `running` states using explicit freshness bounds that distinguish fresh active work from orphaned or abandoned work.
|
||||
- **FR-160-006**: The system MUST reconcile stale covered `queued` runs to terminal failed truth when no credible evidence exists that the run is still legitimately progressing.
|
||||
- **FR-160-007**: The system MUST reconcile stale covered `running` runs to terminal failed truth when no credible evidence exists that the run is still legitimately progressing.
|
||||
- **FR-160-008**: Reconciliation MUST be conservative, idempotent, and limited to non-terminal runs; it MUST never mutate a legitimately completed run.
|
||||
- **FR-160-009**: Reconciled terminal failures MUST preserve operator-safe, infrastructure-oriented reason semantics that distinguish normal execution failure from stale or orphaned lifecycle healing.
|
||||
- **FR-160-010**: Covered queued jobs MUST satisfy a minimum lifecycle contract that provides either a direct terminal-failure bridge, a shared inherited failure bridge, or a documented fallback reconciliation path to eventual terminal truth.
|
||||
- **FR-160-011**: Covered long-running jobs MUST declare explicit runtime bounds rather than relying solely on implicit worker defaults.
|
||||
- **FR-160-012**: The timeout behavior for covered long-running jobs MUST be intentionally defined so that timeout-related failure semantics are not ambiguous.
|
||||
- **FR-160-013**: The platform MUST preserve traceable linkage between a failed or reconciled infrastructure event and the owning `OperationRun`.
|
||||
- **FR-160-014**: The platform MUST establish and document a runtime timing invariant that keeps queue reservation timing safely above legitimate covered job execution duration.
|
||||
- **FR-160-015**: Lifecycle runtime validation MUST make misaligned timing relationships detectable rather than allowing silent divergence between queue truth and domain truth.
|
||||
- **FR-160-016**: The Operations index and run detail MUST distinguish legitimately active runs from likely stale runs and reconciled failures without implying indefinite normal activity for obviously stale work.
|
||||
- **FR-160-017**: Operator-facing Monitoring surfaces MUST communicate whether a run ended normally or was force-resolved by lifecycle reconciliation.
|
||||
- **FR-160-018**: Existing happy-path lifecycle handling such as middleware-based run tracking MUST remain valid but MUST no longer be treated as the only mechanism that can guarantee terminal truth.
|
||||
- **FR-160-019**: The system MUST support evidence-based reconciliation using available operational signals such as run freshness, queue failure evidence, and other non-render-time lifecycle evidence that does not require external calls during Monitoring page render.
|
||||
- **FR-160-020**: The system MUST preserve the current top-level lifecycle model of `queued`, `running`, and `completed` while enriching failure reason and freshness interpretation instead of introducing a second terminal-state model for V1.
|
||||
- **FR-160-021**: Reconciliation and direct failure bridging MUST remain auditable, including when reconciliation happened, why the run was judged stale or orphaned, and whether the normal lifecycle path was bypassed.
|
||||
- **FR-160-022**: The system SHOULD make it possible to recover aggregate visibility into how many runs were force-reconciled and which operation types are most affected, even if V1 does not add a dedicated observability dashboard.
|
||||
- **FR-160-023**: Manual database or Tinker intervention MUST no longer be the normal recovery path for orphaned covered runs.
|
||||
- **FR-160-024**: Validation coverage MUST include stale queued reconciliation, stale running reconciliation, idempotency, fresh-run non-interference, direct failure bridging, normal failure-path coexistence, a Run-126-style orphaned running regression, and runtime timing guard coverage where practical.
|
||||
|
||||
## UI Action Matrix *(mandatory when Filament is changed)*
|
||||
|
||||
If this feature adds/modifies any Filament Resource / RelationManager / Page, fill out the matrix below.
|
||||
|
||||
For each surface, list the exact action labels, whether they are destructive (confirmation? typed confirmation?),
|
||||
RBAC gating (capability + enforcement helper), and whether the mutation writes an audit log.
|
||||
|
||||
| Surface | Location | Header Actions | Inspect Affordance (List/Table) | Row Actions (max 2 visible) | Bulk Actions (grouped) | Empty-State CTA(s) | View Header Actions | Create/Edit Save+Cancel | Audit log? | Notes / Exemptions |
|
||||
|---|---|---|---|---|---|---|---|---|---|---|
|
||||
| Operations index | `/admin/operations` | Existing filter and navigation actions remain | Existing linked rows to run detail remain the inspect affordance | `View run` remains the primary inspect action | Existing grouped bulk actions unchanged if present | Existing empty-state CTA remains | Not applicable | Not applicable | No direct mutation from index in V1 | Table semantics change to show freshness and reconciliation truth without changing the core action surface. |
|
||||
| Operation run detail | `/admin/operations/{run}` | `Back to Operations` and existing contextual navigation remain | Route-resolved record inspection | No new row actions | None | Not applicable | Existing view actions remain; no new destructive action is introduced | Not applicable | No direct mutation from the page in V1 | Detail page exposes reconciled or stale semantics and diagnostics but does not add retry or re-drive actions in V1. |
|
||||
| Existing operation start surfaces | Existing baseline, restore, backup schedule, inventory sync, and review generation surfaces | Existing start or preview actions remain | Existing route or view affordances remain | Existing `View run` or equivalent inspect affordance remains where already present | Existing grouped actions remain | Existing empty-state CTA remains | Existing view header actions remain | Existing save and cancel behavior unchanged | Yes where the originating feature already writes audit logs | Exemption: this spec changes lifecycle guarantees behind existing start actions, not the visible action inventory. Existing destructive operations remain confirmation-required under their originating specs. |
|
||||
|
||||
### Key Entities *(include if feature involves data)*
|
||||
|
||||
- **Covered Operation Run**: An operator-visible queued run whose lifecycle truth must converge even when queue infrastructure fails outside the normal happy path.
|
||||
- **Queue Failure Evidence**: Any infrastructure-originated signal that indicates a covered queued job terminally failed, timed out, exhausted attempts, or became otherwise non-viable.
|
||||
- **Lifecycle Reconciliation Decision**: The platform decision that a non-terminal run is stale or orphaned and must be force-resolved to terminal failed truth.
|
||||
- **Freshness Window**: The accepted age or progress boundary that separates plausibly active `queued` or `running` work from stale work.
|
||||
- **Lifecycle Contract**: The minimum expectations a covered queued job must satisfy so the platform can map infrastructure truth back to domain truth.
|
||||
- **Runtime Timing Invariant**: The deployment and worker relationship that prevents legitimate covered work from outliving queue reservation semantics.
|
||||
|
||||
## Success Criteria *(mandatory)*
|
||||
|
||||
### Measurable Outcomes
|
||||
|
||||
- **SC-160-001**: In focused lifecycle regression coverage, 100% of covered stale `queued` runs and stale `running` runs are force-resolved to terminal failed truth within the configured reconciliation window.
|
||||
- **SC-160-002**: In focused lifecycle regression coverage, 0 fresh covered runs are incorrectly reconciled while still within their accepted freshness window.
|
||||
- **SC-160-003**: In focused Run-126-style regression coverage, 100% of simulated orphaned runs that never receive normal completion or failure callbacks stop appearing as indefinitely active work.
|
||||
- **SC-160-004**: In focused Monitoring UX coverage, operators can distinguish normal failure from reconciled lifecycle failure on 100% of covered scenarios exercised by the spec tests.
|
||||
- **SC-160-005**: In focused lifecycle contract coverage, 100% of V1-covered queued operation types demonstrate a credible terminal-truth path through either direct failure bridging or reconciliation.
|
||||
- **SC-160-006**: In focused runtime guard coverage, timing relationships that would allow legitimate covered work to outlive queue reservation semantics are detected or documented rather than silently accepted.
|
||||
|
||||
## Assumptions
|
||||
|
||||
- Existing `OperationRun` top-level statuses and outcomes remain sufficient for V1 if failure reasons and freshness semantics are enriched.
|
||||
- Existing Monitoring pages remain the canonical operator-facing surfaces for run truth and do not perform external calls during render.
|
||||
- Existing happy-path middleware and service-based lifecycle handling remain useful and are preserved as first-line execution handling.
|
||||
- V1 prioritizes deterministic terminal truth over resumability, automatic re-drive, or advanced checkpoint-based recovery.
|
||||
- Covered operation types are limited to operator-visible runs that materially own or advance business-relevant work.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- Existing `OperationRun` domain model and `OperationRunService` remain the canonical ownership boundary for lifecycle transitions.
|
||||
- Existing Monitoring and Operations surfaces remain the canonical render surfaces for run truth.
|
||||
- Existing initiating specs for restore, baseline, backup scheduling, inventory synchronization, and review generation remain the source of truth for their mutation scope and confirmation policy.
|
||||
|
||||
## Risks
|
||||
|
||||
- Stale thresholds that are too aggressive could force-fail legitimate long-running work.
|
||||
- A reconciliation safety net could hide recurring infrastructure weaknesses if aggregate visibility is not preserved.
|
||||
- Partial adoption across covered operation types could leave some orphaned paths untreated and create a false sense of completion.
|
||||
- Teams may mistake lifecycle reconciliation for full resiliency even though resumability and advanced retry orchestration remain out of scope.
|
||||
|
||||
## Summary
|
||||
|
||||
Run 126 showed that queue truth, domain truth, and operator-visible truth can diverge when infrastructure-level failure happens before the normal lifecycle path finishes its cleanup. This feature closes that gap by establishing a platform rule: no covered queued `OperationRun` may remain indefinitely ambiguous. If the system cannot prove that a run is still legitimately active, it must eventually reconcile that run to deterministic terminal truth.
|
||||
|
||||
V1 delivers that guarantee through three coordinated changes: a minimum lifecycle contract for covered queued jobs, a deterministic bridge from queue or infrastructure failure back to the owning run, and conservative stale-run reconciliation that heals orphaned `queued` or `running` runs. Monitoring surfaces then present that truth honestly by distinguishing fresh activity, likely stale work, and reconciled lifecycle failure without adding a second status model or promising resumability.
|
||||
220
specs/160-operation-lifecycle-guarantees/tasks.md
Normal file
220
specs/160-operation-lifecycle-guarantees/tasks.md
Normal file
@ -0,0 +1,220 @@
|
||||
# Tasks: Operation Lifecycle Guarantees & Queue-to-Domain Failure Reconciliation
|
||||
|
||||
**Input**: Design documents from `/specs/160-operation-lifecycle-guarantees/`
|
||||
**Prerequisites**: `plan.md` (required), `spec.md` (required), `research.md`, `data-model.md`, `contracts/`, `quickstart.md`
|
||||
|
||||
**Tests**: Runtime behavior changes in this repo require Pest coverage. This feature changes queue lifecycle handling, Monitoring semantics, authorization-adjacent Monitoring truth, and Ops-UX guarantees, so tests are required for every user story.
|
||||
**Operations**: This feature hardens long-running and queued `OperationRun` execution. Tasks below preserve the Ops-UX 3-surface feedback contract, keep terminal truth service-owned through `OperationRunService`, keep `summary_counts` numeric-only, prevent queued or running DB notifications, preserve initiator-null notification behavior for system runs, and keep canonical `View run` navigation pointed at `/admin/operations/{run}`.
|
||||
**RBAC**: This feature changes Monitoring truth semantics in the admin `/admin` plane. Tasks below preserve deny-as-not-found for non-entitled workspace or tenant access, keep capability denial as `403` where applicable, continue using the capability registry, and add positive and negative authorization coverage for canonical run viewing.
|
||||
**UI Naming**: Lifecycle copy must use operator-safe domain language such as `stale`, `reconciled`, and `infrastructure failure` in primary UI surfaces and keep low-level queue exceptions in diagnostics only.
|
||||
**Filament UI Action Surfaces**: This feature modifies existing Filament Monitoring pages and resources without changing their core action inventory. Tasks below preserve existing inspect affordances, keep destructive-action rules unchanged, and retrofit lifecycle semantics into current header, row, and empty-state behavior.
|
||||
**Filament UI UX-001**: This feature is not a layout redesign. Tasks below keep the current Operations layouts intact while updating badges, diagnostics, and operator-first truth messaging inside the existing pages.
|
||||
**Badges**: Status-like semantics must continue to flow through `BadgeCatalog`-backed domain badge mappers in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/Badges/Domains/OperationRunStatusBadge.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/Badges/Domains/OperationRunOutcomeBadge.php`.
|
||||
**Contract Artifact**: `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/160-operation-lifecycle-guarantees/contracts/operation-run-lifecycle.openapi.yaml` is an internal Monitoring contract for freshness and reconciliation semantics, not a requirement to add new public controller endpoints.
|
||||
|
||||
**Organization**: Tasks are grouped by user story so each story can be implemented and tested independently.
|
||||
|
||||
## Phase 1: Setup (Shared Infrastructure)
|
||||
|
||||
**Purpose**: Prepare the regression targets and touchpoints for lifecycle hardening.
|
||||
|
||||
- [X] T001 [P] Create or extend lifecycle service and middleware regression targets in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OperationRunServiceTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OperationRunServiceStaleQueuedRunTest.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/TrackOperationRunMiddlewareTest.php`
|
||||
- [X] T002 [P] Create or extend reconciliation and scheduler regression targets in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Console/ReconcileBackupScheduleOperationRunsCommandTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OpsUx/AdapterRunReconcilerTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/ReconcileAdapterRunsJobTrackingTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/OperationLifecycleReconciliationTest.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Console/ReconcileOperationRunsCommandTest.php`
|
||||
- [X] T003 [P] Create or extend Monitoring and badge regression targets in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Monitoring/MonitoringOperationsTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Filament/OperationRunEnterpriseDetailPageTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Monitoring/OperationsDbOnlyRenderTest.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Unit/Badges/OperationRunBadgesTest.php`
|
||||
- [X] T004 [P] Create or extend authorization, queued-intent, canonical View run, and Ops-UX guard targets in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/TenantlessOperationRunViewerTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/RunAuthorizationTenantIsolationTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Notifications/OperationRunNotificationTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OpsUx/QueuedToastCopyTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OpsUx/NotificationViewRunLinkTest.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Guards/OperationLifecycleOpsUxGuardTest.php`
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Foundational (Blocking Prerequisites)
|
||||
|
||||
**Purpose**: Build the shared lifecycle policy and reconciliation infrastructure that all user stories depend on.
|
||||
|
||||
**⚠️ CRITICAL**: No user story work should begin until this phase is complete.
|
||||
|
||||
- [X] T005 Define the config-backed lifecycle coverage, terminal-truth-path matrix, and threshold registry for `baseline_capture`, `baseline_compare`, `inventory_sync`, `policy.sync`, `policy.sync_one`, `entra_group_sync`, `directory_role_definitions.sync`, `backup_schedule_run`, `restore.execute`, `tenant.review_pack.generate`, `tenant.review.compose`, and `tenant.evidence.snapshot.generate` in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/config/tenantpilot.php` and align queue timing defaults in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/config/queue.php`
|
||||
- [X] T006 Create shared lifecycle policy and freshness support types in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/Operations/OperationLifecyclePolicy.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/Operations/OperationRunFreshnessState.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/Operations/LifecycleReconciliationReason.php`
|
||||
- [X] T007 Extend `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Services/OperationRunService.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Models/OperationRun.php` with generic stale-running assessment, standardized reconciliation metadata, and idempotent service-owned force-fail helpers for non-terminal runs
|
||||
- [X] T008 Create the generic active-run reconciler in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Services/Operations/OperationLifecycleReconciler.php` and reuse existing legitimacy signals from `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Services/Operations/QueuedExecutionLegitimacyGate.php`
|
||||
- [X] T009 Register the generic reconciliation entry point in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Console/Commands/TenantpilotReconcileOperationRuns.php` and schedule it from `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/routes/console.php`
|
||||
- [X] T010 Add foundational coverage for lifecycle policy parsing, stale-running service transitions, and idempotent reconciliation in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OperationRunServiceStaleQueuedRunTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/OperationLifecycleReconciliationTest.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Console/ReconcileOperationRunsCommandTest.php`
|
||||
|
||||
**Checkpoint**: Foundation ready. The repo has one shared lifecycle policy, one generic reconciliation seam, and service-owned APIs that stories can adopt independently.
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: User Story 1 - Force Terminal Truth For Orphaned Runs (Priority: P1) 🎯 MVP
|
||||
|
||||
**Goal**: Ensure every covered queued operation converges to deterministic terminal truth when normal queue cleanup does not.
|
||||
|
||||
**Independent Test**: Create covered `OperationRun` records in `queued` and `running`, prevent normal finalization, advance time past the configured threshold, run the generic reconciler or direct failure bridge, and verify the run becomes `completed/failed` with operator-safe reconciliation evidence.
|
||||
|
||||
### Tests for User Story 1
|
||||
|
||||
- [X] T011 [P] [US1] Add stale queued, stale running, fresh-run non-interference, and idempotency coverage in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OperationRunServiceStaleQueuedRunTest.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/OperationLifecycleReconciliationTest.php`
|
||||
- [X] T012 [P] [US1] Add direct queue-failure bridge coverage for exhausted-attempt and timeout paths in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/TrackOperationRunMiddlewareTest.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/OperationRunFailedJobBridgeTest.php`
|
||||
- [X] T013 [P] [US1] Add reconciliation command and coexistence coverage for scheduled healing paths in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Console/ReconcileOperationRunsCommandTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Console/ReconcileBackupScheduleOperationRunsCommandTest.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OpsUx/AdapterRunReconcilerTest.php`
|
||||
|
||||
### Implementation for User Story 1
|
||||
|
||||
- [X] T014 [US1] Implement service-owned stale queued and stale running force-fail transitions in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Services/OperationRunService.php`
|
||||
- [X] T015 [US1] Implement the generic lifecycle reconciliation flow and structured reconciliation payloads in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Services/Operations/OperationLifecycleReconciler.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Console/Commands/TenantpilotReconcileOperationRuns.php`
|
||||
- [X] T016 [US1] Integrate the new generic reconciler with existing type-specific healing in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Console/Commands/TenantpilotReconcileBackupScheduleOperationRuns.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Services/AdapterRunReconciler.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/ReconcileAdapterRunsJob.php`
|
||||
- [X] T017 [US1] Create a reusable failed-job bridge in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/Concerns/BridgesFailedOperationRun.php`, normalize the existing direct bridge in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/BulkBackupSetRestoreJob.php`, and add missing direct bridges for `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/CaptureBaselineSnapshotJob.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/CompareBaselineToTenantJob.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/BulkTenantSyncJob.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/SyncPoliciesJob.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/ComposeTenantReviewJob.php` while preserving scheduled reconciliation for covered types marked fallback-only in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/config/tenantpilot.php`
|
||||
- [X] T018 [US1] Preserve queued-intent, canonical `View run`, completion, and initiator-only notification guarantees across representative start surfaces in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Pages/BaselineCompareLanding.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Resources/BackupScheduleResource.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Resources/RestoreRunResource.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Resources/InventoryItemResource/Pages/ListInventoryItems.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Resources/ReviewPackResource.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/Middleware/TrackOperationRun.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Notifications/OperationRunCompleted.php`
|
||||
|
||||
**Checkpoint**: User Story 1 is complete when orphaned covered runs no longer require manual DB repair and always converge to terminal failed truth through direct failure bridging or reconciliation.
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: User Story 2 - Show Honest Liveness In Monitoring (Priority: P2)
|
||||
|
||||
**Goal**: Make Monitoring distinguish fresh activity, likely stale activity, and reconciled failure without implying indefinite normal progress.
|
||||
|
||||
**Independent Test**: Seed fresh active runs, stale runs, and reconciled-failed runs, then verify the Operations index and canonical run detail show distinct operator-safe semantics while canonical authorization remains intact.
|
||||
|
||||
### Tests for User Story 2
|
||||
|
||||
- [X] T019 [P] [US2] Add Operations index, aggregate reconciliation visibility, and run-detail truth-semantics coverage in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Monitoring/MonitoringOperationsTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Monitoring/OperationLifecycleAggregateVisibilityTest.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Filament/OperationRunEnterpriseDetailPageTest.php`
|
||||
- [X] T020 [P] [US2] Add freshness-state and badge mapping coverage in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Unit/Badges/OperationRunBadgesTest.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Monitoring/OperationLifecycleFreshnessPresentationTest.php`
|
||||
- [X] T021 [P] [US2] Add positive and negative canonical Monitoring authorization coverage for stale or reconciled runs in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/TenantlessOperationRunViewerTest.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/RunAuthorizationTenantIsolationTest.php`
|
||||
|
||||
### Implementation for User Story 2
|
||||
|
||||
- [X] T022 [US2] Extend centralized lifecycle presentation in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/OpsUx/OperationUxPresenter.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/OpsUx/RunDurationInsights.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/ReasonTranslation/ReasonPresenter.php`
|
||||
- [X] T023 [US2] Implement fresh, stale, and reconciled badge semantics in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/Badges/Domains/OperationRunStatusBadge.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/Badges/Domains/OperationRunOutcomeBadge.php`
|
||||
- [X] T024 [US2] Update Operations list filtering, minimal aggregate reconciliation visibility, query semantics, and default-visible lifecycle truth in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Pages/Monitoring/Operations.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Resources/OperationRunResource.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/resources/views/filament/pages/monitoring/operations.blade.php`
|
||||
- [X] T025 [US2] Update canonical run detail messaging and diagnostics disclosure in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Pages/Operations/TenantlessOperationRunViewer.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/resources/views/filament/pages/operations/tenantless-operation-run-viewer.blade.php`
|
||||
- [X] T026 [US2] Keep `/admin/operations` and `/admin/operations/{run}` DB-only and canonical-navigation-safe while exposing lifecycle truth in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Pages/Monitoring/Operations.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Pages/Operations/TenantlessOperationRunViewer.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Filament/Resources/OperationRunResource.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Monitoring/OperationsDbOnlyRenderTest.php`
|
||||
|
||||
**Checkpoint**: User Story 2 is complete when operators can distinguish normal active work from stale or reconciled runs on Monitoring surfaces without losing canonical authorization or DB-only rendering guarantees.
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: User Story 3 - Prevent Repeat Incidents Through Lifecycle Contracts (Priority: P3)
|
||||
|
||||
**Goal**: Enforce explicit lifecycle policy, timeout strategy, and guardrails so covered jobs cannot silently drift back into ambiguous run truth.
|
||||
|
||||
**Independent Test**: Verify that the covered lifecycle policy rejects misaligned timeout versus `retry_after` settings, covered jobs declare explicit lifecycle behavior, and Ops-UX guard tests fail if service ownership or notification constraints regress.
|
||||
|
||||
### Tests for User Story 3
|
||||
|
||||
- [X] T027 [P] [US3] Add lifecycle policy and timeout invariant coverage in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/OperationLifecycleTimingGuardTest.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Unit/Operations/OperationLifecyclePolicyValidatorTest.php`
|
||||
- [X] T028 [P] [US3] Add covered-job lifecycle contract coverage in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/BaselineOperationRunGuardTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Inventory/RunInventorySyncJobTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/BackupScheduling/RunBackupScheduleJobCompatibilityTest.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/TenantReview/TenantReviewOperationsUxTest.php`
|
||||
- [X] T029 [P] [US3] Add Ops-UX regression guard coverage for service-owned transitions, notification discipline, and initiator-null behavior in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Notifications/OperationRunNotificationTest.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Guards/OperationLifecycleOpsUxGuardTest.php`
|
||||
|
||||
### Implementation for User Story 3
|
||||
|
||||
- [X] T030 [US3] Implement lifecycle policy validation and timeout-versus-`retry_after` enforcement for the exact covered V1 operation set in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Services/Operations/OperationLifecyclePolicyValidator.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/config/tenantpilot.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/config/queue.php`
|
||||
- [X] T031 [US3] Align covered job timeout and failure-contract declarations in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/CaptureBaselineSnapshotJob.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/CompareBaselineToTenantJob.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/BulkBackupSetRestoreJob.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/BulkTenantSyncJob.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/SyncPoliciesJob.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Jobs/ComposeTenantReviewJob.php`
|
||||
- [X] T032 [US3] Preserve canonical Monitoring authorization and capability semantics for reconciled lifecycle states in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Policies/OperationRunPolicy.php` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/Operations/OperationRunCapabilityResolver.php`
|
||||
- [X] T033 [US3] Normalize operator-safe lifecycle copy and diagnostics boundaries in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/ReasonTranslation/ReasonPresenter.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app/Support/OpsUx/OperationUxPresenter.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/resources/views/filament/pages/monitoring/operations.blade.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/resources/views/filament/pages/operations/tenantless-operation-run-viewer.blade.php`
|
||||
|
||||
**Checkpoint**: User Story 3 is complete when covered jobs and runtime settings have explicit lifecycle contracts and guard tests catch timing or ownership regressions before they reintroduce orphaned runs.
|
||||
|
||||
---
|
||||
|
||||
## Phase 6: Polish & Cross-Cutting Concerns
|
||||
|
||||
**Purpose**: Validate the full feature slice, format touched files, and complete the manual smoke pass.
|
||||
|
||||
- [X] T034 [P] Run the focused Pest suites from `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/160-operation-lifecycle-guarantees/quickstart.md` covering `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OperationRunServiceTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/OperationRunServiceStaleQueuedRunTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/TrackOperationRunMiddlewareTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/OperationLifecycleReconciliationTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Console/ReconcileOperationRunsCommandTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/OperationRunFailedJobBridgeTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Monitoring/MonitoringOperationsTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Filament/OperationRunEnterpriseDetailPageTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/TenantlessOperationRunViewerTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/RunAuthorizationTenantIsolationTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Feature/Operations/OperationLifecycleTimingGuardTest.php`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Unit/Badges/OperationRunBadgesTest.php`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests/Unit/Operations/OperationLifecyclePolicyValidatorTest.php`
|
||||
- [X] T035 Run formatting for touched files under `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/app`, `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/config`, and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/tests` with `vendor/bin/sail bin pint --dirty --format agent`
|
||||
- [X] T036 [P] Validate the manual smoke checklist in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/160-operation-lifecycle-guarantees/quickstart.md` against `/admin/operations`, `/admin/operations/{run}`, and the affected operation start surfaces for baseline capture, baseline compare, restore execution, backup schedule execution, inventory sync, and tenant review generation
|
||||
- [X] T037 [P] Document worker timeout, `retry_after`, `queue:restart`, and stop-wait expectations in `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/specs/160-operation-lifecycle-guarantees/quickstart.md` and `/Users/ahmeddarrazi/Documents/projects/TenantAtlas/docs/HANDOVER.md`
|
||||
|
||||
---
|
||||
|
||||
## Dependencies & Execution Order
|
||||
|
||||
### Phase Dependencies
|
||||
|
||||
- **Phase 1: Setup** has no dependencies and can start immediately.
|
||||
- **Phase 2: Foundational** depends on Phase 1 and blocks all user story work.
|
||||
- **Phase 3: User Story 1** depends on Phase 2 and delivers the MVP.
|
||||
- **Phase 4: User Story 2** depends on Phase 2 and benefits from User Story 1 because it needs reconciled lifecycle evidence to display.
|
||||
- **Phase 5: User Story 3** depends on Phase 2 and is safest after User Story 1 because it formalizes the lifecycle policy used by the reconciliation and failed-job bridge paths.
|
||||
- **Phase 6: Polish** depends on all desired user stories being complete.
|
||||
|
||||
### User Story Dependencies
|
||||
|
||||
- **User Story 1 (P1)** can start immediately after the foundational phase and is the MVP slice.
|
||||
- **User Story 2 (P2)** can start after the foundational phase but is easiest once User Story 1 provides the reconciled lifecycle states to present.
|
||||
- **User Story 3 (P3)** can start after the foundational phase but should land after User Story 1 to avoid validating contracts against outdated bridge behavior.
|
||||
|
||||
### Within Each User Story
|
||||
|
||||
- Write or extend tests first and confirm they fail before implementation.
|
||||
- Policy and support-layer changes should land before command, job, and UI adoption.
|
||||
- Reconciliation flow should stabilize before UI semantics consume its metadata.
|
||||
- Story-level regression coverage should pass before moving to the next priority story.
|
||||
|
||||
### Parallel Opportunities
|
||||
|
||||
- `T001`, `T002`, `T003`, and `T004` can run in parallel because they prepare separate regression targets.
|
||||
- `T005` and `T006` can run in parallel before the service and reconciler wiring tasks.
|
||||
- `T011`, `T012`, and `T013` can run in parallel within User Story 1.
|
||||
- `T019`, `T020`, and `T021` can run in parallel within User Story 2.
|
||||
- `T027`, `T028`, and `T029` can run in parallel within User Story 3.
|
||||
- `T034`, `T036`, and `T037` can run in parallel after implementation is complete.
|
||||
|
||||
---
|
||||
|
||||
## Parallel Example: User Story 1
|
||||
|
||||
```bash
|
||||
# Run the P1 regression additions together:
|
||||
Task: "Add stale queued, stale running, fresh-run non-interference, and idempotency coverage in tests/Feature/OperationRunServiceStaleQueuedRunTest.php and tests/Feature/Operations/OperationLifecycleReconciliationTest.php"
|
||||
Task: "Add direct queue-failure bridge coverage for exhausted-attempt and timeout paths in tests/Feature/TrackOperationRunMiddlewareTest.php and tests/Feature/Operations/OperationRunFailedJobBridgeTest.php"
|
||||
Task: "Add reconciliation command and coexistence coverage for scheduled healing paths in tests/Feature/Console/ReconcileOperationRunsCommandTest.php, tests/Feature/Console/ReconcileBackupScheduleOperationRunsCommandTest.php, and tests/Feature/OpsUx/AdapterRunReconcilerTest.php"
|
||||
```
|
||||
|
||||
## Parallel Example: User Story 2
|
||||
|
||||
```bash
|
||||
# Split list/detail truth semantics, badge semantics, and auth coverage:
|
||||
Task: "Add Operations index and run-detail truth-semantics coverage in tests/Feature/Monitoring/MonitoringOperationsTest.php and tests/Feature/Filament/OperationRunEnterpriseDetailPageTest.php"
|
||||
Task: "Add freshness-state and badge mapping coverage in tests/Unit/Badges/OperationRunBadgesTest.php and tests/Feature/Monitoring/OperationLifecycleFreshnessPresentationTest.php"
|
||||
Task: "Add positive and negative canonical Monitoring authorization coverage for stale or reconciled runs in tests/Feature/Operations/TenantlessOperationRunViewerTest.php and tests/Feature/RunAuthorizationTenantIsolationTest.php"
|
||||
```
|
||||
|
||||
## Parallel Example: User Story 3
|
||||
|
||||
```bash
|
||||
# Split policy, job-contract, and Ops-UX guard work:
|
||||
Task: "Add lifecycle policy and timeout invariant coverage in tests/Feature/Operations/OperationLifecycleTimingGuardTest.php and tests/Unit/Operations/OperationLifecyclePolicyValidatorTest.php"
|
||||
Task: "Add covered-job lifecycle contract coverage in tests/Feature/Operations/BaselineOperationRunGuardTest.php, tests/Feature/Inventory/RunInventorySyncJobTest.php, tests/Feature/BackupScheduling/RunBackupScheduleJobCompatibilityTest.php, and tests/Feature/TenantReview/TenantReviewOperationsUxTest.php"
|
||||
Task: "Add Ops-UX regression guard coverage for service-owned transitions, notification discipline, and initiator-null behavior in tests/Feature/Notifications/OperationRunNotificationTest.php and tests/Feature/Guards/OperationLifecycleOpsUxGuardTest.php"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Strategy
|
||||
|
||||
### MVP First
|
||||
|
||||
1. Complete Phase 1: Setup.
|
||||
2. Complete Phase 2: Foundational.
|
||||
3. Complete Phase 3: User Story 1.
|
||||
4. **Stop and validate** that orphaned queued and running runs now converge to terminal failed truth without manual intervention.
|
||||
|
||||
### Incremental Delivery
|
||||
|
||||
1. Deliver User Story 1 to close the integrity gap and establish direct failure bridging plus stale-run healing.
|
||||
2. Deliver User Story 2 to make Monitoring surfaces reflect that lifecycle truth honestly.
|
||||
3. Deliver User Story 3 to formalize covered-job contracts and timing guards so the same incident class does not recur.
|
||||
4. Finish with Phase 6 regression execution, formatting, and manual smoke validation.
|
||||
|
||||
### Team Strategy
|
||||
|
||||
1. One engineer should own the foundational lifecycle policy and reconciliation seam in `app/Services/OperationRunService.php`, `app/Services/Operations/OperationLifecycleReconciler.php`, and `config/tenantpilot.php`.
|
||||
2. A second engineer can prepare User Story 1 regression coverage in parallel during the foundational phase.
|
||||
3. Monitoring and badge semantics for User Story 2 can be developed separately once the reconciled metadata contract is stable.
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- `[P]` tasks touch separate files and can be executed in parallel.
|
||||
- Each user story remains independently testable after the foundational phase.
|
||||
- This feature does not require a schema migration in the first slice.
|
||||
- Keep lifecycle truth service-owned and operator-facing copy domain-safe across every touched surface.
|
||||
@ -41,10 +41,9 @@
|
||||
|
||||
Livewire::actingAs($user)->test(AuditLogPage::class)
|
||||
->assertCanSeeTableRecords([$audit])
|
||||
->callTableAction('inspect', $audit)
|
||||
->assertSet('selectedAuditLogId', (int) $audit->getKey())
|
||||
->assertSee('Drift finding #'.$finding->getKey())
|
||||
->assertSee('Open finding');
|
||||
->mountTableAction('inspect', $audit)
|
||||
->assertMountedActionModalSee('Drift finding #'.$finding->getKey())
|
||||
->assertMountedActionModalSee('Open finding');
|
||||
});
|
||||
|
||||
it('keeps deleted findings readable while suppressing finding drill-down links', function (): void {
|
||||
@ -81,10 +80,9 @@
|
||||
|
||||
Livewire::actingAs($user)->test(AuditLogPage::class)
|
||||
->assertCanSeeTableRecords([$audit])
|
||||
->callTableAction('inspect', $audit)
|
||||
->assertSet('selectedAuditLogId', (int) $audit->getKey())
|
||||
->assertSee('Permission posture finding #'.$findingId)
|
||||
->assertDontSee('Open finding');
|
||||
->mountTableAction('inspect', $audit)
|
||||
->assertMountedActionModalSee('Permission posture finding #'.$findingId)
|
||||
->assertMountedActionModalDontSee('Open finding');
|
||||
});
|
||||
|
||||
it('does not render internal audit bookkeeping metadata in the inspection view', function (): void {
|
||||
@ -120,12 +118,11 @@
|
||||
|
||||
Livewire::actingAs($user)->test(AuditLogPage::class)
|
||||
->assertCanSeeTableRecords([$audit])
|
||||
->callTableAction('inspect', $audit)
|
||||
->assertSet('selectedAuditLogId', (int) $audit->getKey())
|
||||
->assertDontSee('_dedupe_key')
|
||||
->assertDontSee('internal-bookkeeping-marker')
|
||||
->assertDontSee('_actor_type')
|
||||
->assertDontSee('hidden-actor-marker');
|
||||
->mountTableAction('inspect', $audit)
|
||||
->assertMountedActionModalDontSee('_dedupe_key')
|
||||
->assertMountedActionModalDontSee('internal-bookkeeping-marker')
|
||||
->assertMountedActionModalDontSee('_actor_type')
|
||||
->assertMountedActionModalDontSee('hidden-actor-marker');
|
||||
});
|
||||
|
||||
it('hides finding audit rows for tenants outside the viewer entitlement scope', function (): void {
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
<?php
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Jobs\RunBackupScheduleJob;
|
||||
|
||||
it('serializes without the legacy backup schedule run id field', function (): void {
|
||||
@ -21,3 +22,11 @@
|
||||
expect($job)->toBeInstanceOf(RunBackupScheduleJob::class)
|
||||
->and($job->backupScheduleId)->toBe(42);
|
||||
});
|
||||
|
||||
it('declares the backup schedule lifecycle contract explicitly', function (): void {
|
||||
$job = new RunBackupScheduleJob(operationRun: null, backupScheduleId: 42);
|
||||
|
||||
expect(class_uses_recursive($job))->not->toContain(BridgesFailedOperationRun::class)
|
||||
->and($job->timeout)->toBe(300)
|
||||
->and($job->failOnTimeout)->toBeTrue();
|
||||
});
|
||||
|
||||
@ -52,28 +52,7 @@
|
||||
],
|
||||
]);
|
||||
|
||||
$snapshot2 = BaselineSnapshot::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'baseline_profile_id' => (int) $profile->getKey(),
|
||||
'captured_at' => now()->subMinute(),
|
||||
]);
|
||||
|
||||
BaselineSnapshotItem::factory()->create([
|
||||
'baseline_snapshot_id' => (int) $snapshot2->getKey(),
|
||||
'subject_type' => 'policy',
|
||||
'subject_external_id' => $workspaceSafeExternalId,
|
||||
'subject_key' => (string) $subjectKey,
|
||||
'policy_type' => 'deviceConfiguration',
|
||||
'baseline_hash' => hash('sha256', 'baseline-v2'),
|
||||
'meta_jsonb' => [
|
||||
'display_name' => $displayName,
|
||||
'evidence' => [
|
||||
'fidelity' => 'meta',
|
||||
'source' => 'inventory',
|
||||
'observed_at' => now()->toIso8601String(),
|
||||
],
|
||||
],
|
||||
]);
|
||||
$profile->update(['active_snapshot_id' => (int) $snapshot1->getKey()]);
|
||||
|
||||
$inventorySyncRun = createInventorySyncOperationRunWithCoverage(
|
||||
tenant: $tenant,
|
||||
@ -134,7 +113,32 @@
|
||||
expect($finding->times_seen)->toBe(1);
|
||||
expect((string) $finding->fingerprint)->toBe($fingerprint);
|
||||
|
||||
// Compare against a different baseline snapshot (hash changes), but recurrence identity stays stable.
|
||||
$snapshot2 = BaselineSnapshot::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'baseline_profile_id' => (int) $profile->getKey(),
|
||||
'captured_at' => now()->subMinute(),
|
||||
]);
|
||||
|
||||
BaselineSnapshotItem::factory()->create([
|
||||
'baseline_snapshot_id' => (int) $snapshot2->getKey(),
|
||||
'subject_type' => 'policy',
|
||||
'subject_external_id' => $workspaceSafeExternalId,
|
||||
'subject_key' => (string) $subjectKey,
|
||||
'policy_type' => 'deviceConfiguration',
|
||||
'baseline_hash' => hash('sha256', 'baseline-v2'),
|
||||
'meta_jsonb' => [
|
||||
'display_name' => $displayName,
|
||||
'evidence' => [
|
||||
'fidelity' => 'meta',
|
||||
'source' => 'inventory',
|
||||
'observed_at' => now()->toIso8601String(),
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$profile->update(['active_snapshot_id' => (int) $snapshot2->getKey()]);
|
||||
|
||||
// Compare against a newer current baseline snapshot (hash changes), but recurrence identity stays stable.
|
||||
$run2 = $opService->ensureRunWithIdentity(
|
||||
tenant: $tenant,
|
||||
type: OperationRunType::BaselineCompare->value,
|
||||
@ -235,11 +239,7 @@ function rbacRecurrenceSnapshot(string $displayName, string $description, array
|
||||
'captured_at' => $capturedAt->copy(),
|
||||
]);
|
||||
|
||||
$snapshot2 = BaselineSnapshot::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'baseline_profile_id' => (int) $profile->getKey(),
|
||||
'captured_at' => $capturedAt->copy()->addMinute(),
|
||||
]);
|
||||
$profile->update(['active_snapshot_id' => (int) $snapshot1->getKey()]);
|
||||
|
||||
$provider = app(\App\Services\Baselines\Evidence\ContentEvidenceProvider::class);
|
||||
|
||||
@ -267,30 +267,6 @@ function rbacRecurrenceSnapshot(string $displayName, string $description, array
|
||||
],
|
||||
]);
|
||||
|
||||
BaselineSnapshotItem::factory()->create([
|
||||
'baseline_snapshot_id' => (int) $snapshot2->getKey(),
|
||||
'subject_type' => 'policy',
|
||||
'subject_external_id' => (string) $workspaceSafeExternalId,
|
||||
'subject_key' => (string) $subjectKey,
|
||||
'policy_type' => 'intuneRoleDefinition',
|
||||
'baseline_hash' => $provider->fromPolicyVersion($baselineVersionTwo, (string) $workspaceSafeExternalId)->hash,
|
||||
'meta_jsonb' => [
|
||||
'display_name' => $displayName,
|
||||
'evidence' => [
|
||||
'fidelity' => 'content',
|
||||
'source' => 'policy_version',
|
||||
'observed_at' => $baselineVersionTwo->captured_at?->toIso8601String(),
|
||||
],
|
||||
'version_reference' => [
|
||||
'policy_version_id' => (int) $baselineVersionTwo->getKey(),
|
||||
],
|
||||
'rbac' => [
|
||||
'is_built_in' => false,
|
||||
'role_permission_count' => 1,
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$currentVersion = \App\Models\PolicyVersion::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'policy_id' => (int) $policy->getKey(),
|
||||
@ -361,6 +337,38 @@ function rbacRecurrenceSnapshot(string $displayName, string $description, array
|
||||
expect($finding->recurrence_key)->toBe($fingerprint)
|
||||
->and($firstDiffFingerprint)->not->toBe('');
|
||||
|
||||
$snapshot2 = BaselineSnapshot::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'baseline_profile_id' => (int) $profile->getKey(),
|
||||
'captured_at' => $capturedAt->copy()->addMinute(),
|
||||
]);
|
||||
|
||||
BaselineSnapshotItem::factory()->create([
|
||||
'baseline_snapshot_id' => (int) $snapshot2->getKey(),
|
||||
'subject_type' => 'policy',
|
||||
'subject_external_id' => (string) $workspaceSafeExternalId,
|
||||
'subject_key' => (string) $subjectKey,
|
||||
'policy_type' => 'intuneRoleDefinition',
|
||||
'baseline_hash' => $provider->fromPolicyVersion($baselineVersionTwo, (string) $workspaceSafeExternalId)->hash,
|
||||
'meta_jsonb' => [
|
||||
'display_name' => $displayName,
|
||||
'evidence' => [
|
||||
'fidelity' => 'content',
|
||||
'source' => 'policy_version',
|
||||
'observed_at' => $baselineVersionTwo->captured_at?->toIso8601String(),
|
||||
],
|
||||
'version_reference' => [
|
||||
'policy_version_id' => (int) $baselineVersionTwo->getKey(),
|
||||
],
|
||||
'rbac' => [
|
||||
'is_built_in' => false,
|
||||
'role_permission_count' => 1,
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$profile->update(['active_snapshot_id' => (int) $snapshot2->getKey()]);
|
||||
|
||||
$run2 = $opService->ensureRunWithIdentity(
|
||||
tenant: $tenant,
|
||||
type: OperationRunType::BaselineCompare->value,
|
||||
|
||||
@ -59,12 +59,14 @@
|
||||
expect($operationRun->outcome)->toBe('failed');
|
||||
expect($operationRun->failure_summary)->toMatchArray([
|
||||
[
|
||||
'code' => 'backup_schedule.stalled',
|
||||
'message' => 'Backup schedule run exceeded reconciliation timeout and was marked failed.',
|
||||
'reason_code' => 'unknown_error',
|
||||
'code' => 'run.stale_running',
|
||||
'message' => 'The run stayed active past its lifecycle window and was marked failed.',
|
||||
'reason_code' => 'run.stale_running',
|
||||
],
|
||||
]);
|
||||
expect($operationRun->context)->toMatchArray([
|
||||
'backup_schedule_id' => (int) $schedule->id,
|
||||
]);
|
||||
expect(data_get($operationRun->context, 'backup_schedule_id'))->toBe((int) $schedule->id)
|
||||
->and(data_get($operationRun->context, 'reason_code'))->toBe('run.stale_running')
|
||||
->and(data_get($operationRun->context, 'reconciliation.reason'))->toBe('run.stale_running')
|
||||
->and(data_get($operationRun->context, 'reconciliation.reason_code'))->toBe('run.stale_running')
|
||||
->and(data_get($operationRun->context, 'reconciliation.source'))->toBe('scheduled_reconciler');
|
||||
});
|
||||
|
||||
57
tests/Feature/Console/ReconcileOperationRunsCommandTest.php
Normal file
57
tests/Feature/Console/ReconcileOperationRunsCommandTest.php
Normal file
@ -0,0 +1,57 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Models\OperationRun;
|
||||
use App\Support\OperationRunOutcome;
|
||||
use App\Support\OperationRunStatus;
|
||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||
|
||||
uses(RefreshDatabase::class);
|
||||
|
||||
it('reconciles stale covered runs from the console command', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'type' => 'policy.sync',
|
||||
'status' => OperationRunStatus::Queued->value,
|
||||
'outcome' => OperationRunOutcome::Pending->value,
|
||||
'created_at' => now()->subMinutes(20),
|
||||
]);
|
||||
|
||||
$this->artisan('tenantpilot:operation-runs:reconcile', [
|
||||
'--tenant' => [(string) $tenant->getKey()],
|
||||
'--type' => ['policy.sync'],
|
||||
])
|
||||
->assertSuccessful()
|
||||
->expectsOutputToContain('reconciled 1');
|
||||
|
||||
expect($run->fresh()->status)->toBe(OperationRunStatus::Completed->value)
|
||||
->and(data_get($run->fresh()->context, 'reconciliation.reason_code'))->toBe('run.stale_queued');
|
||||
});
|
||||
|
||||
it('supports dry-run mode without mutating runs', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'type' => 'policy.sync',
|
||||
'status' => OperationRunStatus::Queued->value,
|
||||
'outcome' => OperationRunOutcome::Pending->value,
|
||||
'created_at' => now()->subMinutes(20),
|
||||
]);
|
||||
|
||||
$this->artisan('tenantpilot:operation-runs:reconcile', [
|
||||
'--tenant' => [(string) $tenant->getKey()],
|
||||
'--type' => ['policy.sync'],
|
||||
'--dry-run' => true,
|
||||
])->assertSuccessful();
|
||||
|
||||
expect($run->fresh()->status)->toBe(OperationRunStatus::Queued->value)
|
||||
->and($run->fresh()->outcome)->toBe(OperationRunOutcome::Pending->value);
|
||||
});
|
||||
@ -4,6 +4,7 @@
|
||||
|
||||
use App\Filament\Widgets\Dashboard\BaselineCompareNow;
|
||||
use App\Models\BaselineProfile;
|
||||
use App\Models\BaselineSnapshot;
|
||||
use App\Models\BaselineTenantAssignment;
|
||||
use App\Models\OperationRun;
|
||||
use Filament\Facades\Filament;
|
||||
@ -20,6 +21,13 @@
|
||||
'name' => 'Baseline A',
|
||||
]);
|
||||
|
||||
$snapshot = BaselineSnapshot::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'baseline_profile_id' => (int) $profile->getKey(),
|
||||
]);
|
||||
|
||||
$profile->update(['active_snapshot_id' => (int) $snapshot->getKey()]);
|
||||
|
||||
BaselineTenantAssignment::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
|
||||
uses(RefreshDatabase::class);
|
||||
|
||||
it('shows snapshot fidelity counts and gap state on list and view pages', function (): void {
|
||||
it('shows snapshot fidelity counts and evidence gap totals on list and view pages', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'readonly');
|
||||
|
||||
$profile = BaselineProfile::factory()->active()->create([
|
||||
@ -41,8 +41,6 @@
|
||||
$this->actingAs($user)
|
||||
->get(BaselineSnapshotResource::getUrl(panel: 'admin'))
|
||||
->assertOk()
|
||||
->assertSee('No follow-up needed')
|
||||
->assertSee('Coverage gaps need review')
|
||||
->assertSee('Detailed evidence 5, Metadata only 0')
|
||||
->assertSee('Detailed evidence 3, Metadata only 2');
|
||||
|
||||
@ -50,7 +48,6 @@
|
||||
->get(BaselineSnapshotResource::getUrl('view', ['record' => $withGaps], panel: 'admin'))
|
||||
->assertOk()
|
||||
->assertSee('Coverage summary')
|
||||
->assertSee('Coverage gaps need review')
|
||||
->assertSee('Detailed evidence 3, Metadata only 2')
|
||||
->assertSee('Evidence gaps')
|
||||
->assertSee('2');
|
||||
@ -59,7 +56,6 @@
|
||||
->get(BaselineSnapshotResource::getUrl('view', ['record' => $complete], panel: 'admin'))
|
||||
->assertOk()
|
||||
->assertSee('Coverage summary')
|
||||
->assertSee('No follow-up needed')
|
||||
->assertSee('Detailed evidence 5, Metadata only 0')
|
||||
->assertSee('Evidence gaps')
|
||||
->assertSee('0');
|
||||
|
||||
@ -34,7 +34,7 @@ function baselineSnapshotFilterIndicatorLabels($component): array
|
||||
->all();
|
||||
}
|
||||
|
||||
it('filters baseline snapshots by baseline, state, and captured date inside the current workspace', function (): void {
|
||||
it('filters baseline snapshots by baseline, lifecycle, and captured date inside the current workspace', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
session([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id]);
|
||||
@ -59,11 +59,12 @@ function baselineSnapshotFilterIndicatorLabels($component): array
|
||||
$matching = BaselineSnapshot::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'baseline_profile_id' => (int) $baselineA->getKey(),
|
||||
'lifecycle_state' => 'complete',
|
||||
'captured_at' => now()->subDay(),
|
||||
'summary_jsonb' => baselineSnapshotSummary(3, 0, 2),
|
||||
]);
|
||||
|
||||
$wrongState = BaselineSnapshot::factory()->create([
|
||||
$wrongLifecycle = BaselineSnapshot::factory()->incomplete()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'baseline_profile_id' => (int) $baselineA->getKey(),
|
||||
'captured_at' => now()->subDay(),
|
||||
@ -94,11 +95,11 @@ function baselineSnapshotFilterIndicatorLabels($component): array
|
||||
$component = Livewire::actingAs($user)
|
||||
->test(ListBaselineSnapshots::class)
|
||||
->filterTable('baseline_profile_id', (string) $baselineA->getKey())
|
||||
->filterTable('snapshot_state', 'gaps_present')
|
||||
->filterTable('lifecycle_state', 'complete')
|
||||
->set('tableFilters.captured_at.from', now()->subDays(2)->toDateString())
|
||||
->set('tableFilters.captured_at.until', now()->toDateString())
|
||||
->assertCanSeeTableRecords([$matching])
|
||||
->assertCanNotSeeTableRecords([$wrongState, $wrongBaseline, $outsideWindow, $otherWorkspaceSnapshot]);
|
||||
->assertCanNotSeeTableRecords([$wrongLifecycle, $wrongBaseline, $outsideWindow, $otherWorkspaceSnapshot]);
|
||||
|
||||
expect(baselineSnapshotFilterIndicatorLabels($component))
|
||||
->toContain('Captured from '.now()->subDays(2)->toFormattedDateString())
|
||||
@ -117,11 +118,12 @@ function baselineSnapshotFilterIndicatorLabels($component): array
|
||||
$withGaps = BaselineSnapshot::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'baseline_profile_id' => (int) $baseline->getKey(),
|
||||
'lifecycle_state' => 'complete',
|
||||
'captured_at' => now()->subHour(),
|
||||
'summary_jsonb' => baselineSnapshotSummary(2, 0, 1),
|
||||
]);
|
||||
|
||||
$complete = BaselineSnapshot::factory()->create([
|
||||
$building = BaselineSnapshot::factory()->building()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'baseline_profile_id' => (int) $baseline->getKey(),
|
||||
'captured_at' => now()->subDays(5),
|
||||
@ -130,15 +132,15 @@ function baselineSnapshotFilterIndicatorLabels($component): array
|
||||
|
||||
$component = Livewire::actingAs($user)
|
||||
->test(ListBaselineSnapshots::class)
|
||||
->filterTable('snapshot_state', 'gaps_present')
|
||||
->filterTable('lifecycle_state', 'complete')
|
||||
->set('tableFilters.captured_at.from', now()->subDays(2)->toDateString())
|
||||
->set('tableFilters.captured_at.until', now()->toDateString())
|
||||
->assertCanSeeTableRecords([$withGaps])
|
||||
->assertCanNotSeeTableRecords([$complete]);
|
||||
->assertCanNotSeeTableRecords([$building]);
|
||||
|
||||
$component
|
||||
->set('tableFilters.snapshot_state.value', null)
|
||||
->set('tableFilters.lifecycle_state.value', null)
|
||||
->set('tableFilters.captured_at.from', null)
|
||||
->set('tableFilters.captured_at.until', null)
|
||||
->assertCanSeeTableRecords([$withGaps, $complete]);
|
||||
->assertCanSeeTableRecords([$withGaps, $building]);
|
||||
});
|
||||
|
||||
@ -75,6 +75,10 @@ function visiblePageText(TestResponse $response): string
|
||||
->and($runSummaryPosition)->toBeLessThan($relatedContextPosition)
|
||||
->and($relatedContextPosition)->toBeLessThan($countsPosition)
|
||||
->and($countsPosition)->toBeLessThan($identityHashPosition);
|
||||
|
||||
expect((string) $response->getContent())
|
||||
->toMatch('/fi-section-header-heading[^>]*>\s*Current state\s*</')
|
||||
->toMatch('/fi-section-header-heading[^>]*>\s*Timing\s*</');
|
||||
});
|
||||
|
||||
it('keeps header navigation and related context visible for tenant-bound operation runs', function (): void {
|
||||
@ -184,3 +188,40 @@ function visiblePageText(TestResponse $response): string
|
||||
->assertSee('Verification report unavailable')
|
||||
->assertDontSee('Counts');
|
||||
});
|
||||
|
||||
it('renders lifecycle reconciliation diagnostics for reconciled runs', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
Filament::setTenant(null, true);
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'type' => 'restore.execute',
|
||||
'status' => OperationRunStatus::Completed->value,
|
||||
'outcome' => OperationRunOutcome::Failed->value,
|
||||
'context' => [
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'reconciliation' => [
|
||||
'reconciled_at' => now()->toIso8601String(),
|
||||
'reason' => 'run.adapter_out_of_sync',
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'source' => 'adapter_reconciler',
|
||||
],
|
||||
],
|
||||
'failure_summary' => [[
|
||||
'code' => 'run.adapter_out_of_sync',
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'message' => 'A related restore record reached terminal truth before the operation run was updated.',
|
||||
]],
|
||||
]);
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get(route('admin.operations.view', ['run' => (int) $run->getKey()]))
|
||||
->assertOk()
|
||||
->assertSee('Lifecycle reconciliation')
|
||||
->assertSee('Automatically reconciled')
|
||||
->assertSee('Reconciled by')
|
||||
->assertSee('Adapter reconciler');
|
||||
});
|
||||
|
||||
34
tests/Feature/Guards/OperationLifecycleOpsUxGuardTest.php
Normal file
34
tests/Feature/Guards/OperationLifecycleOpsUxGuardTest.php
Normal file
@ -0,0 +1,34 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Services\Operations\OperationLifecyclePolicyValidator;
|
||||
use Tests\Support\OpsUx\SourceFileScanner;
|
||||
|
||||
it('keeps lifecycle bridge ownership and initiator-null notification discipline intact', function (): void {
|
||||
$validator = app(OperationLifecyclePolicyValidator::class);
|
||||
$result = $validator->validate();
|
||||
|
||||
expect($result['valid'])->toBeTrue();
|
||||
|
||||
$root = SourceFileScanner::projectRoot();
|
||||
$operationRunService = SourceFileScanner::read($root.'/app/Services/OperationRunService.php');
|
||||
|
||||
expect($operationRunService)->toContain('if ($run->user instanceof User)')
|
||||
->and($operationRunService)->toContain('OperationRunCompletedNotification')
|
||||
->and($operationRunService)->toContain('bridgeFailedJobFailure');
|
||||
|
||||
$directBridgeJobs = [
|
||||
$root.'/app/Jobs/CaptureBaselineSnapshotJob.php',
|
||||
$root.'/app/Jobs/CompareBaselineToTenantJob.php',
|
||||
$root.'/app/Jobs/RunInventorySyncJob.php',
|
||||
$root.'/app/Jobs/SyncPoliciesJob.php',
|
||||
$root.'/app/Jobs/BulkTenantSyncJob.php',
|
||||
$root.'/app/Jobs/BulkBackupSetRestoreJob.php',
|
||||
$root.'/app/Jobs/ComposeTenantReviewJob.php',
|
||||
];
|
||||
|
||||
foreach ($directBridgeJobs as $jobPath) {
|
||||
expect(SourceFileScanner::read($jobPath))->toContain('BridgesFailedOperationRun');
|
||||
}
|
||||
})->group('ops-ux');
|
||||
@ -1,6 +1,8 @@
|
||||
<?php
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Jobs\RunInventorySyncJob;
|
||||
use App\Models\OperationRun;
|
||||
use App\Notifications\OperationRunCompleted;
|
||||
use App\Services\Graph\GraphClientInterface;
|
||||
use App\Services\Intune\AuditLogger;
|
||||
@ -75,7 +77,7 @@
|
||||
'notifiable_id' => $user->getKey(),
|
||||
'notifiable_type' => $user->getMorphClass(),
|
||||
'type' => OperationRunCompleted::class,
|
||||
'data->title' => 'Inventory sync completed',
|
||||
'data->title' => 'Inventory sync completed successfully',
|
||||
]);
|
||||
});
|
||||
|
||||
@ -148,6 +150,18 @@
|
||||
'notifiable_id' => $user->getKey(),
|
||||
'notifiable_type' => $user->getMorphClass(),
|
||||
'type' => OperationRunCompleted::class,
|
||||
'data->title' => 'Inventory sync failed',
|
||||
'data->title' => 'Inventory sync execution failed',
|
||||
]);
|
||||
});
|
||||
|
||||
it('declares the inventory sync lifecycle contract explicitly', function (): void {
|
||||
$job = new RunInventorySyncJob(
|
||||
tenantId: 1,
|
||||
userId: 1,
|
||||
operationRun: OperationRun::factory()->make(),
|
||||
);
|
||||
|
||||
expect(class_uses_recursive($job))->toContain(BridgesFailedOperationRun::class)
|
||||
->and($job->timeout)->toBe(240)
|
||||
->and($job->failOnTimeout)->toBeTrue();
|
||||
});
|
||||
|
||||
@ -2,6 +2,16 @@
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Support\Filament\PanelThemeAsset;
|
||||
|
||||
it('resolves the built Filament admin theme from the manifest', function (): void {
|
||||
$asset = PanelThemeAsset::resolve('resources/css/filament/admin/theme.css');
|
||||
|
||||
expect($asset)
|
||||
->toContain('/build/assets/theme-')
|
||||
->not->toContain(':5173');
|
||||
});
|
||||
|
||||
it('injects the Livewire intercept shim into Filament pages', function (): void {
|
||||
$this->get('/admin/login')
|
||||
->assertSuccessful()
|
||||
|
||||
167
tests/Feature/Monitoring/AuditLogInspectFlowTest.php
Normal file
167
tests/Feature/Monitoring/AuditLogInspectFlowTest.php
Normal file
@ -0,0 +1,167 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Filament\Pages\Monitoring\AuditLog as AuditLogPage;
|
||||
use App\Models\AuditLog;
|
||||
use App\Models\OperationRun;
|
||||
use App\Support\OperationRunOutcome;
|
||||
use App\Support\OperationRunStatus;
|
||||
use App\Support\Workspaces\WorkspaceContext;
|
||||
use Filament\Facades\Filament;
|
||||
use Livewire\Livewire;
|
||||
|
||||
it('opens the selected audit event in a slideover inspection surface', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$audit = AuditLog::query()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'actor_email' => 'owner@example.com',
|
||||
'actor_name' => 'Owner',
|
||||
'actor_type' => 'human',
|
||||
'action' => 'workspace.selected',
|
||||
'status' => 'success',
|
||||
'resource_type' => 'workspace',
|
||||
'resource_id' => (string) $tenant->workspace_id,
|
||||
'target_label' => 'Workspace 1',
|
||||
'summary' => 'Workspace selected for Workspace 1',
|
||||
'metadata' => [
|
||||
'reason' => 'chooser',
|
||||
'method' => 'manual',
|
||||
],
|
||||
'recorded_at' => now(),
|
||||
]);
|
||||
|
||||
session()->put(WorkspaceContext::SESSION_KEY, (int) $tenant->workspace_id);
|
||||
Filament::setTenant(null, true);
|
||||
|
||||
Livewire::actingAs($user)
|
||||
->test(AuditLogPage::class)
|
||||
->assertCanSeeTableRecords([$audit])
|
||||
->mountTableAction('inspect', $audit)
|
||||
->assertMountedActionModalSee('Workspace selected for Workspace 1')
|
||||
->assertMountedActionModalSee('Readable context')
|
||||
->assertMountedActionModalSee('Technical metadata');
|
||||
});
|
||||
|
||||
it('shows operation-run navigation only for the currently inspected operation run event', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'type' => 'baseline_compare',
|
||||
'status' => OperationRunStatus::Completed->value,
|
||||
'outcome' => OperationRunOutcome::Succeeded->value,
|
||||
]);
|
||||
|
||||
$withRunLink = AuditLog::query()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'actor_email' => 'owner@example.com',
|
||||
'actor_name' => 'Owner',
|
||||
'actor_type' => 'human',
|
||||
'action' => 'operation_run.completed',
|
||||
'status' => 'success',
|
||||
'resource_type' => 'operation_run',
|
||||
'resource_id' => (string) $run->getKey(),
|
||||
'target_label' => 'Baseline compare #'.$run->getKey(),
|
||||
'summary' => 'Baseline compare completed for Operation run',
|
||||
'recorded_at' => now(),
|
||||
]);
|
||||
|
||||
$withoutRunLink = AuditLog::query()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'tenant_id' => null,
|
||||
'actor_email' => 'owner@example.com',
|
||||
'actor_name' => 'Owner',
|
||||
'actor_type' => 'human',
|
||||
'action' => 'workspace.selected',
|
||||
'status' => 'success',
|
||||
'resource_type' => 'workspace',
|
||||
'resource_id' => (string) $tenant->workspace_id,
|
||||
'target_label' => 'Workspace 1',
|
||||
'summary' => 'Workspace selected for Workspace 1',
|
||||
'recorded_at' => now()->addSecond(),
|
||||
]);
|
||||
|
||||
session()->put(WorkspaceContext::SESSION_KEY, (int) $tenant->workspace_id);
|
||||
Filament::setTenant(null, true);
|
||||
|
||||
$component = Livewire::actingAs($user)
|
||||
->test(AuditLogPage::class)
|
||||
->assertCanSeeTableRecords([$withRunLink, $withoutRunLink])
|
||||
->mountTableAction('inspect', $withRunLink)
|
||||
->assertMountedActionModalSee('Open operation run');
|
||||
|
||||
$component
|
||||
->call('replaceMountedTableAction', 'inspect', (string) $withoutRunLink->getKey())
|
||||
->assertMountedActionModalSee('Workspace selected for Workspace 1')
|
||||
->assertMountedActionModalDontSee('Open operation run')
|
||||
->assertMountedActionModalDontSee('Baseline compare completed for Operation run');
|
||||
});
|
||||
|
||||
it('clearing the slideover closes the inspection surface cleanly', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$audit = AuditLog::query()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'actor_email' => 'owner@example.com',
|
||||
'actor_name' => 'Owner',
|
||||
'actor_type' => 'human',
|
||||
'action' => 'workspace.selected',
|
||||
'status' => 'success',
|
||||
'resource_type' => 'workspace',
|
||||
'resource_id' => (string) $tenant->workspace_id,
|
||||
'target_label' => 'Workspace 1',
|
||||
'summary' => 'Workspace selected for Workspace 1',
|
||||
'recorded_at' => now(),
|
||||
]);
|
||||
|
||||
session()->put(WorkspaceContext::SESSION_KEY, (int) $tenant->workspace_id);
|
||||
Filament::setTenant(null, true);
|
||||
|
||||
$component = Livewire::actingAs($user)
|
||||
->test(AuditLogPage::class)
|
||||
->mountTableAction('inspect', $audit)
|
||||
->unmountTableAction()
|
||||
->assertTableActionNotMounted('inspect');
|
||||
|
||||
expect($component->instance()->getMountedTableAction())->toBeNull();
|
||||
});
|
||||
|
||||
it('keeps record inspection actions out of the global page header', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'type' => 'baseline_compare',
|
||||
'status' => OperationRunStatus::Completed->value,
|
||||
'outcome' => OperationRunOutcome::Succeeded->value,
|
||||
]);
|
||||
|
||||
AuditLog::query()->create([
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'actor_email' => 'owner@example.com',
|
||||
'actor_name' => 'Owner',
|
||||
'actor_type' => 'human',
|
||||
'action' => 'operation_run.completed',
|
||||
'status' => 'success',
|
||||
'resource_type' => 'operation_run',
|
||||
'resource_id' => (string) $run->getKey(),
|
||||
'target_label' => 'Baseline compare #'.$run->getKey(),
|
||||
'summary' => 'Baseline compare completed for Operation run',
|
||||
'recorded_at' => now(),
|
||||
]);
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get(route('admin.monitoring.audit-log'))
|
||||
->assertOk()
|
||||
->assertDontSee('Close details')
|
||||
->assertDontSee('Open operation run');
|
||||
});
|
||||
@ -52,3 +52,66 @@
|
||||
|
||||
Bus::assertNothingDispatched();
|
||||
});
|
||||
|
||||
it('shows lifecycle aggregate visibility for stale and reconciled runs', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'policy.sync',
|
||||
'status' => 'running',
|
||||
'outcome' => 'pending',
|
||||
'started_at' => now()->subMinutes(30),
|
||||
'created_at' => now()->subMinutes(30),
|
||||
]);
|
||||
|
||||
OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'restore.execute',
|
||||
'status' => 'completed',
|
||||
'outcome' => 'failed',
|
||||
'context' => [
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'reconciliation' => [
|
||||
'reconciled_at' => now()->toIso8601String(),
|
||||
'reason' => 'run.adapter_out_of_sync',
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'source' => 'adapter_reconciler',
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get('/admin/operations')
|
||||
->assertOk()
|
||||
->assertSee('active run(s) are beyond their lifecycle window')
|
||||
->assertSee('run(s) have already been automatically reconciled');
|
||||
});
|
||||
|
||||
it('renders completed operation rows without leaking array-state unknown badges', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'baseline_compare',
|
||||
'status' => 'completed',
|
||||
'outcome' => 'partially_succeeded',
|
||||
'initiator_name' => 'Ahmed Darrazi',
|
||||
'created_at' => now()->subHours(6),
|
||||
'started_at' => now()->subHours(6),
|
||||
'completed_at' => now()->subHours(6)->addMinute(),
|
||||
]);
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get('/admin/operations')
|
||||
->assertOk()
|
||||
->assertSee('Run finished')
|
||||
->assertSee('Completed with follow-up')
|
||||
->assertDontSee('Run finished Unknown')
|
||||
->assertDontSee('Completed with follow-up Unknown');
|
||||
});
|
||||
|
||||
@ -0,0 +1,44 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Models\OperationRun;
|
||||
use App\Support\Workspaces\WorkspaceContext;
|
||||
|
||||
it('surfaces stale and reconciled lifecycle counts in operations monitoring', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'policy.sync',
|
||||
'status' => 'running',
|
||||
'outcome' => 'pending',
|
||||
'started_at' => now()->subMinutes(20),
|
||||
'created_at' => now()->subMinutes(20),
|
||||
]);
|
||||
|
||||
OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'restore.execute',
|
||||
'status' => 'completed',
|
||||
'outcome' => 'failed',
|
||||
'context' => [
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'reconciliation' => [
|
||||
'reconciled_at' => now()->toIso8601String(),
|
||||
'reason' => 'run.adapter_out_of_sync',
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'source' => 'adapter_reconciler',
|
||||
],
|
||||
],
|
||||
]);
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get('/admin/operations')
|
||||
->assertOk()
|
||||
->assertSee('1 active run(s) are beyond their lifecycle window.')
|
||||
->assertSee('1 run(s) have already been automatically reconciled.');
|
||||
});
|
||||
@ -0,0 +1,115 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Models\OperationRun;
|
||||
use App\Support\Workspaces\WorkspaceContext;
|
||||
|
||||
it('shows likely stale and reconciled lifecycle semantics on the operations surfaces', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$staleRun = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'policy.sync',
|
||||
'status' => 'running',
|
||||
'outcome' => 'pending',
|
||||
'started_at' => now()->subMinutes(20),
|
||||
'created_at' => now()->subMinutes(20),
|
||||
]);
|
||||
|
||||
$reconciledRun = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'restore.execute',
|
||||
'status' => 'completed',
|
||||
'outcome' => 'failed',
|
||||
'context' => [
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'reconciliation' => [
|
||||
'reconciled_at' => now()->toIso8601String(),
|
||||
'reason' => 'run.adapter_out_of_sync',
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'source' => 'adapter_reconciler',
|
||||
],
|
||||
],
|
||||
'failure_summary' => [[
|
||||
'code' => 'run.adapter_out_of_sync',
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'message' => 'A related restore record reached terminal truth before the operation run was updated.',
|
||||
]],
|
||||
]);
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get('/admin/operations')
|
||||
->assertOk()
|
||||
->assertSee('Likely stale')
|
||||
->assertSee('run(s) have already been automatically reconciled');
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get(route('admin.operations.view', ['run' => (int) $reconciledRun->getKey()]))
|
||||
->assertOk()
|
||||
->assertSee('Automatically reconciled');
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get(route('admin.operations.view', ['run' => (int) $staleRun->getKey()]))
|
||||
->assertOk()
|
||||
->assertSee('Likely stale run');
|
||||
});
|
||||
|
||||
it('renders lifecycle outcome fallbacks when historical runs are missing stored outcomes', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$staleRun = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'policy.sync',
|
||||
'status' => 'running',
|
||||
'outcome' => '',
|
||||
'started_at' => now()->subMinutes(20),
|
||||
'created_at' => now()->subMinutes(20),
|
||||
]);
|
||||
|
||||
$reconciledRun = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'restore.execute',
|
||||
'status' => 'completed',
|
||||
'outcome' => '',
|
||||
'context' => [
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'reconciliation' => [
|
||||
'reconciled_at' => now()->toIso8601String(),
|
||||
'reason' => 'run.adapter_out_of_sync',
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'source' => 'adapter_reconciler',
|
||||
],
|
||||
],
|
||||
'failure_summary' => [[
|
||||
'code' => 'run.adapter_out_of_sync',
|
||||
'reason_code' => 'run.adapter_out_of_sync',
|
||||
'message' => 'A related restore record reached terminal truth before the operation run was updated.',
|
||||
]],
|
||||
]);
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get('/admin/operations')
|
||||
->assertOk()
|
||||
->assertSee('Awaiting result');
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get(route('admin.operations.view', ['run' => (int) $staleRun->getKey()]))
|
||||
->assertOk()
|
||||
->assertSee('Awaiting result');
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get(route('admin.operations.view', ['run' => (int) $reconciledRun->getKey()]))
|
||||
->assertOk()
|
||||
->assertSee('Reconciled failed');
|
||||
});
|
||||
@ -248,3 +248,34 @@
|
||||
expect($run->status)->toBe('completed');
|
||||
expect($run->outcome)->toBe('failed');
|
||||
});
|
||||
|
||||
it('renders reconciled terminal notifications with operator-safe lifecycle copy', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
$this->actingAs($user);
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'initiator_name' => $user->name,
|
||||
'type' => 'restore.execute',
|
||||
'status' => 'queued',
|
||||
'outcome' => 'pending',
|
||||
]);
|
||||
|
||||
app(OperationRunService::class)->forceFailNonTerminalRun(
|
||||
run: $run,
|
||||
reasonCode: 'run.infrastructure_timeout_or_abandonment',
|
||||
message: 'Queue infrastructure ended the job before normal completion could update the run.',
|
||||
source: 'failed_callback',
|
||||
evidence: ['exception_class' => 'TimeoutExceededException'],
|
||||
);
|
||||
|
||||
$notification = $user->notifications()->latest('id')->first();
|
||||
|
||||
expect($notification)->not->toBeNull()
|
||||
->and((string) data_get($notification?->data, 'title'))->toContain('was automatically reconciled')
|
||||
->and((string) data_get($notification?->data, 'body'))->toContain('Infrastructure ended the run')
|
||||
->and((string) data_get($notification?->data, 'body'))->toContain('Review worker health and logs before retrying this operation')
|
||||
->and(data_get($notification?->data, 'diagnostic_reason_code'))->toBe('run.infrastructure_timeout_or_abandonment');
|
||||
});
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Jobs\CompareBaselineToTenantJob;
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Models\BaselineProfile;
|
||||
use App\Models\BaselineSnapshot;
|
||||
use App\Models\OperationRun;
|
||||
@ -59,3 +60,11 @@
|
||||
->and(data_get($run->failure_summary, '0.reason_code'))->toBe(BaselineReasonCodes::COMPARE_SNAPSHOT_INCOMPLETE)
|
||||
->and(data_get($run->summary_counts, 'total'))->toBeInt();
|
||||
});
|
||||
|
||||
it('declares the baseline compare lifecycle contract explicitly', function (): void {
|
||||
$job = new CompareBaselineToTenantJob(OperationRun::factory()->make());
|
||||
|
||||
expect(class_uses_recursive($job))->toContain(BridgesFailedOperationRun::class)
|
||||
->and($job->timeout)->toBe(300)
|
||||
->and($job->failOnTimeout)->toBeTrue();
|
||||
});
|
||||
|
||||
@ -0,0 +1,98 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Models\OperationRun;
|
||||
use App\Services\Operations\OperationLifecycleReconciler;
|
||||
use App\Support\OperationRunOutcome;
|
||||
use App\Support\OperationRunStatus;
|
||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||
|
||||
uses(RefreshDatabase::class);
|
||||
|
||||
it('reconciles stale queued and running covered runs while leaving fresh runs untouched', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$staleQueued = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'type' => 'policy.sync',
|
||||
'status' => OperationRunStatus::Queued->value,
|
||||
'outcome' => OperationRunOutcome::Pending->value,
|
||||
'started_at' => null,
|
||||
'created_at' => now()->subMinutes(15),
|
||||
]);
|
||||
|
||||
$staleRunning = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'type' => 'inventory_sync',
|
||||
'status' => OperationRunStatus::Running->value,
|
||||
'outcome' => OperationRunOutcome::Pending->value,
|
||||
'started_at' => now()->subMinutes(30),
|
||||
'created_at' => now()->subMinutes(30),
|
||||
]);
|
||||
|
||||
$freshRunning = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'type' => 'inventory_sync',
|
||||
'status' => OperationRunStatus::Running->value,
|
||||
'outcome' => OperationRunOutcome::Pending->value,
|
||||
'started_at' => now()->subMinutes(2),
|
||||
'created_at' => now()->subMinutes(2),
|
||||
]);
|
||||
|
||||
$result = app(OperationLifecycleReconciler::class)->reconcile([
|
||||
'types' => ['policy.sync', 'inventory_sync'],
|
||||
'tenant_ids' => [(int) $tenant->getKey()],
|
||||
'dry_run' => false,
|
||||
]);
|
||||
|
||||
expect($result['reconciled'])->toBe(2)
|
||||
->and($result['skipped'])->toBe(1);
|
||||
|
||||
expect($staleQueued->fresh()->status)->toBe(OperationRunStatus::Completed->value)
|
||||
->and($staleQueued->fresh()->outcome)->toBe(OperationRunOutcome::Failed->value)
|
||||
->and(data_get($staleQueued->fresh()->context, 'reconciliation.reason_code'))->toBe('run.stale_queued');
|
||||
|
||||
expect($staleRunning->fresh()->status)->toBe(OperationRunStatus::Completed->value)
|
||||
->and($staleRunning->fresh()->outcome)->toBe(OperationRunOutcome::Failed->value)
|
||||
->and(data_get($staleRunning->fresh()->context, 'reconciliation.reason_code'))->toBe('run.stale_running');
|
||||
|
||||
expect($freshRunning->fresh()->status)->toBe(OperationRunStatus::Running->value)
|
||||
->and($freshRunning->fresh()->outcome)->toBe(OperationRunOutcome::Pending->value);
|
||||
});
|
||||
|
||||
it('is idempotent when the reconciler is run repeatedly', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'type' => 'policy.sync',
|
||||
'status' => OperationRunStatus::Queued->value,
|
||||
'outcome' => OperationRunOutcome::Pending->value,
|
||||
'created_at' => now()->subMinutes(20),
|
||||
]);
|
||||
|
||||
$reconciler = app(OperationLifecycleReconciler::class);
|
||||
|
||||
$first = $reconciler->reconcile([
|
||||
'types' => ['policy.sync'],
|
||||
'tenant_ids' => [(int) $tenant->getKey()],
|
||||
]);
|
||||
|
||||
$second = $reconciler->reconcile([
|
||||
'types' => ['policy.sync'],
|
||||
'tenant_ids' => [(int) $tenant->getKey()],
|
||||
]);
|
||||
|
||||
expect($first['reconciled'])->toBe(1)
|
||||
->and($second['reconciled'])->toBe(0)
|
||||
->and($run->fresh()->status)->toBe(OperationRunStatus::Completed->value);
|
||||
});
|
||||
@ -0,0 +1,21 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Services\Operations\OperationLifecyclePolicyValidator;
|
||||
|
||||
it('accepts the current lifecycle timing configuration for covered operation types', function (): void {
|
||||
$result = app(OperationLifecyclePolicyValidator::class)->validate();
|
||||
|
||||
expect($result['valid'])->toBeTrue()
|
||||
->and($result['errors'])->toBe([]);
|
||||
});
|
||||
|
||||
it('detects retry_after mismatches for covered lifecycle policy entries', function (): void {
|
||||
config()->set('queue.connections.database.retry_after', 200);
|
||||
|
||||
$result = app(OperationLifecyclePolicyValidator::class)->validate();
|
||||
|
||||
expect($result['valid'])->toBeFalse()
|
||||
->and(collect($result['errors'])->join(' '))->toContain('retry_after');
|
||||
});
|
||||
68
tests/Feature/Operations/OperationRunFailedJobBridgeTest.php
Normal file
68
tests/Feature/Operations/OperationRunFailedJobBridgeTest.php
Normal file
@ -0,0 +1,68 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Models\OperationRun;
|
||||
use App\Support\OperationRunOutcome;
|
||||
use App\Support\OperationRunStatus;
|
||||
use Illuminate\Foundation\Testing\RefreshDatabase;
|
||||
|
||||
uses(RefreshDatabase::class);
|
||||
|
||||
class FakeTimeoutExceededException extends RuntimeException {}
|
||||
|
||||
class FakeMaxAttemptsExceededException extends RuntimeException {}
|
||||
|
||||
it('bridges timeout-like failed callbacks back to the owning operation run', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'type' => 'policy.sync',
|
||||
'status' => OperationRunStatus::Running->value,
|
||||
'outcome' => OperationRunOutcome::Pending->value,
|
||||
]);
|
||||
|
||||
$job = new class($run)
|
||||
{
|
||||
use BridgesFailedOperationRun;
|
||||
|
||||
public function __construct(public OperationRun $operationRun) {}
|
||||
};
|
||||
|
||||
$job->failed(new FakeTimeoutExceededException('Queue worker timed out.'));
|
||||
|
||||
expect($run->fresh()->status)->toBe(OperationRunStatus::Completed->value)
|
||||
->and($run->fresh()->outcome)->toBe(OperationRunOutcome::Failed->value)
|
||||
->and(data_get($run->fresh()->context, 'reconciliation.reason_code'))->toBe('run.infrastructure_timeout_or_abandonment')
|
||||
->and(data_get($run->fresh()->context, 'reconciliation.source'))->toBe('failed_callback');
|
||||
});
|
||||
|
||||
it('bridges exhausted-attempt failures back to the owning operation run', function (): void {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'type' => 'inventory_sync',
|
||||
'status' => OperationRunStatus::Running->value,
|
||||
'outcome' => OperationRunOutcome::Pending->value,
|
||||
]);
|
||||
|
||||
$job = new class((int) $run->getKey())
|
||||
{
|
||||
use BridgesFailedOperationRun;
|
||||
|
||||
public function __construct(public int $operationRunId) {}
|
||||
};
|
||||
|
||||
$job->failed(new FakeMaxAttemptsExceededException('Max attempts exceeded.'));
|
||||
|
||||
expect($run->fresh()->status)->toBe(OperationRunStatus::Completed->value)
|
||||
->and($run->fresh()->outcome)->toBe(OperationRunOutcome::Failed->value)
|
||||
->and(data_get($run->fresh()->context, 'reconciliation.reason_code'))->toBe('run.infrastructure_timeout_or_abandonment');
|
||||
});
|
||||
@ -213,6 +213,50 @@
|
||||
->assertSee('Review workspace or tenant access before retrying.');
|
||||
});
|
||||
|
||||
it('keeps reconciled lifecycle runs viewable for entitled members', function (): void {
|
||||
$tenant = Tenant::factory()->create();
|
||||
[$user, $tenant] = createUserWithTenant(tenant: $tenant, role: 'owner');
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'user_id' => (int) $user->getKey(),
|
||||
'type' => 'restore.execute',
|
||||
'status' => OperationRunStatus::Completed->value,
|
||||
'outcome' => OperationRunOutcome::Failed->value,
|
||||
'context' => [
|
||||
'reason_code' => 'run.infrastructure_timeout_or_abandonment',
|
||||
'reconciliation' => [
|
||||
'reconciled_at' => now('UTC')->toIso8601String(),
|
||||
'reason' => 'Infrastructure ended the run before completion.',
|
||||
'reason_code' => 'run.infrastructure_timeout_or_abandonment',
|
||||
'reason_message' => 'Infrastructure ended the run before completion.',
|
||||
'source' => 'failed_callback',
|
||||
'evidence' => [
|
||||
'exception_class' => 'TimeoutExceededException',
|
||||
],
|
||||
],
|
||||
],
|
||||
'failure_summary' => [[
|
||||
'code' => 'operation.failed',
|
||||
'reason_code' => 'run.infrastructure_timeout_or_abandonment',
|
||||
'message' => 'Infrastructure ended the run before completion.',
|
||||
]],
|
||||
]);
|
||||
|
||||
Filament::setTenant(null, true);
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([
|
||||
WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id,
|
||||
])
|
||||
->get(route('admin.operations.view', ['run' => (int) $run->getKey()]))
|
||||
->assertSuccessful()
|
||||
->assertSee('Automatically reconciled')
|
||||
->assertSee('Infrastructure ended the run')
|
||||
->assertSee('Review worker health and logs before retrying this operation.');
|
||||
});
|
||||
|
||||
it('keeps a canonical run viewer accessible when the remembered tenant differs from the run tenant', function (): void {
|
||||
$workspace = Workspace::factory()->create();
|
||||
$tenantA = Tenant::factory()->for($workspace)->create();
|
||||
|
||||
@ -75,7 +75,7 @@
|
||||
expect($opRun->summary_counts['skipped'] ?? null)->toBe(1);
|
||||
|
||||
$context = is_array($opRun->context) ? $opRun->context : [];
|
||||
expect($context['reconciliation']['reason'] ?? null)->toBe('adapter_out_of_sync');
|
||||
expect($context['reconciliation']['reason'] ?? null)->toBe('run.adapter_out_of_sync');
|
||||
expect($context['reconciliation']['reconciled_at'] ?? null)->toBeString();
|
||||
|
||||
expect($opRun->started_at)->not->toBeNull();
|
||||
|
||||
@ -5,6 +5,10 @@
|
||||
use Livewire\Livewire;
|
||||
|
||||
it('renders the progress widget poller script', function () {
|
||||
$this->get('/admin/login')
|
||||
->assertSuccessful()
|
||||
->assertSee('js/tenantpilot/ops-ux-progress-widget-poller.js', escape: false);
|
||||
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
$this->actingAs($user);
|
||||
|
||||
@ -12,5 +16,5 @@
|
||||
|
||||
Livewire::test(BulkOperationProgress::class)
|
||||
->assertSee('opsUxProgressWidgetPoller()')
|
||||
->assertSee('window.opsUxProgressWidgetPoller');
|
||||
->assertDontSee('window.opsUxProgressWidgetPoller');
|
||||
})->group('ops-ux');
|
||||
|
||||
@ -210,6 +210,42 @@
|
||||
->assertForbidden();
|
||||
});
|
||||
|
||||
test('reconciled lifecycle run detail keeps capability denial semantics', function (): void {
|
||||
$tenant = Tenant::factory()->create();
|
||||
|
||||
$run = OperationRun::factory()->create([
|
||||
'tenant_id' => (int) $tenant->getKey(),
|
||||
'workspace_id' => (int) $tenant->workspace_id,
|
||||
'type' => 'inventory_sync',
|
||||
'status' => 'completed',
|
||||
'outcome' => 'failed',
|
||||
'context' => [
|
||||
'reason_code' => 'run.infrastructure_timeout_or_abandonment',
|
||||
'reconciliation' => [
|
||||
'reconciled_at' => now('UTC')->toIso8601String(),
|
||||
'reason' => 'Infrastructure ended the run before completion.',
|
||||
'reason_code' => 'run.infrastructure_timeout_or_abandonment',
|
||||
'reason_message' => 'Infrastructure ended the run before completion.',
|
||||
'source' => 'failed_callback',
|
||||
],
|
||||
],
|
||||
'failure_summary' => [[
|
||||
'code' => 'operation.failed',
|
||||
'reason_code' => 'run.infrastructure_timeout_or_abandonment',
|
||||
'message' => 'Infrastructure ended the run before completion.',
|
||||
]],
|
||||
]);
|
||||
|
||||
[$user, $tenant] = createUserWithTenant($tenant, role: 'readonly');
|
||||
|
||||
Filament::setTenant(null, true);
|
||||
|
||||
$this->actingAs($user)
|
||||
->withSession([WorkspaceContext::SESSION_KEY => (int) $tenant->workspace_id])
|
||||
->get(route('admin.operations.view', ['run' => (int) $run->getKey()]))
|
||||
->assertForbidden();
|
||||
});
|
||||
|
||||
test('tenant-scoped restore run actions return 404 for forged foreign-tenant run keys', function (): void {
|
||||
$tenantA = Tenant::factory()->create();
|
||||
[$user, $tenantA] = createUserWithTenant($tenantA, role: 'owner');
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Jobs\ComposeTenantReviewJob;
|
||||
use App\Jobs\Concerns\BridgesFailedOperationRun;
|
||||
use App\Models\OperationRun;
|
||||
use App\Notifications\OperationRunCompleted;
|
||||
use App\Support\OperationCatalog;
|
||||
@ -37,3 +38,11 @@
|
||||
|
||||
Notification::assertSentTo($user, OperationRunCompleted::class);
|
||||
});
|
||||
|
||||
it('declares the tenant review lifecycle contract explicitly', function (): void {
|
||||
$job = new ComposeTenantReviewJob(tenantReviewId: 1, operationRunId: 1);
|
||||
|
||||
expect(class_uses_recursive($job))->toContain(BridgesFailedOperationRun::class)
|
||||
->and($job->timeout)->toBe(240)
|
||||
->and($job->failOnTimeout)->toBeTrue();
|
||||
});
|
||||
|
||||
11
tests/Feature/Theme/FilamentThemeRadiusTokenTest.php
Normal file
11
tests/Feature/Theme/FilamentThemeRadiusTokenTest.php
Normal file
@ -0,0 +1,11 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
it('defines the rounded 2xl radius token for filament panel themes', function (): void {
|
||||
$adminTheme = file_get_contents(resource_path('css/filament/admin/theme.css'));
|
||||
$systemTheme = file_get_contents(resource_path('css/filament/system/theme.css'));
|
||||
|
||||
expect($adminTheme)->toContain('--radius-2xl: 1rem;')
|
||||
->and($systemTheme)->toContain('--radius-2xl: 1rem;');
|
||||
});
|
||||
@ -41,3 +41,38 @@ public function __construct(public OperationRun $operationRun)
|
||||
expect($operationRun->status)->toBe('running');
|
||||
expect($operationRun->outcome)->toBe('pending');
|
||||
});
|
||||
|
||||
it('marks an operation run failed when the wrapped job throws inside middleware execution', function () {
|
||||
[$user, $tenant] = createUserWithTenant(role: 'owner');
|
||||
|
||||
/** @var OperationRunService $operationRunService */
|
||||
$operationRunService = app(OperationRunService::class);
|
||||
$operationRun = $operationRunService->ensureRun(
|
||||
tenant: $tenant,
|
||||
type: 'test.exception',
|
||||
inputs: ['foo' => 'bar'],
|
||||
initiator: $user,
|
||||
);
|
||||
|
||||
$job = new class($operationRun) implements ShouldQueue
|
||||
{
|
||||
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
||||
|
||||
public function __construct(public OperationRun $operationRun)
|
||||
{
|
||||
$this->withFakeQueueInteractions();
|
||||
}
|
||||
};
|
||||
|
||||
$middleware = new TrackOperationRun;
|
||||
|
||||
expect(fn () => $middleware->handle($job, function (): void {
|
||||
throw new RuntimeException('wrapped job failure');
|
||||
}))->toThrow(RuntimeException::class);
|
||||
|
||||
$operationRun->refresh();
|
||||
|
||||
expect($operationRun->status)->toBe('completed')
|
||||
->and($operationRun->outcome)->toBe('failed')
|
||||
->and(data_get($operationRun->failure_summary, '0.code'))->toBe('exception.unhandled');
|
||||
});
|
||||
|
||||
@ -38,7 +38,7 @@
|
||||
// trigger background updates (lazy-loaded database notifications, progress poller).
|
||||
expect($html)->not->toContain('Filament\\Livewire\\DatabaseNotifications');
|
||||
expect($html)->not->toContain('__lazyLoad');
|
||||
expect($html)->not->toContain('opsUxProgressWidgetPoller');
|
||||
expect($html)->not->toContain('x-data="opsUxProgressWidgetPoller()"');
|
||||
|
||||
// 2. Extract the first Livewire component snapshot
|
||||
preg_match('/wire:snapshot="([^"]+)"/', $html, $snapshotMatch);
|
||||
|
||||
@ -17,6 +17,20 @@
|
||||
$completed = BadgeCatalog::spec(BadgeDomain::OperationRunStatus, 'completed');
|
||||
expect($completed->label)->toBe('Run finished');
|
||||
expect($completed->color)->toBe('gray');
|
||||
|
||||
$stale = BadgeCatalog::spec(BadgeDomain::OperationRunStatus, [
|
||||
'status' => 'running',
|
||||
'freshness_state' => 'likely_stale',
|
||||
]);
|
||||
expect($stale->label)->toBe('Likely stale');
|
||||
expect($stale->color)->toBe('warning');
|
||||
|
||||
$completedArray = BadgeCatalog::spec(BadgeDomain::OperationRunStatus, [
|
||||
'status' => 'completed',
|
||||
'freshness_state' => 'terminal_normal',
|
||||
]);
|
||||
expect($completedArray->label)->toBe('Run finished');
|
||||
expect($completedArray->color)->toBe('gray');
|
||||
});
|
||||
|
||||
it('maps operation run outcome values to canonical badge semantics', function (): void {
|
||||
@ -40,11 +54,36 @@
|
||||
expect($failed->label)->toBe('Execution failed');
|
||||
expect($failed->color)->toBe('danger');
|
||||
|
||||
$reconciled = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, [
|
||||
'outcome' => 'failed',
|
||||
'freshness_state' => 'reconciled_failed',
|
||||
]);
|
||||
expect($reconciled->label)->toBe('Reconciled failed');
|
||||
expect($reconciled->color)->toBe('danger');
|
||||
|
||||
$cancelled = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, 'cancelled');
|
||||
expect($cancelled->label)->toBe('Cancelled');
|
||||
expect($cancelled->color)->toBe('gray');
|
||||
});
|
||||
|
||||
it('normalizes missing operation run outcomes from lifecycle context', function (): void {
|
||||
$stalePending = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, [
|
||||
'outcome' => '',
|
||||
'status' => 'running',
|
||||
'freshness_state' => 'likely_stale',
|
||||
]);
|
||||
expect($stalePending->label)->toBe('Awaiting result');
|
||||
expect($stalePending->color)->toBe('gray');
|
||||
|
||||
$reconciledFailed = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, [
|
||||
'outcome' => '',
|
||||
'status' => 'completed',
|
||||
'freshness_state' => 'reconciled_failed',
|
||||
]);
|
||||
expect($reconciledFailed->label)->toBe('Reconciled failed');
|
||||
expect($reconciledFailed->color)->toBe('danger');
|
||||
});
|
||||
|
||||
it('never represents a success outcome with warning/attention meaning', function (): void {
|
||||
$succeeded = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, 'succeeded');
|
||||
|
||||
|
||||
@ -7,36 +7,36 @@
|
||||
|
||||
it('maps operation run status values to canonical badge semantics', function (): void {
|
||||
$queued = BadgeCatalog::spec(BadgeDomain::OperationRunStatus, 'queued');
|
||||
expect($queued->label)->toBe('Queued');
|
||||
expect($queued->color)->toBe('warning');
|
||||
expect($queued->label)->toBe('Queued for execution');
|
||||
expect($queued->color)->toBe('info');
|
||||
|
||||
$running = BadgeCatalog::spec(BadgeDomain::OperationRunStatus, 'running');
|
||||
expect($running->label)->toBe('Running');
|
||||
expect($running->label)->toBe('In progress');
|
||||
expect($running->color)->toBe('info');
|
||||
|
||||
$completed = BadgeCatalog::spec(BadgeDomain::OperationRunStatus, 'completed');
|
||||
expect($completed->label)->toBe('Completed');
|
||||
expect($completed->label)->toBe('Run finished');
|
||||
expect($completed->color)->toBe('gray');
|
||||
});
|
||||
|
||||
it('maps operation run outcome values to canonical badge semantics', function (): void {
|
||||
$pending = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, 'pending');
|
||||
expect($pending->label)->toBe('Pending');
|
||||
expect($pending->label)->toBe('Awaiting result');
|
||||
expect($pending->color)->toBe('gray');
|
||||
|
||||
$succeeded = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, 'succeeded');
|
||||
expect($succeeded->label)->toBe('Succeeded');
|
||||
expect($succeeded->label)->toBe('Completed successfully');
|
||||
expect($succeeded->color)->toBe('success');
|
||||
|
||||
$partial = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, 'partially_succeeded');
|
||||
expect($partial->label)->toBe('Partially succeeded');
|
||||
expect($partial->label)->toBe('Completed with follow-up');
|
||||
expect($partial->color)->toBe('warning');
|
||||
|
||||
$blocked = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, 'blocked');
|
||||
expect($blocked->label)->toBe('Blocked');
|
||||
expect($blocked->label)->toBe('Blocked by prerequisite');
|
||||
expect($blocked->color)->toBe('warning');
|
||||
|
||||
$failed = BadgeCatalog::spec(BadgeDomain::OperationRunOutcome, 'failed');
|
||||
expect($failed->label)->toBe('Failed');
|
||||
expect($failed->label)->toBe('Execution failed');
|
||||
expect($failed->color)->toBe('danger');
|
||||
});
|
||||
|
||||
@ -0,0 +1,45 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
use App\Services\Operations\OperationLifecyclePolicyValidator;
|
||||
use App\Support\Operations\OperationLifecyclePolicy;
|
||||
|
||||
it('exposes the exact covered v1 lifecycle operation set', function (): void {
|
||||
$types = app(OperationLifecyclePolicy::class)->coveredTypeNames();
|
||||
|
||||
expect($types)->toBe([
|
||||
'baseline_capture',
|
||||
'baseline_compare',
|
||||
'inventory_sync',
|
||||
'policy.sync',
|
||||
'policy.sync_one',
|
||||
'entra_group_sync',
|
||||
'directory_role_definitions.sync',
|
||||
'backup_schedule_run',
|
||||
'restore.execute',
|
||||
'tenant.review_pack.generate',
|
||||
'tenant.review.compose',
|
||||
'tenant.evidence.snapshot.generate',
|
||||
]);
|
||||
});
|
||||
|
||||
it('requires direct failed-job bridges for lifecycle policy entries that declare them', function (): void {
|
||||
$validator = app(OperationLifecyclePolicyValidator::class);
|
||||
|
||||
expect($validator->jobUsesDirectFailedBridge('baseline_capture'))->toBeTrue()
|
||||
->and($validator->jobUsesDirectFailedBridge('baseline_compare'))->toBeTrue()
|
||||
->and($validator->jobUsesDirectFailedBridge('inventory_sync'))->toBeTrue()
|
||||
->and($validator->jobUsesDirectFailedBridge('policy.sync'))->toBeTrue()
|
||||
->and($validator->jobUsesDirectFailedBridge('tenant.review.compose'))->toBeTrue()
|
||||
->and($validator->jobUsesDirectFailedBridge('backup_schedule_run'))->toBeFalse();
|
||||
});
|
||||
|
||||
it('requires explicit timeout and fail-on-timeout declarations for covered jobs', function (): void {
|
||||
$validator = app(OperationLifecyclePolicyValidator::class);
|
||||
|
||||
expect($validator->jobTimeoutSeconds('baseline_capture'))->toBe(300)
|
||||
->and($validator->jobFailsOnTimeout('baseline_capture'))->toBeTrue()
|
||||
->and($validator->jobTimeoutSeconds('restore.execute'))->toBe(420)
|
||||
->and($validator->jobFailsOnTimeout('restore.execute'))->toBeTrue();
|
||||
});
|
||||
Loading…
Reference in New Issue
Block a user