TenantAtlas/app/Services/BulkOperationService.php
ahmido bcf4996a1e feat/049-backup-restore-job-orchestration (#56)
Summary

This PR implements Spec 049 – Backup/Restore Job Orchestration: all critical Backup/Restore execution paths are job-only, idempotent, tenant-scoped, and observable via run records + DB notifications (Phase 1). The UI no longer performs heavy Graph work inside request/Filament actions for these flows.

Why

We want predictable UX and operations at MSP scale:
	•	no timeouts / long-running requests
	•	reproducible run state + per-item results
	•	safe error persistence (no secrets / no token leakage)
	•	strict tenant isolation + auditability for write paths

What changed

Foundational (Runs + Idempotency + Observability)
	•	Added a shared RunIdempotency helper (dedupe while queued/running).
	•	Added a read-only BulkOperationRuns surface (list + view) for status/progress.
	•	Added DB notifications for run status changes (with “View run” link).

US1 – Policy “Capture snapshot” is job-only
	•	Policy detail “Capture snapshot” now:
	•	creates/reuses a run (dedupe key: tenant + policy.capture_snapshot + policy DB id)
	•	dispatches a queued job
	•	returns immediately with notification + link to run detail
	•	Graph capture work moved fully into the job; request path stays Graph-free.

US3 – Restore runs orchestration is job-only + safe
	•	Live restore execution is queued and updates RestoreRun status/progress.
	•	Per-item outcomes are persisted deterministically (per internal DB record).
	•	Audit logging is written for live restore.
	•	Preview/dry-run is enforced as read-only (no writes).

Tenant isolation / authorization (non-negotiable)
	•	Run list/view/start are tenant-scoped and policy-guarded (cross-tenant access => 403, not 404).
	•	Explicit Pest tests cover cross-tenant denial and start authorization.

Tests / Verification
	•	./vendor/bin/pint --dirty
	•	Targeted suite (examples):
	•	policy capture snapshot queued + idempotency tests
	•	restore orchestration + audit logging + preview read-only tests
	•	run authorization / tenant isolation tests

Notes / Scope boundaries
	•	Phase 1 UX = DB notifications + run detail page. A global “progress widget” is tracked as Phase 2 and not required for merge.
	•	Resilience/backoff is tracked in tasks but can be iterated further after merge.

Review focus
	•	Dedupe behavior for queued/running runs (reuse vs create-new)
	•	Tenant scoping & policy gates for all run surfaces
	•	Restore safety: audit event + preview no-writes

Co-authored-by: Ahmed Darrazi <ahmeddarrazi@adsmac.local>
Reviewed-on: #56
2026-01-11 15:59:06 +00:00

238 lines
6.7 KiB
PHP

<?php
namespace App\Services;
use App\Models\BulkOperationRun;
use App\Models\Tenant;
use App\Models\User;
use App\Services\Intune\AuditLogger;
class BulkOperationService
{
public function __construct(
protected AuditLogger $auditLogger
) {}
public function sanitizeFailureReason(string $reason): string
{
$reason = trim($reason);
if ($reason === '') {
return 'error';
}
$lower = mb_strtolower($reason);
if (
str_contains($lower, 'bearer ') ||
str_contains($lower, 'access_token') ||
str_contains($lower, 'client_secret') ||
str_contains($lower, 'authorization')
) {
return 'redacted';
}
$reason = preg_replace("/\s+/u", ' ', $reason) ?? $reason;
return mb_substr($reason, 0, 200);
}
public function createRun(
Tenant $tenant,
User $user,
string $resource,
string $action,
array $itemIds,
int $totalItems
): BulkOperationRun {
$run = BulkOperationRun::create([
'tenant_id' => $tenant->id,
'user_id' => $user->id,
'resource' => $resource,
'action' => $action,
'status' => 'pending',
'item_ids' => $itemIds,
'total_items' => $totalItems,
'processed_items' => 0,
'succeeded' => 0,
'failed' => 0,
'skipped' => 0,
'failures' => [],
]);
$auditLog = $this->auditLogger->log(
tenant: $tenant,
action: "bulk.{$resource}.{$action}.created",
context: [
'metadata' => [
'bulk_run_id' => $run->id,
'total_items' => $totalItems,
],
],
actorId: $user->id,
actorEmail: $user->email,
actorName: $user->name,
resourceType: 'bulk_operation_run',
resourceId: (string) $run->id
);
$run->update(['audit_log_id' => $auditLog->id]);
return $run;
}
public function start(BulkOperationRun $run): void
{
$run->update(['status' => 'running']);
}
public function recordSuccess(BulkOperationRun $run): void
{
$run->increment('processed_items');
$run->increment('succeeded');
}
public function recordFailure(BulkOperationRun $run, string $itemId, string $reason): void
{
$reason = $this->sanitizeFailureReason($reason);
$failures = $run->failures ?? [];
$failures[] = [
'item_id' => $itemId,
'reason' => $reason,
'timestamp' => now()->toIso8601String(),
];
$run->update([
'failures' => $failures,
'processed_items' => $run->processed_items + 1,
'failed' => $run->failed + 1,
]);
}
public function recordSkipped(BulkOperationRun $run): void
{
$run->increment('processed_items');
$run->increment('skipped');
}
public function recordSkippedWithReason(BulkOperationRun $run, string $itemId, string $reason): void
{
$reason = $this->sanitizeFailureReason($reason);
$failures = $run->failures ?? [];
$failures[] = [
'item_id' => $itemId,
'reason' => $reason,
'type' => 'skipped',
'timestamp' => now()->toIso8601String(),
];
$run->update([
'failures' => $failures,
'processed_items' => $run->processed_items + 1,
'skipped' => $run->skipped + 1,
]);
}
public function complete(BulkOperationRun $run): void
{
$run->refresh();
if (! in_array($run->status, ['pending', 'running'], true)) {
return;
}
$status = $run->failed > 0 ? 'completed_with_errors' : 'completed';
$updated = BulkOperationRun::query()
->whereKey($run->id)
->whereIn('status', ['pending', 'running'])
->update(['status' => $status]);
if ($updated === 0) {
return;
}
$run->refresh();
$failureEntries = collect($run->failures ?? []);
$failedReasons = $failureEntries
->filter(fn (array $entry) => ($entry['type'] ?? 'failed') !== 'skipped')
->groupBy('reason')
->map(fn ($group) => $group->count())
->all();
$skippedReasons = $failureEntries
->filter(fn (array $entry) => ($entry['type'] ?? null) === 'skipped')
->groupBy('reason')
->map(fn ($group) => $group->count())
->all();
$this->auditLogger->log(
tenant: $run->tenant,
action: "bulk.{$run->resource}.{$run->action}.{$status}",
context: [
'metadata' => [
'bulk_run_id' => $run->id,
'succeeded' => $run->succeeded,
'failed' => $run->failed,
'skipped' => $run->skipped,
'failed_reasons' => $failedReasons,
'skipped_reasons' => $skippedReasons,
],
],
actorId: $run->user_id,
resourceType: 'bulk_operation_run',
resourceId: (string) $run->id
);
}
public function fail(BulkOperationRun $run, string $reason): void
{
$run->update(['status' => 'failed']);
$reason = $this->sanitizeFailureReason($reason);
$this->auditLogger->log(
tenant: $run->tenant,
action: "bulk.{$run->resource}.{$run->action}.failed",
context: [
'reason' => $reason,
'metadata' => [
'bulk_run_id' => $run->id,
],
],
actorId: $run->user_id,
status: 'failure',
resourceType: 'bulk_operation_run',
resourceId: (string) $run->id
);
}
public function abort(BulkOperationRun $run, string $reason): void
{
$run->update(['status' => 'aborted']);
$reason = $this->sanitizeFailureReason($reason);
$this->auditLogger->log(
tenant: $run->tenant,
action: "bulk.{$run->resource}.{$run->action}.aborted",
context: [
'reason' => $reason,
'metadata' => [
'bulk_run_id' => $run->id,
'succeeded' => $run->succeeded,
'failed' => $run->failed,
'skipped' => $run->skipped,
],
],
actorId: $run->user_id,
status: 'failure',
resourceType: 'bulk_operation_run',
resourceId: (string) $run->id
);
}
}