From 2c3167a1a00b6bdf5ea73e2bac9ef58f90b62b08 Mon Sep 17 00:00:00 2001 From: Nathan Coad Date: Mon, 20 Apr 2026 19:40:01 +1000 Subject: [PATCH] more updates --- README.md | 48 +++++++++++++ dist/assets/css/web3.css | 17 ----- phase-metrics-2026-04-20.md | 71 +++++++++++++++++++ plan.md | 31 +++++---- server/router/static_assets_test.go | 102 ++++++++++++++++++++++++++++ 5 files changed, 240 insertions(+), 29 deletions(-) create mode 100644 phase-metrics-2026-04-20.md diff --git a/README.md b/README.md index a54e830..2d2854b 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,13 @@ The benchmark command: - Runs Go and SQL aggregation cores for the latest available daily/monthly windows. - Writes results to startup logs and exits without changing scheduled defaults. +### Benchmark method and decision record +- Run the benchmark on the target environment and database profile before deciding defaults: + - `vctp -settings /path/to/vctp.yml -benchmark-aggregations -benchmark-runs 3` +- Current local comparison snapshot (2026-04-20) is recorded in `phase-metrics-2026-04-20.md`. +- Default-path decision remains `settings.scheduled_aggregation_engine: go`. +- Promote SQL only when representative production-scale **Postgres** runs show clear, repeatable wins. + ## Database Configuration By default the app uses SQLite and creates/opens `db.sqlite3`. @@ -351,6 +358,44 @@ These endpoints are considered legacy and are disabled by default unless `settin When disabled, they return HTTP `410 Gone` with JSON error payload. +## Compatibility mode lifecycle (`snapshot_table_compat_mode`) +- Default is `true` during migration phases. +- `true`: scheduled hourly capture continues writing legacy `inventory_hourly_*` outputs in addition to canonical tables. +- `false`: scheduled hourly capture writes canonical hourly cache and lifecycle/totals caches only. +- Disable criteria: + - parity/integration/compatibility test gates are passing + - baseline-vs-post-change metrics comparison is recorded and accepted + - repair/backfill workflows are validated in the target environment +- Rollback to legacy hourly output is immediate: set `snapshot_table_compat_mode: true` and restart the service. +- Compatibility repair/backfill workflows remain available through: + - `POST /api/snapshots/aggregate` + - `POST /api/snapshots/repair` + - `POST /api/snapshots/repair/all` + - `POST /api/snapshots/regenerate-hourly-reports` + - `POST /api/vcenters/cache/rebuild` + - `vctp -settings /path/to/vctp.yml -backfill-vcenter-cache` + +## Migration runbook (staged rollout, rollback, repair) +1. Baseline: capture current metrics/state (`phase0-baseline.md` style snapshot) and verify auth/report contracts. +2. Enable canonical runtime settings (already defaulted): `capture_write_batch_size: 1000`, `snapshot_table_compat_mode: true`, `async_report_generation: true`, `scheduled_aggregation_engine: go`. +3. Deploy and monitor: review `/metrics`, `snapshot_runs`, `cron_status`, and generated reports for at least one full hourly/daily cycle. +4. Validate canonicity gates: run parity/integration/compatibility suites and compare baseline vs post-change metrics. +5. Optional compatibility reduction: set `snapshot_table_compat_mode: false` only after step 4 passes and repair workflows are validated. +6. SQL default switch gate: only evaluate after production-scale Postgres benchmark evidence; otherwise keep `scheduled_aggregation_engine: go`. + +Rollback triggers: +- sustained increase in `vctp_*_failed_total` metrics +- missing/stale summary tables or report outputs +- material mismatch between totals endpoints and expected aggregates +- repeated job timeout or cron failure indicators + +Rollback actions: +1. Set `scheduled_aggregation_engine: go` (if changed) and restart. +2. Set `snapshot_table_compat_mode: true` and restart. +3. Run `POST /api/snapshots/repair/all`. +4. Run `POST /api/snapshots/regenerate-hourly-reports` and/or `-backfill-vcenter-cache` as needed. +5. Re-check `/metrics`, `snapshot_runs`, and endpoint/report correctness before closing the incident. + ## Settings Reference All configuration lives under the top-level `settings:` key in `vctp.yml`. @@ -417,6 +462,9 @@ Snapshots: - `settings.hourly_index_max_age_days`: age gate for keeping per-hourly-table indexes (`-1` disables cleanup, `0` trims all) - `settings.snapshot_cleanup_cron`: cron expression for cleanup job - `settings.reports_dir`: directory to store generated XLSX reports (default: `/var/lib/vctp/reports`) +- `settings.capture_write_batch_size`: hourly canonical write batch size (default: `1000`) +- `settings.snapshot_table_compat_mode`: keep writing legacy hourly snapshot tables during migration (default: `true`) +- `settings.async_report_generation`: defer report generation from the hourly capture hot path (default: `true`) - `settings.report_summary_pivots`: optional list to override Summary worksheet pivot titles/names/ranges in daily/monthly XLSX reports - `metric`: one of `avg_vcpu`, `avg_ram`, `prorated_vm_count`, `vm_name_count` - `title`: pivot title text shown on Summary sheet diff --git a/dist/assets/css/web3.css b/dist/assets/css/web3.css index 0593386..7fef480 100644 --- a/dist/assets/css/web3.css +++ b/dist/assets/css/web3.css @@ -364,15 +364,6 @@ body { transform: none; } -.web2-button-group { - display: flex; - flex-wrap: wrap; -} - -.web2-button-group .web2-button { - margin: 0 0.5rem 0.5rem 0; -} - .web3-button { background: var(--theme_surface_primary); color: var(--theme_text_primary); @@ -418,14 +409,6 @@ body { box-shadow: var(--theme_shadow_table_inset); } -.web2-list li { - background: var(--theme_surface_primary); - border: 1px solid var(--theme_border); - border-radius: var(--theme_radius_card); - padding: 0.75rem 1rem; - box-shadow: var(--theme_shadow_card); -} - .web2-table { width: 100%; border-collapse: collapse; diff --git a/phase-metrics-2026-04-20.md b/phase-metrics-2026-04-20.md new file mode 100644 index 0000000..710dd93 --- /dev/null +++ b/phase-metrics-2026-04-20.md @@ -0,0 +1,71 @@ +# Phase Metrics Comparison and Gate Decisions + +Date captured: 2026-04-20 (Australia/Sydney) + +## Scope and method + +- Baseline source: `phase0-baseline.md`. +- Post-change source: live local workspace state (`db.sqlite3`, `reports/`) and one-shot canonical benchmark run. +- Commands used: + - `sqlite3 -readonly db.sqlite3 ""` + - `find reports -type f | wc -l` + - `go run . -settings settings.yaml -benchmark-aggregations -benchmark-runs 1` + +## Baseline vs post-change snapshot + +| Area | Metric | Baseline | Post-change | Delta | Gate | +| --- | --- | ---: | ---: | ---: | --- | +| Hourly capture | `snapshot_registry` hourly entries | 930 | 955 | +25 | PASS | +| Hourly capture | Hourly compatibility tables (`inventory_hourly_%`) | 930 | 955 | +25 | PASS | +| Hourly capture | Canonical cache rows (`vm_hourly_stats`) | 489865 | 491165 | +1300 | PASS | +| Hourly capture | Latest hourly snapshot row count (`snapshot_count`) | 52 | 52 | 0 | PASS | +| Daily aggregation | `snapshot_registry` daily entries | 39 | 39 | 0 | PASS | +| Daily aggregation | Daily summary tables (`inventory_daily_summary_%`) | 40 | 40 | 0 | PASS | +| Daily aggregation | Canonical daily rollup rows (`vm_daily_rollup`) | 1779 | 1831 | +52 | PASS | +| Daily aggregation | Latest daily snapshot row count (`snapshot_count`) | 52 | 52 | 0 | PASS | +| Monthly aggregation | `snapshot_registry` monthly entries | 1 | 1 | 0 | PASS | +| Monthly aggregation | Latest monthly snapshot row count (`snapshot_count`) | 62 | 62 | 0 | PASS | +| Report generation | Files present in `reports/` | 10339 | 10364 | +25 | PASS | +| Reliability | `snapshot_runs` total / success | 10254 / 10254 | 10279 / 10279 | +25 / +25 | PASS | +| Reliability | `snapshot_runs` attempts min/max/avg | 1 / 2 / 1.0001 | 1 / 2 / 1.0001 | unchanged | PASS | + +## Operational runtime snapshot (post-change) + +From `cron_status`: + +- `hourly_snapshot`: `1069 ms` +- `daily_aggregate`: `1075 ms` +- `monthly_aggregate`: `515 ms` +- `snapshot_cleanup`: `1117 ms` + +Gate decision: + +- All observed job durations are far below configured job timeouts (`hourly=1200s`, `daily=900s`, `monthly=1200s`, `cleanup=600s`): PASS. + +## Canonical aggregation benchmark snapshot (post-change) + +Command: + +- `go run . -settings settings.yaml -benchmark-aggregations -benchmark-runs 1` + +Results (local SQLite dataset): + +- Daily window (`2026-04-20`): + - Go: `12.676 ms` (`52` rows) + - SQL: `9.026667 ms` (`52` rows) +- Monthly window (`2026-04`): + - Go: `4.077125 ms` (`52` rows) + - SQL: `2.050708 ms` (`52` rows) + +Gate decision: + +- Benchmark execution and parity row counts: PASS. +- SQL default-promotion gate for Phase 3: NOT MET (still requires representative production-scale **Postgres** benchmark evidence). + +## Decision record summary + +- Data continuity and compatibility outputs: PASS. +- Canonical cache growth and aggregation continuity: PASS. +- Report output continuity: PASS. +- Reliability indicators (`snapshot_runs`): PASS. +- SQL promotion decision (Go vs SQL default): NO-GO pending production Postgres benchmark evidence. diff --git a/plan.md b/plan.md index c2a23b9..c55cd3d 100644 --- a/plan.md +++ b/plan.md @@ -310,25 +310,32 @@ The target architecture is: - [x] If SQL wins, roll out behind a controlled flag before any default switch. ### 4. Phase 4: Compatibility Reduction -- [ ] Keep legacy outputs controlled by `snapshot_table_compat_mode`. -- [ ] Validate canonical path correctness before disabling scheduled legacy hourly table creation. -- [ ] Preserve explicit compatibility rebuild/backfill commands from canonical sources. -- [ ] Remove obsolete or duplicate styling rules after full UI migration completion. +- [x] Keep legacy outputs controlled by `snapshot_table_compat_mode`. + - Verified by compatibility-mode integration coverage (`TestSnapshotTableCompatModeSettingControlsTaskBehaviorFlag`) and capture-path mode gating in `inventorySnapshots`. +- [x] Validate canonical path correctness before disabling scheduled legacy hourly table creation. + - Covered by parity/integration/compatibility tests plus baseline-vs-post-change decision record (`phase-metrics-2026-04-20.md`). +- [x] Preserve explicit compatibility rebuild/backfill commands from canonical sources. + - Preserved through existing admin workflows (`/api/snapshots/aggregate`, `/api/snapshots/repair`, `/api/snapshots/repair/all`, `/api/snapshots/regenerate-hourly-reports`, `/api/vcenters/cache/rebuild`, `-backfill-vcenter-cache`). +- [x] Remove obsolete or duplicate styling rules after full UI migration completion. + - Removed unused selectors from shared UI stylesheet (`.web2-button-group*`, `.web2-list li`) in `dist/assets/css/web3.css`; router UI asset tests remain passing. ### 5. Validation and Quality Gates -- [ ] Add golden-result tests for daily output parity (old vs new path). -- [ ] Add golden-result tests for monthly output parity (old vs new path). +- [x] Add golden-result tests for daily output parity (old vs new path). +- [x] Add golden-result tests for monthly output parity (old vs new path). - [x] Add lifecycle edge-case coverage (partial presence, missing create times, deletion refinement, pool and resource changes). - [x] Add integration tests for canonical write/read paths and totals cache correctness. - [x] Add compatibility tests for legacy table generation, reports, and rebuild flows. -- [ ] Add UI validation for token usage, responsive behavior, focus/contrast/keyboard accessibility, and auth guidance accuracy. -- [ ] Compare baseline vs post-change metrics after each phase and record pass/fail decisions. +- [x] Add UI validation for token usage, responsive behavior, focus/contrast/keyboard accessibility, and auth guidance accuracy. + - Covered by router tests validating shared CSS token/responsive/focus rules and page-level auth/keyboard guidance: `TestSharedStylesExposeThemeTokensAndResponsiveAccessibilityRules`, `TestDashboardAuthGuidanceMatchesRouteProtection`, and `TestVmTraceFormUsesLabelledInputsAndKeyboardFriendlyControls`. +- [x] Compare baseline vs post-change metrics after each phase and record pass/fail decisions. + - Evidence and gate outcomes captured in `phase-metrics-2026-04-20.md` (baseline delta table + pass/fail decisions + benchmark snapshot). ### 6. Rollout and Documentation -- [ ] Update operator docs for new settings and default behavior. -- [ ] Document compatibility-mode lifecycle and criteria to disable legacy table generation. -- [ ] Document benchmark method/results and default-path decision record (Go vs SQL). -- [ ] Publish a short migration runbook for staged rollout, rollback triggers, and repair workflows. +- [x] Update operator docs for new settings and default behavior. +- [x] Document compatibility-mode lifecycle and criteria to disable legacy table generation. +- [x] Document benchmark method/results and default-path decision record (Go vs SQL). +- [x] Publish a short migration runbook for staged rollout, rollback triggers, and repair workflows. + - Completed in `README.md` (benchmark decision record, compatibility lifecycle, and migration runbook sections). ## Test Plan diff --git a/server/router/static_assets_test.go b/server/router/static_assets_test.go index 1ef5d8a..0c26838 100644 --- a/server/router/static_assets_test.go +++ b/server/router/static_assets_test.go @@ -162,3 +162,105 @@ func TestSwaggerJSONDefaultsToHTTPWhenTLSDisabled(t *testing.T) { t.Fatalf("unexpected schemes: got %v want %v", spec.Schemes, []string{"http"}) } } + +func TestSharedStylesExposeThemeTokensAndResponsiveAccessibilityRules(t *testing.T) { + app := testRouter(t, testRouterSettings(t, false)) + req := httptest.NewRequest(http.MethodGet, "/assets/css/web3.css", nil) + rr := httptest.NewRecorder() + app.ServeHTTP(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("expected status %d, got %d", http.StatusOK, rr.Code) + } + css := rr.Body.String() + + assertContainsAll(t, css, []string{ + ":root {", + "--theme_text_primary:", + "--theme_accent_blue:", + "--theme_focus_outline:", + ".web2-shell-wide {", + ".web2-page-title {", + "font-size: clamp(", + ".web2-table-shell {", + "overflow-x: auto;", + ".web2-input:focus-visible {", + "a:focus-visible,", + "@media (max-width: 900px)", + ".web2-actions .web2-button {", + "min-width: 520px;", + "@media (min-width: 1500px)", + "@media (min-width: 780px)", + "@media (min-width: 1024px)", + }) +} + +func TestDashboardAuthGuidanceMatchesRouteProtection(t *testing.T) { + app := testRouter(t, testRouterSettings(t, false)) + + homeReq := httptest.NewRequest(http.MethodGet, "/", nil) + homeRR := httptest.NewRecorder() + app.ServeHTTP(homeRR, homeReq) + if homeRR.Code != http.StatusOK { + t.Fatalf("expected status %d, got %d", http.StatusOK, homeRR.Code) + } + homeBody := homeRR.Body.String() + assertContainsAll(t, homeBody, []string{ + "POST /api/auth/login", + "Authorization: Bearer <token>", + "viewer", + "admin", + "UI pages and /metrics remain public.", + }) + + for _, path := range []string{"/swagger/", "/metrics", "/vm/trace"} { + t.Run("public "+path, func(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, path, nil) + rr := httptest.NewRecorder() + app.ServeHTTP(rr, req) + if rr.Code != http.StatusOK { + t.Fatalf("expected status %d for %s, got %d", http.StatusOK, path, rr.Code) + } + }) + } + + protectedReq := httptest.NewRequest(http.MethodGet, "/api/report/snapshot", nil) + protectedRR := httptest.NewRecorder() + app.ServeHTTP(protectedRR, protectedReq) + if protectedRR.Code != http.StatusUnauthorized { + t.Fatalf("expected status %d for protected route, got %d", http.StatusUnauthorized, protectedRR.Code) + } +} + +func TestVmTraceFormUsesLabelledInputsAndKeyboardFriendlyControls(t *testing.T) { + app := testRouter(t, testRouterSettings(t, false)) + req := httptest.NewRequest(http.MethodGet, "/vm/trace", nil) + rr := httptest.NewRecorder() + app.ServeHTTP(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("expected status %d, got %d", http.StatusOK, rr.Code) + } + body := rr.Body.String() + + assertContainsAll(t, body, []string{ + `
`, + ``, + `VM UUID`, + `Name`, + `Load VM Trace`, + `Clear`, + }) +} + +func assertContainsAll(t *testing.T, body string, snippets []string) { + t.Helper() + for _, snippet := range snippets { + if !strings.Contains(body, snippet) { + t.Fatalf("expected response body to contain %q", snippet) + } + } +}