From 1e750e35d17ef31b7e6739db4a6fa1b1c54c7689 Mon Sep 17 00:00:00 2001 From: Nathan Coad Date: Mon, 6 Apr 2026 19:09:20 +1000 Subject: [PATCH] Implemented the next 4h-plan phase: dual-run support + explicit cutover gate. --- README.md | 1 + docker-compose.yml | 54 ++++++++++-- docs/rain_model_runbook.md | 29 ++++++- scripts/check_rain_cutover_gate.py | 134 +++++++++++++++++++++++++++++ scripts/compare_rain_reports.py | 23 ++++- scripts/rainml_py.sh | 8 +- todo.md | 9 +- 7 files changed, 238 insertions(+), 20 deletions(-) create mode 100644 scripts/check_rain_cutover_gate.py diff --git a/README.md b/README.md index 5d5dfd7..d2ad5cd 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,7 @@ Runbook/docs: - `docs/rain_model_runbook.md` - `scripts/recommend_rain_model.py` (rank reports and recommend deploy candidate) - `scripts/rainml_py.sh` (run ML Python scripts inside the `rainml` container; avoids host virtualenv/dependency setup) +- `scripts/check_rain_cutover_gate.py` (automated pass/fail gate using baseline vs candidate test metrics) ## Publish a test WS90 payload ```sh diff --git a/docker-compose.yml b/docker-compose.yml index b275c4f..6b9a2df 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -36,11 +36,11 @@ services: RAIN_SITE: "home" RAIN_HORIZON_HOURS: "4" RAIN_MODEL_NAME: "rain_next_4h" - RAIN_MODEL_VERSION_BASE: "rain-auto-v2-extended-4h" + RAIN_MODEL_VERSION_BASE: "rain-auto-v2-extended-calendar-4h" RAIN_MODEL_FAMILY: "auto" - RAIN_FEATURE_SET: "extended" + RAIN_FEATURE_SET: "extended_calendar" RAIN_FORECAST_MODEL: "ecmwf" - RAIN_LOOKBACK_DAYS: "30" + RAIN_LOOKBACK_DAYS: "60" RAIN_TRAIN_INTERVAL_HOURS: "24" RAIN_PREDICT_INTERVAL_MINUTES: "10" RAIN_MIN_PRECISION: "0.70" @@ -50,12 +50,48 @@ services: RAIN_THRESHOLD_POLICY: "walk_forward" RAIN_WALK_FORWARD_FOLDS: "4" RAIN_ALLOW_EMPTY_DATA: "true" - RAIN_MODEL_PATH: "/app/models/rain_model.pkl" - RAIN_MODEL_BACKUP_PATH: "/app/models/rain_model.pkl.last_good" - RAIN_REPORT_PATH: "/app/models/rain_model_report.json" - RAIN_AUDIT_PATH: "/app/models/rain_data_audit.json" - RAIN_DATASET_PATH: "/app/models/datasets/rain_dataset_{model_version}_{feature_set}.csv" - RAIN_MODEL_CARD_PATH: "/app/models/model_card_{model_version}.md" + RAIN_MODEL_PATH: "/app/models/rain_model_4h.pkl" + RAIN_MODEL_BACKUP_PATH: "/app/models/rain_model_4h.pkl.last_good" + RAIN_REPORT_PATH: "/app/models/rain_model_report_4h.json" + RAIN_AUDIT_PATH: "/app/models/rain_data_audit_4h.json" + RAIN_DATASET_PATH: "/app/models/datasets/rain_dataset_4h_{model_version}_{feature_set}.csv" + RAIN_MODEL_CARD_PATH: "/app/models/model_card_4h_{model_version}.md" + volumes: + - ./models:/app/models + + rainml_1h: + build: + context: . + dockerfile: Dockerfile.train + depends_on: + - timescaledb + restart: unless-stopped + profiles: ["shadow"] + environment: + DATABASE_URL: "postgres://postgres:postgres@timescaledb:5432/micrometeo?sslmode=disable" + RAIN_SITE: "home" + RAIN_HORIZON_HOURS: "1" + RAIN_MODEL_NAME: "rain_next_1h" + RAIN_MODEL_VERSION_BASE: "rain-auto-v1-extended-1h" + RAIN_MODEL_FAMILY: "auto" + RAIN_FEATURE_SET: "extended" + RAIN_FORECAST_MODEL: "ecmwf" + RAIN_LOOKBACK_DAYS: "60" + RAIN_TRAIN_INTERVAL_HOURS: "24" + RAIN_PREDICT_INTERVAL_MINUTES: "10" + RAIN_MIN_PRECISION: "0.70" + RAIN_TUNE_HYPERPARAMETERS: "true" + RAIN_MAX_HYPERPARAM_TRIALS: "12" + RAIN_CALIBRATION_METHODS: "none,sigmoid,isotonic" + RAIN_THRESHOLD_POLICY: "walk_forward" + RAIN_WALK_FORWARD_FOLDS: "4" + RAIN_ALLOW_EMPTY_DATA: "true" + RAIN_MODEL_PATH: "/app/models/rain_model_1h.pkl" + RAIN_MODEL_BACKUP_PATH: "/app/models/rain_model_1h.pkl.last_good" + RAIN_REPORT_PATH: "/app/models/rain_model_report_1h.json" + RAIN_AUDIT_PATH: "/app/models/rain_data_audit_1h.json" + RAIN_DATASET_PATH: "/app/models/datasets/rain_dataset_1h_{model_version}_{feature_set}.csv" + RAIN_MODEL_CARD_PATH: "/app/models/model_card_1h_{model_version}.md" volumes: - ./models:/app/models diff --git a/docs/rain_model_runbook.md b/docs/rain_model_runbook.md index 10a65d5..0c2032d 100644 --- a/docs/rain_model_runbook.md +++ b/docs/rain_model_runbook.md @@ -39,8 +39,8 @@ scripts/rainml_py.sh scripts/train_rain_model.py \ --threshold-policy "walk_forward" \ --walk-forward-folds 4 \ --model-version "rain-auto-v2-extended-4h" \ - --out "models/rain_model.pkl" \ - --report-out "models/rain_model_report.json" \ + --out "models/rain_model_4h.pkl" \ + --report-out "models/rain_model_report_4h.json" \ --model-card-out "models/model_card_{model_version}.md" \ --dataset-out "models/datasets/rain_dataset_{model_version}_{feature_set}.csv" ``` @@ -61,7 +61,7 @@ Review in report: ```sh scripts/rainml_py.sh scripts/predict_rain_model.py \ --site home \ - --model-path "models/rain_model.pkl" \ + --model-path "models/rain_model_4h.pkl" \ --model-name "rain_next_4h" \ --horizon-hours 4 \ --dry-run @@ -72,7 +72,7 @@ scripts/rainml_py.sh scripts/predict_rain_model.py \ ```sh scripts/rainml_py.sh scripts/predict_rain_model.py \ --site home \ - --model-path "models/rain_model.pkl" \ + --model-path "models/rain_model_4h.pkl" \ --model-name "rain_next_4h" \ --horizon-hours 4 ``` @@ -155,6 +155,14 @@ The script exits non-zero on failure, so it can directly drive alerting. - `RAIN_MODEL_BACKUP_PATH` - `RAIN_MODEL_CARD_PATH` +Dual-run note: +- `rainml` is configured as 4-hour model training/inference with dedicated artifact paths. +- `rainml_1h` is available as an optional shadow/baseline service via profile `shadow`. +- Start both (4h + 1h shadow): + `docker compose --profile shadow up -d rainml rainml_1h` +- Run one-off script against the 1h service: + `RAINML_PY_SERVICE=rainml_1h scripts/rainml_py.sh scripts/train_rain_model.py ...` + Recommended production defaults: - Enable tuning daily or weekly (`RAIN_TUNE_HYPERPARAMETERS=true`) - Set `RAIN_THRESHOLD_POLICY=walk_forward` with `RAIN_WALK_FORWARD_FOLDS=4` for temporally robust threshold selection @@ -217,6 +225,19 @@ scripts/rainml_py.sh scripts/compare_rain_reports.py \ --baseline "models/rain_model_report_1h.json" \ --candidate "models/rain_model_report_4h.json" ``` + +4b. Apply an explicit cutover gate (exit code 0 = pass): + +```sh +scripts/rainml_py.sh scripts/check_rain_cutover_gate.py \ + --baseline "models/rain_model_report_1h.json" \ + --candidate "models/rain_model_report_4h.json" \ + --min-candidate-precision 0.60 \ + --max-precision-drop 0.05 \ + --max-pr-auc-drop 0.05 \ + --max-roc-auc-drop 0.05 \ + --max-brier-increase 0.03 +``` 5. Run dry-run inference, then live inference with 4h model name/horizon: ```sh diff --git a/scripts/check_rain_cutover_gate.py b/scripts/check_rain_cutover_gate.py new file mode 100644 index 0000000..17c8391 --- /dev/null +++ b/scripts/check_rain_cutover_gate.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Evaluate go/no-go cutover gate for rain model reports.") + parser.add_argument("--baseline", required=True, help="Baseline report JSON path (for example 1h production).") + parser.add_argument("--candidate", required=True, help="Candidate report JSON path (for example 4h shadow).") + parser.add_argument( + "--min-candidate-precision", + type=float, + default=0.60, + help="Minimum allowed candidate test precision.", + ) + parser.add_argument( + "--max-precision-drop", + type=float, + default=0.05, + help="Maximum allowed drop: candidate_precision >= baseline_precision - value.", + ) + parser.add_argument( + "--max-pr-auc-drop", + type=float, + default=0.05, + help="Maximum allowed drop: candidate_pr_auc >= baseline_pr_auc - value.", + ) + parser.add_argument( + "--max-roc-auc-drop", + type=float, + default=0.05, + help="Maximum allowed drop: candidate_roc_auc >= baseline_roc_auc - value.", + ) + parser.add_argument( + "--max-brier-increase", + type=float, + default=0.03, + help="Maximum allowed increase: candidate_brier <= baseline_brier + value.", + ) + return parser.parse_args() + + +def load_report(path: str) -> dict[str, Any]: + p = Path(path) + if not p.exists(): + raise FileNotFoundError(path) + with p.open("r", encoding="utf-8") as f: + return json.load(f) + + +def metric(report: dict[str, Any], key: str) -> float: + value = report.get("test_metrics", {}).get(key) + if value is None: + raise ValueError(f"missing test metric: {key}") + return float(value) + + +def main() -> int: + args = parse_args() + baseline = load_report(args.baseline) + candidate = load_report(args.candidate) + + b_precision = metric(baseline, "precision") + c_precision = metric(candidate, "precision") + b_pr_auc = metric(baseline, "pr_auc") + c_pr_auc = metric(candidate, "pr_auc") + b_roc_auc = metric(baseline, "roc_auc") + c_roc_auc = metric(candidate, "roc_auc") + b_brier = metric(baseline, "brier") + c_brier = metric(candidate, "brier") + + checks: list[tuple[str, bool, str]] = [] + checks.append( + ( + "candidate_precision_floor", + c_precision >= args.min_candidate_precision, + f"{c_precision:.4f} >= {args.min_candidate_precision:.4f}", + ) + ) + checks.append( + ( + "precision_drop", + c_precision >= (b_precision - args.max_precision_drop), + f"{c_precision:.4f} >= {b_precision - args.max_precision_drop:.4f}", + ) + ) + checks.append( + ( + "pr_auc_drop", + c_pr_auc >= (b_pr_auc - args.max_pr_auc_drop), + f"{c_pr_auc:.4f} >= {b_pr_auc - args.max_pr_auc_drop:.4f}", + ) + ) + checks.append( + ( + "roc_auc_drop", + c_roc_auc >= (b_roc_auc - args.max_roc_auc_drop), + f"{c_roc_auc:.4f} >= {b_roc_auc - args.max_roc_auc_drop:.4f}", + ) + ) + checks.append( + ( + "brier_increase", + c_brier <= (b_brier + args.max_brier_increase), + f"{c_brier:.4f} <= {b_brier + args.max_brier_increase:.4f}", + ) + ) + + print("Rain cutover gate:") + print(f" baseline: {args.baseline}") + print(f" candidate: {args.candidate}") + print(f" baseline_version={baseline.get('model_version')} candidate_version={candidate.get('model_version')}") + + failures: list[str] = [] + for name, ok, detail in checks: + status = "ok" if ok else "fail" + print(f" {name}: {status} ({detail})") + if not ok: + failures.append(name) + + if failures: + print(f"cutover_decision: FAIL ({', '.join(failures)})") + return 1 + + print("cutover_decision: PASS") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/compare_rain_reports.py b/scripts/compare_rain_reports.py index 712486b..caac3be 100644 --- a/scripts/compare_rain_reports.py +++ b/scripts/compare_rain_reports.py @@ -16,6 +16,8 @@ def parse_args() -> argparse.Namespace: def load_json(path: str) -> dict[str, Any]: p = Path(path) + if not p.exists(): + raise FileNotFoundError(path) with p.open("r", encoding="utf-8") as f: return json.load(f) @@ -42,8 +44,25 @@ def delta_str(base: float | None, cand: float | None) -> str: def main() -> int: args = parse_args() - baseline = load_json(args.baseline) - candidate = load_json(args.candidate) + try: + baseline = load_json(args.baseline) + except FileNotFoundError: + print(f"error: baseline report not found: {args.baseline}") + model_dir = Path("models") + if model_dir.exists(): + candidates = sorted(model_dir.glob("rain_model_report*.json")) + if candidates: + print("available report files:") + for c in candidates: + print(f" - {c}") + print("hint: provide an existing 1h report path, or train a new 1h report first.") + return 2 + + try: + candidate = load_json(args.candidate) + except FileNotFoundError: + print(f"error: candidate report not found: {args.candidate}") + return 2 pairs = [ ("precision", metric(baseline, "test_metrics", "precision"), metric(candidate, "test_metrics", "precision")), diff --git a/scripts/rainml_py.sh b/scripts/rainml_py.sh index 323438e..545bc8b 100755 --- a/scripts/rainml_py.sh +++ b/scripts/rainml_py.sh @@ -16,12 +16,16 @@ Examples: Optional: RAINML_PY_BUILD=1 scripts/rainml_py.sh ... (builds the rainml image before running) + RAINML_PY_SERVICE=rainml_1h scripts/rainml_py.sh ... + (runs against a specific compose service; default is rainml) EOF exit 1 fi +SERVICE="${RAINML_PY_SERVICE:-rainml}" + if [[ "${RAINML_PY_BUILD:-0}" == "1" ]]; then - docker compose build rainml + docker compose build "$SERVICE" fi -docker compose run --rm --no-deps --entrypoint python3 rainml "$@" +docker compose run --rm --no-deps --entrypoint python3 "$SERVICE" "$@" diff --git a/todo.md b/todo.md index 43111e6..d078387 100644 --- a/todo.md +++ b/todo.md @@ -56,7 +56,7 @@ Priority key: `P0` = critical/blocking, `P1` = important, `P2` = later optimizat - [x] [P1] Add one expanded feature set and rerun evaluation. (completed on runtime machine 2026-03-12 with `feature_set=extended`, `model_version=rain-auto-v1-extended-202603120932`) - [x] [P0] Decide v1 threshold and define deployment interface. -## 9) Extension Plan: 4-Hour Precipitation Window (Not Started) +## 9) Extension Plan: 4-Hour Precipitation Window (In Progress) - [x] [P0] Lock v2 target definition for horizon extension: `rain_next_4h_mm >= ` and explicitly decide whether the threshold remains `0.2mm` or is increased for 4-hour labeling. (implemented with `0.2mm` carry-forward) - [x] [P0] Decide rollout strategy: additive dual-horizon support (`1h` + `4h`) vs direct replacement; prefer dual-horizon for safe cutover. (implemented as additive dual-horizon) - [x] [P0] Parameterize label horizon in shared ML code (`scripts/rain_model_common.py`) so target columns are generated for 4-hour windows (48 x 5-minute buckets) instead of hard-coded 1-hour columns. @@ -72,8 +72,11 @@ Priority key: `P0` = critical/blocking, `P1` = important, `P2` = later optimizat - [x] [P1] Update dashboard API defaults (`cmd/ingestd/web.go`) from `rain_next_1h` to the selected 4-hour model name (or make model name configurable). - [x] [P1] Update web UI labels/semantics (`cmd/ingestd/web/index.html`, `cmd/ingestd/web/app.js`) from “Rain 1h %” to “Rain 4h %” and verify chart legends/tooltips match the new horizon. - [x] [P1] Update worker/runtime defaults (`docker-compose.yml`, `scripts/run_rain_ml_worker.py`, `scripts/run_p0_rain_workflow.sh`) to use `rain_next_4h` naming/versioning. +- [x] [P1] Add dual-run deployment support with isolated artifacts for 4h and 1h workers (`docker-compose.yml` + `scripts/rainml_py.sh` service targeting). - [x] [P0] Update health-check defaults (`scripts/check_rain_pipeline_health.py`) for 4-hour evaluation latency (e.g., pending-eval age threshold > 4h). - [x] [P1] Update docs and runbooks (`README.md`, `docs/rain_prediction.md`, `docs/rain_model_runbook.md`) so commands, table names, and target definitions match the 4-hour system. -- [ ] [P0] Run full retraining/evaluation for the 4-hour target and compare against current 1-hour model metrics before production cutover. -- [ ] [P0] Execute staged rollout: deploy schema + views, deploy model + inference, verify dashboard/health checks, then switch default model name. +- [x] [P1] Add explicit automated cutover gate script for baseline-vs-candidate decisioning. (`scripts/check_rain_cutover_gate.py`) +- [x] [P0] Run full retraining/evaluation for the 4-hour target and compare against current 1-hour model metrics before production cutover. (completed on runtime machine 2026-04-06; comparison vs `rain_model_report_extended_eval.json` showed regression: precision `0.7103 -> 0.5545`, PR-AUC `0.7245 -> 0.5850`, ROC-AUC `0.9184 -> 0.7843`, Brier `0.0931 -> 0.2276`) +- [ ] [P0] Execute staged rollout: deploy schema + views, deploy model + inference, verify dashboard/health checks, then switch default model name. (schema/views/deploy/inference/health completed on runtime machine 2026-04-06; final production cutover decision remains open due 4h metric regression vs 1h baseline) +- [ ] [P0] Improve 4-hour model quality to meet cutover gate (at minimum recover precision and calibration relative to current 1-hour production baseline) before declaring rollout complete. - [x] [P1] Keep rollback path documented: retain `rain_next_1h` artifacts/table access until 4-hour monitoring is stable. (documented in `docs/rain_model_runbook.md` staged rollout/rollback section)