Implemented the next 4h-plan phase: dual-run support + explicit cutover gate.

2026-04-06 19:09:20 +10:00
parent 1ef300d25e
commit 1e750e35d1
7 changed files with 238 additions and 20 deletions
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Evaluate go/no-go cutover gate for rain model reports.")
+    parser.add_argument("--baseline", required=True, help="Baseline report JSON path (for example 1h production).")
+    parser.add_argument("--candidate", required=True, help="Candidate report JSON path (for example 4h shadow).")
+    parser.add_argument(
+        "--min-candidate-precision",
+        type=float,
+        default=0.60,
+        help="Minimum allowed candidate test precision.",
+    )
+    parser.add_argument(
+        "--max-precision-drop",
+        type=float,
+        default=0.05,
+        help="Maximum allowed drop: candidate_precision >= baseline_precision - value.",
+    )
+    parser.add_argument(
+        "--max-pr-auc-drop",
+        type=float,
+        default=0.05,
+        help="Maximum allowed drop: candidate_pr_auc >= baseline_pr_auc - value.",
+    )
+    parser.add_argument(
+        "--max-roc-auc-drop",
+        type=float,
+        default=0.05,
+        help="Maximum allowed drop: candidate_roc_auc >= baseline_roc_auc - value.",
+    )
+    parser.add_argument(
+        "--max-brier-increase",
+        type=float,
+        default=0.03,
+        help="Maximum allowed increase: candidate_brier <= baseline_brier + value.",
+    )
+    return parser.parse_args()
+
+
+def load_report(path: str) -> dict[str, Any]:
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(path)
+    with p.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def metric(report: dict[str, Any], key: str) -> float:
+    value = report.get("test_metrics", {}).get(key)
+    if value is None:
+        raise ValueError(f"missing test metric: {key}")
+    return float(value)
+
+
+def main() -> int:
+    args = parse_args()
+    baseline = load_report(args.baseline)
+    candidate = load_report(args.candidate)
+
+    b_precision = metric(baseline, "precision")
+    c_precision = metric(candidate, "precision")
+    b_pr_auc = metric(baseline, "pr_auc")
+    c_pr_auc = metric(candidate, "pr_auc")
+    b_roc_auc = metric(baseline, "roc_auc")
+    c_roc_auc = metric(candidate, "roc_auc")
+    b_brier = metric(baseline, "brier")
+    c_brier = metric(candidate, "brier")
+
+    checks: list[tuple[str, bool, str]] = []
+    checks.append(
+        (
+            "candidate_precision_floor",
+            c_precision >= args.min_candidate_precision,
+            f"{c_precision:.4f} >= {args.min_candidate_precision:.4f}",
+        )
+    )
+    checks.append(
+        (
+            "precision_drop",
+            c_precision >= (b_precision - args.max_precision_drop),
+            f"{c_precision:.4f} >= {b_precision - args.max_precision_drop:.4f}",
+        )
+    )
+    checks.append(
+        (
+            "pr_auc_drop",
+            c_pr_auc >= (b_pr_auc - args.max_pr_auc_drop),
+            f"{c_pr_auc:.4f} >= {b_pr_auc - args.max_pr_auc_drop:.4f}",
+        )
+    )
+    checks.append(
+        (
+            "roc_auc_drop",
+            c_roc_auc >= (b_roc_auc - args.max_roc_auc_drop),
+            f"{c_roc_auc:.4f} >= {b_roc_auc - args.max_roc_auc_drop:.4f}",
+        )
+    )
+    checks.append(
+        (
+            "brier_increase",
+            c_brier <= (b_brier + args.max_brier_increase),
+            f"{c_brier:.4f} <= {b_brier + args.max_brier_increase:.4f}",
+        )
+    )
+
+    print("Rain cutover gate:")
+    print(f"  baseline: {args.baseline}")
+    print(f"  candidate: {args.candidate}")
+    print(f"  baseline_version={baseline.get('model_version')} candidate_version={candidate.get('model_version')}")
+
+    failures: list[str] = []
+    for name, ok, detail in checks:
+        status = "ok" if ok else "fail"
+        print(f"  {name}: {status} ({detail})")
+        if not ok:
+            failures.append(name)
+
+    if failures:
+        print(f"cutover_decision: FAIL ({', '.join(failures)})")
+        return 1
+
+    print("cutover_decision: PASS")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -16,6 +16,8 @@ def parse_args() -> argparse.Namespace:

 def load_json(path: str) -> dict[str, Any]:
    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(path)
    with p.open("r", encoding="utf-8") as f:
        return json.load(f)

@@ -42,8 +44,25 @@ def delta_str(base: float | None, cand: float | None) -> str:

 def main() -> int:
    args = parse_args()
-    baseline = load_json(args.baseline)
-    candidate = load_json(args.candidate)
+    try:
+        baseline = load_json(args.baseline)
+    except FileNotFoundError:
+        print(f"error: baseline report not found: {args.baseline}")
+        model_dir = Path("models")
+        if model_dir.exists():
+            candidates = sorted(model_dir.glob("rain_model_report*.json"))
+            if candidates:
+                print("available report files:")
+                for c in candidates:
+                    print(f"  - {c}")
+        print("hint: provide an existing 1h report path, or train a new 1h report first.")
+        return 2
+
+    try:
+        candidate = load_json(args.candidate)
+    except FileNotFoundError:
+        print(f"error: candidate report not found: {args.candidate}")
+        return 2

    pairs = [
        ("precision", metric(baseline, "test_metrics", "precision"), metric(candidate, "test_metrics", "precision")),
@@ -16,12 +16,16 @@ Examples:
 Optional:
  RAINML_PY_BUILD=1 scripts/rainml_py.sh ...
  (builds the rainml image before running)
+  RAINML_PY_SERVICE=rainml_1h scripts/rainml_py.sh ...
+  (runs against a specific compose service; default is rainml)
 EOF
  exit 1
 fi

+SERVICE="${RAINML_PY_SERVICE:-rainml}"
+
 if [[ "${RAINML_PY_BUILD:-0}" == "1" ]]; then
-  docker compose build rainml
+  docker compose build "$SERVICE"
 fi

-docker compose run --rm --no-deps --entrypoint python3 rainml "$@"
+docker compose run --rm --no-deps --entrypoint python3 "$SERVICE" "$@"