From 9785fc02356a7c3126d046dd60fadb8b250a5038 Mon Sep 17 00:00:00 2001 From: Nathan Coad Date: Thu, 12 Mar 2026 20:39:44 +1100 Subject: [PATCH] improve model training --- README.md | 1 + docker-compose.yml | 1 + docs/rain_model_runbook.md | 15 ++ docs/rain_prediction.md | 28 ++- scripts/recommend_rain_model.py | 324 ++++++++++++++++++++++++++++++++ scripts/run_rain_ml_worker.py | 5 + scripts/train_rain_model.py | 164 +++++++++++++++- todo.md | 2 +- 8 files changed, 536 insertions(+), 4 deletions(-) create mode 100644 scripts/recommend_rain_model.py diff --git a/README.md b/README.md index 6bb6710..ff30bd1 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ Runbook/docs: - `docs/rain_prediction.md` - `docs/rain_data_issues.md` - `docs/rain_model_runbook.md` +- `scripts/recommend_rain_model.py` (rank reports and recommend deploy candidate) ## Publish a test WS90 payload ```sh diff --git a/docker-compose.yml b/docker-compose.yml index 17f8295..1143ab3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -46,6 +46,7 @@ services: RAIN_TUNE_HYPERPARAMETERS: "true" RAIN_MAX_HYPERPARAM_TRIALS: "12" RAIN_CALIBRATION_METHODS: "none,sigmoid,isotonic" + RAIN_THRESHOLD_POLICY: "validation" RAIN_WALK_FORWARD_FOLDS: "0" RAIN_ALLOW_EMPTY_DATA: "true" RAIN_MODEL_PATH: "/app/models/rain_model.pkl" diff --git a/docs/rain_model_runbook.md b/docs/rain_model_runbook.md index 9d08593..3fc743d 100644 --- a/docs/rain_model_runbook.md +++ b/docs/rain_model_runbook.md @@ -27,6 +27,7 @@ python scripts/train_rain_model.py \ --tune-hyperparameters \ --max-hyperparam-trials 12 \ --calibration-methods "none,sigmoid,isotonic" \ + --threshold-policy "walk_forward" \ --walk-forward-folds 4 \ --model-version "rain-auto-v1-extended" \ --out "models/rain_model.pkl" \ @@ -40,6 +41,7 @@ Review in report: - `candidate_models[*].calibration_comparison` - `naive_baselines_test` - `sliced_performance_test` +- `threshold_tuning_walk_forward` - `walk_forward_backtest` ## 3) Deploy @@ -133,6 +135,7 @@ The script exits non-zero on failure, so it can directly drive alerting. - `RAIN_TUNE_HYPERPARAMETERS` - `RAIN_MAX_HYPERPARAM_TRIALS` - `RAIN_CALIBRATION_METHODS` +- `RAIN_THRESHOLD_POLICY` - `RAIN_WALK_FORWARD_FOLDS` - `RAIN_ALLOW_EMPTY_DATA` - `RAIN_MODEL_BACKUP_PATH` @@ -141,3 +144,15 @@ The script exits non-zero on failure, so it can directly drive alerting. Recommended production defaults: - Enable tuning daily or weekly (`RAIN_TUNE_HYPERPARAMETERS=true`) - Keep walk-forward folds `0` in continuous mode, run fold backtests in scheduled evaluation jobs + +## 8) Auto-Recommend Candidate + +To compare saved training reports and pick a deployment candidate automatically: + +```sh +python scripts/recommend_rain_model.py \ + --reports-glob "models/rain_model_report*.json" \ + --require-walk-forward \ + --top-k 5 \ + --json-out "models/rain_model_recommendation.json" +``` diff --git a/docs/rain_prediction.md b/docs/rain_prediction.md index f814246..bc0277a 100644 --- a/docs/rain_prediction.md +++ b/docs/rain_prediction.md @@ -43,6 +43,7 @@ pip install -r scripts/requirements.txt `predictions_rain_1h`. - `scripts/run_rain_ml_worker.py`: long-running worker for periodic training + prediction. - `scripts/check_rain_pipeline_health.py`: freshness/failure check for alerting. +- `scripts/recommend_rain_model.py`: rank saved training reports and recommend a deployment candidate. Feature-set options: - `baseline`: original 5 local observation features. @@ -181,6 +182,22 @@ python scripts/train_rain_model.py \ --model-card-out "models/model_card_{model_version}.md" ``` +### 3f) Walk-forward threshold policy (more temporally robust alert threshold) +```sh +python scripts/train_rain_model.py \ + --site "home" \ + --start "2026-02-01T00:00:00Z" \ + --end "2026-03-03T23:55:00Z" \ + --feature-set "extended" \ + --model-family "auto" \ + --forecast-model "ecmwf" \ + --threshold-policy "walk_forward" \ + --walk-forward-folds 4 \ + --model-version "rain-auto-v1-extended-wf-threshold" \ + --out "models/rain_model_auto.pkl" \ + --report-out "models/rain_model_report_auto.json" +``` + ### 4) Run inference and store prediction ```sh python scripts/predict_rain_model.py \ @@ -200,7 +217,7 @@ The `rainml` service in `docker-compose.yml` now runs: - periodic retraining (default every 24 hours) - periodic prediction writes (default every 10 minutes) - configurable tuning/calibration behavior (`RAIN_TUNE_HYPERPARAMETERS`, - `RAIN_MAX_HYPERPARAM_TRIALS`, `RAIN_CALIBRATION_METHODS`) + `RAIN_MAX_HYPERPARAM_TRIALS`, `RAIN_CALIBRATION_METHODS`, `RAIN_THRESHOLD_POLICY`) - graceful gap handling for temporary source outages (`RAIN_ALLOW_EMPTY_DATA=true`) - automatic rollback path for last-known-good model (`RAIN_MODEL_BACKUP_PATH`) - optional model-card output (`RAIN_MODEL_CARD_PATH`) @@ -222,6 +239,15 @@ docker compose logs -f rainml - Prediction rows: `predictions_rain_1h` (probability + threshold decision + realized outcome fields once available) +### 7) Recommend deploy candidate from saved reports +```sh +python scripts/recommend_rain_model.py \ + --reports-glob "models/rain_model_report*.json" \ + --require-walk-forward \ + --top-k 5 \ + --json-out "models/rain_model_recommendation.json" +``` + ## Model Features (v1 baseline) - `pressure_trend_1h` - `humidity` diff --git a/scripts/recommend_rain_model.py b/scripts/recommend_rain_model.py new file mode 100644 index 0000000..9f9029f --- /dev/null +++ b/scripts/recommend_rain_model.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import glob +import json +import math +import os +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any + + +@dataclass +class Candidate: + path: Path + model_version: str + feature_set: str + model_family: str + generated_at: str | None + test_precision: float | None + test_recall: float | None + test_pr_auc: float | None + test_roc_auc: float | None + test_brier: float | None + wf_precision: float | None + wf_recall: float | None + wf_pr_auc: float | None + wf_brier: float | None + score: float + eligible: bool + ineligible_reasons: list[str] + report: dict[str, Any] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Rank rain-model training reports and recommend a deploy candidate.") + parser.add_argument( + "--reports-glob", + default="models/rain_model_report*.json", + help="Glob for report JSON files.", + ) + parser.add_argument("--min-test-precision", type=float, default=0.65) + parser.add_argument("--min-test-recall", type=float, default=0.50) + parser.add_argument("--min-test-pr-auc", type=float, default=0.40) + parser.add_argument("--min-walk-forward-precision", type=float, default=0.30) + parser.add_argument("--min-walk-forward-recall", type=float, default=0.25) + parser.add_argument( + "--require-walk-forward", + action="store_true", + help="Require walk-forward summary metrics to be present and pass minimums.", + ) + parser.add_argument("--top-k", type=int, default=5) + parser.add_argument("--json-out", help="Optional output JSON path.") + return parser.parse_args() + + +def as_float(v: Any) -> float | None: + if v is None: + return None + try: + out = float(v) + except (TypeError, ValueError): + return None + if math.isnan(out): + return None + return out + + +def load_report(path: Path) -> dict[str, Any]: + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def naive_precision_baseline(report: dict[str, Any]) -> float | None: + baselines = report.get("naive_baselines_test") or {} + out: float | None = None + for baseline in baselines.values(): + metrics = baseline.get("metrics", {}) + precision = as_float(metrics.get("precision")) + if precision is None: + continue + if out is None or precision > out: + out = precision + return out + + +def score_candidate( + report: dict[str, Any], + min_test_precision: float, + min_test_recall: float, + min_test_pr_auc: float, + min_wf_precision: float, + min_wf_recall: float, + require_walk_forward: bool, +) -> tuple[float, bool, list[str], dict[str, float | None]]: + test = report.get("test_metrics") or {} + wf_summary = (report.get("walk_forward_backtest") or {}).get("summary") or {} + + test_precision = as_float(test.get("precision")) + test_recall = as_float(test.get("recall")) + test_pr_auc = as_float(test.get("pr_auc")) + test_roc_auc = as_float(test.get("roc_auc")) + test_brier = as_float(test.get("brier")) + + wf_precision = as_float(wf_summary.get("mean_precision")) + wf_recall = as_float(wf_summary.get("mean_recall")) + wf_pr_auc = as_float(wf_summary.get("mean_pr_auc")) + wf_brier = as_float(wf_summary.get("mean_brier")) + + metrics = { + "test_precision": test_precision, + "test_recall": test_recall, + "test_pr_auc": test_pr_auc, + "test_roc_auc": test_roc_auc, + "test_brier": test_brier, + "wf_precision": wf_precision, + "wf_recall": wf_recall, + "wf_pr_auc": wf_pr_auc, + "wf_brier": wf_brier, + } + + reasons: list[str] = [] + if test_precision is None or test_precision < min_test_precision: + reasons.append(f"test_precision<{min_test_precision:.2f}") + if test_recall is None or test_recall < min_test_recall: + reasons.append(f"test_recall<{min_test_recall:.2f}") + if test_pr_auc is None or test_pr_auc < min_test_pr_auc: + reasons.append(f"test_pr_auc<{min_test_pr_auc:.2f}") + + if require_walk_forward and (wf_precision is None or wf_recall is None): + reasons.append("walk_forward_missing") + if wf_precision is not None and wf_precision < min_wf_precision: + reasons.append(f"wf_precision<{min_wf_precision:.2f}") + if wf_recall is not None and wf_recall < min_wf_recall: + reasons.append(f"wf_recall<{min_wf_recall:.2f}") + + eligible = len(reasons) == 0 + + # Weighted utility score with stability penalty. + score = 0.0 + if test_precision is not None: + score += 3.0 * test_precision + if test_recall is not None: + score += 2.5 * test_recall + if test_pr_auc is not None: + score += 2.5 * test_pr_auc + if test_roc_auc is not None: + score += 1.0 * test_roc_auc + if test_brier is not None: + score += 1.5 * (1.0 - min(max(test_brier, 0.0), 1.0)) + + if wf_precision is not None: + score += 2.0 * wf_precision + else: + score -= 0.25 + if wf_recall is not None: + score += 1.5 * wf_recall + if wf_pr_auc is not None: + score += 1.0 * wf_pr_auc + if wf_brier is not None: + score += 1.0 * (1.0 - min(max(wf_brier, 0.0), 1.0)) + + if test_precision is not None and wf_precision is not None: + score -= 1.5 * abs(test_precision - wf_precision) + if test_recall is not None and wf_recall is not None: + score -= 1.0 * abs(test_recall - wf_recall) + + best_naive_precision = naive_precision_baseline(report) + if best_naive_precision is not None and test_precision is not None: + gap = test_precision - best_naive_precision + score += 0.5 * gap + + return score, eligible, reasons, metrics + + +def parse_generated_at(value: str | None) -> datetime: + if not value: + return datetime.min + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")) + except ValueError: + return datetime.min + + +def build_candidate(path: Path, report: dict[str, Any], args: argparse.Namespace) -> Candidate: + score, eligible, reasons, metrics = score_candidate( + report=report, + min_test_precision=args.min_test_precision, + min_test_recall=args.min_test_recall, + min_test_pr_auc=args.min_test_pr_auc, + min_wf_precision=args.min_walk_forward_precision, + min_wf_recall=args.min_walk_forward_recall, + require_walk_forward=args.require_walk_forward, + ) + return Candidate( + path=path, + model_version=str(report.get("model_version") or "unknown"), + feature_set=str(report.get("feature_set") or "unknown"), + model_family=str(report.get("model_family") or "unknown"), + generated_at=report.get("generated_at"), + test_precision=metrics["test_precision"], + test_recall=metrics["test_recall"], + test_pr_auc=metrics["test_pr_auc"], + test_roc_auc=metrics["test_roc_auc"], + test_brier=metrics["test_brier"], + wf_precision=metrics["wf_precision"], + wf_recall=metrics["wf_recall"], + wf_pr_auc=metrics["wf_pr_auc"], + wf_brier=metrics["wf_brier"], + score=score, + eligible=eligible, + ineligible_reasons=reasons, + report=report, + ) + + +def main() -> int: + args = parse_args() + paths = sorted(Path(p) for p in glob.glob(args.reports_glob)) + if not paths: + print(f"No report files matched: {args.reports_glob}") + return 1 + + candidates: list[Candidate] = [] + for path in paths: + try: + report = load_report(path) + except Exception as exc: + print(f"skip {path}: {exc}") + continue + candidates.append(build_candidate(path=path, report=report, args=args)) + + if not candidates: + print("No valid reports loaded.") + return 1 + + candidates.sort( + key=lambda c: ( + 1 if c.eligible else 0, + c.score, + parse_generated_at(c.generated_at), + ), + reverse=True, + ) + + print(f"Scanned {len(candidates)} report(s). Top {min(args.top_k, len(candidates))}:") + for idx, c in enumerate(candidates[: args.top_k], start=1): + wf_part = ( + f"wf_prec={c.wf_precision:.3f} wf_rec={c.wf_recall:.3f}" + if c.wf_precision is not None and c.wf_recall is not None + else "wf=n/a" + ) + gate_part = "eligible" if c.eligible else f"ineligible({','.join(c.ineligible_reasons)})" + print( + f"{idx}. {gate_part} score={c.score:.3f} " + f"version={c.model_version} feature_set={c.feature_set} family={c.model_family} " + f"test_prec={c.test_precision if c.test_precision is not None else 'n/a'} " + f"test_rec={c.test_recall if c.test_recall is not None else 'n/a'} " + f"test_pr_auc={c.test_pr_auc if c.test_pr_auc is not None else 'n/a'} " + f"{wf_part} " + f"path={c.path}" + ) + + recommendation = next((c for c in candidates if c.eligible), candidates[0]) + print("") + print("Recommended candidate:") + print(f" model_version={recommendation.model_version}") + print(f" feature_set={recommendation.feature_set}") + print(f" model_family={recommendation.model_family}") + print(f" report_path={recommendation.path}") + print(f" score={recommendation.score:.3f}") + if not recommendation.eligible: + print(f" note=no fully eligible report; selected highest score with reasons={recommendation.ineligible_reasons}") + + if args.json_out: + payload = { + "generated_at": datetime.utcnow().isoformat() + "Z", + "reports_glob": args.reports_glob, + "recommendation": { + "model_version": recommendation.model_version, + "feature_set": recommendation.feature_set, + "model_family": recommendation.model_family, + "report_path": str(recommendation.path), + "score": recommendation.score, + "eligible": recommendation.eligible, + "ineligible_reasons": recommendation.ineligible_reasons, + }, + "ranked": [ + { + "model_version": c.model_version, + "feature_set": c.feature_set, + "model_family": c.model_family, + "report_path": str(c.path), + "generated_at": c.generated_at, + "score": c.score, + "eligible": c.eligible, + "ineligible_reasons": c.ineligible_reasons, + "test_precision": c.test_precision, + "test_recall": c.test_recall, + "test_pr_auc": c.test_pr_auc, + "test_roc_auc": c.test_roc_auc, + "test_brier": c.test_brier, + "wf_precision": c.wf_precision, + "wf_recall": c.wf_recall, + "wf_pr_auc": c.wf_pr_auc, + "wf_brier": c.wf_brier, + } + for c in candidates + ], + } + out_dir = os.path.dirname(args.json_out) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + with open(args.json_out, "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2) + print(f"Saved recommendation JSON to {args.json_out}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_rain_ml_worker.py b/scripts/run_rain_ml_worker.py index 84b5051..20a0494 100644 --- a/scripts/run_rain_ml_worker.py +++ b/scripts/run_rain_ml_worker.py @@ -54,6 +54,7 @@ class WorkerConfig: tune_hyperparameters: bool max_hyperparam_trials: int calibration_methods: str + threshold_policy: str walk_forward_folds: int allow_empty_data: bool dataset_path_template: str @@ -194,6 +195,8 @@ def run_training_cycle(cfg: WorkerConfig, env: dict[str, str]) -> None: str(cfg.max_hyperparam_trials), "--calibration-methods", cfg.calibration_methods, + "--threshold-policy", + cfg.threshold_policy, "--walk-forward-folds", str(cfg.walk_forward_folds), "--feature-set", @@ -300,6 +303,7 @@ def load_config() -> WorkerConfig: tune_hyperparameters=read_env_bool("RAIN_TUNE_HYPERPARAMETERS", False), max_hyperparam_trials=read_env_int("RAIN_MAX_HYPERPARAM_TRIALS", 12), calibration_methods=read_env("RAIN_CALIBRATION_METHODS", "none,sigmoid,isotonic"), + threshold_policy=read_env("RAIN_THRESHOLD_POLICY", "validation"), walk_forward_folds=read_env_int("RAIN_WALK_FORWARD_FOLDS", 0), allow_empty_data=read_env_bool("RAIN_ALLOW_EMPTY_DATA", True), dataset_path_template=read_env( @@ -341,6 +345,7 @@ def main() -> int: f"train_interval_hours={cfg.train_interval_hours} " f"predict_interval_minutes={cfg.predict_interval_minutes} " f"tune_hyperparameters={cfg.tune_hyperparameters} " + f"threshold_policy={cfg.threshold_policy} " f"walk_forward_folds={cfg.walk_forward_folds} " f"allow_empty_data={cfg.allow_empty_data} " f"model_backup_path={cfg.model_backup_path}", diff --git a/scripts/train_rain_model.py b/scripts/train_rain_model.py index 66482d3..96e2d53 100644 --- a/scripts/train_rain_model.py +++ b/scripts/train_rain_model.py @@ -45,6 +45,7 @@ except ImportError: # pragma: no cover - optional dependency MODEL_FAMILIES = ("logreg", "hist_gb", "auto") CALIBRATION_METHODS = ("none", "sigmoid", "isotonic") +THRESHOLD_POLICIES = ("validation", "walk_forward") def parse_args() -> argparse.Namespace: @@ -62,6 +63,12 @@ def parse_args() -> argparse.Namespace: help="Minimum validation precision for threshold selection.", ) parser.add_argument("--threshold", type=float, help="Optional fixed classification threshold.") + parser.add_argument( + "--threshold-policy", + default="validation", + choices=THRESHOLD_POLICIES, + help="How to choose operating threshold when --threshold is not set.", + ) parser.add_argument("--min-rows", type=int, default=200, help="Minimum model-ready rows required.") parser.set_defaults(allow_empty=True) parser.add_argument( @@ -575,6 +582,127 @@ def evaluate_sliced_performance( return out +def tune_threshold_walk_forward( + model_df, + feature_cols: list[str], + model_family: str, + model_params: dict[str, Any], + calibration_method: str, + random_state: int, + min_precision: float, + folds: int, +) -> dict[str, Any]: + if folds <= 0: + return { + "enabled": False, + "status": "disabled", + "reason": "walk_forward_folds <= 0", + } + + n = len(model_df) + min_train_rows = max(200, int(0.4 * n)) + remaining = n - min_train_rows + if remaining < 50: + return { + "enabled": True, + "status": "insufficient_data", + "reason": "not enough rows for walk-forward threshold tuning", + "requested_folds": folds, + "min_train_rows": min_train_rows, + } + + fold_size = max(25, remaining // folds) + fold_details: list[dict[str, Any]] = [] + y_true_chunks: list[np.ndarray] = [] + y_prob_chunks: list[np.ndarray] = [] + + for idx in range(folds): + train_end = min_train_rows + idx * fold_size + test_end = n if idx == folds - 1 else min(min_train_rows + (idx + 1) * fold_size, n) + if train_end >= test_end: + continue + + fold_train = model_df.iloc[:train_end] + fold_test = model_df.iloc[train_end:test_end] + if len(fold_train) < 160 or len(fold_test) < 25: + continue + + y_fold_train = fold_train["rain_next_1h"].astype(int).to_numpy() + y_fold_test = fold_test["rain_next_1h"].astype(int).to_numpy() + if len(np.unique(y_fold_train)) < 2: + continue + + try: + fold_model, fold_fit = fit_with_optional_calibration( + model_family=model_family, + model_params=model_params, + random_state=random_state, + x_train=fold_train[feature_cols], + y_train=y_fold_train, + calibration_method=calibration_method, + fallback_to_none=True, + ) + fold_test_prob = fold_model.predict_proba(fold_test[feature_cols])[:, 1] + + y_true_chunks.append(y_fold_test) + y_prob_chunks.append(fold_test_prob) + fold_details.append( + { + "fold_index": idx + 1, + "train_rows": len(fold_train), + "test_rows": len(fold_test), + "train_start": fold_train.index.min(), + "train_end": fold_train.index.max(), + "test_start": fold_test.index.min(), + "test_end": fold_test.index.max(), + "fit": fold_fit, + "test_positive_rate": float(np.mean(y_fold_test)), + } + ) + except Exception as exc: + fold_details.append( + { + "fold_index": idx + 1, + "error": str(exc), + } + ) + + if not y_true_chunks: + return { + "enabled": True, + "status": "failed", + "reason": "no successful folds produced out-of-fold predictions", + "requested_folds": folds, + "folds": fold_details, + } + + y_oof_true = np.concatenate(y_true_chunks) + y_oof_prob = np.concatenate(y_prob_chunks) + tuned_threshold, tuned_info = select_threshold( + y_true=y_oof_true, + y_prob=y_oof_prob, + min_precision=min_precision, + ) + tuned_info = dict(tuned_info) + tuned_info["selection_rule"] = f"walk_forward_{tuned_info['selection_rule']}" + + return { + "enabled": True, + "status": "ok", + "requested_folds": folds, + "successful_folds": int(len(y_true_chunks)), + "rows_used": int(len(y_oof_true)), + "threshold": float(tuned_threshold), + "threshold_selection": tuned_info, + "oof_metrics_at_threshold": evaluate_probs( + y_true=y_oof_true, + y_prob=y_oof_prob, + threshold=tuned_threshold, + ), + "folds": fold_details, + } + + def walk_forward_backtest( model_df, feature_cols: list[str], @@ -935,7 +1063,32 @@ def main() -> int: selected_model_params = best_candidate["model_params"] selected_calibration_method = str(best_candidate["calibration_method"]) chosen_threshold = float(best_candidate["threshold"]) - threshold_info = best_candidate["threshold_info"] + threshold_info = dict(best_candidate["threshold_info"]) + threshold_policy_applied = "fixed" if args.threshold is not None else "validation" + threshold_tuning_walk_forward = { + "enabled": args.threshold_policy == "walk_forward", + "status": "not_run", + } + if args.threshold is None and args.threshold_policy == "walk_forward": + threshold_tuning_walk_forward = tune_threshold_walk_forward( + model_df=model_df.iloc[: len(train_df) + len(val_df)], + feature_cols=feature_cols, + model_family=selected_model_family, + model_params=selected_model_params, + calibration_method=selected_calibration_method, + random_state=args.random_state, + min_precision=args.min_precision, + folds=args.walk_forward_folds, + ) + if threshold_tuning_walk_forward.get("status") == "ok": + chosen_threshold = float(threshold_tuning_walk_forward["threshold"]) + threshold_info = dict(threshold_tuning_walk_forward["threshold_selection"]) + threshold_policy_applied = "walk_forward" + else: + threshold_info["warning"] = ( + "walk-forward threshold tuning unavailable; fell back to validation-selected threshold" + ) + threshold_policy_applied = "validation_fallback" val_metrics = best_candidate["validation_metrics"] train_val_df = model_df.iloc[: len(train_df) + len(val_df)] @@ -971,7 +1124,7 @@ def main() -> int: calibration_method=selected_calibration_method, random_state=args.random_state, min_precision=args.min_precision, - fixed_threshold=args.threshold, + fixed_threshold=chosen_threshold if threshold_policy_applied == "walk_forward" else args.threshold, folds=args.walk_forward_folds, ) @@ -989,6 +1142,8 @@ def main() -> int: "calibration_method_requested": calibration_methods, "calibration_method": selected_calibration_method, "calibration_fit": final_fit_info, + "threshold_policy_requested": args.threshold_policy, + "threshold_policy_applied": threshold_policy_applied, "data_window": { "requested_start": start or None, "requested_end": end or None, @@ -1043,6 +1198,7 @@ def main() -> int: "test_calibration_quality": test_calibration, "naive_baselines_test": naive_baselines_test, "sliced_performance_test": sliced_performance, + "threshold_tuning_walk_forward": threshold_tuning_walk_forward, "walk_forward_backtest": walk_forward, } report = to_builtin(report) @@ -1053,6 +1209,10 @@ def main() -> int: print(f" model_family: {selected_model_family} (requested={args.model_family})") print(f" model_params: {selected_model_params}") print(f" calibration_method: {report['calibration_method']}") + print( + f" threshold_policy: requested={report['threshold_policy_requested']} " + f"applied={report['threshold_policy_applied']}" + ) print(f" feature_set: {args.feature_set} ({len(feature_cols)} features)") print( " rows: " diff --git a/todo.md b/todo.md index ee4d371..894bf38 100644 --- a/todo.md +++ b/todo.md @@ -53,5 +53,5 @@ Priority key: `P0` = critical/blocking, `P1` = important, `P2` = later optimizat ## 8) Immediate Next Steps (This Week) - [x] [P0] Run first full data audit and label-quality checks. (completed on runtime machine) - [x] [P0] Train baseline model on full available history and capture metrics. (completed on runtime machine) -- [ ] [P1] Add one expanded feature set and rerun evaluation. (feature-set plumbing implemented; rerun pending on runtime machine) +- [x] [P1] Add one expanded feature set and rerun evaluation. (completed on runtime machine 2026-03-12 with `feature_set=extended`, `model_version=rain-auto-v1-extended-202603120932`) - [x] [P0] Decide v1 threshold and define deployment interface.