bugfixes

2026-03-12 19:55:51 +11:00
parent 76851f0816
commit d1237eed44
12 changed files with 1444 additions and 82 deletions
@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
+from __future__ import annotations
+
 import argparse
+import itertools
 import json
 import os
 from datetime import datetime, timezone
@@ -7,8 +10,10 @@ from typing import Any

 import numpy as np
 import psycopg2
+from sklearn.calibration import CalibratedClassifierCV
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import TimeSeriesSplit
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler

@@ -38,6 +43,7 @@ except ImportError:  # pragma: no cover - optional dependency


 MODEL_FAMILIES = ("logreg", "hist_gb", "auto")
+CALIBRATION_METHODS = ("none", "sigmoid", "isotonic")


 def parse_args() -> argparse.Namespace:
@@ -56,6 +62,19 @@ def parse_args() -> argparse.Namespace:
    )
    parser.add_argument("--threshold", type=float, help="Optional fixed classification threshold.")
    parser.add_argument("--min-rows", type=int, default=200, help="Minimum model-ready rows required.")
+    parser.set_defaults(allow_empty=True)
+    parser.add_argument(
+        "--allow-empty",
+        dest="allow_empty",
+        action="store_true",
+        help="Exit successfully when source/model-ready rows are temporarily unavailable (default: enabled).",
+    )
+    parser.add_argument(
+        "--strict-source-data",
+        dest="allow_empty",
+        action="store_false",
+        help="Fail when source/model-ready rows are unavailable.",
+    )
    parser.add_argument(
        "--feature-set",
        default="baseline",
@@ -76,6 +95,28 @@ def parse_args() -> argparse.Namespace:
            "'auto' compares logreg and hist_gb on validation and selects best by PR-AUC/ROC-AUC/F1."
        ),
    )
+    parser.add_argument(
+        "--tune-hyperparameters",
+        action="store_true",
+        help="Run a lightweight validation-only hyperparameter search before final training.",
+    )
+    parser.add_argument(
+        "--max-hyperparam-trials",
+        type=int,
+        default=12,
+        help="Maximum hyperparameter trials per model family when tuning is enabled.",
+    )
+    parser.add_argument(
+        "--calibration-methods",
+        default="none,sigmoid,isotonic",
+        help="Comma-separated methods from: none,sigmoid,isotonic. Best method selected by validation Brier/ECE.",
+    )
+    parser.add_argument(
+        "--walk-forward-folds",
+        type=int,
+        default=4,
+        help="Number of temporal folds for walk-forward backtest (0 to disable).",
+    )
    parser.add_argument(
        "--random-state",
        type=int,
@@ -88,6 +129,11 @@ def parse_args() -> argparse.Namespace:
        default="models/rain_model_report.json",
        help="Path to save JSON training report.",
    )
+    parser.add_argument(
+        "--model-card-out",
+        default="models/model_card_{model_version}.md",
+        help="Path (or template) for markdown model card. Supports {model_version}. Use empty string to disable.",
+    )
    parser.add_argument(
        "--dataset-out",
        default="models/datasets/rain_dataset_{model_version}_{feature_set}.csv",
@@ -104,25 +150,556 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


-def make_model(model_family: str, random_state: int):
+def parse_calibration_methods(value: str) -> list[str]:
+    methods: list[str] = []
+    for token in (value or "").split(","):
+        method = token.strip().lower()
+        if not method:
+            continue
+        if method not in CALIBRATION_METHODS:
+            raise ValueError(f"unknown calibration method: {method}")
+        if method not in methods:
+            methods.append(method)
+    if not methods:
+        return ["none"]
+    return methods
+
+
+def default_model_params(model_family: str) -> dict[str, Any]:
    if model_family == "logreg":
+        return {"c": 1.0}
+    if model_family == "hist_gb":
+        return {
+            "max_iter": 300,
+            "learning_rate": 0.05,
+            "max_depth": 5,
+            "min_samples_leaf": 20,
+        }
+    raise ValueError(f"unknown model_family: {model_family}")
+
+
+def model_param_grid(model_family: str) -> list[dict[str, Any]]:
+    if model_family == "logreg":
+        return [{"c": c} for c in [0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]
+
+    if model_family == "hist_gb":
+        grid: list[dict[str, Any]] = []
+        for learning_rate, max_depth, max_iter, min_samples_leaf in itertools.product(
+            [0.03, 0.05, 0.08],
+            [3, 5],
+            [200, 300],
+            [10, 20],
+        ):
+            grid.append(
+                {
+                    "learning_rate": learning_rate,
+                    "max_depth": max_depth,
+                    "max_iter": max_iter,
+                    "min_samples_leaf": min_samples_leaf,
+                }
+            )
+        return grid
+    raise ValueError(f"unknown model_family: {model_family}")
+
+
+def limit_trials(grid: list[dict[str, Any]], max_trials: int) -> list[dict[str, Any]]:
+    if max_trials <= 0 or len(grid) <= max_trials:
+        return grid
+    step = max(1, len(grid) // max_trials)
+    return grid[::step][:max_trials]
+
+
+def make_model(model_family: str, random_state: int, params: dict[str, Any] | None = None):
+    model_params = default_model_params(model_family)
+    if params:
+        model_params.update(params)
+
+    if model_family == "logreg":
+        c = float(model_params["c"])
        return Pipeline(
            [
                ("scaler", StandardScaler()),
-                ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=random_state)),
+                (
+                    "clf",
+                    LogisticRegression(
+                        C=c,
+                        max_iter=1000,
+                        class_weight="balanced",
+                        random_state=random_state,
+                    ),
+                ),
            ]
        )
    if model_family == "hist_gb":
        return HistGradientBoostingClassifier(
-            max_iter=300,
-            learning_rate=0.05,
-            max_depth=5,
-            min_samples_leaf=20,
+            max_iter=int(model_params["max_iter"]),
+            learning_rate=float(model_params["learning_rate"]),
+            max_depth=int(model_params["max_depth"]),
+            min_samples_leaf=int(model_params["min_samples_leaf"]),
            random_state=random_state,
        )
    raise ValueError(f"unknown model_family: {model_family}")


+def threshold_from_probs(
+    y_true: np.ndarray,
+    y_prob: np.ndarray,
+    min_precision: float,
+    fixed_threshold: float | None,
+) -> tuple[float, dict[str, Any]]:
+    if fixed_threshold is not None:
+        return float(fixed_threshold), {
+            "selection_rule": "fixed_cli_threshold",
+            "threshold": float(fixed_threshold),
+        }
+    return select_threshold(y_true=y_true, y_prob=y_prob, min_precision=min_precision)
+
+
+def metric_key(metrics: dict[str, Any]) -> tuple[float, float, float]:
+    return (
+        safe_pr_auc(metrics),
+        safe_roc_auc(metrics),
+        float(metrics["f1"]),
+    )
+
+
+def expected_calibration_error(y_true: np.ndarray, y_prob: np.ndarray, bins: int = 10) -> float:
+    if len(y_true) == 0:
+        return float("nan")
+
+    edges = np.linspace(0.0, 1.0, bins + 1)
+    total = float(len(y_true))
+    ece = 0.0
+    for i in range(bins):
+        lo = edges[i]
+        hi = edges[i + 1]
+        if i == bins - 1:
+            mask = (y_prob >= lo) & (y_prob <= hi)
+        else:
+            mask = (y_prob >= lo) & (y_prob < hi)
+        if not np.any(mask):
+            continue
+        bucket_weight = float(np.sum(mask)) / total
+        bucket_confidence = float(np.mean(y_prob[mask]))
+        bucket_frequency = float(np.mean(y_true[mask]))
+        ece += bucket_weight * abs(bucket_frequency - bucket_confidence)
+    return float(ece)
+
+
+def calibration_cv_splits(rows: int) -> int:
+    if rows >= 800:
+        return 5
+    if rows >= 400:
+        return 4
+    if rows >= 240:
+        return 3
+    if rows >= 140:
+        return 2
+    return 0
+
+
+def fit_with_optional_calibration(
+    model_family: str,
+    model_params: dict[str, Any],
+    random_state: int,
+    x_train,
+    y_train: np.ndarray,
+    calibration_method: str,
+    fallback_to_none: bool = True,
+):
+    base = make_model(model_family=model_family, random_state=random_state, params=model_params)
+    if calibration_method == "none":
+        base.fit(x_train, y_train)
+        return base, {
+            "requested_method": "none",
+            "effective_method": "none",
+            "cv_splits": 0,
+        }
+
+    splits = calibration_cv_splits(len(y_train))
+    if splits < 2:
+        if not fallback_to_none:
+            raise RuntimeError("not enough rows for calibration folds")
+        base.fit(x_train, y_train)
+        return base, {
+            "requested_method": calibration_method,
+            "effective_method": "none",
+            "cv_splits": splits,
+            "warning": "insufficient rows for calibration folds; falling back to uncalibrated model",
+        }
+
+    try:
+        calibrated = CalibratedClassifierCV(
+            estimator=base,
+            method=calibration_method,
+            cv=TimeSeriesSplit(n_splits=splits),
+        )
+        calibrated.fit(x_train, y_train)
+        return calibrated, {
+            "requested_method": calibration_method,
+            "effective_method": calibration_method,
+            "cv_splits": splits,
+        }
+    except Exception as exc:
+        if not fallback_to_none:
+            raise
+        base.fit(x_train, y_train)
+        return base, {
+            "requested_method": calibration_method,
+            "effective_method": "none",
+            "cv_splits": splits,
+            "warning": f"calibration failed ({exc}); falling back to uncalibrated model",
+        }
+
+
+def hyperparameter_search(
+    model_family: str,
+    x_train,
+    y_train: np.ndarray,
+    x_val,
+    y_val: np.ndarray,
+    random_state: int,
+    min_precision: float,
+    fixed_threshold: float | None,
+    enabled: bool,
+    max_trials: int,
+) -> tuple[dict[str, Any], list[dict[str, Any]]]:
+    if enabled:
+        grid = limit_trials(model_param_grid(model_family), max_trials=max_trials)
+    else:
+        grid = [default_model_params(model_family)]
+
+    trials: list[dict[str, Any]] = []
+    for params in grid:
+        model = make_model(model_family=model_family, random_state=random_state, params=params)
+        model.fit(x_train, y_train)
+        y_val_prob = model.predict_proba(x_val)[:, 1]
+        threshold, threshold_info = threshold_from_probs(
+            y_true=y_val,
+            y_prob=y_val_prob,
+            min_precision=min_precision,
+            fixed_threshold=fixed_threshold,
+        )
+        metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=threshold)
+        trials.append(
+            {
+                "model_family": model_family,
+                "model_params": params,
+                "threshold": float(threshold),
+                "threshold_info": threshold_info,
+                "validation_metrics": metrics,
+            }
+        )
+
+    best = max(trials, key=lambda v: metric_key(v["validation_metrics"]))
+    return best["model_params"], trials
+
+
+def calibration_selection_key(result: dict[str, Any]) -> tuple[float, float, float, float, float]:
+    metrics = result["validation_metrics"]
+    brier = float(metrics["brier"])
+    ece_10 = float(result["calibration_quality"]["ece_10"])
+    return (
+        brier,
+        ece_10,
+        -safe_pr_auc(metrics),
+        -safe_roc_auc(metrics),
+        -float(metrics["f1"]),
+    )
+
+
+def evaluate_calibration_methods(
+    model_family: str,
+    model_params: dict[str, Any],
+    calibration_methods: list[str],
+    x_train,
+    y_train: np.ndarray,
+    x_val,
+    y_val: np.ndarray,
+    random_state: int,
+    min_precision: float,
+    fixed_threshold: float | None,
+) -> tuple[dict[str, Any], list[dict[str, Any]]]:
+    results: list[dict[str, Any]] = []
+    for method in calibration_methods:
+        fitted, fit_info = fit_with_optional_calibration(
+            model_family=model_family,
+            model_params=model_params,
+            random_state=random_state,
+            x_train=x_train,
+            y_train=y_train,
+            calibration_method=method,
+            fallback_to_none=True,
+        )
+        y_val_prob = fitted.predict_proba(x_val)[:, 1]
+        threshold, threshold_info = threshold_from_probs(
+            y_true=y_val,
+            y_prob=y_val_prob,
+            min_precision=min_precision,
+            fixed_threshold=fixed_threshold,
+        )
+        metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=threshold)
+        results.append(
+            {
+                "calibration_method": method,
+                "fit": fit_info,
+                "threshold": float(threshold),
+                "threshold_info": threshold_info,
+                "validation_metrics": metrics,
+                "calibration_quality": {
+                    "ece_10": expected_calibration_error(y_true=y_val, y_prob=y_val_prob, bins=10),
+                },
+            }
+        )
+
+    selected = min(results, key=calibration_selection_key)
+    return selected, results
+
+
+def evaluate_naive_baselines(test_df, y_test: np.ndarray) -> dict[str, Any]:
+    out: dict[str, Any] = {}
+
+    if "rain_last_1h_mm" in test_df.columns:
+        rain_last = test_df["rain_last_1h_mm"].to_numpy(dtype=float)
+        persistence_prob = (rain_last >= RAIN_EVENT_THRESHOLD_MM).astype(float)
+        out["persistence_last_1h"] = {
+            "rule": f"predict rain when rain_last_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
+            "metrics": evaluate_probs(y_true=y_test, y_prob=persistence_prob, threshold=0.5),
+        }
+
+    has_fc_prob = "fc_precip_prob" in test_df.columns and test_df["fc_precip_prob"].notna().any()
+    has_fc_mm = "fc_precip_mm" in test_df.columns and test_df["fc_precip_mm"].notna().any()
+    if has_fc_prob:
+        fc_prob = test_df["fc_precip_prob"].fillna(0.0).clip(lower=0.0, upper=1.0).to_numpy(dtype=float)
+        out["forecast_precip_prob"] = {
+            "rule": "use fc_precip_prob directly as baseline probability",
+            "metrics": evaluate_probs(y_true=y_test, y_prob=fc_prob, threshold=0.5),
+        }
+
+    if has_fc_prob or has_fc_mm:
+        fc_prob = (
+            test_df["fc_precip_prob"].fillna(0.0).clip(lower=0.0, upper=1.0).to_numpy(dtype=float)
+            if "fc_precip_prob" in test_df.columns
+            else np.zeros(len(test_df), dtype=float)
+        )
+        fc_mm = (
+            test_df["fc_precip_mm"].fillna(0.0).to_numpy(dtype=float)
+            if "fc_precip_mm" in test_df.columns
+            else np.zeros(len(test_df), dtype=float)
+        )
+        rule_prob = ((fc_prob >= 0.5) | (fc_mm >= RAIN_EVENT_THRESHOLD_MM)).astype(float)
+        out["forecast_threshold_rule"] = {
+            "rule": (
+                "predict rain when (fc_precip_prob >= 0.50) "
+                f"OR (fc_precip_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f})"
+            ),
+            "metrics": evaluate_probs(y_true=y_test, y_prob=rule_prob, threshold=0.5),
+        }
+
+    return out
+
+
+def walk_forward_backtest(
+    model_df,
+    feature_cols: list[str],
+    model_family: str,
+    model_params: dict[str, Any],
+    calibration_method: str,
+    random_state: int,
+    min_precision: float,
+    fixed_threshold: float | None,
+    folds: int,
+) -> dict[str, Any]:
+    if folds <= 0:
+        return {"enabled": False, "folds": [], "summary": None}
+
+    n = len(model_df)
+    min_train_rows = max(200, int(0.4 * n))
+    remaining = n - min_train_rows
+    if remaining < 50:
+        return {
+            "enabled": True,
+            "folds": [],
+            "summary": {
+                "error": "not enough rows for walk-forward folds",
+                "requested_folds": folds,
+                "min_train_rows": min_train_rows,
+            },
+        }
+
+    fold_size = max(25, remaining // folds)
+    results: list[dict[str, Any]] = []
+
+    for idx in range(folds):
+        train_end = min_train_rows + idx * fold_size
+        test_end = n if idx == folds - 1 else min(min_train_rows + (idx + 1) * fold_size, n)
+        if train_end >= test_end:
+            continue
+
+        fold_train = model_df.iloc[:train_end]
+        fold_test = model_df.iloc[train_end:test_end]
+        if len(fold_train) < 160 or len(fold_test) < 25:
+            continue
+
+        y_fold_train = fold_train["rain_next_1h"].astype(int).to_numpy()
+        y_fold_test = fold_test["rain_next_1h"].astype(int).to_numpy()
+        if len(np.unique(y_fold_train)) < 2:
+            continue
+
+        inner_val_rows = max(40, int(0.2 * len(fold_train)))
+        if len(fold_train) - inner_val_rows < 80:
+            continue
+        inner_train = fold_train.iloc[:-inner_val_rows]
+        inner_val = fold_train.iloc[-inner_val_rows:]
+        y_inner_train = inner_train["rain_next_1h"].astype(int).to_numpy()
+        y_inner_val = inner_val["rain_next_1h"].astype(int).to_numpy()
+        if len(np.unique(y_inner_train)) < 2:
+            continue
+
+        try:
+            threshold_model, threshold_fit = fit_with_optional_calibration(
+                model_family=model_family,
+                model_params=model_params,
+                random_state=random_state,
+                x_train=inner_train[feature_cols],
+                y_train=y_inner_train,
+                calibration_method=calibration_method,
+                fallback_to_none=True,
+            )
+            inner_val_prob = threshold_model.predict_proba(inner_val[feature_cols])[:, 1]
+            fold_threshold, fold_threshold_info = threshold_from_probs(
+                y_true=y_inner_val,
+                y_prob=inner_val_prob,
+                min_precision=min_precision,
+                fixed_threshold=fixed_threshold,
+            )
+
+            fold_model, fold_fit = fit_with_optional_calibration(
+                model_family=model_family,
+                model_params=model_params,
+                random_state=random_state,
+                x_train=fold_train[feature_cols],
+                y_train=y_fold_train,
+                calibration_method=calibration_method,
+                fallback_to_none=True,
+            )
+            fold_test_prob = fold_model.predict_proba(fold_test[feature_cols])[:, 1]
+            fold_metrics = evaluate_probs(y_true=y_fold_test, y_prob=fold_test_prob, threshold=fold_threshold)
+            fold_result = {
+                "fold_index": idx + 1,
+                "train_rows": len(fold_train),
+                "test_rows": len(fold_test),
+                "train_start": fold_train.index.min(),
+                "train_end": fold_train.index.max(),
+                "test_start": fold_test.index.min(),
+                "test_end": fold_test.index.max(),
+                "threshold": float(fold_threshold),
+                "threshold_selection": fold_threshold_info,
+                "threshold_fit": threshold_fit,
+                "model_fit": fold_fit,
+                "metrics": fold_metrics,
+            }
+            results.append(fold_result)
+        except Exception as exc:
+            results.append(
+                {
+                    "fold_index": idx + 1,
+                    "train_rows": len(fold_train),
+                    "test_rows": len(fold_test),
+                    "error": str(exc),
+                }
+            )
+
+    good = [r for r in results if "metrics" in r]
+    if not good:
+        return {
+            "enabled": True,
+            "folds": results,
+            "summary": {
+                "error": "no successful fold evaluations",
+                "requested_folds": folds,
+            },
+        }
+
+    summary = {
+        "successful_folds": len(good),
+        "requested_folds": folds,
+        "mean_precision": float(np.mean([f["metrics"]["precision"] for f in good])),
+        "mean_recall": float(np.mean([f["metrics"]["recall"] for f in good])),
+        "mean_f1": float(np.mean([f["metrics"]["f1"] for f in good])),
+        "mean_brier": float(np.mean([f["metrics"]["brier"] for f in good])),
+        "mean_pr_auc": float(np.mean([f["metrics"]["pr_auc"] for f in good if f["metrics"]["pr_auc"] is not None]))
+        if any(f["metrics"]["pr_auc"] is not None for f in good)
+        else None,
+        "mean_roc_auc": float(np.mean([f["metrics"]["roc_auc"] for f in good if f["metrics"]["roc_auc"] is not None]))
+        if any(f["metrics"]["roc_auc"] is not None for f in good)
+        else None,
+    }
+    return {"enabled": True, "folds": results, "summary": summary}
+
+
+def write_model_card(path: str, report: dict[str, Any]) -> None:
+    lines = [
+        "# Rain Model Card",
+        "",
+        f"- Model version: `{report['model_version']}`",
+        f"- Generated at (UTC): `{report['generated_at']}`",
+        f"- Site: `{report['site']}`",
+        f"- Target: `{report['target_definition']}`",
+        f"- Feature set: `{report['feature_set']}`",
+        f"- Model family: `{report['model_family']}`",
+        f"- Model params: `{report['model_params']}`",
+        f"- Calibration method: `{report['calibration_method']}`",
+        f"- Operating threshold: `{report['threshold_selection']['threshold']:.3f}`",
+        "",
+        "## Data Window",
+        "",
+        f"- Requested start: `{report['data_window']['requested_start']}`",
+        f"- Requested end: `{report['data_window']['requested_end']}`",
+        f"- Actual start: `{report['data_window']['actual_start']}`",
+        f"- Actual end: `{report['data_window']['actual_end']}`",
+        f"- Rows: train `{report['split']['train_rows']}`, val `{report['split']['val_rows']}`, test `{report['split']['test_rows']}`",
+        "",
+        "## Features",
+        "",
+    ]
+    for col in report["feature_columns"]:
+        lines.append(f"- `{col}`")
+
+    lines.extend(
+        [
+            "",
+            "## Performance",
+            "",
+            "- Validation:",
+            f"  precision `{report['validation_metrics']['precision']:.3f}`, "
+            f"recall `{report['validation_metrics']['recall']:.3f}`, "
+            f"PR-AUC `{report['validation_metrics']['pr_auc']}`, "
+            f"ROC-AUC `{report['validation_metrics']['roc_auc']}`, "
+            f"Brier `{report['validation_metrics']['brier']:.4f}`",
+            "- Test:",
+            f"  precision `{report['test_metrics']['precision']:.3f}`, "
+            f"recall `{report['test_metrics']['recall']:.3f}`, "
+            f"PR-AUC `{report['test_metrics']['pr_auc']}`, "
+            f"ROC-AUC `{report['test_metrics']['roc_auc']}`, "
+            f"Brier `{report['test_metrics']['brier']:.4f}`",
+            "",
+            "## Known Limitations",
+            "",
+            "- Sensor rain counter resets are clipped at zero increment; extreme spikes are flagged but not fully removed.",
+            "- Forecast feature availability can vary by upstream model freshness.",
+            "- Performance may drift seasonally and should be monitored with the drift queries in docs.",
+            "",
+        ]
+    )
+
+    card_dir = os.path.dirname(path)
+    if card_dir:
+        os.makedirs(card_dir, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines))
+
+
 def train_candidate(
    model_family: str,
    x_train,
@@ -132,30 +709,51 @@ def train_candidate(
    random_state: int,
    min_precision: float,
    fixed_threshold: float | None,
+    tune_hyperparameters: bool,
+    max_hyperparam_trials: int,
+    calibration_methods: list[str],
 ) -> dict[str, Any]:
-    model = make_model(model_family=model_family, random_state=random_state)
-    model.fit(x_train, y_train)
-    y_val_prob = model.predict_proba(x_val)[:, 1]
+    best_params, tuning_trials = hyperparameter_search(
+        model_family=model_family,
+        x_train=x_train,
+        y_train=y_train,
+        x_val=x_val,
+        y_val=y_val,
+        random_state=random_state,
+        min_precision=min_precision,
+        fixed_threshold=fixed_threshold,
+        enabled=tune_hyperparameters,
+        max_trials=max_hyperparam_trials,
+    )

-    if fixed_threshold is not None:
-        threshold = fixed_threshold
-        threshold_info = {
-            "selection_rule": "fixed_cli_threshold",
-            "threshold": float(fixed_threshold),
-        }
-    else:
-        threshold, threshold_info = select_threshold(
-            y_true=y_val,
-            y_prob=y_val_prob,
-            min_precision=min_precision,
-        )
+    selected_calibration, calibration_trials = evaluate_calibration_methods(
+        model_family=model_family,
+        model_params=best_params,
+        calibration_methods=calibration_methods,
+        x_train=x_train,
+        y_train=y_train,
+        x_val=x_val,
+        y_val=y_val,
+        random_state=random_state,
+        min_precision=min_precision,
+        fixed_threshold=fixed_threshold,
+    )

-    val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=threshold)
    return {
        "model_family": model_family,
-        "threshold": float(threshold),
-        "threshold_info": threshold_info,
-        "validation_metrics": val_metrics,
+        "model_params": best_params,
+        "hyperparameter_tuning": {
+            "enabled": tune_hyperparameters,
+            "trial_count": len(tuning_trials),
+            "trials": tuning_trials,
+        },
+        "calibration_comparison": calibration_trials,
+        "calibration_method": selected_calibration["calibration_method"],
+        "calibration_fit": selected_calibration["fit"],
+        "threshold": float(selected_calibration["threshold"]),
+        "threshold_info": selected_calibration["threshold_info"],
+        "validation_metrics": selected_calibration["validation_metrics"],
+        "calibration_quality": selected_calibration["calibration_quality"],
    }


@@ -168,6 +766,7 @@ def main() -> int:
    end = parse_time(args.end) if args.end else ""
    feature_cols = feature_columns_for_set(args.feature_set)
    needs_forecast = feature_columns_need_forecast(feature_cols)
+    calibration_methods = parse_calibration_methods(args.calibration_methods)

    with psycopg2.connect(args.db_url) as conn:
        ws90 = fetch_ws90(conn, args.site, start, end)
@@ -176,10 +775,27 @@ def main() -> int:
        if needs_forecast:
            forecast = fetch_forecast(conn, args.site, start, end, model=args.forecast_model)

+    if ws90.empty:
+        message = "no ws90 observations found in requested window"
+        if args.allow_empty:
+            print(f"Rain model training skipped: {message}.")
+            return 0
+        raise RuntimeError(message)
+    if baro.empty:
+        message = "no barometer observations found in requested window"
+        if args.allow_empty:
+            print(f"Rain model training skipped: {message}.")
+            return 0
+        raise RuntimeError(message)
+
    full_df = build_dataset(ws90, baro, forecast=forecast, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
    model_df = model_frame(full_df, feature_cols, require_target=True)
    if len(model_df) < args.min_rows:
-        raise RuntimeError(f"not enough model-ready rows after filtering (need >= {args.min_rows})")
+        message = f"not enough model-ready rows after filtering (need >= {args.min_rows})"
+        if args.allow_empty:
+            print(f"Rain model training skipped: {message}.")
+            return 0
+        raise RuntimeError(message)

    train_df, val_df, test_df = split_time_ordered(
        model_df,
@@ -194,6 +810,11 @@ def main() -> int:
    x_test = test_df[feature_cols]
    y_test = test_df["rain_next_1h"].astype(int).to_numpy()

+    if len(np.unique(y_train)) < 2:
+        raise RuntimeError("training split does not contain both classes; cannot train classifier")
+    if len(np.unique(y_val)) < 2:
+        print("warning: validation split has one class; AUC metrics may be unavailable", flush=True)
+
    candidate_families = ["logreg", "hist_gb"] if args.model_family == "auto" else [args.model_family]
    candidates = [
        train_candidate(
@@ -205,18 +826,19 @@ def main() -> int:
            random_state=args.random_state,
            min_precision=args.min_precision,
            fixed_threshold=args.threshold,
+            tune_hyperparameters=args.tune_hyperparameters,
+            max_hyperparam_trials=args.max_hyperparam_trials,
+            calibration_methods=calibration_methods,
        )
        for family in candidate_families
    ]
    best_candidate = max(
        candidates,
-        key=lambda c: (
-            safe_pr_auc(c["validation_metrics"]),
-            safe_roc_auc(c["validation_metrics"]),
-            float(c["validation_metrics"]["f1"]),
-        ),
+        key=lambda c: metric_key(c["validation_metrics"]),
    )
    selected_model_family = str(best_candidate["model_family"])
+    selected_model_params = best_candidate["model_params"]
+    selected_calibration_method = str(best_candidate["calibration_method"])
    chosen_threshold = float(best_candidate["threshold"])
    threshold_info = best_candidate["threshold_info"]
    val_metrics = best_candidate["validation_metrics"]
@@ -225,10 +847,32 @@ def main() -> int:
    x_train_val = train_val_df[feature_cols]
    y_train_val = train_val_df["rain_next_1h"].astype(int).to_numpy()

-    final_model = make_model(model_family=selected_model_family, random_state=args.random_state)
-    final_model.fit(x_train_val, y_train_val)
+    final_model, final_fit_info = fit_with_optional_calibration(
+        model_family=selected_model_family,
+        model_params=selected_model_params,
+        random_state=args.random_state,
+        x_train=x_train_val,
+        y_train=y_train_val,
+        calibration_method=selected_calibration_method,
+        fallback_to_none=True,
+    )
    y_test_prob = final_model.predict_proba(x_test)[:, 1]
    test_metrics = evaluate_probs(y_true=y_test, y_prob=y_test_prob, threshold=chosen_threshold)
+    test_calibration = {
+        "ece_10": expected_calibration_error(y_true=y_test, y_prob=y_test_prob, bins=10),
+    }
+    naive_baselines_test = evaluate_naive_baselines(test_df=test_df, y_test=y_test)
+    walk_forward = walk_forward_backtest(
+        model_df=model_df,
+        feature_cols=feature_cols,
+        model_family=selected_model_family,
+        model_params=selected_model_params,
+        calibration_method=selected_calibration_method,
+        random_state=args.random_state,
+        min_precision=args.min_precision,
+        fixed_threshold=args.threshold,
+        folds=args.walk_forward_folds,
+    )

    report = {
        "generated_at": datetime.now(timezone.utc).isoformat(),
@@ -236,10 +880,14 @@ def main() -> int:
        "model_version": args.model_version,
        "model_family_requested": args.model_family,
        "model_family": selected_model_family,
+        "model_params": selected_model_params,
        "feature_set": args.feature_set,
        "target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
        "feature_columns": feature_cols,
        "forecast_model": args.forecast_model if needs_forecast else None,
+        "calibration_method_requested": calibration_methods,
+        "calibration_method": selected_calibration_method,
+        "calibration_fit": final_fit_info,
        "data_window": {
            "requested_start": start or None,
            "requested_end": end or None,
@@ -275,16 +923,25 @@ def main() -> int:
        "candidate_models": [
            {
                "model_family": c["model_family"],
+                "model_params": c["model_params"],
+                "hyperparameter_tuning": c["hyperparameter_tuning"],
+                "calibration_method": c["calibration_method"],
+                "calibration_fit": c["calibration_fit"],
+                "calibration_comparison": c["calibration_comparison"],
                "threshold_selection": {
                    **c["threshold_info"],
                    "min_precision_constraint": args.min_precision,
                },
+                "calibration_quality": c["calibration_quality"],
                "validation_metrics": c["validation_metrics"],
            }
            for c in candidates
        ],
        "validation_metrics": val_metrics,
        "test_metrics": test_metrics,
+        "test_calibration_quality": test_calibration,
+        "naive_baselines_test": naive_baselines_test,
+        "walk_forward_backtest": walk_forward,
    }
    report = to_builtin(report)

@@ -292,8 +949,16 @@ def main() -> int:
    print(f"  site: {args.site}")
    print(f"  model_version: {args.model_version}")
    print(f"  model_family: {selected_model_family} (requested={args.model_family})")
+    print(f"  model_params: {selected_model_params}")
+    print(f"  calibration_method: {report['calibration_method']}")
    print(f"  feature_set: {args.feature_set} ({len(feature_cols)} features)")
-    print(f"  rows: total={report['data_window']['model_rows']} train={report['split']['train_rows']} val={report['split']['val_rows']} test={report['split']['test_rows']}")
+    print(
+        "  rows: "
+        f"total={report['data_window']['model_rows']} "
+        f"train={report['split']['train_rows']} "
+        f"val={report['split']['val_rows']} "
+        f"test={report['split']['test_rows']}"
+    )
    print(
        "  threshold: "
        f"{report['threshold_selection']['threshold']:.3f} "
@@ -311,8 +976,32 @@ def main() -> int:
        f"precision={report['test_metrics']['precision']:.3f} "
        f"recall={report['test_metrics']['recall']:.3f} "
        f"roc_auc={report['test_metrics']['roc_auc'] if report['test_metrics']['roc_auc'] is not None else 'n/a'} "
-        f"pr_auc={report['test_metrics']['pr_auc'] if report['test_metrics']['pr_auc'] is not None else 'n/a'}"
+        f"pr_auc={report['test_metrics']['pr_auc'] if report['test_metrics']['pr_auc'] is not None else 'n/a'} "
+        f"brier={report['test_metrics']['brier']:.4f} "
+        f"ece10={report['test_calibration_quality']['ece_10']:.4f}"
    )
+    if report["walk_forward_backtest"]["summary"] is not None:
+        summary = report["walk_forward_backtest"]["summary"]
+        if "error" in summary:
+            print(f"  walk-forward: {summary['error']}")
+        else:
+            print(
+                "  walk-forward: "
+                f"folds={summary['successful_folds']}/{summary['requested_folds']} "
+                f"mean_precision={summary['mean_precision']:.3f} "
+                f"mean_recall={summary['mean_recall']:.3f} "
+                f"mean_pr_auc={summary['mean_pr_auc'] if summary['mean_pr_auc'] is not None else 'n/a'}"
+            )
+    if report["naive_baselines_test"]:
+        print("  naive baselines (test):")
+        for name, baseline in report["naive_baselines_test"].items():
+            m = baseline["metrics"]
+            print(
+                f"    {name}: "
+                f"precision={m['precision']:.3f} recall={m['recall']:.3f} "
+                f"pr_auc={m['pr_auc'] if m['pr_auc'] is not None else 'n/a'} "
+                f"brier={m['brier']:.4f}"
+            )

    if args.report_out:
        report_dir = os.path.dirname(args.report_out)
@@ -322,6 +1011,11 @@ def main() -> int:
            json.dump(report, f, indent=2)
        print(f"Saved report to {args.report_out}")

+    if args.model_card_out:
+        model_card_out = args.model_card_out.format(model_version=args.model_version)
+        write_model_card(model_card_out, report)
+        print(f"Saved model card to {model_card_out}")
+
    if args.dataset_out:
        dataset_out = args.dataset_out.format(model_version=args.model_version, feature_set=args.feature_set)
        dataset_dir = os.path.dirname(dataset_out)
@@ -341,6 +1035,9 @@ def main() -> int:
            artifact = {
                "model": final_model,
                "model_family": selected_model_family,
+                "model_params": selected_model_params,
+                "calibration_method": selected_calibration_method,
+                "calibration_fit": final_fit_info,
                "features": feature_cols,
                "feature_set": args.feature_set,
                "forecast_model": args.forecast_model if needs_forecast else None,