bugfix wunderground reporting

2026-03-09 09:19:45 +11:00
parent 5b8cad905f
commit c796f1324e
12 changed files with 253 additions and 33 deletions
@@ -3,9 +3,11 @@ import argparse
 import json
 import os
 from datetime import datetime, timezone
+from typing import Any

 import numpy as np
 import psycopg2
+from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
@@ -22,6 +24,8 @@ from rain_model_common import (
    feature_columns_need_forecast,
    model_frame,
    parse_time,
+    safe_pr_auc,
+    safe_roc_auc,
    select_threshold,
    split_time_ordered,
    to_builtin,
@@ -33,6 +37,9 @@ except ImportError:  # pragma: no cover - optional dependency
    joblib = None


+MODEL_FAMILIES = ("logreg", "hist_gb", "auto")
+
+
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Train a rain prediction model (next 1h >= 0.2mm).")
    parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
@@ -60,6 +67,21 @@ def parse_args() -> argparse.Namespace:
        default="ecmwf",
        help="Forecast model name when feature set requires forecast columns.",
    )
+    parser.add_argument(
+        "--model-family",
+        default="logreg",
+        choices=MODEL_FAMILIES,
+        help=(
+            "Estimator family. "
+            "'auto' compares logreg and hist_gb on validation and selects best by PR-AUC/ROC-AUC/F1."
+        ),
+    )
+    parser.add_argument(
+        "--random-state",
+        type=int,
+        default=42,
+        help="Random seed for stochastic estimators.",
+    )
    parser.add_argument("--out", default="models/rain_model.pkl", help="Path to save model.")
    parser.add_argument(
        "--report-out",
@@ -82,13 +104,59 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


-def make_model() -> Pipeline:
-    return Pipeline(
-        [
-            ("scaler", StandardScaler()),
-            ("clf", LogisticRegression(max_iter=1000, class_weight="balanced")),
-        ]
-    )
+def make_model(model_family: str, random_state: int):
+    if model_family == "logreg":
+        return Pipeline(
+            [
+                ("scaler", StandardScaler()),
+                ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=random_state)),
+            ]
+        )
+    if model_family == "hist_gb":
+        return HistGradientBoostingClassifier(
+            max_iter=300,
+            learning_rate=0.05,
+            max_depth=5,
+            min_samples_leaf=20,
+            random_state=random_state,
+        )
+    raise ValueError(f"unknown model_family: {model_family}")
+
+
+def train_candidate(
+    model_family: str,
+    x_train,
+    y_train: np.ndarray,
+    x_val,
+    y_val: np.ndarray,
+    random_state: int,
+    min_precision: float,
+    fixed_threshold: float | None,
+) -> dict[str, Any]:
+    model = make_model(model_family=model_family, random_state=random_state)
+    model.fit(x_train, y_train)
+    y_val_prob = model.predict_proba(x_val)[:, 1]
+
+    if fixed_threshold is not None:
+        threshold = fixed_threshold
+        threshold_info = {
+            "selection_rule": "fixed_cli_threshold",
+            "threshold": float(fixed_threshold),
+        }
+    else:
+        threshold, threshold_info = select_threshold(
+            y_true=y_val,
+            y_prob=y_val_prob,
+            min_precision=min_precision,
+        )
+
+    val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=threshold)
+    return {
+        "model_family": model_family,
+        "threshold": float(threshold),
+        "threshold_info": threshold_info,
+        "validation_metrics": val_metrics,
+    }


 def main() -> int:
@@ -126,30 +194,38 @@ def main() -> int:
    x_test = test_df[feature_cols]
    y_test = test_df["rain_next_1h"].astype(int).to_numpy()

-    base_model = make_model()
-    base_model.fit(x_train, y_train)
-    y_val_prob = base_model.predict_proba(x_val)[:, 1]
-
-    if args.threshold is not None:
-        chosen_threshold = args.threshold
-        threshold_info = {
-            "selection_rule": "fixed_cli_threshold",
-            "threshold": float(args.threshold),
-        }
-    else:
-        chosen_threshold, threshold_info = select_threshold(
-            y_true=y_val,
-            y_prob=y_val_prob,
+    candidate_families = ["logreg", "hist_gb"] if args.model_family == "auto" else [args.model_family]
+    candidates = [
+        train_candidate(
+            model_family=family,
+            x_train=x_train,
+            y_train=y_train,
+            x_val=x_val,
+            y_val=y_val,
+            random_state=args.random_state,
            min_precision=args.min_precision,
+            fixed_threshold=args.threshold,
        )
-
-    val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=chosen_threshold)
+        for family in candidate_families
+    ]
+    best_candidate = max(
+        candidates,
+        key=lambda c: (
+            safe_pr_auc(c["validation_metrics"]),
+            safe_roc_auc(c["validation_metrics"]),
+            float(c["validation_metrics"]["f1"]),
+        ),
+    )
+    selected_model_family = str(best_candidate["model_family"])
+    chosen_threshold = float(best_candidate["threshold"])
+    threshold_info = best_candidate["threshold_info"]
+    val_metrics = best_candidate["validation_metrics"]

    train_val_df = model_df.iloc[: len(train_df) + len(val_df)]
    x_train_val = train_val_df[feature_cols]
    y_train_val = train_val_df["rain_next_1h"].astype(int).to_numpy()

-    final_model = make_model()
+    final_model = make_model(model_family=selected_model_family, random_state=args.random_state)
    final_model.fit(x_train_val, y_train_val)
    y_test_prob = final_model.predict_proba(x_test)[:, 1]
    test_metrics = evaluate_probs(y_true=y_test, y_prob=y_test_prob, threshold=chosen_threshold)
@@ -158,6 +234,8 @@ def main() -> int:
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "site": args.site,
        "model_version": args.model_version,
+        "model_family_requested": args.model_family,
+        "model_family": selected_model_family,
        "feature_set": args.feature_set,
        "target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
        "feature_columns": feature_cols,
@@ -194,6 +272,17 @@ def main() -> int:
            **threshold_info,
            "min_precision_constraint": args.min_precision,
        },
+        "candidate_models": [
+            {
+                "model_family": c["model_family"],
+                "threshold_selection": {
+                    **c["threshold_info"],
+                    "min_precision_constraint": args.min_precision,
+                },
+                "validation_metrics": c["validation_metrics"],
+            }
+            for c in candidates
+        ],
        "validation_metrics": val_metrics,
        "test_metrics": test_metrics,
    }
@@ -202,6 +291,7 @@ def main() -> int:
    print("Rain model training summary:")
    print(f"  site: {args.site}")
    print(f"  model_version: {args.model_version}")
+    print(f"  model_family: {selected_model_family} (requested={args.model_family})")
    print(f"  feature_set: {args.feature_set} ({len(feature_cols)} features)")
    print(f"  rows: total={report['data_window']['model_rows']} train={report['split']['train_rows']} val={report['split']['val_rows']} test={report['split']['test_rows']}")
    print(
@@ -250,6 +340,7 @@ def main() -> int:
        else:
            artifact = {
                "model": final_model,
+                "model_family": selected_model_family,
                "features": feature_cols,
                "feature_set": args.feature_set,
                "forecast_model": args.forecast_model if needs_forecast else None,