feat: add rain data audit and prediction scripts

2026-03-05 08:01:54 +11:00
parent 5bfa910495
commit 96e72d7c43
13 changed files with 1004 additions and 182 deletions
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+
+import numpy as np
+import psycopg2
+
+from rain_model_common import (
+    FEATURE_COLUMNS,
+    RAIN_EVENT_THRESHOLD_MM,
+    build_dataset,
+    fetch_baro,
+    fetch_ws90,
+    model_frame,
+    parse_time,
+    to_builtin,
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Audit weather time-series quality for rain model training.")
+    parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
+    parser.add_argument("--site", required=True, help="Site name (e.g. home).")
+    parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).")
+    parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).")
+    parser.add_argument("--out", default="models/rain_data_audit.json", help="Path to save JSON audit report.")
+    return parser.parse_args()
+
+
+def longest_zero_run(counts: np.ndarray) -> int:
+    best = 0
+    cur = 0
+    for v in counts:
+        if v == 0:
+            cur += 1
+            if cur > best:
+                best = cur
+        else:
+            cur = 0
+    return best
+
+
+def build_weekly_balance(model_df):
+    weekly = model_df.copy()
+    iso = weekly.index.to_series().dt.isocalendar()
+    weekly["year_week"] = iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)
+
+    grouped = (
+        weekly.groupby("year_week")["rain_next_1h"]
+        .agg(total_rows="count", positive_rows="sum")
+        .reset_index()
+        .sort_values("year_week")
+    )
+    grouped["positive_rate"] = grouped["positive_rows"] / grouped["total_rows"]
+    return grouped.to_dict(orient="records")
+
+
+def main() -> int:
+    args = parse_args()
+    if not args.db_url:
+        raise SystemExit("missing --db-url or DATABASE_URL")
+
+    start = parse_time(args.start) if args.start else ""
+    end = parse_time(args.end) if args.end else ""
+
+    with psycopg2.connect(args.db_url) as conn:
+        ws90 = fetch_ws90(conn, args.site, start, end)
+        baro = fetch_baro(conn, args.site, start, end)
+
+    df = build_dataset(ws90, baro, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
+    model_df = model_frame(df, FEATURE_COLUMNS, require_target=True)
+
+    ws90_dupes = int(ws90.duplicated(subset=["ts", "station_id"]).sum()) if not ws90.empty else 0
+    baro_dupes = int(baro.duplicated(subset=["ts", "source"]).sum()) if not baro.empty else 0
+
+    ws90_out_of_order = 0
+    if not ws90.empty:
+        ws90_by_received = ws90.sort_values("received_at")
+        ws90_out_of_order = int((ws90_by_received["ts"].diff().dropna() < np.timedelta64(0, "ns")).sum())
+
+    baro_out_of_order = 0
+    if not baro.empty:
+        baro_by_received = baro.sort_values("received_at")
+        baro_out_of_order = int((baro_by_received["ts"].diff().dropna() < np.timedelta64(0, "ns")).sum())
+
+    ws90_counts = ws90.set_index("ts").resample("5min").size() if not ws90.empty else np.array([])
+    baro_counts = baro.set_index("ts").resample("5min").size() if not baro.empty else np.array([])
+
+    ws90_gap_buckets = int((ws90_counts == 0).sum()) if len(ws90_counts) else 0
+    baro_gap_buckets = int((baro_counts == 0).sum()) if len(baro_counts) else 0
+    ws90_max_gap_min = longest_zero_run(np.array(ws90_counts)) * 5 if len(ws90_counts) else 0
+    baro_max_gap_min = longest_zero_run(np.array(baro_counts)) * 5 if len(baro_counts) else 0
+
+    missingness = {}
+    for col in FEATURE_COLUMNS + ["pressure_hpa", "rain_mm", "rain_inc", "rain_next_1h_mm"]:
+        if col in df.columns:
+            missingness[col] = float(df[col].isna().mean())
+
+    max_rain_inc = None
+    if "rain_inc" in df.columns and np.isfinite(df["rain_inc"].to_numpy(dtype=float)).any():
+        max_rain_inc = float(np.nanmax(df["rain_inc"].to_numpy(dtype=float)))
+
+    report = {
+        "site": args.site,
+        "target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
+        "requested_window": {
+            "start": start or None,
+            "end": end or None,
+        },
+        "observed_window": {
+            "ws90_start": ws90["ts"].min() if not ws90.empty else None,
+            "ws90_end": ws90["ts"].max() if not ws90.empty else None,
+            "baro_start": baro["ts"].min() if not baro.empty else None,
+            "baro_end": baro["ts"].max() if not baro.empty else None,
+            "model_start": model_df.index.min() if not model_df.empty else None,
+            "model_end": model_df.index.max() if not model_df.empty else None,
+        },
+        "row_counts": {
+            "ws90_rows": int(len(ws90)),
+            "baro_rows": int(len(baro)),
+            "model_rows": int(len(model_df)),
+        },
+        "duplicates": {
+            "ws90_ts_station_duplicates": ws90_dupes,
+            "baro_ts_source_duplicates": baro_dupes,
+        },
+        "out_of_order": {
+            "ws90_by_received_count": ws90_out_of_order,
+            "baro_by_received_count": baro_out_of_order,
+        },
+        "gaps_5m": {
+            "ws90_empty_buckets": ws90_gap_buckets,
+            "baro_empty_buckets": baro_gap_buckets,
+            "ws90_max_gap_minutes": ws90_max_gap_min,
+            "baro_max_gap_minutes": baro_max_gap_min,
+        },
+        "missingness_ratio": missingness,
+        "label_quality": {
+            "rain_reset_count": int(np.nansum(df["rain_reset"].fillna(False).to_numpy(dtype=int))),
+            "rain_spike_5m_count": int(np.nansum(df["rain_spike_5m"].fillna(False).to_numpy(dtype=int))),
+            "max_rain_increment_5m_mm": max_rain_inc,
+        },
+        "class_balance": {
+            "overall_positive_rate": float(model_df["rain_next_1h"].mean()) if not model_df.empty else None,
+            "weekly": build_weekly_balance(model_df) if not model_df.empty else [],
+        },
+    }
+    report = to_builtin(report)
+
+    print("Rain data audit summary:")
+    print(f"  site: {report['site']}")
+    print(
+        "  rows: "
+        f"ws90={report['row_counts']['ws90_rows']} "
+        f"baro={report['row_counts']['baro_rows']} "
+        f"model={report['row_counts']['model_rows']}"
+    )
+    print(
+        "  duplicates: "
+        f"ws90={report['duplicates']['ws90_ts_station_duplicates']} "
+        f"baro={report['duplicates']['baro_ts_source_duplicates']}"
+    )
+    print(
+        "  rain label checks: "
+        f"resets={report['label_quality']['rain_reset_count']} "
+        f"spikes_5m={report['label_quality']['rain_spike_5m_count']} "
+        f"max_inc_5m={report['label_quality']['max_rain_increment_5m_mm']}"
+    )
+    print(f"  overall positive rate: {report['class_balance']['overall_positive_rate']}")
+
+    if args.out:
+        out_dir = os.path.dirname(args.out)
+        if out_dir:
+            os.makedirs(out_dir, exist_ok=True)
+        with open(args.out, "w", encoding="utf-8") as f:
+            json.dump(report, f, indent=2)
+        print(f"Saved audit report to {args.out}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import os
+from datetime import datetime, timedelta, timezone
+
+import psycopg2
+from psycopg2.extras import Json
+
+from rain_model_common import build_dataset, fetch_baro, fetch_ws90, model_frame, parse_time, to_builtin
+
+try:
+    import joblib
+except ImportError:  # pragma: no cover - optional dependency
+    joblib = None
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run rain model inference and upsert prediction to Postgres.")
+    parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
+    parser.add_argument("--site", required=True, help="Site name (e.g. home).")
+    parser.add_argument("--model-path", default="models/rain_model.pkl", help="Path to trained model artifact.")
+    parser.add_argument("--model-name", default="rain_next_1h", help="Logical prediction model name.")
+    parser.add_argument("--model-version", help="Override artifact model_version.")
+    parser.add_argument(
+        "--at",
+        help="Prediction timestamp (RFC3339 or YYYY-MM-DD). Default: current UTC time.",
+    )
+    parser.add_argument(
+        "--history-hours",
+        type=int,
+        default=6,
+        help="History lookback window used to build features.",
+    )
+    parser.add_argument("--dry-run", action="store_true", help="Do not write prediction to DB.")
+    return parser.parse_args()
+
+
+def load_artifact(path: str):
+    if joblib is None:
+        raise RuntimeError("joblib not installed; cannot load model artifact")
+    if not os.path.exists(path):
+        raise RuntimeError(f"model artifact not found: {path}")
+    artifact = joblib.load(path)
+    if "model" not in artifact:
+        raise RuntimeError("invalid artifact: missing 'model'")
+    if "features" not in artifact:
+        raise RuntimeError("invalid artifact: missing 'features'")
+    return artifact
+
+
+def parse_at(value: str | None) -> datetime:
+    if not value:
+        return datetime.now(timezone.utc)
+    parsed = parse_time(value)
+    return datetime.fromisoformat(parsed.replace("Z", "+00:00")).astimezone(timezone.utc)
+
+
+def main() -> int:
+    args = parse_args()
+    if not args.db_url:
+        raise SystemExit("missing --db-url or DATABASE_URL")
+
+    at = parse_at(args.at)
+    artifact = load_artifact(args.model_path)
+    model = artifact["model"]
+    features = artifact["features"]
+    threshold = float(artifact.get("threshold", 0.5))
+    model_version = args.model_version or artifact.get("model_version") or "unknown"
+
+    fetch_start = (at - timedelta(hours=args.history_hours)).isoformat()
+    fetch_end = (at + timedelta(hours=1, minutes=5)).isoformat()
+
+    with psycopg2.connect(args.db_url) as conn:
+        ws90 = fetch_ws90(conn, args.site, fetch_start, fetch_end)
+        baro = fetch_baro(conn, args.site, fetch_start, fetch_end)
+
+        full_df = build_dataset(ws90, baro)
+        feature_df = model_frame(full_df, feature_cols=features, require_target=False)
+        candidates = feature_df.loc[feature_df.index <= at]
+        if candidates.empty:
+            raise RuntimeError("no feature-complete row available at or before requested timestamp")
+
+        row = candidates.tail(1)
+        pred_ts = row.index[0].to_pydatetime()
+        x = row[features]
+
+        probability = float(model.predict_proba(x)[:, 1][0])
+        predict_rain = probability >= threshold
+
+        actual_mm = None
+        actual_flag = None
+        evaluated_at = None
+        latest_available = full_df.index.max().to_pydatetime()
+        if pred_ts + timedelta(hours=1) <= latest_available:
+            next_mm = full_df.loc[pred_ts, "rain_next_1h_mm"]
+            next_flag = full_df.loc[pred_ts, "rain_next_1h"]
+            if next_mm == next_mm:  # NaN-safe check
+                actual_mm = float(next_mm)
+            if next_flag == next_flag:
+                actual_flag = bool(next_flag)
+            evaluated_at = datetime.now(timezone.utc)
+
+        metadata = {
+            "artifact_path": args.model_path,
+            "artifact_model_version": artifact.get("model_version"),
+            "feature_values": {col: float(row.iloc[0][col]) for col in features},
+            "source_window_start": fetch_start,
+            "source_window_end": fetch_end,
+            "requested_at": at.isoformat(),
+            "pred_ts": pred_ts.isoformat(),
+        }
+        metadata = to_builtin(metadata)
+
+        print("Rain inference summary:")
+        print(f"  site: {args.site}")
+        print(f"  model_name: {args.model_name}")
+        print(f"  model_version: {model_version}")
+        print(f"  pred_ts: {pred_ts.isoformat()}")
+        print(f"  threshold: {threshold:.3f}")
+        print(f"  probability: {probability:.4f}")
+        print(f"  predict_rain: {predict_rain}")
+        print(f"  outcome_available: {actual_flag is not None}")
+
+        if args.dry_run:
+            print("dry-run enabled; skipping DB upsert.")
+            return 0
+
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                INSERT INTO predictions_rain_1h (
+                    ts,
+                    generated_at,
+                    site,
+                    model_name,
+                    model_version,
+                    threshold,
+                    probability,
+                    predict_rain,
+                    rain_next_1h_mm_actual,
+                    rain_next_1h_actual,
+                    evaluated_at,
+                    metadata
+                ) VALUES (
+                    %s, now(), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
+                )
+                ON CONFLICT (site, model_name, model_version, ts)
+                DO UPDATE SET
+                    generated_at = EXCLUDED.generated_at,
+                    threshold = EXCLUDED.threshold,
+                    probability = EXCLUDED.probability,
+                    predict_rain = EXCLUDED.predict_rain,
+                    rain_next_1h_mm_actual = COALESCE(EXCLUDED.rain_next_1h_mm_actual, predictions_rain_1h.rain_next_1h_mm_actual),
+                    rain_next_1h_actual = COALESCE(EXCLUDED.rain_next_1h_actual, predictions_rain_1h.rain_next_1h_actual),
+                    evaluated_at = COALESCE(EXCLUDED.evaluated_at, predictions_rain_1h.evaluated_at),
+                    metadata = EXCLUDED.metadata
+                """,
+                (
+                    pred_ts,
+                    args.site,
+                    args.model_name,
+                    model_version,
+                    threshold,
+                    probability,
+                    predict_rain,
+                    actual_mm,
+                    actual_flag,
+                    evaluated_at,
+                    Json(metadata),
+                ),
+            )
+            conn.commit()
+        print("Prediction upserted into predictions_rain_1h.")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import (
+    accuracy_score,
+    average_precision_score,
+    brier_score_loss,
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+)
+
+FEATURE_COLUMNS = [
+    "pressure_trend_1h",
+    "humidity",
+    "temperature_c",
+    "wind_avg_m_s",
+    "wind_max_m_s",
+]
+
+RAIN_EVENT_THRESHOLD_MM = 0.2
+RAIN_SPIKE_THRESHOLD_MM_5M = 5.0
+RAIN_HORIZON_BUCKETS = 12  # 12 * 5m = 1h
+
+
+def parse_time(value: str) -> str:
+    if not value:
+        return ""
+    try:
+        datetime.fromisoformat(value.replace("Z", "+00:00"))
+        return value
+    except ValueError as exc:
+        raise ValueError(f"invalid time format: {value}") from exc
+
+
+def fetch_ws90(conn, site: str, start: str, end: str) -> pd.DataFrame:
+    sql = """
+        SELECT ts, station_id, received_at, temperature_c, humidity, wind_avg_m_s, wind_max_m_s, wind_dir_deg, rain_mm
+        FROM observations_ws90
+        WHERE site = %s
+          AND (%s = '' OR ts >= %s::timestamptz)
+          AND (%s = '' OR ts <= %s::timestamptz)
+        ORDER BY ts ASC
+    """
+    return pd.read_sql_query(sql, conn, params=(site, start, start, end, end), parse_dates=["ts", "received_at"])
+
+
+def fetch_baro(conn, site: str, start: str, end: str) -> pd.DataFrame:
+    sql = """
+        SELECT ts, source, received_at, pressure_hpa
+        FROM observations_baro
+        WHERE site = %s
+          AND (%s = '' OR ts >= %s::timestamptz)
+          AND (%s = '' OR ts <= %s::timestamptz)
+        ORDER BY ts ASC
+    """
+    return pd.read_sql_query(sql, conn, params=(site, start, start, end, end), parse_dates=["ts", "received_at"])
+
+
+def build_dataset(
+    ws90: pd.DataFrame,
+    baro: pd.DataFrame,
+    rain_event_threshold_mm: float = RAIN_EVENT_THRESHOLD_MM,
+) -> pd.DataFrame:
+    if ws90.empty:
+        raise RuntimeError("no ws90 observations found")
+    if baro.empty:
+        raise RuntimeError("no barometer observations found")
+
+    ws90 = ws90.set_index("ts").sort_index()
+    baro = baro.set_index("ts").sort_index()
+
+    ws90_5m = ws90.resample("5min").agg(
+        {
+            "temperature_c": "mean",
+            "humidity": "mean",
+            "wind_avg_m_s": "mean",
+            "wind_max_m_s": "max",
+            "wind_dir_deg": "mean",
+            "rain_mm": "last",
+        }
+    )
+    baro_5m = baro.resample("5min").agg({"pressure_hpa": "mean"})
+
+    df = ws90_5m.join(baro_5m, how="outer")
+    df["pressure_hpa"] = df["pressure_hpa"].interpolate(limit=3)
+
+    df["rain_inc_raw"] = df["rain_mm"].diff()
+    df["rain_reset"] = df["rain_inc_raw"] < 0
+    df["rain_inc"] = df["rain_inc_raw"].clip(lower=0)
+    df["rain_spike_5m"] = df["rain_inc"] >= RAIN_SPIKE_THRESHOLD_MM_5M
+
+    window = RAIN_HORIZON_BUCKETS
+    df["rain_next_1h_mm"] = df["rain_inc"].rolling(window=window, min_periods=1).sum().shift(-(window - 1))
+    df["rain_next_1h"] = df["rain_next_1h_mm"] >= rain_event_threshold_mm
+
+    df["pressure_trend_1h"] = df["pressure_hpa"] - df["pressure_hpa"].shift(window)
+
+    return df
+
+
+def model_frame(df: pd.DataFrame, feature_cols: list[str] | None = None, require_target: bool = True) -> pd.DataFrame:
+    features = feature_cols or FEATURE_COLUMNS
+    required = list(features)
+    if require_target:
+        required.append("rain_next_1h")
+    out = df.dropna(subset=required).copy()
+    return out.sort_index()
+
+
+def split_time_ordered(df: pd.DataFrame, train_ratio: float = 0.7, val_ratio: float = 0.15) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    if not (0 < train_ratio < 1):
+        raise ValueError("train_ratio must be between 0 and 1")
+    if not (0 <= val_ratio < 1):
+        raise ValueError("val_ratio must be between 0 and 1")
+    if train_ratio+val_ratio >= 1:
+        raise ValueError("train_ratio + val_ratio must be < 1")
+
+    n = len(df)
+    if n < 100:
+        raise RuntimeError("not enough rows after filtering (need >= 100)")
+
+    train_end = int(n * train_ratio)
+    val_end = int(n * (train_ratio + val_ratio))
+
+    train_end = min(max(train_end, 1), n - 2)
+    val_end = min(max(val_end, train_end + 1), n - 1)
+
+    train_df = df.iloc[:train_end]
+    val_df = df.iloc[train_end:val_end]
+    test_df = df.iloc[val_end:]
+
+    if train_df.empty or val_df.empty or test_df.empty:
+        raise RuntimeError("time split produced empty train/val/test set")
+    return train_df, val_df, test_df
+
+
+def evaluate_probs(y_true: np.ndarray, y_prob: np.ndarray, threshold: float) -> dict[str, Any]:
+    y_pred = (y_prob >= threshold).astype(int)
+
+    roc_auc = float("nan")
+    pr_auc = float("nan")
+    if len(np.unique(y_true)) > 1:
+        roc_auc = roc_auc_score(y_true, y_prob)
+        pr_auc = average_precision_score(y_true, y_prob)
+
+    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
+    metrics = {
+        "rows": int(len(y_true)),
+        "positive_rate": float(np.mean(y_true)),
+        "threshold": float(threshold),
+        "accuracy": accuracy_score(y_true, y_pred),
+        "precision": precision_score(y_true, y_pred, zero_division=0),
+        "recall": recall_score(y_true, y_pred, zero_division=0),
+        "f1": f1_score(y_true, y_pred, zero_division=0),
+        "roc_auc": roc_auc,
+        "pr_auc": pr_auc,
+        "brier": brier_score_loss(y_true, y_prob),
+        "confusion_matrix": cm.tolist(),
+    }
+    return to_builtin(metrics)
+
+
+def select_threshold(y_true: np.ndarray, y_prob: np.ndarray, min_precision: float = 0.7) -> tuple[float, dict[str, Any]]:
+    thresholds = np.linspace(0.05, 0.95, 91)
+
+    best: dict[str, Any] | None = None
+    constrained_best: dict[str, Any] | None = None
+    for threshold in thresholds:
+        y_pred = (y_prob >= threshold).astype(int)
+        precision = precision_score(y_true, y_pred, zero_division=0)
+        recall = recall_score(y_true, y_pred, zero_division=0)
+        f1 = f1_score(y_true, y_pred, zero_division=0)
+        candidate = {
+            "threshold": float(threshold),
+            "precision": float(precision),
+            "recall": float(recall),
+            "f1": float(f1),
+        }
+
+        if best is None or candidate["f1"] > best["f1"]:
+            best = candidate
+
+        if precision >= min_precision:
+            if constrained_best is None:
+                constrained_best = candidate
+            elif candidate["recall"] > constrained_best["recall"]:
+                constrained_best = candidate
+            elif candidate["recall"] == constrained_best["recall"] and candidate["f1"] > constrained_best["f1"]:
+                constrained_best = candidate
+
+    if constrained_best is not None:
+        constrained_best["selection_rule"] = f"max_recall_where_precision>={min_precision:.2f}"
+        return float(constrained_best["threshold"]), constrained_best
+
+    assert best is not None
+    best["selection_rule"] = "fallback_max_f1"
+    return float(best["threshold"]), best
+
+
+def to_builtin(v: Any) -> Any:
+    if isinstance(v, dict):
+        return {k: to_builtin(val) for k, val in v.items()}
+    if isinstance(v, list):
+        return [to_builtin(i) for i in v]
+    if isinstance(v, tuple):
+        return [to_builtin(i) for i in v]
+    if isinstance(v, np.integer):
+        return int(v)
+    if isinstance(v, np.floating):
+        out = float(v)
+        if np.isnan(out):
+            return None
+        return out
+    if isinstance(v, pd.Timestamp):
+        return v.isoformat()
+    if isinstance(v, datetime):
+        return v.isoformat()
+    return v
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SITE="${SITE:-home}"
+START="${START:-2026-02-01T00:00:00Z}"
+END="${END:-2026-03-03T23:55:00Z}"
+MODEL_VERSION="${MODEL_VERSION:-rain-logreg-v1}"
+MODEL_PATH="${MODEL_PATH:-models/rain_model.pkl}"
+REPORT_PATH="${REPORT_PATH:-models/rain_model_report.json}"
+AUDIT_PATH="${AUDIT_PATH:-models/rain_data_audit.json}"
+
+if [[ -z "${DATABASE_URL:-}" ]]; then
+  echo "DATABASE_URL is required"
+  echo "example: export DATABASE_URL='postgres://postgres:postgres@localhost:5432/micrometeo?sslmode=disable'"
+  exit 1
+fi
+
+echo "Running rain data audit..."
+python scripts/audit_rain_data.py \
+  --site "$SITE" \
+  --start "$START" \
+  --end "$END" \
+  --out "$AUDIT_PATH"
+
+echo "Training baseline rain model..."
+python scripts/train_rain_model.py \
+  --site "$SITE" \
+  --start "$START" \
+  --end "$END" \
+  --train-ratio 0.7 \
+  --val-ratio 0.15 \
+  --min-precision 0.70 \
+  --model-version "$MODEL_VERSION" \
+  --out "$MODEL_PATH" \
+  --report-out "$REPORT_PATH"
+
+echo "Writing current prediction..."
+python scripts/predict_rain_model.py \
+  --site "$SITE" \
+  --model-path "$MODEL_PATH" \
+  --model-name "rain_next_1h"
+
+echo "P0 rain workflow complete."
@@ -1,16 +1,29 @@
 #!/usr/bin/env python3
 import argparse
+import json
 import os
-from datetime import datetime
+from datetime import datetime, timezone

 import numpy as np
-import pandas as pd
 import psycopg2
 from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler

+from rain_model_common import (
+    FEATURE_COLUMNS,
+    RAIN_EVENT_THRESHOLD_MM,
+    build_dataset,
+    evaluate_probs,
+    fetch_baro,
+    fetch_ws90,
+    model_frame,
+    parse_time,
+    select_threshold,
+    split_time_ordered,
+    to_builtin,
+)
+
 try:
    import joblib
 except ImportError:  # pragma: no cover - optional dependency
@@ -18,128 +31,42 @@ except ImportError:  # pragma: no cover - optional dependency


 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Train a simple rain prediction model (next 1h >= 0.2mm).")
+    parser = argparse.ArgumentParser(description="Train a rain prediction model (next 1h >= 0.2mm).")
    parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
    parser.add_argument("--site", required=True, help="Site name (e.g. home).")
    parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).")
    parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).")
+    parser.add_argument("--train-ratio", type=float, default=0.7, help="Time-ordered train split ratio.")
+    parser.add_argument("--val-ratio", type=float, default=0.15, help="Time-ordered validation split ratio.")
+    parser.add_argument(
+        "--min-precision",
+        type=float,
+        default=0.7,
+        help="Minimum validation precision for threshold selection.",
+    )
+    parser.add_argument("--threshold", type=float, help="Optional fixed classification threshold.")
+    parser.add_argument("--min-rows", type=int, default=200, help="Minimum model-ready rows required.")
    parser.add_argument("--out", default="models/rain_model.pkl", help="Path to save model.")
+    parser.add_argument(
+        "--report-out",
+        default="models/rain_model_report.json",
+        help="Path to save JSON training report.",
+    )
+    parser.add_argument(
+        "--model-version",
+        default="rain-logreg-v1",
+        help="Version label stored in artifact metadata.",
+    )
    return parser.parse_args()


-def parse_time(value: str) -> str:
-    if not value:
-        return ""
-    try:
-        datetime.fromisoformat(value.replace("Z", "+00:00"))
-        return value
-    except ValueError:
-        raise ValueError(f"invalid time format: {value}")
-
-
-def fetch_ws90(conn, site, start, end):
-    sql = """
-        SELECT ts, temperature_c, humidity, wind_avg_m_s, wind_max_m_s, wind_dir_deg, rain_mm
-        FROM observations_ws90
-        WHERE site = %s
-          AND (%s = '' OR ts >= %s::timestamptz)
-          AND (%s = '' OR ts <= %s::timestamptz)
-        ORDER BY ts ASC
-    """
-    return pd.read_sql_query(sql, conn, params=(site, start, start, end, end), parse_dates=["ts"])
-
-
-def fetch_baro(conn, site, start, end):
-    sql = """
-        SELECT ts, pressure_hpa
-        FROM observations_baro
-        WHERE site = %s
-          AND (%s = '' OR ts >= %s::timestamptz)
-          AND (%s = '' OR ts <= %s::timestamptz)
-        ORDER BY ts ASC
-    """
-    return pd.read_sql_query(sql, conn, params=(site, start, start, end, end), parse_dates=["ts"])
-
-
-def build_dataset(ws90: pd.DataFrame, baro: pd.DataFrame) -> pd.DataFrame:
-    if ws90.empty:
-        raise RuntimeError("no ws90 observations found")
-    if baro.empty:
-        raise RuntimeError("no barometer observations found")
-
-    ws90 = ws90.set_index("ts").sort_index()
-    baro = baro.set_index("ts").sort_index()
-
-    ws90_5m = ws90.resample("5min").agg(
-        {
-            "temperature_c": "mean",
-            "humidity": "mean",
-            "wind_avg_m_s": "mean",
-            "wind_max_m_s": "max",
-            "wind_dir_deg": "mean",
-            "rain_mm": "last",
-        }
-    )
-    baro_5m = baro.resample("5min").mean()
-
-    df = ws90_5m.join(baro_5m, how="outer")
-    df["pressure_hpa"] = df["pressure_hpa"].interpolate(limit=3)
-
-    # Compute incremental rain and future 1-hour sum.
-    df["rain_inc"] = df["rain_mm"].diff().clip(lower=0)
-    window = 12  # 12 * 5min = 1 hour
-    df["rain_next_1h_mm"] = df["rain_inc"].rolling(window=window, min_periods=1).sum().shift(-(window - 1))
-    df["rain_next_1h"] = df["rain_next_1h_mm"] >= 0.2
-
-    # Pressure trend over the previous hour.
-    df["pressure_trend_1h"] = df["pressure_hpa"] - df["pressure_hpa"].shift(12)
-
-    return df
-
-
-def train_model(df: pd.DataFrame):
-    feature_cols = [
-        "pressure_trend_1h",
-        "humidity",
-        "temperature_c",
-        "wind_avg_m_s",
-        "wind_max_m_s",
-    ]
-
-    df = df.dropna(subset=feature_cols + ["rain_next_1h"])
-    if len(df) < 200:
-        raise RuntimeError("not enough data after filtering (need >= 200 rows)")
-
-    X = df[feature_cols]
-    y = df["rain_next_1h"].astype(int)
-
-    split_idx = int(len(df) * 0.8)
-    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
-    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
-
-    model = Pipeline(
+def make_model() -> Pipeline:
+    return Pipeline(
        [
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=1000, class_weight="balanced")),
        ]
    )
-    model.fit(X_train, y_train)
-
-    y_pred = model.predict(X_test)
-    y_prob = model.predict_proba(X_test)[:, 1]
-
-    metrics = {
-        "rows": len(df),
-        "train_rows": len(X_train),
-        "test_rows": len(X_test),
-        "accuracy": accuracy_score(y_test, y_pred),
-        "precision": precision_score(y_test, y_pred, zero_division=0),
-        "recall": recall_score(y_test, y_pred, zero_division=0),
-        "roc_auc": roc_auc_score(y_test, y_prob),
-        "confusion_matrix": confusion_matrix(y_test, y_pred).tolist(),
-    }
-
-    return model, metrics, feature_cols


 def main() -> int:
@@ -154,12 +81,124 @@ def main() -> int:
        ws90 = fetch_ws90(conn, args.site, start, end)
        baro = fetch_baro(conn, args.site, start, end)

-    df = build_dataset(ws90, baro)
-    model, metrics, features = train_model(df)
+    full_df = build_dataset(ws90, baro, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
+    model_df = model_frame(full_df, FEATURE_COLUMNS, require_target=True)
+    if len(model_df) < args.min_rows:
+        raise RuntimeError(f"not enough model-ready rows after filtering (need >= {args.min_rows})")

-    print("Rain model metrics:")
-    for k, v in metrics.items():
-        print(f"  {k}: {v}")
+    train_df, val_df, test_df = split_time_ordered(
+        model_df,
+        train_ratio=args.train_ratio,
+        val_ratio=args.val_ratio,
+    )
+
+    x_train = train_df[FEATURE_COLUMNS]
+    y_train = train_df["rain_next_1h"].astype(int).to_numpy()
+    x_val = val_df[FEATURE_COLUMNS]
+    y_val = val_df["rain_next_1h"].astype(int).to_numpy()
+    x_test = test_df[FEATURE_COLUMNS]
+    y_test = test_df["rain_next_1h"].astype(int).to_numpy()
+
+    base_model = make_model()
+    base_model.fit(x_train, y_train)
+    y_val_prob = base_model.predict_proba(x_val)[:, 1]
+
+    if args.threshold is not None:
+        chosen_threshold = args.threshold
+        threshold_info = {
+            "selection_rule": "fixed_cli_threshold",
+            "threshold": float(args.threshold),
+        }
+    else:
+        chosen_threshold, threshold_info = select_threshold(
+            y_true=y_val,
+            y_prob=y_val_prob,
+            min_precision=args.min_precision,
+        )
+
+    val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=chosen_threshold)
+
+    train_val_df = model_df.iloc[: len(train_df) + len(val_df)]
+    x_train_val = train_val_df[FEATURE_COLUMNS]
+    y_train_val = train_val_df["rain_next_1h"].astype(int).to_numpy()
+
+    final_model = make_model()
+    final_model.fit(x_train_val, y_train_val)
+    y_test_prob = final_model.predict_proba(x_test)[:, 1]
+    test_metrics = evaluate_probs(y_true=y_test, y_prob=y_test_prob, threshold=chosen_threshold)
+
+    report = {
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "site": args.site,
+        "model_version": args.model_version,
+        "target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
+        "feature_columns": FEATURE_COLUMNS,
+        "data_window": {
+            "requested_start": start or None,
+            "requested_end": end or None,
+            "actual_start": model_df.index.min(),
+            "actual_end": model_df.index.max(),
+            "model_rows": len(model_df),
+            "ws90_rows": len(ws90),
+            "baro_rows": len(baro),
+        },
+        "label_quality": {
+            "rain_reset_count": int(np.nansum(full_df["rain_reset"].fillna(False).to_numpy(dtype=int))),
+            "rain_spike_5m_count": int(np.nansum(full_df["rain_spike_5m"].fillna(False).to_numpy(dtype=int))),
+        },
+        "split": {
+            "train_ratio": args.train_ratio,
+            "val_ratio": args.val_ratio,
+            "train_rows": len(train_df),
+            "val_rows": len(val_df),
+            "test_rows": len(test_df),
+            "train_start": train_df.index.min(),
+            "train_end": train_df.index.max(),
+            "val_start": val_df.index.min(),
+            "val_end": val_df.index.max(),
+            "test_start": test_df.index.min(),
+            "test_end": test_df.index.max(),
+        },
+        "threshold_selection": {
+            **threshold_info,
+            "min_precision_constraint": args.min_precision,
+        },
+        "validation_metrics": val_metrics,
+        "test_metrics": test_metrics,
+    }
+    report = to_builtin(report)
+
+    print("Rain model training summary:")
+    print(f"  site: {args.site}")
+    print(f"  model_version: {args.model_version}")
+    print(f"  rows: total={report['data_window']['model_rows']} train={report['split']['train_rows']} val={report['split']['val_rows']} test={report['split']['test_rows']}")
+    print(
+        "  threshold: "
+        f"{report['threshold_selection']['threshold']:.3f} "
+        f"({report['threshold_selection']['selection_rule']})"
+    )
+    print(
+        "  val metrics: "
+        f"precision={report['validation_metrics']['precision']:.3f} "
+        f"recall={report['validation_metrics']['recall']:.3f} "
+        f"roc_auc={report['validation_metrics']['roc_auc'] if report['validation_metrics']['roc_auc'] is not None else 'n/a'} "
+        f"pr_auc={report['validation_metrics']['pr_auc'] if report['validation_metrics']['pr_auc'] is not None else 'n/a'}"
+    )
+    print(
+        "  test metrics: "
+        f"precision={report['test_metrics']['precision']:.3f} "
+        f"recall={report['test_metrics']['recall']:.3f} "
+        f"roc_auc={report['test_metrics']['roc_auc'] if report['test_metrics']['roc_auc'] is not None else 'n/a'} "
+        f"pr_auc={report['test_metrics']['pr_auc'] if report['test_metrics']['pr_auc'] is not None else 'n/a'}"
+    )
+
+    if args.report_out:
+        report_dir = os.path.dirname(args.report_out)
+        if report_dir:
+            os.makedirs(report_dir, exist_ok=True)
+        with open(args.report_out, "w", encoding="utf-8") as f:
+            json.dump(report, f, indent=2)
+        print(f"Saved report to {args.report_out}")

    if args.out:
        out_dir = os.path.dirname(args.out)
@@ -168,7 +207,17 @@ def main() -> int:
        if joblib is None:
            print("joblib not installed; skipping model save.")
        else:
-            joblib.dump({"model": model, "features": features}, args.out)
+            artifact = {
+                "model": final_model,
+                "features": FEATURE_COLUMNS,
+                "threshold": float(chosen_threshold),
+                "target_mm": float(RAIN_EVENT_THRESHOLD_MM),
+                "model_version": args.model_version,
+                "trained_at": datetime.now(timezone.utc).isoformat(),
+                "split": report["split"],
+                "threshold_selection": report["threshold_selection"],
+            }
+            joblib.dump(artifact, args.out)
            print(f"Saved model to {args.out}")

    return 0