feat: add rain data audit and prediction scripts

2026-03-05 08:01:54 +11:00
parent 5bfa910495
commit 96e72d7c43
13 changed files with 1004 additions and 182 deletions
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+
+import numpy as np
+import psycopg2
+
+from rain_model_common import (
+    FEATURE_COLUMNS,
+    RAIN_EVENT_THRESHOLD_MM,
+    build_dataset,
+    fetch_baro,
+    fetch_ws90,
+    model_frame,
+    parse_time,
+    to_builtin,
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Audit weather time-series quality for rain model training.")
+    parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
+    parser.add_argument("--site", required=True, help="Site name (e.g. home).")
+    parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).")
+    parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).")
+    parser.add_argument("--out", default="models/rain_data_audit.json", help="Path to save JSON audit report.")
+    return parser.parse_args()
+
+
+def longest_zero_run(counts: np.ndarray) -> int:
+    best = 0
+    cur = 0
+    for v in counts:
+        if v == 0:
+            cur += 1
+            if cur > best:
+                best = cur
+        else:
+            cur = 0
+    return best
+
+
+def build_weekly_balance(model_df):
+    weekly = model_df.copy()
+    iso = weekly.index.to_series().dt.isocalendar()
+    weekly["year_week"] = iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)
+
+    grouped = (
+        weekly.groupby("year_week")["rain_next_1h"]
+        .agg(total_rows="count", positive_rows="sum")
+        .reset_index()
+        .sort_values("year_week")
+    )
+    grouped["positive_rate"] = grouped["positive_rows"] / grouped["total_rows"]
+    return grouped.to_dict(orient="records")
+
+
+def main() -> int:
+    args = parse_args()
+    if not args.db_url:
+        raise SystemExit("missing --db-url or DATABASE_URL")
+
+    start = parse_time(args.start) if args.start else ""
+    end = parse_time(args.end) if args.end else ""
+
+    with psycopg2.connect(args.db_url) as conn:
+        ws90 = fetch_ws90(conn, args.site, start, end)
+        baro = fetch_baro(conn, args.site, start, end)
+
+    df = build_dataset(ws90, baro, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
+    model_df = model_frame(df, FEATURE_COLUMNS, require_target=True)
+
+    ws90_dupes = int(ws90.duplicated(subset=["ts", "station_id"]).sum()) if not ws90.empty else 0
+    baro_dupes = int(baro.duplicated(subset=["ts", "source"]).sum()) if not baro.empty else 0
+
+    ws90_out_of_order = 0
+    if not ws90.empty:
+        ws90_by_received = ws90.sort_values("received_at")
+        ws90_out_of_order = int((ws90_by_received["ts"].diff().dropna() < np.timedelta64(0, "ns")).sum())
+
+    baro_out_of_order = 0
+    if not baro.empty:
+        baro_by_received = baro.sort_values("received_at")
+        baro_out_of_order = int((baro_by_received["ts"].diff().dropna() < np.timedelta64(0, "ns")).sum())
+
+    ws90_counts = ws90.set_index("ts").resample("5min").size() if not ws90.empty else np.array([])
+    baro_counts = baro.set_index("ts").resample("5min").size() if not baro.empty else np.array([])
+
+    ws90_gap_buckets = int((ws90_counts == 0).sum()) if len(ws90_counts) else 0
+    baro_gap_buckets = int((baro_counts == 0).sum()) if len(baro_counts) else 0
+    ws90_max_gap_min = longest_zero_run(np.array(ws90_counts)) * 5 if len(ws90_counts) else 0
+    baro_max_gap_min = longest_zero_run(np.array(baro_counts)) * 5 if len(baro_counts) else 0
+
+    missingness = {}
+    for col in FEATURE_COLUMNS + ["pressure_hpa", "rain_mm", "rain_inc", "rain_next_1h_mm"]:
+        if col in df.columns:
+            missingness[col] = float(df[col].isna().mean())
+
+    max_rain_inc = None
+    if "rain_inc" in df.columns and np.isfinite(df["rain_inc"].to_numpy(dtype=float)).any():
+        max_rain_inc = float(np.nanmax(df["rain_inc"].to_numpy(dtype=float)))
+
+    report = {
+        "site": args.site,
+        "target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
+        "requested_window": {
+            "start": start or None,
+            "end": end or None,
+        },
+        "observed_window": {
+            "ws90_start": ws90["ts"].min() if not ws90.empty else None,
+            "ws90_end": ws90["ts"].max() if not ws90.empty else None,
+            "baro_start": baro["ts"].min() if not baro.empty else None,
+            "baro_end": baro["ts"].max() if not baro.empty else None,
+            "model_start": model_df.index.min() if not model_df.empty else None,
+            "model_end": model_df.index.max() if not model_df.empty else None,
+        },
+        "row_counts": {
+            "ws90_rows": int(len(ws90)),
+            "baro_rows": int(len(baro)),
+            "model_rows": int(len(model_df)),
+        },
+        "duplicates": {
+            "ws90_ts_station_duplicates": ws90_dupes,
+            "baro_ts_source_duplicates": baro_dupes,
+        },
+        "out_of_order": {
+            "ws90_by_received_count": ws90_out_of_order,
+            "baro_by_received_count": baro_out_of_order,
+        },
+        "gaps_5m": {
+            "ws90_empty_buckets": ws90_gap_buckets,
+            "baro_empty_buckets": baro_gap_buckets,
+            "ws90_max_gap_minutes": ws90_max_gap_min,
+            "baro_max_gap_minutes": baro_max_gap_min,
+        },
+        "missingness_ratio": missingness,
+        "label_quality": {
+            "rain_reset_count": int(np.nansum(df["rain_reset"].fillna(False).to_numpy(dtype=int))),
+            "rain_spike_5m_count": int(np.nansum(df["rain_spike_5m"].fillna(False).to_numpy(dtype=int))),
+            "max_rain_increment_5m_mm": max_rain_inc,
+        },
+        "class_balance": {
+            "overall_positive_rate": float(model_df["rain_next_1h"].mean()) if not model_df.empty else None,
+            "weekly": build_weekly_balance(model_df) if not model_df.empty else [],
+        },
+    }
+    report = to_builtin(report)
+
+    print("Rain data audit summary:")
+    print(f"  site: {report['site']}")
+    print(
+        "  rows: "
+        f"ws90={report['row_counts']['ws90_rows']} "
+        f"baro={report['row_counts']['baro_rows']} "
+        f"model={report['row_counts']['model_rows']}"
+    )
+    print(
+        "  duplicates: "
+        f"ws90={report['duplicates']['ws90_ts_station_duplicates']} "
+        f"baro={report['duplicates']['baro_ts_source_duplicates']}"
+    )
+    print(
+        "  rain label checks: "
+        f"resets={report['label_quality']['rain_reset_count']} "
+        f"spikes_5m={report['label_quality']['rain_spike_5m_count']} "
+        f"max_inc_5m={report['label_quality']['max_rain_increment_5m_mm']}"
+    )
+    print(f"  overall positive rate: {report['class_balance']['overall_positive_rate']}")
+
+    if args.out:
+        out_dir = os.path.dirname(args.out)
+        if out_dir:
+            os.makedirs(out_dir, exist_ok=True)
+        with open(args.out, "w", encoding="utf-8") as f:
+            json.dump(report, f, indent=2)
+        print(f"Saved audit report to {args.out}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())