#!/usr/bin/env python3 from __future__ import annotations import argparse import json import os import numpy as np import psycopg2 from rain_model_common import ( FEATURE_COLUMNS, RAIN_EVENT_THRESHOLD_MM, build_dataset, fetch_baro, fetch_ws90, model_frame, parse_time, to_builtin, ) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Audit weather time-series quality for rain model training.") parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.") parser.add_argument("--site", required=True, help="Site name (e.g. home).") parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).") parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).") parser.add_argument("--out", default="models/rain_data_audit.json", help="Path to save JSON audit report.") return parser.parse_args() def longest_zero_run(counts: np.ndarray) -> int: best = 0 cur = 0 for v in counts: if v == 0: cur += 1 if cur > best: best = cur else: cur = 0 return best def build_weekly_balance(model_df): weekly = model_df.copy() iso = weekly.index.to_series().dt.isocalendar() weekly["year_week"] = iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2) grouped = ( weekly.groupby("year_week")["rain_next_1h"] .agg(total_rows="count", positive_rows="sum") .reset_index() .sort_values("year_week") ) grouped["positive_rate"] = grouped["positive_rows"] / grouped["total_rows"] return grouped.to_dict(orient="records") def main() -> int: args = parse_args() if not args.db_url: raise SystemExit("missing --db-url or DATABASE_URL") start = parse_time(args.start) if args.start else "" end = parse_time(args.end) if args.end else "" with psycopg2.connect(args.db_url) as conn: ws90 = fetch_ws90(conn, args.site, start, end) baro = fetch_baro(conn, args.site, start, end) df = build_dataset(ws90, baro, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM) model_df = model_frame(df, FEATURE_COLUMNS, require_target=True) ws90_dupes = int(ws90.duplicated(subset=["ts", "station_id"]).sum()) if not ws90.empty else 0 baro_dupes = int(baro.duplicated(subset=["ts", "source"]).sum()) if not baro.empty else 0 ws90_out_of_order = 0 if not ws90.empty: ws90_by_received = ws90.sort_values("received_at") ws90_out_of_order = int((ws90_by_received["ts"].diff().dropna() < np.timedelta64(0, "ns")).sum()) baro_out_of_order = 0 if not baro.empty: baro_by_received = baro.sort_values("received_at") baro_out_of_order = int((baro_by_received["ts"].diff().dropna() < np.timedelta64(0, "ns")).sum()) ws90_counts = ws90.set_index("ts").resample("5min").size() if not ws90.empty else np.array([]) baro_counts = baro.set_index("ts").resample("5min").size() if not baro.empty else np.array([]) ws90_gap_buckets = int((ws90_counts == 0).sum()) if len(ws90_counts) else 0 baro_gap_buckets = int((baro_counts == 0).sum()) if len(baro_counts) else 0 ws90_max_gap_min = longest_zero_run(np.array(ws90_counts)) * 5 if len(ws90_counts) else 0 baro_max_gap_min = longest_zero_run(np.array(baro_counts)) * 5 if len(baro_counts) else 0 missingness = {} for col in FEATURE_COLUMNS + ["pressure_hpa", "rain_mm", "rain_inc", "rain_next_1h_mm"]: if col in df.columns: missingness[col] = float(df[col].isna().mean()) max_rain_inc = None if "rain_inc" in df.columns and np.isfinite(df["rain_inc"].to_numpy(dtype=float)).any(): max_rain_inc = float(np.nanmax(df["rain_inc"].to_numpy(dtype=float))) report = { "site": args.site, "target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}", "requested_window": { "start": start or None, "end": end or None, }, "observed_window": { "ws90_start": ws90["ts"].min() if not ws90.empty else None, "ws90_end": ws90["ts"].max() if not ws90.empty else None, "baro_start": baro["ts"].min() if not baro.empty else None, "baro_end": baro["ts"].max() if not baro.empty else None, "model_start": model_df.index.min() if not model_df.empty else None, "model_end": model_df.index.max() if not model_df.empty else None, }, "row_counts": { "ws90_rows": int(len(ws90)), "baro_rows": int(len(baro)), "model_rows": int(len(model_df)), }, "duplicates": { "ws90_ts_station_duplicates": ws90_dupes, "baro_ts_source_duplicates": baro_dupes, }, "out_of_order": { "ws90_by_received_count": ws90_out_of_order, "baro_by_received_count": baro_out_of_order, }, "gaps_5m": { "ws90_empty_buckets": ws90_gap_buckets, "baro_empty_buckets": baro_gap_buckets, "ws90_max_gap_minutes": ws90_max_gap_min, "baro_max_gap_minutes": baro_max_gap_min, }, "missingness_ratio": missingness, "label_quality": { "rain_reset_count": int(np.nansum(df["rain_reset"].fillna(False).to_numpy(dtype=int))), "rain_spike_5m_count": int(np.nansum(df["rain_spike_5m"].fillna(False).to_numpy(dtype=int))), "max_rain_increment_5m_mm": max_rain_inc, }, "class_balance": { "overall_positive_rate": float(model_df["rain_next_1h"].mean()) if not model_df.empty else None, "weekly": build_weekly_balance(model_df) if not model_df.empty else [], }, } report = to_builtin(report) print("Rain data audit summary:") print(f" site: {report['site']}") print( " rows: " f"ws90={report['row_counts']['ws90_rows']} " f"baro={report['row_counts']['baro_rows']} " f"model={report['row_counts']['model_rows']}" ) print( " duplicates: " f"ws90={report['duplicates']['ws90_ts_station_duplicates']} " f"baro={report['duplicates']['baro_ts_source_duplicates']}" ) print( " rain label checks: " f"resets={report['label_quality']['rain_reset_count']} " f"spikes_5m={report['label_quality']['rain_spike_5m_count']} " f"max_inc_5m={report['label_quality']['max_rain_increment_5m_mm']}" ) print(f" overall positive rate: {report['class_balance']['overall_positive_rate']}") if args.out: out_dir = os.path.dirname(args.out) if out_dir: os.makedirs(out_dir, exist_ok=True) with open(args.out, "w", encoding="utf-8") as f: json.dump(report, f, indent=2) print(f"Saved audit report to {args.out}") return 0 if __name__ == "__main__": raise SystemExit(main())