212 lines
7.9 KiB
Python
212 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
|
|
import numpy as np
|
|
import psycopg2
|
|
|
|
from rain_model_common import (
|
|
AVAILABLE_FEATURE_SETS,
|
|
RAIN_EVENT_THRESHOLD_MM,
|
|
build_dataset,
|
|
feature_columns_for_set,
|
|
feature_columns_need_forecast,
|
|
fetch_baro,
|
|
fetch_forecast,
|
|
fetch_ws90,
|
|
model_frame,
|
|
parse_time,
|
|
to_builtin,
|
|
)
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Audit weather time-series quality for rain model training.")
|
|
parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
|
|
parser.add_argument("--site", required=True, help="Site name (e.g. home).")
|
|
parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).")
|
|
parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).")
|
|
parser.add_argument(
|
|
"--feature-set",
|
|
default="baseline",
|
|
choices=AVAILABLE_FEATURE_SETS,
|
|
help="Named feature set used for model-readiness auditing.",
|
|
)
|
|
parser.add_argument(
|
|
"--forecast-model",
|
|
default="ecmwf",
|
|
help="Forecast model name when feature set requires forecast columns.",
|
|
)
|
|
parser.add_argument("--out", default="models/rain_data_audit.json", help="Path to save JSON audit report.")
|
|
return parser.parse_args()
|
|
|
|
|
|
def longest_zero_run(counts: np.ndarray) -> int:
|
|
best = 0
|
|
cur = 0
|
|
for v in counts:
|
|
if v == 0:
|
|
cur += 1
|
|
if cur > best:
|
|
best = cur
|
|
else:
|
|
cur = 0
|
|
return best
|
|
|
|
|
|
def build_weekly_balance(model_df):
|
|
weekly = model_df.copy()
|
|
iso = weekly.index.to_series().dt.isocalendar()
|
|
weekly["year_week"] = iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)
|
|
|
|
grouped = (
|
|
weekly.groupby("year_week")["rain_next_1h"]
|
|
.agg(total_rows="count", positive_rows="sum")
|
|
.reset_index()
|
|
.sort_values("year_week")
|
|
)
|
|
grouped["positive_rate"] = grouped["positive_rows"] / grouped["total_rows"]
|
|
return grouped.to_dict(orient="records")
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
if not args.db_url:
|
|
raise SystemExit("missing --db-url or DATABASE_URL")
|
|
|
|
start = parse_time(args.start) if args.start else ""
|
|
end = parse_time(args.end) if args.end else ""
|
|
feature_cols = feature_columns_for_set(args.feature_set)
|
|
needs_forecast = feature_columns_need_forecast(feature_cols)
|
|
|
|
with psycopg2.connect(args.db_url) as conn:
|
|
ws90 = fetch_ws90(conn, args.site, start, end)
|
|
baro = fetch_baro(conn, args.site, start, end)
|
|
forecast = None
|
|
if needs_forecast:
|
|
forecast = fetch_forecast(conn, args.site, start, end, model=args.forecast_model)
|
|
|
|
df = build_dataset(ws90, baro, forecast=forecast, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
|
|
model_df = model_frame(df, feature_cols, require_target=True)
|
|
|
|
ws90_dupes = int(ws90.duplicated(subset=["ts", "station_id"]).sum()) if not ws90.empty else 0
|
|
baro_dupes = int(baro.duplicated(subset=["ts", "source"]).sum()) if not baro.empty else 0
|
|
|
|
ws90_out_of_order = 0
|
|
if not ws90.empty:
|
|
ws90_by_received = ws90.sort_values("received_at")
|
|
ws90_out_of_order = int((ws90_by_received["ts"].diff().dropna() < np.timedelta64(0, "ns")).sum())
|
|
|
|
baro_out_of_order = 0
|
|
if not baro.empty:
|
|
baro_by_received = baro.sort_values("received_at")
|
|
baro_out_of_order = int((baro_by_received["ts"].diff().dropna() < np.timedelta64(0, "ns")).sum())
|
|
|
|
ws90_counts = ws90.set_index("ts").resample("5min").size() if not ws90.empty else np.array([])
|
|
baro_counts = baro.set_index("ts").resample("5min").size() if not baro.empty else np.array([])
|
|
|
|
ws90_gap_buckets = int((ws90_counts == 0).sum()) if len(ws90_counts) else 0
|
|
baro_gap_buckets = int((baro_counts == 0).sum()) if len(baro_counts) else 0
|
|
ws90_max_gap_min = longest_zero_run(np.array(ws90_counts)) * 5 if len(ws90_counts) else 0
|
|
baro_max_gap_min = longest_zero_run(np.array(baro_counts)) * 5 if len(baro_counts) else 0
|
|
|
|
missingness = {}
|
|
for col in feature_cols + ["pressure_hpa", "rain_mm", "rain_inc", "rain_next_1h_mm"]:
|
|
if col in df.columns:
|
|
missingness[col] = float(df[col].isna().mean())
|
|
|
|
max_rain_inc = None
|
|
if "rain_inc" in df.columns and np.isfinite(df["rain_inc"].to_numpy(dtype=float)).any():
|
|
max_rain_inc = float(np.nanmax(df["rain_inc"].to_numpy(dtype=float)))
|
|
|
|
report = {
|
|
"site": args.site,
|
|
"feature_set": args.feature_set,
|
|
"feature_columns": feature_cols,
|
|
"forecast_model": args.forecast_model if needs_forecast else None,
|
|
"target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
|
"requested_window": {
|
|
"start": start or None,
|
|
"end": end or None,
|
|
},
|
|
"observed_window": {
|
|
"ws90_start": ws90["ts"].min() if not ws90.empty else None,
|
|
"ws90_end": ws90["ts"].max() if not ws90.empty else None,
|
|
"baro_start": baro["ts"].min() if not baro.empty else None,
|
|
"baro_end": baro["ts"].max() if not baro.empty else None,
|
|
"model_start": model_df.index.min() if not model_df.empty else None,
|
|
"model_end": model_df.index.max() if not model_df.empty else None,
|
|
},
|
|
"row_counts": {
|
|
"ws90_rows": int(len(ws90)),
|
|
"baro_rows": int(len(baro)),
|
|
"forecast_rows": int(len(forecast)) if forecast is not None else 0,
|
|
"model_rows": int(len(model_df)),
|
|
},
|
|
"duplicates": {
|
|
"ws90_ts_station_duplicates": ws90_dupes,
|
|
"baro_ts_source_duplicates": baro_dupes,
|
|
},
|
|
"out_of_order": {
|
|
"ws90_by_received_count": ws90_out_of_order,
|
|
"baro_by_received_count": baro_out_of_order,
|
|
},
|
|
"gaps_5m": {
|
|
"ws90_empty_buckets": ws90_gap_buckets,
|
|
"baro_empty_buckets": baro_gap_buckets,
|
|
"ws90_max_gap_minutes": ws90_max_gap_min,
|
|
"baro_max_gap_minutes": baro_max_gap_min,
|
|
},
|
|
"missingness_ratio": missingness,
|
|
"label_quality": {
|
|
"rain_reset_count": int(np.nansum(df["rain_reset"].fillna(False).to_numpy(dtype=int))),
|
|
"rain_spike_5m_count": int(np.nansum(df["rain_spike_5m"].fillna(False).to_numpy(dtype=int))),
|
|
"max_rain_increment_5m_mm": max_rain_inc,
|
|
},
|
|
"class_balance": {
|
|
"overall_positive_rate": float(model_df["rain_next_1h"].mean()) if not model_df.empty else None,
|
|
"weekly": build_weekly_balance(model_df) if not model_df.empty else [],
|
|
},
|
|
}
|
|
report = to_builtin(report)
|
|
|
|
print("Rain data audit summary:")
|
|
print(f" site: {report['site']}")
|
|
print(f" feature_set: {report['feature_set']}")
|
|
print(
|
|
" rows: "
|
|
f"ws90={report['row_counts']['ws90_rows']} "
|
|
f"baro={report['row_counts']['baro_rows']} "
|
|
f"forecast={report['row_counts']['forecast_rows']} "
|
|
f"model={report['row_counts']['model_rows']}"
|
|
)
|
|
print(
|
|
" duplicates: "
|
|
f"ws90={report['duplicates']['ws90_ts_station_duplicates']} "
|
|
f"baro={report['duplicates']['baro_ts_source_duplicates']}"
|
|
)
|
|
print(
|
|
" rain label checks: "
|
|
f"resets={report['label_quality']['rain_reset_count']} "
|
|
f"spikes_5m={report['label_quality']['rain_spike_5m_count']} "
|
|
f"max_inc_5m={report['label_quality']['max_rain_increment_5m_mm']}"
|
|
)
|
|
print(f" overall positive rate: {report['class_balance']['overall_positive_rate']}")
|
|
|
|
if args.out:
|
|
out_dir = os.path.dirname(args.out)
|
|
if out_dir:
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
with open(args.out, "w", encoding="utf-8") as f:
|
|
json.dump(report, f, indent=2)
|
|
print(f"Saved audit report to {args.out}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|