update for 4 hour rain forecast

This commit is contained in:
2026-04-06 18:32:33 +10:00
parent fb50c8ed71
commit 3a7309b2cf
20 changed files with 716 additions and 132 deletions
+30 -8
View File
@@ -10,6 +10,7 @@ import psycopg2
from rain_model_common import (
AVAILABLE_FEATURE_SETS,
DEFAULT_HORIZON_HOURS,
RAIN_EVENT_THRESHOLD_MM,
build_dataset,
feature_columns_for_set,
@@ -17,8 +18,11 @@ from rain_model_common import (
fetch_baro,
fetch_forecast,
fetch_ws90,
normalize_horizon_hours,
model_frame,
parse_time,
rain_next_flag_col,
rain_next_mm_col,
to_builtin,
)
@@ -29,6 +33,12 @@ def parse_args() -> argparse.Namespace:
parser.add_argument("--site", required=True, help="Site name (e.g. home).")
parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).")
parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).")
parser.add_argument(
"--horizon-hours",
type=int,
default=DEFAULT_HORIZON_HOURS,
help="Prediction horizon in hours for target/label auditing.",
)
parser.add_argument(
"--feature-set",
default="baseline",
@@ -57,13 +67,13 @@ def longest_zero_run(counts: np.ndarray) -> int:
return best
def build_weekly_balance(model_df):
def build_weekly_balance(model_df, target_col: str):
weekly = model_df.copy()
iso = weekly.index.to_series().dt.isocalendar()
weekly["year_week"] = iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)
grouped = (
weekly.groupby("year_week")["rain_next_1h"]
weekly.groupby("year_week")[target_col]
.agg(total_rows="count", positive_rows="sum")
.reset_index()
.sort_values("year_week")
@@ -79,6 +89,9 @@ def main() -> int:
start = parse_time(args.start) if args.start else ""
end = parse_time(args.end) if args.end else ""
horizon_hours = normalize_horizon_hours(args.horizon_hours)
target_col = rain_next_flag_col(horizon_hours)
target_mm_col = rain_next_mm_col(horizon_hours)
feature_cols = feature_columns_for_set(args.feature_set)
needs_forecast = feature_columns_need_forecast(feature_cols)
@@ -89,8 +102,14 @@ def main() -> int:
if needs_forecast:
forecast = fetch_forecast(conn, args.site, start, end, model=args.forecast_model)
df = build_dataset(ws90, baro, forecast=forecast, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
model_df = model_frame(df, feature_cols, require_target=True)
df = build_dataset(
ws90,
baro,
forecast=forecast,
rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM,
horizon_hours=horizon_hours,
)
model_df = model_frame(df, feature_cols, require_target=True, target_col=target_col)
ws90_dupes = int(ws90.duplicated(subset=["ts", "station_id"]).sum()) if not ws90.empty else 0
baro_dupes = int(baro.duplicated(subset=["ts", "source"]).sum()) if not baro.empty else 0
@@ -114,7 +133,7 @@ def main() -> int:
baro_max_gap_min = longest_zero_run(np.array(baro_counts)) * 5 if len(baro_counts) else 0
missingness = {}
for col in feature_cols + ["pressure_hpa", "rain_mm", "rain_inc", "rain_next_1h_mm"]:
for col in feature_cols + ["pressure_hpa", "rain_mm", "rain_inc", target_mm_col]:
if col in df.columns:
missingness[col] = float(df[col].isna().mean())
@@ -125,9 +144,12 @@ def main() -> int:
report = {
"site": args.site,
"feature_set": args.feature_set,
"horizon_hours": horizon_hours,
"target_column": target_col,
"target_mm_column": target_mm_col,
"feature_columns": feature_cols,
"forecast_model": args.forecast_model if needs_forecast else None,
"target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
"target_definition": f"{target_mm_col} >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
"requested_window": {
"start": start or None,
"end": end or None,
@@ -167,8 +189,8 @@ def main() -> int:
"max_rain_increment_5m_mm": max_rain_inc,
},
"class_balance": {
"overall_positive_rate": float(model_df["rain_next_1h"].mean()) if not model_df.empty else None,
"weekly": build_weekly_balance(model_df) if not model_df.empty else [],
"overall_positive_rate": float(model_df[target_col].mean()) if not model_df.empty else None,
"weekly": build_weekly_balance(model_df, target_col=target_col) if not model_df.empty else [],
},
}
report = to_builtin(report)