update for 4 hour rain forecast
This commit is contained in:
+83
-24
@@ -20,6 +20,7 @@ from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from rain_model_common import (
|
||||
AVAILABLE_FEATURE_SETS,
|
||||
DEFAULT_HORIZON_HOURS,
|
||||
RAIN_EVENT_THRESHOLD_MM,
|
||||
build_dataset,
|
||||
evaluate_probs,
|
||||
@@ -28,8 +29,13 @@ from rain_model_common import (
|
||||
fetch_ws90,
|
||||
feature_columns_for_set,
|
||||
feature_columns_need_forecast,
|
||||
horizon_suffix,
|
||||
model_frame,
|
||||
normalize_horizon_hours,
|
||||
parse_time,
|
||||
rain_last_mm_col,
|
||||
rain_next_flag_col,
|
||||
rain_next_mm_col,
|
||||
safe_pr_auc,
|
||||
safe_roc_auc,
|
||||
select_threshold,
|
||||
@@ -49,11 +55,17 @@ THRESHOLD_POLICIES = ("validation", "walk_forward")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Train a rain prediction model (next 1h >= 0.2mm).")
|
||||
parser = argparse.ArgumentParser(description="Train a rain prediction model (next Nh >= threshold).")
|
||||
parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
|
||||
parser.add_argument("--site", required=True, help="Site name (e.g. home).")
|
||||
parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).")
|
||||
parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).")
|
||||
parser.add_argument(
|
||||
"--horizon-hours",
|
||||
type=int,
|
||||
default=DEFAULT_HORIZON_HOURS,
|
||||
help="Prediction horizon in hours (for example 1 or 4).",
|
||||
)
|
||||
parser.add_argument("--train-ratio", type=float, default=0.7, help="Time-ordered train split ratio.")
|
||||
parser.add_argument("--val-ratio", type=float, default=0.15, help="Time-ordered validation split ratio.")
|
||||
parser.add_argument(
|
||||
@@ -464,14 +476,18 @@ def evaluate_calibration_methods(
|
||||
return selected, results
|
||||
|
||||
|
||||
def evaluate_naive_baselines(test_df, y_test: np.ndarray) -> dict[str, Any]:
|
||||
def evaluate_naive_baselines(
|
||||
test_df,
|
||||
y_test: np.ndarray,
|
||||
persistence_context_col: str,
|
||||
) -> dict[str, Any]:
|
||||
out: dict[str, Any] = {}
|
||||
|
||||
if "rain_last_1h_mm" in test_df.columns:
|
||||
rain_last = test_df["rain_last_1h_mm"].to_numpy(dtype=float)
|
||||
if persistence_context_col in test_df.columns:
|
||||
rain_last = test_df[persistence_context_col].to_numpy(dtype=float)
|
||||
persistence_prob = (rain_last >= RAIN_EVENT_THRESHOLD_MM).astype(float)
|
||||
out["persistence_last_1h"] = {
|
||||
"rule": f"predict rain when rain_last_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
out[f"persistence_{persistence_context_col}"] = {
|
||||
"rule": f"predict rain when {persistence_context_col} >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"metrics": evaluate_probs(y_true=y_test, y_prob=persistence_prob, threshold=0.5),
|
||||
}
|
||||
|
||||
@@ -512,6 +528,8 @@ def evaluate_sliced_performance(
|
||||
y_true: np.ndarray,
|
||||
y_prob: np.ndarray,
|
||||
threshold: float,
|
||||
context_col: str,
|
||||
context_label: str,
|
||||
min_rows_per_slice: int = 30,
|
||||
) -> dict[str, Any]:
|
||||
frame = pd.DataFrame(
|
||||
@@ -530,7 +548,11 @@ def evaluate_sliced_performance(
|
||||
weekly_positive_rate = frame.groupby(week_label)["y_true"].transform("mean")
|
||||
rainy_week = weekly_positive_rate >= overall_rate
|
||||
|
||||
rain_context = test_df["rain_last_1h_mm"].to_numpy(dtype=float) if "rain_last_1h_mm" in test_df.columns else np.zeros(len(test_df))
|
||||
rain_context = (
|
||||
test_df[context_col].to_numpy(dtype=float)
|
||||
if context_col in test_df.columns
|
||||
else np.zeros(len(test_df))
|
||||
)
|
||||
wet_context = rain_context >= RAIN_EVENT_THRESHOLD_MM
|
||||
|
||||
wind_values = test_df["wind_max_m_s"].to_numpy(dtype=float) if "wind_max_m_s" in test_df.columns else np.full(len(test_df), np.nan)
|
||||
@@ -545,8 +567,8 @@ def evaluate_sliced_performance(
|
||||
("nighttime_utc", np.asarray(~is_day, dtype=bool), "18:00-05:59 UTC"),
|
||||
("rainy_weeks", np.asarray(rainy_week, dtype=bool), "weeks with positive-rate >= test positive-rate"),
|
||||
("non_rainy_weeks", np.asarray(~rainy_week, dtype=bool), "weeks with positive-rate < test positive-rate"),
|
||||
("wet_context_last_1h", np.asarray(wet_context, dtype=bool), f"rain_last_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}"),
|
||||
("dry_context_last_1h", np.asarray(~wet_context, dtype=bool), f"rain_last_1h_mm < {RAIN_EVENT_THRESHOLD_MM:.2f}"),
|
||||
("wet_context_recent_rain", np.asarray(wet_context, dtype=bool), f"{context_label} >= {RAIN_EVENT_THRESHOLD_MM:.2f}"),
|
||||
("dry_context_recent_rain", np.asarray(~wet_context, dtype=bool), f"{context_label} < {RAIN_EVENT_THRESHOLD_MM:.2f}"),
|
||||
("windy_q75", np.asarray(windy, dtype=bool), "wind_max_m_s >= test 75th percentile"),
|
||||
("calm_below_q75", np.asarray(~windy, dtype=bool), "wind_max_m_s < test 75th percentile"),
|
||||
]
|
||||
@@ -585,6 +607,7 @@ def evaluate_sliced_performance(
|
||||
def tune_threshold_walk_forward(
|
||||
model_df,
|
||||
feature_cols: list[str],
|
||||
target_col: str,
|
||||
model_family: str,
|
||||
model_params: dict[str, Any],
|
||||
calibration_method: str,
|
||||
@@ -627,8 +650,8 @@ def tune_threshold_walk_forward(
|
||||
if len(fold_train) < 160 or len(fold_test) < 25:
|
||||
continue
|
||||
|
||||
y_fold_train = fold_train["rain_next_1h"].astype(int).to_numpy()
|
||||
y_fold_test = fold_test["rain_next_1h"].astype(int).to_numpy()
|
||||
y_fold_train = fold_train[target_col].astype(int).to_numpy()
|
||||
y_fold_test = fold_test[target_col].astype(int).to_numpy()
|
||||
if len(np.unique(y_fold_train)) < 2:
|
||||
continue
|
||||
|
||||
@@ -706,6 +729,7 @@ def tune_threshold_walk_forward(
|
||||
def walk_forward_backtest(
|
||||
model_df,
|
||||
feature_cols: list[str],
|
||||
target_col: str,
|
||||
model_family: str,
|
||||
model_params: dict[str, Any],
|
||||
calibration_method: str,
|
||||
@@ -745,8 +769,8 @@ def walk_forward_backtest(
|
||||
if len(fold_train) < 160 or len(fold_test) < 25:
|
||||
continue
|
||||
|
||||
y_fold_train = fold_train["rain_next_1h"].astype(int).to_numpy()
|
||||
y_fold_test = fold_test["rain_next_1h"].astype(int).to_numpy()
|
||||
y_fold_train = fold_train[target_col].astype(int).to_numpy()
|
||||
y_fold_test = fold_test[target_col].astype(int).to_numpy()
|
||||
if len(np.unique(y_fold_train)) < 2:
|
||||
continue
|
||||
|
||||
@@ -755,8 +779,8 @@ def walk_forward_backtest(
|
||||
continue
|
||||
inner_train = fold_train.iloc[:-inner_val_rows]
|
||||
inner_val = fold_train.iloc[-inner_val_rows:]
|
||||
y_inner_train = inner_train["rain_next_1h"].astype(int).to_numpy()
|
||||
y_inner_val = inner_val["rain_next_1h"].astype(int).to_numpy()
|
||||
y_inner_train = inner_train[target_col].astype(int).to_numpy()
|
||||
y_inner_val = inner_val[target_col].astype(int).to_numpy()
|
||||
if len(np.unique(y_inner_train)) < 2:
|
||||
continue
|
||||
|
||||
@@ -987,6 +1011,11 @@ def main() -> int:
|
||||
|
||||
start = parse_time(args.start) if args.start else ""
|
||||
end = parse_time(args.end) if args.end else ""
|
||||
horizon_hours = normalize_horizon_hours(args.horizon_hours)
|
||||
horizon_label = horizon_suffix(horizon_hours)
|
||||
target_col = rain_next_flag_col(horizon_hours)
|
||||
target_mm_col = rain_next_mm_col(horizon_hours)
|
||||
persistence_context_col = rain_last_mm_col(horizon_hours)
|
||||
feature_cols = feature_columns_for_set(args.feature_set)
|
||||
needs_forecast = feature_columns_need_forecast(feature_cols)
|
||||
calibration_methods = parse_calibration_methods(args.calibration_methods)
|
||||
@@ -1011,8 +1040,21 @@ def main() -> int:
|
||||
return 0
|
||||
raise RuntimeError(message)
|
||||
|
||||
full_df = build_dataset(ws90, baro, forecast=forecast, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
|
||||
model_df = model_frame(full_df, feature_cols, require_target=True)
|
||||
full_df = build_dataset(
|
||||
ws90,
|
||||
baro,
|
||||
forecast=forecast,
|
||||
rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM,
|
||||
horizon_hours=horizon_hours,
|
||||
)
|
||||
if persistence_context_col not in full_df.columns:
|
||||
persistence_context_col = rain_last_mm_col(1)
|
||||
model_df = model_frame(
|
||||
full_df,
|
||||
feature_cols,
|
||||
require_target=True,
|
||||
target_col=target_col,
|
||||
)
|
||||
if len(model_df) < args.min_rows:
|
||||
message = f"not enough model-ready rows after filtering (need >= {args.min_rows})"
|
||||
if args.allow_empty:
|
||||
@@ -1027,11 +1069,11 @@ def main() -> int:
|
||||
)
|
||||
|
||||
x_train = train_df[feature_cols]
|
||||
y_train = train_df["rain_next_1h"].astype(int).to_numpy()
|
||||
y_train = train_df[target_col].astype(int).to_numpy()
|
||||
x_val = val_df[feature_cols]
|
||||
y_val = val_df["rain_next_1h"].astype(int).to_numpy()
|
||||
y_val = val_df[target_col].astype(int).to_numpy()
|
||||
x_test = test_df[feature_cols]
|
||||
y_test = test_df["rain_next_1h"].astype(int).to_numpy()
|
||||
y_test = test_df[target_col].astype(int).to_numpy()
|
||||
|
||||
if len(np.unique(y_train)) < 2:
|
||||
raise RuntimeError("training split does not contain both classes; cannot train classifier")
|
||||
@@ -1073,6 +1115,7 @@ def main() -> int:
|
||||
threshold_tuning_walk_forward = tune_threshold_walk_forward(
|
||||
model_df=model_df.iloc[: len(train_df) + len(val_df)],
|
||||
feature_cols=feature_cols,
|
||||
target_col=target_col,
|
||||
model_family=selected_model_family,
|
||||
model_params=selected_model_params,
|
||||
calibration_method=selected_calibration_method,
|
||||
@@ -1093,7 +1136,7 @@ def main() -> int:
|
||||
|
||||
train_val_df = model_df.iloc[: len(train_df) + len(val_df)]
|
||||
x_train_val = train_val_df[feature_cols]
|
||||
y_train_val = train_val_df["rain_next_1h"].astype(int).to_numpy()
|
||||
y_train_val = train_val_df[target_col].astype(int).to_numpy()
|
||||
|
||||
final_model, final_fit_info = fit_with_optional_calibration(
|
||||
model_family=selected_model_family,
|
||||
@@ -1109,16 +1152,23 @@ def main() -> int:
|
||||
test_calibration = {
|
||||
"ece_10": expected_calibration_error(y_true=y_test, y_prob=y_test_prob, bins=10),
|
||||
}
|
||||
naive_baselines_test = evaluate_naive_baselines(test_df=test_df, y_test=y_test)
|
||||
naive_baselines_test = evaluate_naive_baselines(
|
||||
test_df=test_df,
|
||||
y_test=y_test,
|
||||
persistence_context_col=persistence_context_col,
|
||||
)
|
||||
sliced_performance = evaluate_sliced_performance(
|
||||
test_df=test_df,
|
||||
y_true=y_test,
|
||||
y_prob=y_test_prob,
|
||||
threshold=chosen_threshold,
|
||||
context_col=persistence_context_col,
|
||||
context_label=persistence_context_col,
|
||||
)
|
||||
walk_forward = walk_forward_backtest(
|
||||
model_df=model_df,
|
||||
feature_cols=feature_cols,
|
||||
target_col=target_col,
|
||||
model_family=selected_model_family,
|
||||
model_params=selected_model_params,
|
||||
calibration_method=selected_calibration_method,
|
||||
@@ -1135,8 +1185,12 @@ def main() -> int:
|
||||
"model_family_requested": args.model_family,
|
||||
"model_family": selected_model_family,
|
||||
"model_params": selected_model_params,
|
||||
"horizon_hours": horizon_hours,
|
||||
"horizon_label": horizon_label,
|
||||
"feature_set": args.feature_set,
|
||||
"target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"target_column": target_col,
|
||||
"target_mm_column": target_mm_col,
|
||||
"target_definition": f"{target_mm_col} >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"feature_columns": feature_cols,
|
||||
"forecast_model": args.forecast_model if needs_forecast else None,
|
||||
"calibration_method_requested": calibration_methods,
|
||||
@@ -1207,6 +1261,7 @@ def main() -> int:
|
||||
print(f" site: {args.site}")
|
||||
print(f" model_version: {args.model_version}")
|
||||
print(f" model_family: {selected_model_family} (requested={args.model_family})")
|
||||
print(f" horizon: {horizon_hours}h")
|
||||
print(f" model_params: {selected_model_params}")
|
||||
print(f" calibration_method: {report['calibration_method']}")
|
||||
print(
|
||||
@@ -1298,7 +1353,7 @@ def main() -> int:
|
||||
dataset_dir = os.path.dirname(dataset_out)
|
||||
if dataset_dir:
|
||||
os.makedirs(dataset_dir, exist_ok=True)
|
||||
snapshot_cols = list(dict.fromkeys(feature_cols + ["rain_next_1h", "rain_next_1h_mm"]))
|
||||
snapshot_cols = list(dict.fromkeys(feature_cols + [target_col, target_mm_col]))
|
||||
model_df[snapshot_cols].to_csv(dataset_out, index=True, index_label="ts")
|
||||
print(f"Saved dataset snapshot to {dataset_out}")
|
||||
|
||||
@@ -1320,6 +1375,10 @@ def main() -> int:
|
||||
"forecast_model": args.forecast_model if needs_forecast else None,
|
||||
"threshold": float(chosen_threshold),
|
||||
"target_mm": float(RAIN_EVENT_THRESHOLD_MM),
|
||||
"horizon_hours": horizon_hours,
|
||||
"target_col": target_col,
|
||||
"target_mm_col": target_mm_col,
|
||||
"persistence_context_col": persistence_context_col,
|
||||
"model_version": args.model_version,
|
||||
"trained_at": datetime.now(timezone.utc).isoformat(),
|
||||
"split": report["split"],
|
||||
|
||||
Reference in New Issue
Block a user