update for 4 hour rain forecast
This commit is contained in:
@@ -10,6 +10,7 @@ import psycopg2
|
||||
|
||||
from rain_model_common import (
|
||||
AVAILABLE_FEATURE_SETS,
|
||||
DEFAULT_HORIZON_HOURS,
|
||||
RAIN_EVENT_THRESHOLD_MM,
|
||||
build_dataset,
|
||||
feature_columns_for_set,
|
||||
@@ -17,8 +18,11 @@ from rain_model_common import (
|
||||
fetch_baro,
|
||||
fetch_forecast,
|
||||
fetch_ws90,
|
||||
normalize_horizon_hours,
|
||||
model_frame,
|
||||
parse_time,
|
||||
rain_next_flag_col,
|
||||
rain_next_mm_col,
|
||||
to_builtin,
|
||||
)
|
||||
|
||||
@@ -29,6 +33,12 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.add_argument("--site", required=True, help="Site name (e.g. home).")
|
||||
parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).")
|
||||
parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).")
|
||||
parser.add_argument(
|
||||
"--horizon-hours",
|
||||
type=int,
|
||||
default=DEFAULT_HORIZON_HOURS,
|
||||
help="Prediction horizon in hours for target/label auditing.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--feature-set",
|
||||
default="baseline",
|
||||
@@ -57,13 +67,13 @@ def longest_zero_run(counts: np.ndarray) -> int:
|
||||
return best
|
||||
|
||||
|
||||
def build_weekly_balance(model_df):
|
||||
def build_weekly_balance(model_df, target_col: str):
|
||||
weekly = model_df.copy()
|
||||
iso = weekly.index.to_series().dt.isocalendar()
|
||||
weekly["year_week"] = iso["year"].astype(str) + "-W" + iso["week"].astype(str).str.zfill(2)
|
||||
|
||||
grouped = (
|
||||
weekly.groupby("year_week")["rain_next_1h"]
|
||||
weekly.groupby("year_week")[target_col]
|
||||
.agg(total_rows="count", positive_rows="sum")
|
||||
.reset_index()
|
||||
.sort_values("year_week")
|
||||
@@ -79,6 +89,9 @@ def main() -> int:
|
||||
|
||||
start = parse_time(args.start) if args.start else ""
|
||||
end = parse_time(args.end) if args.end else ""
|
||||
horizon_hours = normalize_horizon_hours(args.horizon_hours)
|
||||
target_col = rain_next_flag_col(horizon_hours)
|
||||
target_mm_col = rain_next_mm_col(horizon_hours)
|
||||
feature_cols = feature_columns_for_set(args.feature_set)
|
||||
needs_forecast = feature_columns_need_forecast(feature_cols)
|
||||
|
||||
@@ -89,8 +102,14 @@ def main() -> int:
|
||||
if needs_forecast:
|
||||
forecast = fetch_forecast(conn, args.site, start, end, model=args.forecast_model)
|
||||
|
||||
df = build_dataset(ws90, baro, forecast=forecast, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
|
||||
model_df = model_frame(df, feature_cols, require_target=True)
|
||||
df = build_dataset(
|
||||
ws90,
|
||||
baro,
|
||||
forecast=forecast,
|
||||
rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM,
|
||||
horizon_hours=horizon_hours,
|
||||
)
|
||||
model_df = model_frame(df, feature_cols, require_target=True, target_col=target_col)
|
||||
|
||||
ws90_dupes = int(ws90.duplicated(subset=["ts", "station_id"]).sum()) if not ws90.empty else 0
|
||||
baro_dupes = int(baro.duplicated(subset=["ts", "source"]).sum()) if not baro.empty else 0
|
||||
@@ -114,7 +133,7 @@ def main() -> int:
|
||||
baro_max_gap_min = longest_zero_run(np.array(baro_counts)) * 5 if len(baro_counts) else 0
|
||||
|
||||
missingness = {}
|
||||
for col in feature_cols + ["pressure_hpa", "rain_mm", "rain_inc", "rain_next_1h_mm"]:
|
||||
for col in feature_cols + ["pressure_hpa", "rain_mm", "rain_inc", target_mm_col]:
|
||||
if col in df.columns:
|
||||
missingness[col] = float(df[col].isna().mean())
|
||||
|
||||
@@ -125,9 +144,12 @@ def main() -> int:
|
||||
report = {
|
||||
"site": args.site,
|
||||
"feature_set": args.feature_set,
|
||||
"horizon_hours": horizon_hours,
|
||||
"target_column": target_col,
|
||||
"target_mm_column": target_mm_col,
|
||||
"feature_columns": feature_cols,
|
||||
"forecast_model": args.forecast_model if needs_forecast else None,
|
||||
"target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"target_definition": f"{target_mm_col} >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"requested_window": {
|
||||
"start": start or None,
|
||||
"end": end or None,
|
||||
@@ -167,8 +189,8 @@ def main() -> int:
|
||||
"max_rain_increment_5m_mm": max_rain_inc,
|
||||
},
|
||||
"class_balance": {
|
||||
"overall_positive_rate": float(model_df["rain_next_1h"].mean()) if not model_df.empty else None,
|
||||
"weekly": build_weekly_balance(model_df) if not model_df.empty else [],
|
||||
"overall_positive_rate": float(model_df[target_col].mean()) if not model_df.empty else None,
|
||||
"weekly": build_weekly_balance(model_df, target_col=target_col) if not model_df.empty else [],
|
||||
},
|
||||
}
|
||||
report = to_builtin(report)
|
||||
|
||||
@@ -9,6 +9,24 @@ from typing import Any
|
||||
|
||||
import psycopg2
|
||||
|
||||
DEFAULT_HORIZON_HOURS = 4
|
||||
|
||||
|
||||
def normalize_horizon_hours(horizon_hours: int) -> int:
|
||||
out = int(horizon_hours)
|
||||
if out <= 0:
|
||||
raise ValueError("horizon_hours must be > 0")
|
||||
return out
|
||||
|
||||
|
||||
def prediction_table_for_horizon(horizon_hours: int) -> str:
|
||||
horizon = normalize_horizon_hours(horizon_hours)
|
||||
if horizon == 1:
|
||||
return "predictions_rain_1h"
|
||||
if horizon == 4:
|
||||
return "predictions_rain_4h"
|
||||
raise ValueError(f"unsupported prediction-table horizon: {horizon_hours}")
|
||||
|
||||
|
||||
def parse_duration(value: str) -> timedelta:
|
||||
raw = value.strip().lower()
|
||||
@@ -27,14 +45,20 @@ def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Check freshness/health of rain-model data and predictions.")
|
||||
parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
|
||||
parser.add_argument("--site", required=True, help="Site name.")
|
||||
parser.add_argument("--model-name", default="rain_next_1h", help="Prediction model_name to check.")
|
||||
parser.add_argument("--model-name", default="rain_next_4h", help="Prediction model_name to check.")
|
||||
parser.add_argument(
|
||||
"--horizon-hours",
|
||||
type=int,
|
||||
default=DEFAULT_HORIZON_HOURS,
|
||||
help="Prediction horizon in hours used to select prediction storage table.",
|
||||
)
|
||||
parser.add_argument("--max-ws90-age", default="20m", help="Max allowed age for ws90 latest row.")
|
||||
parser.add_argument("--max-baro-age", default="30m", help="Max allowed age for barometer latest row.")
|
||||
parser.add_argument("--max-forecast-age", default="3h", help="Max allowed age for forecast latest row.")
|
||||
parser.add_argument("--max-prediction-age", default="30m", help="Max allowed age for latest prediction write.")
|
||||
parser.add_argument(
|
||||
"--max-pending-eval-age",
|
||||
default="3h",
|
||||
default="6h",
|
||||
help="Pending evaluations older than this count toward alert.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -91,6 +115,8 @@ def main() -> int:
|
||||
max_forecast_age = parse_duration(args.max_forecast_age)
|
||||
max_prediction_age = parse_duration(args.max_prediction_age)
|
||||
max_pending_eval_age = parse_duration(args.max_pending_eval_age)
|
||||
horizon_hours = normalize_horizon_hours(args.horizon_hours)
|
||||
prediction_table = prediction_table_for_horizon(horizon_hours)
|
||||
|
||||
with psycopg2.connect(args.db_url) as conn:
|
||||
ws90_latest = fetch_latest_ts(
|
||||
@@ -110,9 +136,9 @@ def main() -> int:
|
||||
)
|
||||
prediction_latest = fetch_latest_ts(
|
||||
conn,
|
||||
"""
|
||||
f"""
|
||||
SELECT max(generated_at)
|
||||
FROM predictions_rain_1h
|
||||
FROM {prediction_table}
|
||||
WHERE site = %s
|
||||
AND model_name = %s
|
||||
""",
|
||||
@@ -120,9 +146,9 @@ def main() -> int:
|
||||
)
|
||||
pending_eval_rows = fetch_count(
|
||||
conn,
|
||||
"""
|
||||
f"""
|
||||
SELECT count(*)
|
||||
FROM predictions_rain_1h
|
||||
FROM {prediction_table}
|
||||
WHERE site = %s
|
||||
AND model_name = %s
|
||||
AND evaluated_at IS NULL
|
||||
@@ -188,6 +214,8 @@ def main() -> int:
|
||||
"generated_at": now.isoformat(),
|
||||
"site": args.site,
|
||||
"model_name": args.model_name,
|
||||
"horizon_hours": horizon_hours,
|
||||
"prediction_table": prediction_table,
|
||||
"status": overall_status,
|
||||
"failing_checks": failing,
|
||||
"checks": checks,
|
||||
|
||||
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Compare two rain-model training reports.")
|
||||
parser.add_argument("--baseline", required=True, help="Baseline report path (for example 1h).")
|
||||
parser.add_argument("--candidate", required=True, help="Candidate report path (for example 4h).")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: str) -> dict[str, Any]:
|
||||
p = Path(path)
|
||||
with p.open("r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def to_float(v: Any) -> float | None:
|
||||
if v is None:
|
||||
return None
|
||||
try:
|
||||
return float(v)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def metric(report: dict[str, Any], split: str, key: str) -> float | None:
|
||||
return to_float(report.get(split, {}).get(key))
|
||||
|
||||
|
||||
def delta_str(base: float | None, cand: float | None) -> str:
|
||||
if base is None or cand is None:
|
||||
return "n/a"
|
||||
d = cand - base
|
||||
return f"{d:+.4f}"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
baseline = load_json(args.baseline)
|
||||
candidate = load_json(args.candidate)
|
||||
|
||||
pairs = [
|
||||
("precision", metric(baseline, "test_metrics", "precision"), metric(candidate, "test_metrics", "precision")),
|
||||
("recall", metric(baseline, "test_metrics", "recall"), metric(candidate, "test_metrics", "recall")),
|
||||
("f1", metric(baseline, "test_metrics", "f1"), metric(candidate, "test_metrics", "f1")),
|
||||
("pr_auc", metric(baseline, "test_metrics", "pr_auc"), metric(candidate, "test_metrics", "pr_auc")),
|
||||
("roc_auc", metric(baseline, "test_metrics", "roc_auc"), metric(candidate, "test_metrics", "roc_auc")),
|
||||
("brier", metric(baseline, "test_metrics", "brier"), metric(candidate, "test_metrics", "brier")),
|
||||
]
|
||||
|
||||
print("Rain report comparison:")
|
||||
print(
|
||||
f" baseline: version={baseline.get('model_version')} "
|
||||
f"horizon={baseline.get('horizon_hours')}h "
|
||||
f"target={baseline.get('target_definition')}"
|
||||
)
|
||||
print(
|
||||
f" candidate: version={candidate.get('model_version')} "
|
||||
f"horizon={candidate.get('horizon_hours')}h "
|
||||
f"target={candidate.get('target_definition')}"
|
||||
)
|
||||
print(" metrics (candidate - baseline):")
|
||||
for name, base, cand in pairs:
|
||||
base_txt = "n/a" if base is None else f"{base:.4f}"
|
||||
cand_txt = "n/a" if cand is None else f"{cand:.4f}"
|
||||
print(f" {name}: baseline={base_txt} candidate={cand_txt} delta={delta_str(base, cand)}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -9,13 +9,18 @@ import psycopg2
|
||||
from psycopg2.extras import Json
|
||||
|
||||
from rain_model_common import (
|
||||
DEFAULT_HORIZON_HOURS,
|
||||
build_dataset,
|
||||
feature_columns_need_forecast,
|
||||
fetch_baro,
|
||||
fetch_forecast,
|
||||
fetch_ws90,
|
||||
model_frame,
|
||||
normalize_horizon_hours,
|
||||
parse_time,
|
||||
prediction_table_for_horizon,
|
||||
rain_next_flag_col,
|
||||
rain_next_mm_col,
|
||||
to_builtin,
|
||||
)
|
||||
|
||||
@@ -30,8 +35,13 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
|
||||
parser.add_argument("--site", required=True, help="Site name (e.g. home).")
|
||||
parser.add_argument("--model-path", default="models/rain_model.pkl", help="Path to trained model artifact.")
|
||||
parser.add_argument("--model-name", default="rain_next_1h", help="Logical prediction model name.")
|
||||
parser.add_argument("--model-name", default="rain_next_4h", help="Logical prediction model name.")
|
||||
parser.add_argument("--model-version", help="Override artifact model_version.")
|
||||
parser.add_argument(
|
||||
"--horizon-hours",
|
||||
type=int,
|
||||
help="Prediction horizon in hours. Defaults to artifact horizon when present, else 4.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--at",
|
||||
help="Prediction timestamp (RFC3339 or YYYY-MM-DD). Default: current UTC time.",
|
||||
@@ -98,9 +108,21 @@ def main() -> int:
|
||||
threshold = float(artifact.get("threshold", 0.5))
|
||||
model_version = args.model_version or artifact.get("model_version") or "unknown"
|
||||
forecast_model = str(artifact.get("forecast_model") or args.forecast_model)
|
||||
artifact_horizon = artifact.get("horizon_hours")
|
||||
if args.horizon_hours is not None:
|
||||
horizon_hours = normalize_horizon_hours(args.horizon_hours)
|
||||
elif artifact_horizon is not None:
|
||||
horizon_hours = normalize_horizon_hours(int(artifact_horizon))
|
||||
else:
|
||||
horizon_hours = DEFAULT_HORIZON_HOURS
|
||||
target_col = str(artifact.get("target_col") or rain_next_flag_col(horizon_hours))
|
||||
target_mm_col = str(artifact.get("target_mm_col") or rain_next_mm_col(horizon_hours))
|
||||
prediction_table = prediction_table_for_horizon(horizon_hours)
|
||||
actual_mm_col = f"{target_mm_col}_actual"
|
||||
actual_flag_col = f"{target_col}_actual"
|
||||
|
||||
fetch_start = (at - timedelta(hours=args.history_hours)).isoformat()
|
||||
fetch_end = (at + timedelta(hours=1, minutes=5)).isoformat()
|
||||
fetch_end = (at + timedelta(hours=horizon_hours, minutes=5)).isoformat()
|
||||
|
||||
with psycopg2.connect(args.db_url) as conn:
|
||||
ws90 = fetch_ws90(conn, args.site, fetch_start, fetch_end)
|
||||
@@ -122,7 +144,7 @@ def main() -> int:
|
||||
return 0
|
||||
raise RuntimeError(message)
|
||||
|
||||
full_df = build_dataset(ws90, baro, forecast=forecast)
|
||||
full_df = build_dataset(ws90, baro, forecast=forecast, horizon_hours=horizon_hours)
|
||||
feature_df = model_frame(full_df, feature_cols=features, require_target=False)
|
||||
candidates = feature_df.loc[feature_df.index <= at]
|
||||
if candidates.empty:
|
||||
@@ -143,9 +165,9 @@ def main() -> int:
|
||||
actual_flag = None
|
||||
evaluated_at = None
|
||||
latest_available = full_df.index.max().to_pydatetime()
|
||||
if pred_ts + timedelta(hours=1) <= latest_available:
|
||||
next_mm = full_df.loc[pred_ts, "rain_next_1h_mm"]
|
||||
next_flag = full_df.loc[pred_ts, "rain_next_1h"]
|
||||
if pred_ts + timedelta(hours=horizon_hours) <= latest_available:
|
||||
next_mm = full_df.loc[pred_ts, target_mm_col]
|
||||
next_flag = full_df.loc[pred_ts, target_col]
|
||||
if next_mm == next_mm: # NaN-safe check
|
||||
actual_mm = float(next_mm)
|
||||
if next_flag == next_flag:
|
||||
@@ -156,6 +178,10 @@ def main() -> int:
|
||||
"artifact_path": args.model_path,
|
||||
"artifact_model_version": artifact.get("model_version"),
|
||||
"artifact_feature_set": feature_set,
|
||||
"horizon_hours": horizon_hours,
|
||||
"target_col": target_col,
|
||||
"target_mm_col": target_mm_col,
|
||||
"prediction_table": prediction_table,
|
||||
"forecast_model": forecast_model if needs_forecast else None,
|
||||
"needs_forecast_features": needs_forecast,
|
||||
"feature_values": {col: float(row.iloc[0][col]) for col in features},
|
||||
@@ -170,6 +196,7 @@ def main() -> int:
|
||||
print(f" site: {args.site}")
|
||||
print(f" model_name: {args.model_name}")
|
||||
print(f" model_version: {model_version}")
|
||||
print(f" horizon_hours: {horizon_hours}")
|
||||
if feature_set:
|
||||
print(f" feature_set: {feature_set}")
|
||||
print(f" pred_ts: {pred_ts.isoformat()}")
|
||||
@@ -182,10 +209,8 @@ def main() -> int:
|
||||
print("dry-run enabled; skipping DB upsert.")
|
||||
return 0
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO predictions_rain_1h (
|
||||
query = f"""
|
||||
INSERT INTO {prediction_table} (
|
||||
ts,
|
||||
generated_at,
|
||||
site,
|
||||
@@ -194,8 +219,8 @@ def main() -> int:
|
||||
threshold,
|
||||
probability,
|
||||
predict_rain,
|
||||
rain_next_1h_mm_actual,
|
||||
rain_next_1h_actual,
|
||||
{actual_mm_col},
|
||||
{actual_flag_col},
|
||||
evaluated_at,
|
||||
metadata
|
||||
) VALUES (
|
||||
@@ -207,11 +232,14 @@ def main() -> int:
|
||||
threshold = EXCLUDED.threshold,
|
||||
probability = EXCLUDED.probability,
|
||||
predict_rain = EXCLUDED.predict_rain,
|
||||
rain_next_1h_mm_actual = COALESCE(EXCLUDED.rain_next_1h_mm_actual, predictions_rain_1h.rain_next_1h_mm_actual),
|
||||
rain_next_1h_actual = COALESCE(EXCLUDED.rain_next_1h_actual, predictions_rain_1h.rain_next_1h_actual),
|
||||
evaluated_at = COALESCE(EXCLUDED.evaluated_at, predictions_rain_1h.evaluated_at),
|
||||
{actual_mm_col} = COALESCE(EXCLUDED.{actual_mm_col}, {prediction_table}.{actual_mm_col}),
|
||||
{actual_flag_col} = COALESCE(EXCLUDED.{actual_flag_col}, {prediction_table}.{actual_flag_col}),
|
||||
evaluated_at = COALESCE(EXCLUDED.evaluated_at, {prediction_table}.evaluated_at),
|
||||
metadata = EXCLUDED.metadata
|
||||
""",
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
query,
|
||||
(
|
||||
pred_ts,
|
||||
args.site,
|
||||
@@ -227,7 +255,7 @@ def main() -> int:
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
print("Prediction upserted into predictions_rain_1h.")
|
||||
print(f"Prediction upserted into {prediction_table}.")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
@@ -86,7 +86,46 @@ FEATURE_COLUMNS = BASELINE_FEATURE_COLUMNS
|
||||
|
||||
RAIN_EVENT_THRESHOLD_MM = 0.2
|
||||
RAIN_SPIKE_THRESHOLD_MM_5M = 5.0
|
||||
RAIN_HORIZON_BUCKETS = 12 # 12 * 5m = 1h
|
||||
BUCKET_MINUTES = 5
|
||||
DEFAULT_HORIZON_HOURS = 4
|
||||
SUPPORTED_PREDICTION_HORIZONS = (1, 4)
|
||||
|
||||
|
||||
def normalize_horizon_hours(horizon_hours: int) -> int:
|
||||
out = int(horizon_hours)
|
||||
if out <= 0:
|
||||
raise ValueError("horizon_hours must be > 0")
|
||||
return out
|
||||
|
||||
|
||||
def horizon_suffix(horizon_hours: int) -> str:
|
||||
return f"{normalize_horizon_hours(horizon_hours)}h"
|
||||
|
||||
|
||||
def horizon_buckets(horizon_hours: int) -> int:
|
||||
hours = normalize_horizon_hours(horizon_hours)
|
||||
return (hours * 60) // BUCKET_MINUTES
|
||||
|
||||
|
||||
def rain_last_mm_col(horizon_hours: int) -> str:
|
||||
return f"rain_last_{horizon_suffix(horizon_hours)}_mm"
|
||||
|
||||
|
||||
def rain_next_mm_col(horizon_hours: int) -> str:
|
||||
return f"rain_next_{horizon_suffix(horizon_hours)}_mm"
|
||||
|
||||
|
||||
def rain_next_flag_col(horizon_hours: int) -> str:
|
||||
return f"rain_next_{horizon_suffix(horizon_hours)}"
|
||||
|
||||
|
||||
def prediction_table_for_horizon(horizon_hours: int) -> str:
|
||||
horizon = normalize_horizon_hours(horizon_hours)
|
||||
if horizon == 1:
|
||||
return "predictions_rain_1h"
|
||||
if horizon == 4:
|
||||
return "predictions_rain_4h"
|
||||
raise ValueError(f"unsupported prediction-table horizon: {horizon_hours}")
|
||||
|
||||
|
||||
def parse_time(value: str) -> str:
|
||||
@@ -232,6 +271,7 @@ def build_dataset(
|
||||
baro: pd.DataFrame,
|
||||
forecast: pd.DataFrame | None = None,
|
||||
rain_event_threshold_mm: float = RAIN_EVENT_THRESHOLD_MM,
|
||||
horizon_hours: int = 1,
|
||||
) -> pd.DataFrame:
|
||||
if ws90.empty:
|
||||
raise RuntimeError("no ws90 observations found")
|
||||
@@ -261,12 +301,20 @@ def build_dataset(
|
||||
df["rain_inc"] = df["rain_inc_raw"].clip(lower=0)
|
||||
df["rain_spike_5m"] = df["rain_inc"] >= RAIN_SPIKE_THRESHOLD_MM_5M
|
||||
|
||||
window = RAIN_HORIZON_BUCKETS
|
||||
df["rain_last_1h_mm"] = df["rain_inc"].rolling(window=window, min_periods=1).sum()
|
||||
df["rain_next_1h_mm"] = df["rain_inc"].rolling(window=window, min_periods=1).sum().shift(-(window - 1))
|
||||
df["rain_next_1h"] = df["rain_next_1h_mm"] >= rain_event_threshold_mm
|
||||
windows: dict[int, int] = {
|
||||
1: horizon_buckets(1),
|
||||
normalize_horizon_hours(horizon_hours): horizon_buckets(horizon_hours),
|
||||
}
|
||||
for hours, window in windows.items():
|
||||
rain_last_col = rain_last_mm_col(hours)
|
||||
rain_next_mm = rain_next_mm_col(hours)
|
||||
rain_next_flag = rain_next_flag_col(hours)
|
||||
df[rain_last_col] = df["rain_inc"].rolling(window=window, min_periods=1).sum()
|
||||
df[rain_next_mm] = df["rain_inc"].rolling(window=window, min_periods=1).sum().shift(-(window - 1))
|
||||
df[rain_next_flag] = df[rain_next_mm] >= rain_event_threshold_mm
|
||||
|
||||
df["pressure_trend_1h"] = df["pressure_hpa"] - df["pressure_hpa"].shift(window)
|
||||
window_1h = horizon_buckets(1)
|
||||
df["pressure_trend_1h"] = df["pressure_hpa"] - df["pressure_hpa"].shift(window_1h)
|
||||
|
||||
# Wind direction cyclical encoding.
|
||||
radians = np.deg2rad(df["wind_dir_deg"] % 360.0)
|
||||
@@ -279,14 +327,14 @@ def build_dataset(
|
||||
df["wind_avg_lag_5m"] = df["wind_avg_m_s"].shift(1)
|
||||
df["pressure_lag_5m"] = df["pressure_hpa"].shift(1)
|
||||
|
||||
df["temp_roll_1h_mean"] = df["temperature_c"].rolling(window=window, min_periods=3).mean()
|
||||
df["temp_roll_1h_std"] = df["temperature_c"].rolling(window=window, min_periods=3).std()
|
||||
df["humidity_roll_1h_mean"] = df["humidity"].rolling(window=window, min_periods=3).mean()
|
||||
df["humidity_roll_1h_std"] = df["humidity"].rolling(window=window, min_periods=3).std()
|
||||
df["wind_avg_roll_1h_mean"] = df["wind_avg_m_s"].rolling(window=window, min_periods=3).mean()
|
||||
df["wind_gust_roll_1h_max"] = df["wind_max_m_s"].rolling(window=window, min_periods=3).max()
|
||||
df["pressure_roll_1h_mean"] = df["pressure_hpa"].rolling(window=window, min_periods=3).mean()
|
||||
df["pressure_roll_1h_std"] = df["pressure_hpa"].rolling(window=window, min_periods=3).std()
|
||||
df["temp_roll_1h_mean"] = df["temperature_c"].rolling(window=window_1h, min_periods=3).mean()
|
||||
df["temp_roll_1h_std"] = df["temperature_c"].rolling(window=window_1h, min_periods=3).std()
|
||||
df["humidity_roll_1h_mean"] = df["humidity"].rolling(window=window_1h, min_periods=3).mean()
|
||||
df["humidity_roll_1h_std"] = df["humidity"].rolling(window=window_1h, min_periods=3).std()
|
||||
df["wind_avg_roll_1h_mean"] = df["wind_avg_m_s"].rolling(window=window_1h, min_periods=3).mean()
|
||||
df["wind_gust_roll_1h_max"] = df["wind_max_m_s"].rolling(window=window_1h, min_periods=3).max()
|
||||
df["pressure_roll_1h_mean"] = df["pressure_hpa"].rolling(window=window_1h, min_periods=3).mean()
|
||||
df["pressure_roll_1h_std"] = df["pressure_hpa"].rolling(window=window_1h, min_periods=3).std()
|
||||
|
||||
# Calendar/seasonality features (UTC based).
|
||||
hour_of_day = df.index.hour + (df.index.minute / 60.0)
|
||||
@@ -304,11 +352,16 @@ def build_dataset(
|
||||
return df
|
||||
|
||||
|
||||
def model_frame(df: pd.DataFrame, feature_cols: list[str] | None = None, require_target: bool = True) -> pd.DataFrame:
|
||||
def model_frame(
|
||||
df: pd.DataFrame,
|
||||
feature_cols: list[str] | None = None,
|
||||
require_target: bool = True,
|
||||
target_col: str | None = None,
|
||||
) -> pd.DataFrame:
|
||||
features = feature_cols or FEATURE_COLUMNS
|
||||
required = list(features)
|
||||
if require_target:
|
||||
required.append("rain_next_1h")
|
||||
required.append(target_col or rain_next_flag_col(1))
|
||||
out = df.dropna(subset=required).copy()
|
||||
return out.sort_index()
|
||||
|
||||
|
||||
@@ -4,7 +4,9 @@ set -euo pipefail
|
||||
SITE="${SITE:-home}"
|
||||
START="${START:-2026-02-01T00:00:00Z}"
|
||||
END="${END:-2026-03-03T23:55:00Z}"
|
||||
MODEL_VERSION="${MODEL_VERSION:-rain-logreg-v1}"
|
||||
HORIZON_HOURS="${HORIZON_HOURS:-4}"
|
||||
MODEL_NAME="${MODEL_NAME:-rain_next_${HORIZON_HOURS}h}"
|
||||
MODEL_VERSION="${MODEL_VERSION:-rain-logreg-v2-${HORIZON_HOURS}h}"
|
||||
MODEL_PATH="${MODEL_PATH:-models/rain_model.pkl}"
|
||||
REPORT_PATH="${REPORT_PATH:-models/rain_model_report.json}"
|
||||
AUDIT_PATH="${AUDIT_PATH:-models/rain_data_audit.json}"
|
||||
@@ -24,6 +26,7 @@ python scripts/audit_rain_data.py \
|
||||
--site "$SITE" \
|
||||
--start "$START" \
|
||||
--end "$END" \
|
||||
--horizon-hours "$HORIZON_HOURS" \
|
||||
--feature-set "$FEATURE_SET" \
|
||||
--forecast-model "$FORECAST_MODEL" \
|
||||
--out "$AUDIT_PATH"
|
||||
@@ -33,6 +36,7 @@ python scripts/train_rain_model.py \
|
||||
--site "$SITE" \
|
||||
--start "$START" \
|
||||
--end "$END" \
|
||||
--horizon-hours "$HORIZON_HOURS" \
|
||||
--train-ratio 0.7 \
|
||||
--val-ratio 0.15 \
|
||||
--min-precision 0.70 \
|
||||
@@ -50,7 +54,8 @@ echo "Writing current prediction..."
|
||||
python scripts/predict_rain_model.py \
|
||||
--site "$SITE" \
|
||||
--model-path "$MODEL_PATH" \
|
||||
--model-name "rain_next_1h" \
|
||||
--model-name "$MODEL_NAME" \
|
||||
--horizon-hours "$HORIZON_HOURS" \
|
||||
--forecast-model "$FORECAST_MODEL"
|
||||
|
||||
echo "P0 rain workflow complete."
|
||||
|
||||
@@ -40,6 +40,7 @@ def read_env_bool(name: str, default: bool) -> bool:
|
||||
class WorkerConfig:
|
||||
database_url: str
|
||||
site: str
|
||||
horizon_hours: int
|
||||
model_name: str
|
||||
model_version_base: str
|
||||
model_family: str
|
||||
@@ -166,6 +167,8 @@ def run_training_cycle(cfg: WorkerConfig, env: dict[str, str]) -> None:
|
||||
start,
|
||||
"--end",
|
||||
end,
|
||||
"--horizon-hours",
|
||||
str(cfg.horizon_hours),
|
||||
"--feature-set",
|
||||
cfg.feature_set,
|
||||
"--forecast-model",
|
||||
@@ -185,6 +188,8 @@ def run_training_cycle(cfg: WorkerConfig, env: dict[str, str]) -> None:
|
||||
start,
|
||||
"--end",
|
||||
end,
|
||||
"--horizon-hours",
|
||||
str(cfg.horizon_hours),
|
||||
"--train-ratio",
|
||||
str(cfg.train_ratio),
|
||||
"--val-ratio",
|
||||
@@ -269,6 +274,8 @@ def run_predict_once(cfg: WorkerConfig, env: dict[str, str]) -> None:
|
||||
str(cfg.model_path),
|
||||
"--model-name",
|
||||
cfg.model_name,
|
||||
"--horizon-hours",
|
||||
str(cfg.horizon_hours),
|
||||
"--forecast-model",
|
||||
cfg.forecast_model,
|
||||
*(["--allow-empty"] if cfg.allow_empty_data else ["--strict-source-data"]),
|
||||
@@ -289,8 +296,9 @@ def load_config() -> WorkerConfig:
|
||||
return WorkerConfig(
|
||||
database_url=database_url,
|
||||
site=read_env("RAIN_SITE", "home"),
|
||||
model_name=read_env("RAIN_MODEL_NAME", "rain_next_1h"),
|
||||
model_version_base=read_env("RAIN_MODEL_VERSION_BASE", "rain-auto-v1-extended"),
|
||||
horizon_hours=read_env_int("RAIN_HORIZON_HOURS", 4),
|
||||
model_name=read_env("RAIN_MODEL_NAME", "rain_next_4h"),
|
||||
model_version_base=read_env("RAIN_MODEL_VERSION_BASE", "rain-auto-v2-extended-4h"),
|
||||
model_family=read_env("RAIN_MODEL_FAMILY", "auto"),
|
||||
feature_set=read_env("RAIN_FEATURE_SET", "extended"),
|
||||
forecast_model=read_env("RAIN_FORECAST_MODEL", "ecmwf"),
|
||||
@@ -338,6 +346,7 @@ def main() -> int:
|
||||
print(
|
||||
"[rain-ml] worker start "
|
||||
f"site={cfg.site} "
|
||||
f"horizon_hours={cfg.horizon_hours} "
|
||||
f"model_name={cfg.model_name} "
|
||||
f"model_family={cfg.model_family} "
|
||||
f"feature_set={cfg.feature_set} "
|
||||
|
||||
+83
-24
@@ -20,6 +20,7 @@ from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from rain_model_common import (
|
||||
AVAILABLE_FEATURE_SETS,
|
||||
DEFAULT_HORIZON_HOURS,
|
||||
RAIN_EVENT_THRESHOLD_MM,
|
||||
build_dataset,
|
||||
evaluate_probs,
|
||||
@@ -28,8 +29,13 @@ from rain_model_common import (
|
||||
fetch_ws90,
|
||||
feature_columns_for_set,
|
||||
feature_columns_need_forecast,
|
||||
horizon_suffix,
|
||||
model_frame,
|
||||
normalize_horizon_hours,
|
||||
parse_time,
|
||||
rain_last_mm_col,
|
||||
rain_next_flag_col,
|
||||
rain_next_mm_col,
|
||||
safe_pr_auc,
|
||||
safe_roc_auc,
|
||||
select_threshold,
|
||||
@@ -49,11 +55,17 @@ THRESHOLD_POLICIES = ("validation", "walk_forward")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Train a rain prediction model (next 1h >= 0.2mm).")
|
||||
parser = argparse.ArgumentParser(description="Train a rain prediction model (next Nh >= threshold).")
|
||||
parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
|
||||
parser.add_argument("--site", required=True, help="Site name (e.g. home).")
|
||||
parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).")
|
||||
parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).")
|
||||
parser.add_argument(
|
||||
"--horizon-hours",
|
||||
type=int,
|
||||
default=DEFAULT_HORIZON_HOURS,
|
||||
help="Prediction horizon in hours (for example 1 or 4).",
|
||||
)
|
||||
parser.add_argument("--train-ratio", type=float, default=0.7, help="Time-ordered train split ratio.")
|
||||
parser.add_argument("--val-ratio", type=float, default=0.15, help="Time-ordered validation split ratio.")
|
||||
parser.add_argument(
|
||||
@@ -464,14 +476,18 @@ def evaluate_calibration_methods(
|
||||
return selected, results
|
||||
|
||||
|
||||
def evaluate_naive_baselines(test_df, y_test: np.ndarray) -> dict[str, Any]:
|
||||
def evaluate_naive_baselines(
|
||||
test_df,
|
||||
y_test: np.ndarray,
|
||||
persistence_context_col: str,
|
||||
) -> dict[str, Any]:
|
||||
out: dict[str, Any] = {}
|
||||
|
||||
if "rain_last_1h_mm" in test_df.columns:
|
||||
rain_last = test_df["rain_last_1h_mm"].to_numpy(dtype=float)
|
||||
if persistence_context_col in test_df.columns:
|
||||
rain_last = test_df[persistence_context_col].to_numpy(dtype=float)
|
||||
persistence_prob = (rain_last >= RAIN_EVENT_THRESHOLD_MM).astype(float)
|
||||
out["persistence_last_1h"] = {
|
||||
"rule": f"predict rain when rain_last_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
out[f"persistence_{persistence_context_col}"] = {
|
||||
"rule": f"predict rain when {persistence_context_col} >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"metrics": evaluate_probs(y_true=y_test, y_prob=persistence_prob, threshold=0.5),
|
||||
}
|
||||
|
||||
@@ -512,6 +528,8 @@ def evaluate_sliced_performance(
|
||||
y_true: np.ndarray,
|
||||
y_prob: np.ndarray,
|
||||
threshold: float,
|
||||
context_col: str,
|
||||
context_label: str,
|
||||
min_rows_per_slice: int = 30,
|
||||
) -> dict[str, Any]:
|
||||
frame = pd.DataFrame(
|
||||
@@ -530,7 +548,11 @@ def evaluate_sliced_performance(
|
||||
weekly_positive_rate = frame.groupby(week_label)["y_true"].transform("mean")
|
||||
rainy_week = weekly_positive_rate >= overall_rate
|
||||
|
||||
rain_context = test_df["rain_last_1h_mm"].to_numpy(dtype=float) if "rain_last_1h_mm" in test_df.columns else np.zeros(len(test_df))
|
||||
rain_context = (
|
||||
test_df[context_col].to_numpy(dtype=float)
|
||||
if context_col in test_df.columns
|
||||
else np.zeros(len(test_df))
|
||||
)
|
||||
wet_context = rain_context >= RAIN_EVENT_THRESHOLD_MM
|
||||
|
||||
wind_values = test_df["wind_max_m_s"].to_numpy(dtype=float) if "wind_max_m_s" in test_df.columns else np.full(len(test_df), np.nan)
|
||||
@@ -545,8 +567,8 @@ def evaluate_sliced_performance(
|
||||
("nighttime_utc", np.asarray(~is_day, dtype=bool), "18:00-05:59 UTC"),
|
||||
("rainy_weeks", np.asarray(rainy_week, dtype=bool), "weeks with positive-rate >= test positive-rate"),
|
||||
("non_rainy_weeks", np.asarray(~rainy_week, dtype=bool), "weeks with positive-rate < test positive-rate"),
|
||||
("wet_context_last_1h", np.asarray(wet_context, dtype=bool), f"rain_last_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}"),
|
||||
("dry_context_last_1h", np.asarray(~wet_context, dtype=bool), f"rain_last_1h_mm < {RAIN_EVENT_THRESHOLD_MM:.2f}"),
|
||||
("wet_context_recent_rain", np.asarray(wet_context, dtype=bool), f"{context_label} >= {RAIN_EVENT_THRESHOLD_MM:.2f}"),
|
||||
("dry_context_recent_rain", np.asarray(~wet_context, dtype=bool), f"{context_label} < {RAIN_EVENT_THRESHOLD_MM:.2f}"),
|
||||
("windy_q75", np.asarray(windy, dtype=bool), "wind_max_m_s >= test 75th percentile"),
|
||||
("calm_below_q75", np.asarray(~windy, dtype=bool), "wind_max_m_s < test 75th percentile"),
|
||||
]
|
||||
@@ -585,6 +607,7 @@ def evaluate_sliced_performance(
|
||||
def tune_threshold_walk_forward(
|
||||
model_df,
|
||||
feature_cols: list[str],
|
||||
target_col: str,
|
||||
model_family: str,
|
||||
model_params: dict[str, Any],
|
||||
calibration_method: str,
|
||||
@@ -627,8 +650,8 @@ def tune_threshold_walk_forward(
|
||||
if len(fold_train) < 160 or len(fold_test) < 25:
|
||||
continue
|
||||
|
||||
y_fold_train = fold_train["rain_next_1h"].astype(int).to_numpy()
|
||||
y_fold_test = fold_test["rain_next_1h"].astype(int).to_numpy()
|
||||
y_fold_train = fold_train[target_col].astype(int).to_numpy()
|
||||
y_fold_test = fold_test[target_col].astype(int).to_numpy()
|
||||
if len(np.unique(y_fold_train)) < 2:
|
||||
continue
|
||||
|
||||
@@ -706,6 +729,7 @@ def tune_threshold_walk_forward(
|
||||
def walk_forward_backtest(
|
||||
model_df,
|
||||
feature_cols: list[str],
|
||||
target_col: str,
|
||||
model_family: str,
|
||||
model_params: dict[str, Any],
|
||||
calibration_method: str,
|
||||
@@ -745,8 +769,8 @@ def walk_forward_backtest(
|
||||
if len(fold_train) < 160 or len(fold_test) < 25:
|
||||
continue
|
||||
|
||||
y_fold_train = fold_train["rain_next_1h"].astype(int).to_numpy()
|
||||
y_fold_test = fold_test["rain_next_1h"].astype(int).to_numpy()
|
||||
y_fold_train = fold_train[target_col].astype(int).to_numpy()
|
||||
y_fold_test = fold_test[target_col].astype(int).to_numpy()
|
||||
if len(np.unique(y_fold_train)) < 2:
|
||||
continue
|
||||
|
||||
@@ -755,8 +779,8 @@ def walk_forward_backtest(
|
||||
continue
|
||||
inner_train = fold_train.iloc[:-inner_val_rows]
|
||||
inner_val = fold_train.iloc[-inner_val_rows:]
|
||||
y_inner_train = inner_train["rain_next_1h"].astype(int).to_numpy()
|
||||
y_inner_val = inner_val["rain_next_1h"].astype(int).to_numpy()
|
||||
y_inner_train = inner_train[target_col].astype(int).to_numpy()
|
||||
y_inner_val = inner_val[target_col].astype(int).to_numpy()
|
||||
if len(np.unique(y_inner_train)) < 2:
|
||||
continue
|
||||
|
||||
@@ -987,6 +1011,11 @@ def main() -> int:
|
||||
|
||||
start = parse_time(args.start) if args.start else ""
|
||||
end = parse_time(args.end) if args.end else ""
|
||||
horizon_hours = normalize_horizon_hours(args.horizon_hours)
|
||||
horizon_label = horizon_suffix(horizon_hours)
|
||||
target_col = rain_next_flag_col(horizon_hours)
|
||||
target_mm_col = rain_next_mm_col(horizon_hours)
|
||||
persistence_context_col = rain_last_mm_col(horizon_hours)
|
||||
feature_cols = feature_columns_for_set(args.feature_set)
|
||||
needs_forecast = feature_columns_need_forecast(feature_cols)
|
||||
calibration_methods = parse_calibration_methods(args.calibration_methods)
|
||||
@@ -1011,8 +1040,21 @@ def main() -> int:
|
||||
return 0
|
||||
raise RuntimeError(message)
|
||||
|
||||
full_df = build_dataset(ws90, baro, forecast=forecast, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
|
||||
model_df = model_frame(full_df, feature_cols, require_target=True)
|
||||
full_df = build_dataset(
|
||||
ws90,
|
||||
baro,
|
||||
forecast=forecast,
|
||||
rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM,
|
||||
horizon_hours=horizon_hours,
|
||||
)
|
||||
if persistence_context_col not in full_df.columns:
|
||||
persistence_context_col = rain_last_mm_col(1)
|
||||
model_df = model_frame(
|
||||
full_df,
|
||||
feature_cols,
|
||||
require_target=True,
|
||||
target_col=target_col,
|
||||
)
|
||||
if len(model_df) < args.min_rows:
|
||||
message = f"not enough model-ready rows after filtering (need >= {args.min_rows})"
|
||||
if args.allow_empty:
|
||||
@@ -1027,11 +1069,11 @@ def main() -> int:
|
||||
)
|
||||
|
||||
x_train = train_df[feature_cols]
|
||||
y_train = train_df["rain_next_1h"].astype(int).to_numpy()
|
||||
y_train = train_df[target_col].astype(int).to_numpy()
|
||||
x_val = val_df[feature_cols]
|
||||
y_val = val_df["rain_next_1h"].astype(int).to_numpy()
|
||||
y_val = val_df[target_col].astype(int).to_numpy()
|
||||
x_test = test_df[feature_cols]
|
||||
y_test = test_df["rain_next_1h"].astype(int).to_numpy()
|
||||
y_test = test_df[target_col].astype(int).to_numpy()
|
||||
|
||||
if len(np.unique(y_train)) < 2:
|
||||
raise RuntimeError("training split does not contain both classes; cannot train classifier")
|
||||
@@ -1073,6 +1115,7 @@ def main() -> int:
|
||||
threshold_tuning_walk_forward = tune_threshold_walk_forward(
|
||||
model_df=model_df.iloc[: len(train_df) + len(val_df)],
|
||||
feature_cols=feature_cols,
|
||||
target_col=target_col,
|
||||
model_family=selected_model_family,
|
||||
model_params=selected_model_params,
|
||||
calibration_method=selected_calibration_method,
|
||||
@@ -1093,7 +1136,7 @@ def main() -> int:
|
||||
|
||||
train_val_df = model_df.iloc[: len(train_df) + len(val_df)]
|
||||
x_train_val = train_val_df[feature_cols]
|
||||
y_train_val = train_val_df["rain_next_1h"].astype(int).to_numpy()
|
||||
y_train_val = train_val_df[target_col].astype(int).to_numpy()
|
||||
|
||||
final_model, final_fit_info = fit_with_optional_calibration(
|
||||
model_family=selected_model_family,
|
||||
@@ -1109,16 +1152,23 @@ def main() -> int:
|
||||
test_calibration = {
|
||||
"ece_10": expected_calibration_error(y_true=y_test, y_prob=y_test_prob, bins=10),
|
||||
}
|
||||
naive_baselines_test = evaluate_naive_baselines(test_df=test_df, y_test=y_test)
|
||||
naive_baselines_test = evaluate_naive_baselines(
|
||||
test_df=test_df,
|
||||
y_test=y_test,
|
||||
persistence_context_col=persistence_context_col,
|
||||
)
|
||||
sliced_performance = evaluate_sliced_performance(
|
||||
test_df=test_df,
|
||||
y_true=y_test,
|
||||
y_prob=y_test_prob,
|
||||
threshold=chosen_threshold,
|
||||
context_col=persistence_context_col,
|
||||
context_label=persistence_context_col,
|
||||
)
|
||||
walk_forward = walk_forward_backtest(
|
||||
model_df=model_df,
|
||||
feature_cols=feature_cols,
|
||||
target_col=target_col,
|
||||
model_family=selected_model_family,
|
||||
model_params=selected_model_params,
|
||||
calibration_method=selected_calibration_method,
|
||||
@@ -1135,8 +1185,12 @@ def main() -> int:
|
||||
"model_family_requested": args.model_family,
|
||||
"model_family": selected_model_family,
|
||||
"model_params": selected_model_params,
|
||||
"horizon_hours": horizon_hours,
|
||||
"horizon_label": horizon_label,
|
||||
"feature_set": args.feature_set,
|
||||
"target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"target_column": target_col,
|
||||
"target_mm_column": target_mm_col,
|
||||
"target_definition": f"{target_mm_col} >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"feature_columns": feature_cols,
|
||||
"forecast_model": args.forecast_model if needs_forecast else None,
|
||||
"calibration_method_requested": calibration_methods,
|
||||
@@ -1207,6 +1261,7 @@ def main() -> int:
|
||||
print(f" site: {args.site}")
|
||||
print(f" model_version: {args.model_version}")
|
||||
print(f" model_family: {selected_model_family} (requested={args.model_family})")
|
||||
print(f" horizon: {horizon_hours}h")
|
||||
print(f" model_params: {selected_model_params}")
|
||||
print(f" calibration_method: {report['calibration_method']}")
|
||||
print(
|
||||
@@ -1298,7 +1353,7 @@ def main() -> int:
|
||||
dataset_dir = os.path.dirname(dataset_out)
|
||||
if dataset_dir:
|
||||
os.makedirs(dataset_dir, exist_ok=True)
|
||||
snapshot_cols = list(dict.fromkeys(feature_cols + ["rain_next_1h", "rain_next_1h_mm"]))
|
||||
snapshot_cols = list(dict.fromkeys(feature_cols + [target_col, target_mm_col]))
|
||||
model_df[snapshot_cols].to_csv(dataset_out, index=True, index_label="ts")
|
||||
print(f"Saved dataset snapshot to {dataset_out}")
|
||||
|
||||
@@ -1320,6 +1375,10 @@ def main() -> int:
|
||||
"forecast_model": args.forecast_model if needs_forecast else None,
|
||||
"threshold": float(chosen_threshold),
|
||||
"target_mm": float(RAIN_EVENT_THRESHOLD_MM),
|
||||
"horizon_hours": horizon_hours,
|
||||
"target_col": target_col,
|
||||
"target_mm_col": target_mm_col,
|
||||
"persistence_context_col": persistence_context_col,
|
||||
"model_version": args.model_version,
|
||||
"trained_at": datetime.now(timezone.utc).isoformat(),
|
||||
"split": report["split"],
|
||||
|
||||
Reference in New Issue
Block a user