#!/usr/bin/env python3 from __future__ import annotations from datetime import datetime from typing import Any import numpy as np import pandas as pd from sklearn.metrics import ( accuracy_score, average_precision_score, brier_score_loss, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, ) BASELINE_FEATURE_COLUMNS = [ "pressure_trend_1h", "humidity", "temperature_c", "wind_avg_m_s", "wind_max_m_s", ] FORECAST_FEATURE_COLUMNS = [ "fc_temp_c", "fc_rh", "fc_pressure_msl_hpa", "fc_wind_m_s", "fc_wind_gust_m_s", "fc_precip_mm", "fc_precip_prob", "fc_cloud_cover", ] CALENDAR_FEATURE_COLUMNS = [ "hour_sin", "hour_cos", "dow_sin", "dow_cos", "month_sin", "month_cos", "is_weekend", ] EXTENDED_FEATURE_COLUMNS = [ "pressure_trend_1h", "temperature_c", "humidity", "wind_avg_m_s", "wind_max_m_s", "wind_dir_sin", "wind_dir_cos", "temp_lag_5m", "temp_roll_1h_mean", "temp_roll_1h_std", "humidity_lag_5m", "humidity_roll_1h_mean", "humidity_roll_1h_std", "wind_avg_lag_5m", "wind_avg_roll_1h_mean", "wind_gust_roll_1h_max", "pressure_lag_5m", "pressure_roll_1h_mean", "pressure_roll_1h_std", "rain_last_1h_mm", *FORECAST_FEATURE_COLUMNS, ] EXTENDED_CALENDAR_FEATURE_COLUMNS = [ *EXTENDED_FEATURE_COLUMNS, *CALENDAR_FEATURE_COLUMNS, ] FEATURE_SETS: dict[str, list[str]] = { "baseline": BASELINE_FEATURE_COLUMNS, "extended": EXTENDED_FEATURE_COLUMNS, "extended_calendar": EXTENDED_CALENDAR_FEATURE_COLUMNS, } AVAILABLE_FEATURE_SETS = tuple(sorted(FEATURE_SETS.keys())) FEATURE_COLUMNS = BASELINE_FEATURE_COLUMNS RAIN_EVENT_THRESHOLD_MM = 0.2 RAIN_SPIKE_THRESHOLD_MM_5M = 5.0 RAIN_HORIZON_BUCKETS = 12 # 12 * 5m = 1h def parse_time(value: str) -> str: if not value: return "" try: datetime.fromisoformat(value.replace("Z", "+00:00")) return value except ValueError as exc: raise ValueError(f"invalid time format: {value}") from exc def feature_columns_for_set(feature_set: str) -> list[str]: out = FEATURE_SETS.get(feature_set.lower()) if out is None: raise ValueError(f"unknown feature set: {feature_set}") return list(out) def feature_columns_need_forecast(feature_cols: list[str]) -> bool: return any(col in FORECAST_FEATURE_COLUMNS for col in feature_cols) def feature_set_needs_forecast(feature_set: str) -> bool: return feature_columns_need_forecast(feature_columns_for_set(feature_set)) def _fetch_df(conn, sql: str, params: tuple[Any, ...], parse_dt_cols: list[str]) -> pd.DataFrame: with conn.cursor() as cur: cur.execute(sql, params) rows = cur.fetchall() cols = [d.name for d in cur.description] df = pd.DataFrame.from_records(rows, columns=cols) if not df.empty: for col in parse_dt_cols: df[col] = pd.to_datetime(df[col], utc=True) return df def fetch_ws90(conn, site: str, start: str, end: str) -> pd.DataFrame: sql = """ SELECT ts, station_id, received_at, temperature_c, humidity, wind_avg_m_s, wind_max_m_s, wind_dir_deg, rain_mm FROM observations_ws90 WHERE site = %s AND (%s = '' OR ts >= NULLIF(%s, '')::timestamptz) AND (%s = '' OR ts <= NULLIF(%s, '')::timestamptz) ORDER BY ts ASC """ return _fetch_df(conn, sql, (site, start, start, end, end), ["ts", "received_at"]) def fetch_baro(conn, site: str, start: str, end: str) -> pd.DataFrame: sql = """ SELECT ts, source, received_at, pressure_hpa FROM observations_baro WHERE site = %s AND (%s = '' OR ts >= NULLIF(%s, '')::timestamptz) AND (%s = '' OR ts <= NULLIF(%s, '')::timestamptz) ORDER BY ts ASC """ return _fetch_df(conn, sql, (site, start, start, end, end), ["ts", "received_at"]) def fetch_forecast(conn, site: str, start: str, end: str, model: str = "ecmwf") -> pd.DataFrame: sql = """ SELECT DISTINCT ON (ts) ts, retrieved_at, temp_c, rh, pressure_msl_hpa, wind_m_s, wind_gust_m_s, precip_mm, precip_prob, cloud_cover FROM forecast_openmeteo_hourly WHERE site = %s AND model = %s AND (%s = '' OR ts >= NULLIF(%s, '')::timestamptz - INTERVAL '2 hours') AND (%s = '' OR ts <= NULLIF(%s, '')::timestamptz + INTERVAL '2 hours') ORDER BY ts ASC, retrieved_at DESC """ return _fetch_df(conn, sql, (site, model, start, start, end, end), ["ts", "retrieved_at"]) def _apply_forecast_features(df: pd.DataFrame, forecast: pd.DataFrame | None) -> pd.DataFrame: out = df.copy() for col in FORECAST_FEATURE_COLUMNS: out[col] = np.nan if forecast is None or forecast.empty: return out fc = forecast.set_index("ts").sort_index().rename( columns={ "temp_c": "fc_temp_c", "rh": "fc_rh", "pressure_msl_hpa": "fc_pressure_msl_hpa", "wind_m_s": "fc_wind_m_s", "wind_gust_m_s": "fc_wind_gust_m_s", "precip_mm": "fc_precip_mm", "precip_prob": "fc_precip_prob", "cloud_cover": "fc_cloud_cover", } ) keep = [c for c in FORECAST_FEATURE_COLUMNS if c in fc.columns] fc = fc[keep] # Bring hourly forecast onto 5m observation grid. fc_5m = fc.resample("5min").ffill(limit=12) out = out.join(fc_5m, how="left", rsuffix="_dup") # Prefer joined forecast values and softly fill small gaps. for col in keep: dup_col = f"{col}_dup" if dup_col in out.columns: out[col] = out[dup_col] out.drop(columns=[dup_col], inplace=True) out[col] = out[col].ffill(limit=12).bfill(limit=2) # Normalize precip probability to 0..1 if source is 0..100. if "fc_precip_prob" in out.columns: mask = out["fc_precip_prob"] > 1.0 out.loc[mask, "fc_precip_prob"] = out.loc[mask, "fc_precip_prob"] / 100.0 out["fc_precip_prob"] = out["fc_precip_prob"].clip(lower=0.0, upper=1.0) # Some forecast sources (or model configs) provide precip amount but no precip probability. # Backfill missing probability to keep feature rows usable for training/inference. if "fc_precip_mm" in out.columns: fallback_prob = (out["fc_precip_mm"].fillna(0.0) > 0.0).astype(float) else: fallback_prob = 0.0 out["fc_precip_prob"] = out["fc_precip_prob"].fillna(fallback_prob) out["fc_precip_prob"] = out["fc_precip_prob"].clip(lower=0.0, upper=1.0) return out def build_dataset( ws90: pd.DataFrame, baro: pd.DataFrame, forecast: pd.DataFrame | None = None, rain_event_threshold_mm: float = RAIN_EVENT_THRESHOLD_MM, ) -> pd.DataFrame: if ws90.empty: raise RuntimeError("no ws90 observations found") if baro.empty: raise RuntimeError("no barometer observations found") ws90 = ws90.set_index("ts").sort_index() baro = baro.set_index("ts").sort_index() ws90_5m = ws90.resample("5min").agg( { "temperature_c": "mean", "humidity": "mean", "wind_avg_m_s": "mean", "wind_max_m_s": "max", "wind_dir_deg": "mean", "rain_mm": "last", } ) baro_5m = baro.resample("5min").agg({"pressure_hpa": "mean"}) df = ws90_5m.join(baro_5m, how="outer") df["pressure_hpa"] = df["pressure_hpa"].interpolate(limit=3) df["rain_inc_raw"] = df["rain_mm"].diff() df["rain_reset"] = df["rain_inc_raw"] < 0 df["rain_inc"] = df["rain_inc_raw"].clip(lower=0) df["rain_spike_5m"] = df["rain_inc"] >= RAIN_SPIKE_THRESHOLD_MM_5M window = RAIN_HORIZON_BUCKETS df["rain_last_1h_mm"] = df["rain_inc"].rolling(window=window, min_periods=1).sum() df["rain_next_1h_mm"] = df["rain_inc"].rolling(window=window, min_periods=1).sum().shift(-(window - 1)) df["rain_next_1h"] = df["rain_next_1h_mm"] >= rain_event_threshold_mm df["pressure_trend_1h"] = df["pressure_hpa"] - df["pressure_hpa"].shift(window) # Wind direction cyclical encoding. radians = np.deg2rad(df["wind_dir_deg"] % 360.0) df["wind_dir_sin"] = np.sin(radians) df["wind_dir_cos"] = np.cos(radians) # Lags and rolling features (core sensors). df["temp_lag_5m"] = df["temperature_c"].shift(1) df["humidity_lag_5m"] = df["humidity"].shift(1) df["wind_avg_lag_5m"] = df["wind_avg_m_s"].shift(1) df["pressure_lag_5m"] = df["pressure_hpa"].shift(1) df["temp_roll_1h_mean"] = df["temperature_c"].rolling(window=window, min_periods=3).mean() df["temp_roll_1h_std"] = df["temperature_c"].rolling(window=window, min_periods=3).std() df["humidity_roll_1h_mean"] = df["humidity"].rolling(window=window, min_periods=3).mean() df["humidity_roll_1h_std"] = df["humidity"].rolling(window=window, min_periods=3).std() df["wind_avg_roll_1h_mean"] = df["wind_avg_m_s"].rolling(window=window, min_periods=3).mean() df["wind_gust_roll_1h_max"] = df["wind_max_m_s"].rolling(window=window, min_periods=3).max() df["pressure_roll_1h_mean"] = df["pressure_hpa"].rolling(window=window, min_periods=3).mean() df["pressure_roll_1h_std"] = df["pressure_hpa"].rolling(window=window, min_periods=3).std() # Calendar/seasonality features (UTC based). hour_of_day = df.index.hour + (df.index.minute / 60.0) day_of_week = df.index.dayofweek month_of_year = df.index.month df["hour_sin"] = np.sin(2.0 * np.pi * hour_of_day / 24.0) df["hour_cos"] = np.cos(2.0 * np.pi * hour_of_day / 24.0) df["dow_sin"] = np.sin(2.0 * np.pi * day_of_week / 7.0) df["dow_cos"] = np.cos(2.0 * np.pi * day_of_week / 7.0) df["month_sin"] = np.sin(2.0 * np.pi * (month_of_year - 1.0) / 12.0) df["month_cos"] = np.cos(2.0 * np.pi * (month_of_year - 1.0) / 12.0) df["is_weekend"] = (day_of_week >= 5).astype(float) df = _apply_forecast_features(df, forecast) return df def model_frame(df: pd.DataFrame, feature_cols: list[str] | None = None, require_target: bool = True) -> pd.DataFrame: features = feature_cols or FEATURE_COLUMNS required = list(features) if require_target: required.append("rain_next_1h") out = df.dropna(subset=required).copy() return out.sort_index() def split_time_ordered( df: pd.DataFrame, train_ratio: float = 0.7, val_ratio: float = 0.15, ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: if not (0 < train_ratio < 1): raise ValueError("train_ratio must be between 0 and 1") if not (0 <= val_ratio < 1): raise ValueError("val_ratio must be between 0 and 1") if train_ratio + val_ratio >= 1: raise ValueError("train_ratio + val_ratio must be < 1") n = len(df) if n < 100: raise RuntimeError("not enough rows after filtering (need >= 100)") train_end = int(n * train_ratio) val_end = int(n * (train_ratio + val_ratio)) train_end = min(max(train_end, 1), n - 2) val_end = min(max(val_end, train_end + 1), n - 1) train_df = df.iloc[:train_end] val_df = df.iloc[train_end:val_end] test_df = df.iloc[val_end:] if train_df.empty or val_df.empty or test_df.empty: raise RuntimeError("time split produced empty train/val/test set") return train_df, val_df, test_df def evaluate_probs(y_true: np.ndarray, y_prob: np.ndarray, threshold: float) -> dict[str, Any]: y_pred = (y_prob >= threshold).astype(int) roc_auc = float("nan") pr_auc = float("nan") if len(np.unique(y_true)) > 1: roc_auc = roc_auc_score(y_true, y_prob) pr_auc = average_precision_score(y_true, y_prob) cm = confusion_matrix(y_true, y_pred, labels=[0, 1]) metrics = { "rows": int(len(y_true)), "positive_rate": float(np.mean(y_true)), "threshold": float(threshold), "accuracy": accuracy_score(y_true, y_pred), "precision": precision_score(y_true, y_pred, zero_division=0), "recall": recall_score(y_true, y_pred, zero_division=0), "f1": f1_score(y_true, y_pred, zero_division=0), "roc_auc": roc_auc, "pr_auc": pr_auc, "brier": brier_score_loss(y_true, y_prob), "confusion_matrix": cm.tolist(), } return to_builtin(metrics) def select_threshold(y_true: np.ndarray, y_prob: np.ndarray, min_precision: float = 0.7) -> tuple[float, dict[str, Any]]: thresholds = np.linspace(0.05, 0.95, 91) best: dict[str, Any] | None = None constrained_best: dict[str, Any] | None = None for threshold in thresholds: y_pred = (y_prob >= threshold).astype(int) precision = precision_score(y_true, y_pred, zero_division=0) recall = recall_score(y_true, y_pred, zero_division=0) f1 = f1_score(y_true, y_pred, zero_division=0) candidate = { "threshold": float(threshold), "precision": float(precision), "recall": float(recall), "f1": float(f1), } if best is None or candidate["f1"] > best["f1"]: best = candidate if precision >= min_precision: if constrained_best is None: constrained_best = candidate elif candidate["recall"] > constrained_best["recall"]: constrained_best = candidate elif candidate["recall"] == constrained_best["recall"] and candidate["f1"] > constrained_best["f1"]: constrained_best = candidate if constrained_best is not None: constrained_best["selection_rule"] = f"max_recall_where_precision>={min_precision:.2f}" return float(constrained_best["threshold"]), constrained_best assert best is not None best["selection_rule"] = "fallback_max_f1" return float(best["threshold"]), best def safe_pr_auc(v: dict[str, Any]) -> float: value = v.get("pr_auc") if value is None: return float("-inf") out = float(value) if np.isnan(out): return float("-inf") return out def safe_roc_auc(v: dict[str, Any]) -> float: value = v.get("roc_auc") if value is None: return float("-inf") out = float(value) if np.isnan(out): return float("-inf") return out def to_builtin(v: Any) -> Any: if isinstance(v, dict): return {k: to_builtin(val) for k, val in v.items()} if isinstance(v, list): return [to_builtin(i) for i in v] if isinstance(v, tuple): return [to_builtin(i) for i in v] if isinstance(v, np.integer): return int(v) if isinstance(v, np.floating): out = float(v) if np.isnan(out): return None return out if isinstance(v, pd.Timestamp): return v.isoformat() if isinstance(v, datetime): return v.isoformat() return v