449 lines
15 KiB
Python
449 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.metrics import (
|
|
accuracy_score,
|
|
average_precision_score,
|
|
brier_score_loss,
|
|
confusion_matrix,
|
|
f1_score,
|
|
precision_score,
|
|
recall_score,
|
|
roc_auc_score,
|
|
)
|
|
|
|
BASELINE_FEATURE_COLUMNS = [
|
|
"pressure_trend_1h",
|
|
"humidity",
|
|
"temperature_c",
|
|
"wind_avg_m_s",
|
|
"wind_max_m_s",
|
|
]
|
|
|
|
FORECAST_FEATURE_COLUMNS = [
|
|
"fc_temp_c",
|
|
"fc_rh",
|
|
"fc_pressure_msl_hpa",
|
|
"fc_wind_m_s",
|
|
"fc_wind_gust_m_s",
|
|
"fc_precip_mm",
|
|
"fc_precip_prob",
|
|
"fc_cloud_cover",
|
|
]
|
|
|
|
CALENDAR_FEATURE_COLUMNS = [
|
|
"hour_sin",
|
|
"hour_cos",
|
|
"dow_sin",
|
|
"dow_cos",
|
|
"month_sin",
|
|
"month_cos",
|
|
"is_weekend",
|
|
]
|
|
|
|
EXTENDED_FEATURE_COLUMNS = [
|
|
"pressure_trend_1h",
|
|
"temperature_c",
|
|
"humidity",
|
|
"wind_avg_m_s",
|
|
"wind_max_m_s",
|
|
"wind_dir_sin",
|
|
"wind_dir_cos",
|
|
"temp_lag_5m",
|
|
"temp_roll_1h_mean",
|
|
"temp_roll_1h_std",
|
|
"humidity_lag_5m",
|
|
"humidity_roll_1h_mean",
|
|
"humidity_roll_1h_std",
|
|
"wind_avg_lag_5m",
|
|
"wind_avg_roll_1h_mean",
|
|
"wind_gust_roll_1h_max",
|
|
"pressure_lag_5m",
|
|
"pressure_roll_1h_mean",
|
|
"pressure_roll_1h_std",
|
|
"rain_last_1h_mm",
|
|
*FORECAST_FEATURE_COLUMNS,
|
|
]
|
|
|
|
EXTENDED_CALENDAR_FEATURE_COLUMNS = [
|
|
*EXTENDED_FEATURE_COLUMNS,
|
|
*CALENDAR_FEATURE_COLUMNS,
|
|
]
|
|
|
|
FEATURE_SETS: dict[str, list[str]] = {
|
|
"baseline": BASELINE_FEATURE_COLUMNS,
|
|
"extended": EXTENDED_FEATURE_COLUMNS,
|
|
"extended_calendar": EXTENDED_CALENDAR_FEATURE_COLUMNS,
|
|
}
|
|
|
|
AVAILABLE_FEATURE_SETS = tuple(sorted(FEATURE_SETS.keys()))
|
|
FEATURE_COLUMNS = BASELINE_FEATURE_COLUMNS
|
|
|
|
RAIN_EVENT_THRESHOLD_MM = 0.2
|
|
RAIN_SPIKE_THRESHOLD_MM_5M = 5.0
|
|
RAIN_HORIZON_BUCKETS = 12 # 12 * 5m = 1h
|
|
|
|
|
|
def parse_time(value: str) -> str:
|
|
if not value:
|
|
return ""
|
|
try:
|
|
datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
return value
|
|
except ValueError as exc:
|
|
raise ValueError(f"invalid time format: {value}") from exc
|
|
|
|
|
|
def feature_columns_for_set(feature_set: str) -> list[str]:
|
|
out = FEATURE_SETS.get(feature_set.lower())
|
|
if out is None:
|
|
raise ValueError(f"unknown feature set: {feature_set}")
|
|
return list(out)
|
|
|
|
|
|
def feature_columns_need_forecast(feature_cols: list[str]) -> bool:
|
|
return any(col in FORECAST_FEATURE_COLUMNS for col in feature_cols)
|
|
|
|
|
|
def feature_set_needs_forecast(feature_set: str) -> bool:
|
|
return feature_columns_need_forecast(feature_columns_for_set(feature_set))
|
|
|
|
|
|
def _fetch_df(conn, sql: str, params: tuple[Any, ...], parse_dt_cols: list[str]) -> pd.DataFrame:
|
|
with conn.cursor() as cur:
|
|
cur.execute(sql, params)
|
|
rows = cur.fetchall()
|
|
cols = [d.name for d in cur.description]
|
|
|
|
df = pd.DataFrame.from_records(rows, columns=cols)
|
|
if not df.empty:
|
|
for col in parse_dt_cols:
|
|
df[col] = pd.to_datetime(df[col], utc=True)
|
|
return df
|
|
|
|
|
|
def fetch_ws90(conn, site: str, start: str, end: str) -> pd.DataFrame:
|
|
sql = """
|
|
SELECT ts, station_id, received_at, temperature_c, humidity, wind_avg_m_s, wind_max_m_s, wind_dir_deg, rain_mm
|
|
FROM observations_ws90
|
|
WHERE site = %s
|
|
AND (%s = '' OR ts >= NULLIF(%s, '')::timestamptz)
|
|
AND (%s = '' OR ts <= NULLIF(%s, '')::timestamptz)
|
|
ORDER BY ts ASC
|
|
"""
|
|
return _fetch_df(conn, sql, (site, start, start, end, end), ["ts", "received_at"])
|
|
|
|
|
|
def fetch_baro(conn, site: str, start: str, end: str) -> pd.DataFrame:
|
|
sql = """
|
|
SELECT ts, source, received_at, pressure_hpa
|
|
FROM observations_baro
|
|
WHERE site = %s
|
|
AND (%s = '' OR ts >= NULLIF(%s, '')::timestamptz)
|
|
AND (%s = '' OR ts <= NULLIF(%s, '')::timestamptz)
|
|
ORDER BY ts ASC
|
|
"""
|
|
return _fetch_df(conn, sql, (site, start, start, end, end), ["ts", "received_at"])
|
|
|
|
|
|
def fetch_forecast(conn, site: str, start: str, end: str, model: str = "ecmwf") -> pd.DataFrame:
|
|
sql = """
|
|
SELECT DISTINCT ON (ts)
|
|
ts,
|
|
retrieved_at,
|
|
temp_c,
|
|
rh,
|
|
pressure_msl_hpa,
|
|
wind_m_s,
|
|
wind_gust_m_s,
|
|
precip_mm,
|
|
precip_prob,
|
|
cloud_cover
|
|
FROM forecast_openmeteo_hourly
|
|
WHERE site = %s
|
|
AND model = %s
|
|
AND (%s = '' OR ts >= NULLIF(%s, '')::timestamptz - INTERVAL '2 hours')
|
|
AND (%s = '' OR ts <= NULLIF(%s, '')::timestamptz + INTERVAL '2 hours')
|
|
ORDER BY ts ASC, retrieved_at DESC
|
|
"""
|
|
return _fetch_df(conn, sql, (site, model, start, start, end, end), ["ts", "retrieved_at"])
|
|
|
|
|
|
def _apply_forecast_features(df: pd.DataFrame, forecast: pd.DataFrame | None) -> pd.DataFrame:
|
|
out = df.copy()
|
|
for col in FORECAST_FEATURE_COLUMNS:
|
|
out[col] = np.nan
|
|
|
|
if forecast is None or forecast.empty:
|
|
return out
|
|
|
|
fc = forecast.set_index("ts").sort_index().rename(
|
|
columns={
|
|
"temp_c": "fc_temp_c",
|
|
"rh": "fc_rh",
|
|
"pressure_msl_hpa": "fc_pressure_msl_hpa",
|
|
"wind_m_s": "fc_wind_m_s",
|
|
"wind_gust_m_s": "fc_wind_gust_m_s",
|
|
"precip_mm": "fc_precip_mm",
|
|
"precip_prob": "fc_precip_prob",
|
|
"cloud_cover": "fc_cloud_cover",
|
|
}
|
|
)
|
|
|
|
keep = [c for c in FORECAST_FEATURE_COLUMNS if c in fc.columns]
|
|
fc = fc[keep]
|
|
# Bring hourly forecast onto 5m observation grid.
|
|
fc_5m = fc.resample("5min").ffill(limit=12)
|
|
out = out.join(fc_5m, how="left", rsuffix="_dup")
|
|
|
|
# Prefer joined forecast values and softly fill small gaps.
|
|
for col in keep:
|
|
dup_col = f"{col}_dup"
|
|
if dup_col in out.columns:
|
|
out[col] = out[dup_col]
|
|
out.drop(columns=[dup_col], inplace=True)
|
|
out[col] = out[col].ffill(limit=12).bfill(limit=2)
|
|
|
|
# Normalize precip probability to 0..1 if source is 0..100.
|
|
if "fc_precip_prob" in out.columns:
|
|
mask = out["fc_precip_prob"] > 1.0
|
|
out.loc[mask, "fc_precip_prob"] = out.loc[mask, "fc_precip_prob"] / 100.0
|
|
out["fc_precip_prob"] = out["fc_precip_prob"].clip(lower=0.0, upper=1.0)
|
|
|
|
# Some forecast sources (or model configs) provide precip amount but no precip probability.
|
|
# Backfill missing probability to keep feature rows usable for training/inference.
|
|
if "fc_precip_mm" in out.columns:
|
|
fallback_prob = (out["fc_precip_mm"].fillna(0.0) > 0.0).astype(float)
|
|
else:
|
|
fallback_prob = 0.0
|
|
out["fc_precip_prob"] = out["fc_precip_prob"].fillna(fallback_prob)
|
|
out["fc_precip_prob"] = out["fc_precip_prob"].clip(lower=0.0, upper=1.0)
|
|
|
|
return out
|
|
|
|
|
|
def build_dataset(
|
|
ws90: pd.DataFrame,
|
|
baro: pd.DataFrame,
|
|
forecast: pd.DataFrame | None = None,
|
|
rain_event_threshold_mm: float = RAIN_EVENT_THRESHOLD_MM,
|
|
) -> pd.DataFrame:
|
|
if ws90.empty:
|
|
raise RuntimeError("no ws90 observations found")
|
|
if baro.empty:
|
|
raise RuntimeError("no barometer observations found")
|
|
|
|
ws90 = ws90.set_index("ts").sort_index()
|
|
baro = baro.set_index("ts").sort_index()
|
|
|
|
ws90_5m = ws90.resample("5min").agg(
|
|
{
|
|
"temperature_c": "mean",
|
|
"humidity": "mean",
|
|
"wind_avg_m_s": "mean",
|
|
"wind_max_m_s": "max",
|
|
"wind_dir_deg": "mean",
|
|
"rain_mm": "last",
|
|
}
|
|
)
|
|
baro_5m = baro.resample("5min").agg({"pressure_hpa": "mean"})
|
|
|
|
df = ws90_5m.join(baro_5m, how="outer")
|
|
df["pressure_hpa"] = df["pressure_hpa"].interpolate(limit=3)
|
|
|
|
df["rain_inc_raw"] = df["rain_mm"].diff()
|
|
df["rain_reset"] = df["rain_inc_raw"] < 0
|
|
df["rain_inc"] = df["rain_inc_raw"].clip(lower=0)
|
|
df["rain_spike_5m"] = df["rain_inc"] >= RAIN_SPIKE_THRESHOLD_MM_5M
|
|
|
|
window = RAIN_HORIZON_BUCKETS
|
|
df["rain_last_1h_mm"] = df["rain_inc"].rolling(window=window, min_periods=1).sum()
|
|
df["rain_next_1h_mm"] = df["rain_inc"].rolling(window=window, min_periods=1).sum().shift(-(window - 1))
|
|
df["rain_next_1h"] = df["rain_next_1h_mm"] >= rain_event_threshold_mm
|
|
|
|
df["pressure_trend_1h"] = df["pressure_hpa"] - df["pressure_hpa"].shift(window)
|
|
|
|
# Wind direction cyclical encoding.
|
|
radians = np.deg2rad(df["wind_dir_deg"] % 360.0)
|
|
df["wind_dir_sin"] = np.sin(radians)
|
|
df["wind_dir_cos"] = np.cos(radians)
|
|
|
|
# Lags and rolling features (core sensors).
|
|
df["temp_lag_5m"] = df["temperature_c"].shift(1)
|
|
df["humidity_lag_5m"] = df["humidity"].shift(1)
|
|
df["wind_avg_lag_5m"] = df["wind_avg_m_s"].shift(1)
|
|
df["pressure_lag_5m"] = df["pressure_hpa"].shift(1)
|
|
|
|
df["temp_roll_1h_mean"] = df["temperature_c"].rolling(window=window, min_periods=3).mean()
|
|
df["temp_roll_1h_std"] = df["temperature_c"].rolling(window=window, min_periods=3).std()
|
|
df["humidity_roll_1h_mean"] = df["humidity"].rolling(window=window, min_periods=3).mean()
|
|
df["humidity_roll_1h_std"] = df["humidity"].rolling(window=window, min_periods=3).std()
|
|
df["wind_avg_roll_1h_mean"] = df["wind_avg_m_s"].rolling(window=window, min_periods=3).mean()
|
|
df["wind_gust_roll_1h_max"] = df["wind_max_m_s"].rolling(window=window, min_periods=3).max()
|
|
df["pressure_roll_1h_mean"] = df["pressure_hpa"].rolling(window=window, min_periods=3).mean()
|
|
df["pressure_roll_1h_std"] = df["pressure_hpa"].rolling(window=window, min_periods=3).std()
|
|
|
|
# Calendar/seasonality features (UTC based).
|
|
hour_of_day = df.index.hour + (df.index.minute / 60.0)
|
|
day_of_week = df.index.dayofweek
|
|
month_of_year = df.index.month
|
|
df["hour_sin"] = np.sin(2.0 * np.pi * hour_of_day / 24.0)
|
|
df["hour_cos"] = np.cos(2.0 * np.pi * hour_of_day / 24.0)
|
|
df["dow_sin"] = np.sin(2.0 * np.pi * day_of_week / 7.0)
|
|
df["dow_cos"] = np.cos(2.0 * np.pi * day_of_week / 7.0)
|
|
df["month_sin"] = np.sin(2.0 * np.pi * (month_of_year - 1.0) / 12.0)
|
|
df["month_cos"] = np.cos(2.0 * np.pi * (month_of_year - 1.0) / 12.0)
|
|
df["is_weekend"] = (day_of_week >= 5).astype(float)
|
|
|
|
df = _apply_forecast_features(df, forecast)
|
|
return df
|
|
|
|
|
|
def model_frame(df: pd.DataFrame, feature_cols: list[str] | None = None, require_target: bool = True) -> pd.DataFrame:
|
|
features = feature_cols or FEATURE_COLUMNS
|
|
required = list(features)
|
|
if require_target:
|
|
required.append("rain_next_1h")
|
|
out = df.dropna(subset=required).copy()
|
|
return out.sort_index()
|
|
|
|
|
|
def split_time_ordered(
|
|
df: pd.DataFrame,
|
|
train_ratio: float = 0.7,
|
|
val_ratio: float = 0.15,
|
|
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
|
if not (0 < train_ratio < 1):
|
|
raise ValueError("train_ratio must be between 0 and 1")
|
|
if not (0 <= val_ratio < 1):
|
|
raise ValueError("val_ratio must be between 0 and 1")
|
|
if train_ratio + val_ratio >= 1:
|
|
raise ValueError("train_ratio + val_ratio must be < 1")
|
|
|
|
n = len(df)
|
|
if n < 100:
|
|
raise RuntimeError("not enough rows after filtering (need >= 100)")
|
|
|
|
train_end = int(n * train_ratio)
|
|
val_end = int(n * (train_ratio + val_ratio))
|
|
|
|
train_end = min(max(train_end, 1), n - 2)
|
|
val_end = min(max(val_end, train_end + 1), n - 1)
|
|
|
|
train_df = df.iloc[:train_end]
|
|
val_df = df.iloc[train_end:val_end]
|
|
test_df = df.iloc[val_end:]
|
|
|
|
if train_df.empty or val_df.empty or test_df.empty:
|
|
raise RuntimeError("time split produced empty train/val/test set")
|
|
return train_df, val_df, test_df
|
|
|
|
|
|
def evaluate_probs(y_true: np.ndarray, y_prob: np.ndarray, threshold: float) -> dict[str, Any]:
|
|
y_pred = (y_prob >= threshold).astype(int)
|
|
|
|
roc_auc = float("nan")
|
|
pr_auc = float("nan")
|
|
if len(np.unique(y_true)) > 1:
|
|
roc_auc = roc_auc_score(y_true, y_prob)
|
|
pr_auc = average_precision_score(y_true, y_prob)
|
|
|
|
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
|
|
metrics = {
|
|
"rows": int(len(y_true)),
|
|
"positive_rate": float(np.mean(y_true)),
|
|
"threshold": float(threshold),
|
|
"accuracy": accuracy_score(y_true, y_pred),
|
|
"precision": precision_score(y_true, y_pred, zero_division=0),
|
|
"recall": recall_score(y_true, y_pred, zero_division=0),
|
|
"f1": f1_score(y_true, y_pred, zero_division=0),
|
|
"roc_auc": roc_auc,
|
|
"pr_auc": pr_auc,
|
|
"brier": brier_score_loss(y_true, y_prob),
|
|
"confusion_matrix": cm.tolist(),
|
|
}
|
|
return to_builtin(metrics)
|
|
|
|
|
|
def select_threshold(y_true: np.ndarray, y_prob: np.ndarray, min_precision: float = 0.7) -> tuple[float, dict[str, Any]]:
|
|
thresholds = np.linspace(0.05, 0.95, 91)
|
|
|
|
best: dict[str, Any] | None = None
|
|
constrained_best: dict[str, Any] | None = None
|
|
for threshold in thresholds:
|
|
y_pred = (y_prob >= threshold).astype(int)
|
|
precision = precision_score(y_true, y_pred, zero_division=0)
|
|
recall = recall_score(y_true, y_pred, zero_division=0)
|
|
f1 = f1_score(y_true, y_pred, zero_division=0)
|
|
candidate = {
|
|
"threshold": float(threshold),
|
|
"precision": float(precision),
|
|
"recall": float(recall),
|
|
"f1": float(f1),
|
|
}
|
|
|
|
if best is None or candidate["f1"] > best["f1"]:
|
|
best = candidate
|
|
|
|
if precision >= min_precision:
|
|
if constrained_best is None:
|
|
constrained_best = candidate
|
|
elif candidate["recall"] > constrained_best["recall"]:
|
|
constrained_best = candidate
|
|
elif candidate["recall"] == constrained_best["recall"] and candidate["f1"] > constrained_best["f1"]:
|
|
constrained_best = candidate
|
|
|
|
if constrained_best is not None:
|
|
constrained_best["selection_rule"] = f"max_recall_where_precision>={min_precision:.2f}"
|
|
return float(constrained_best["threshold"]), constrained_best
|
|
|
|
assert best is not None
|
|
best["selection_rule"] = "fallback_max_f1"
|
|
return float(best["threshold"]), best
|
|
|
|
|
|
def safe_pr_auc(v: dict[str, Any]) -> float:
|
|
value = v.get("pr_auc")
|
|
if value is None:
|
|
return float("-inf")
|
|
out = float(value)
|
|
if np.isnan(out):
|
|
return float("-inf")
|
|
return out
|
|
|
|
|
|
def safe_roc_auc(v: dict[str, Any]) -> float:
|
|
value = v.get("roc_auc")
|
|
if value is None:
|
|
return float("-inf")
|
|
out = float(value)
|
|
if np.isnan(out):
|
|
return float("-inf")
|
|
return out
|
|
|
|
|
|
def to_builtin(v: Any) -> Any:
|
|
if isinstance(v, dict):
|
|
return {k: to_builtin(val) for k, val in v.items()}
|
|
if isinstance(v, list):
|
|
return [to_builtin(i) for i in v]
|
|
if isinstance(v, tuple):
|
|
return [to_builtin(i) for i in v]
|
|
if isinstance(v, np.integer):
|
|
return int(v)
|
|
if isinstance(v, np.floating):
|
|
out = float(v)
|
|
if np.isnan(out):
|
|
return None
|
|
return out
|
|
if isinstance(v, pd.Timestamp):
|
|
return v.isoformat()
|
|
if isinstance(v, datetime):
|
|
return v.isoformat()
|
|
return v
|