another bugfix

This commit is contained in:
2026-03-12 20:29:29 +11:00
parent d1237eed44
commit 20316cee91
8 changed files with 293 additions and 23 deletions

View File

@@ -36,6 +36,16 @@ FORECAST_FEATURE_COLUMNS = [
"fc_cloud_cover",
]
CALENDAR_FEATURE_COLUMNS = [
"hour_sin",
"hour_cos",
"dow_sin",
"dow_cos",
"month_sin",
"month_cos",
"is_weekend",
]
EXTENDED_FEATURE_COLUMNS = [
"pressure_trend_1h",
"temperature_c",
@@ -60,9 +70,15 @@ EXTENDED_FEATURE_COLUMNS = [
*FORECAST_FEATURE_COLUMNS,
]
EXTENDED_CALENDAR_FEATURE_COLUMNS = [
*EXTENDED_FEATURE_COLUMNS,
*CALENDAR_FEATURE_COLUMNS,
]
FEATURE_SETS: dict[str, list[str]] = {
"baseline": BASELINE_FEATURE_COLUMNS,
"extended": EXTENDED_FEATURE_COLUMNS,
"extended_calendar": EXTENDED_CALENDAR_FEATURE_COLUMNS,
}
AVAILABLE_FEATURE_SETS = tuple(sorted(FEATURE_SETS.keys()))
@@ -116,8 +132,8 @@ def fetch_ws90(conn, site: str, start: str, end: str) -> pd.DataFrame:
SELECT ts, station_id, received_at, temperature_c, humidity, wind_avg_m_s, wind_max_m_s, wind_dir_deg, rain_mm
FROM observations_ws90
WHERE site = %s
AND (%s = '' OR ts >= %s::timestamptz)
AND (%s = '' OR ts <= %s::timestamptz)
AND (%s = '' OR ts >= NULLIF(%s, '')::timestamptz)
AND (%s = '' OR ts <= NULLIF(%s, '')::timestamptz)
ORDER BY ts ASC
"""
return _fetch_df(conn, sql, (site, start, start, end, end), ["ts", "received_at"])
@@ -128,8 +144,8 @@ def fetch_baro(conn, site: str, start: str, end: str) -> pd.DataFrame:
SELECT ts, source, received_at, pressure_hpa
FROM observations_baro
WHERE site = %s
AND (%s = '' OR ts >= %s::timestamptz)
AND (%s = '' OR ts <= %s::timestamptz)
AND (%s = '' OR ts >= NULLIF(%s, '')::timestamptz)
AND (%s = '' OR ts <= NULLIF(%s, '')::timestamptz)
ORDER BY ts ASC
"""
return _fetch_df(conn, sql, (site, start, start, end, end), ["ts", "received_at"])
@@ -151,8 +167,8 @@ def fetch_forecast(conn, site: str, start: str, end: str, model: str = "ecmwf")
FROM forecast_openmeteo_hourly
WHERE site = %s
AND model = %s
AND (%s = '' OR ts >= %s::timestamptz - INTERVAL '2 hours')
AND (%s = '' OR ts <= %s::timestamptz + INTERVAL '2 hours')
AND (%s = '' OR ts >= NULLIF(%s, '')::timestamptz - INTERVAL '2 hours')
AND (%s = '' OR ts <= NULLIF(%s, '')::timestamptz + INTERVAL '2 hours')
ORDER BY ts ASC, retrieved_at DESC
"""
return _fetch_df(conn, sql, (site, model, start, start, end, end), ["ts", "retrieved_at"])
@@ -199,6 +215,15 @@ def _apply_forecast_features(df: pd.DataFrame, forecast: pd.DataFrame | None) ->
out.loc[mask, "fc_precip_prob"] = out.loc[mask, "fc_precip_prob"] / 100.0
out["fc_precip_prob"] = out["fc_precip_prob"].clip(lower=0.0, upper=1.0)
# Some forecast sources (or model configs) provide precip amount but no precip probability.
# Backfill missing probability to keep feature rows usable for training/inference.
if "fc_precip_mm" in out.columns:
fallback_prob = (out["fc_precip_mm"].fillna(0.0) > 0.0).astype(float)
else:
fallback_prob = 0.0
out["fc_precip_prob"] = out["fc_precip_prob"].fillna(fallback_prob)
out["fc_precip_prob"] = out["fc_precip_prob"].clip(lower=0.0, upper=1.0)
return out
@@ -263,6 +288,18 @@ def build_dataset(
df["pressure_roll_1h_mean"] = df["pressure_hpa"].rolling(window=window, min_periods=3).mean()
df["pressure_roll_1h_std"] = df["pressure_hpa"].rolling(window=window, min_periods=3).std()
# Calendar/seasonality features (UTC based).
hour_of_day = df.index.hour + (df.index.minute / 60.0)
day_of_week = df.index.dayofweek
month_of_year = df.index.month
df["hour_sin"] = np.sin(2.0 * np.pi * hour_of_day / 24.0)
df["hour_cos"] = np.cos(2.0 * np.pi * hour_of_day / 24.0)
df["dow_sin"] = np.sin(2.0 * np.pi * day_of_week / 7.0)
df["dow_cos"] = np.cos(2.0 * np.pi * day_of_week / 7.0)
df["month_sin"] = np.sin(2.0 * np.pi * (month_of_year - 1.0) / 12.0)
df["month_cos"] = np.cos(2.0 * np.pi * (month_of_year - 1.0) / 12.0)
df["is_weekend"] = (day_of_week >= 5).astype(float)
df = _apply_forecast_features(df, forecast)
return df