bugfix wunderground reporting

This commit is contained in:
2026-03-09 09:19:45 +11:00
parent 5b8cad905f
commit c796f1324e
12 changed files with 253 additions and 33 deletions

View File

@@ -3,9 +3,11 @@ import argparse
import json
import os
from datetime import datetime, timezone
from typing import Any
import numpy as np
import psycopg2
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
@@ -22,6 +24,8 @@ from rain_model_common import (
feature_columns_need_forecast,
model_frame,
parse_time,
safe_pr_auc,
safe_roc_auc,
select_threshold,
split_time_ordered,
to_builtin,
@@ -33,6 +37,9 @@ except ImportError: # pragma: no cover - optional dependency
joblib = None
MODEL_FAMILIES = ("logreg", "hist_gb", "auto")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Train a rain prediction model (next 1h >= 0.2mm).")
parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
@@ -60,6 +67,21 @@ def parse_args() -> argparse.Namespace:
default="ecmwf",
help="Forecast model name when feature set requires forecast columns.",
)
parser.add_argument(
"--model-family",
default="logreg",
choices=MODEL_FAMILIES,
help=(
"Estimator family. "
"'auto' compares logreg and hist_gb on validation and selects best by PR-AUC/ROC-AUC/F1."
),
)
parser.add_argument(
"--random-state",
type=int,
default=42,
help="Random seed for stochastic estimators.",
)
parser.add_argument("--out", default="models/rain_model.pkl", help="Path to save model.")
parser.add_argument(
"--report-out",
@@ -82,13 +104,59 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args()
def make_model() -> Pipeline:
return Pipeline(
[
("scaler", StandardScaler()),
("clf", LogisticRegression(max_iter=1000, class_weight="balanced")),
]
)
def make_model(model_family: str, random_state: int):
if model_family == "logreg":
return Pipeline(
[
("scaler", StandardScaler()),
("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=random_state)),
]
)
if model_family == "hist_gb":
return HistGradientBoostingClassifier(
max_iter=300,
learning_rate=0.05,
max_depth=5,
min_samples_leaf=20,
random_state=random_state,
)
raise ValueError(f"unknown model_family: {model_family}")
def train_candidate(
model_family: str,
x_train,
y_train: np.ndarray,
x_val,
y_val: np.ndarray,
random_state: int,
min_precision: float,
fixed_threshold: float | None,
) -> dict[str, Any]:
model = make_model(model_family=model_family, random_state=random_state)
model.fit(x_train, y_train)
y_val_prob = model.predict_proba(x_val)[:, 1]
if fixed_threshold is not None:
threshold = fixed_threshold
threshold_info = {
"selection_rule": "fixed_cli_threshold",
"threshold": float(fixed_threshold),
}
else:
threshold, threshold_info = select_threshold(
y_true=y_val,
y_prob=y_val_prob,
min_precision=min_precision,
)
val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=threshold)
return {
"model_family": model_family,
"threshold": float(threshold),
"threshold_info": threshold_info,
"validation_metrics": val_metrics,
}
def main() -> int:
@@ -126,30 +194,38 @@ def main() -> int:
x_test = test_df[feature_cols]
y_test = test_df["rain_next_1h"].astype(int).to_numpy()
base_model = make_model()
base_model.fit(x_train, y_train)
y_val_prob = base_model.predict_proba(x_val)[:, 1]
if args.threshold is not None:
chosen_threshold = args.threshold
threshold_info = {
"selection_rule": "fixed_cli_threshold",
"threshold": float(args.threshold),
}
else:
chosen_threshold, threshold_info = select_threshold(
y_true=y_val,
y_prob=y_val_prob,
candidate_families = ["logreg", "hist_gb"] if args.model_family == "auto" else [args.model_family]
candidates = [
train_candidate(
model_family=family,
x_train=x_train,
y_train=y_train,
x_val=x_val,
y_val=y_val,
random_state=args.random_state,
min_precision=args.min_precision,
fixed_threshold=args.threshold,
)
val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=chosen_threshold)
for family in candidate_families
]
best_candidate = max(
candidates,
key=lambda c: (
safe_pr_auc(c["validation_metrics"]),
safe_roc_auc(c["validation_metrics"]),
float(c["validation_metrics"]["f1"]),
),
)
selected_model_family = str(best_candidate["model_family"])
chosen_threshold = float(best_candidate["threshold"])
threshold_info = best_candidate["threshold_info"]
val_metrics = best_candidate["validation_metrics"]
train_val_df = model_df.iloc[: len(train_df) + len(val_df)]
x_train_val = train_val_df[feature_cols]
y_train_val = train_val_df["rain_next_1h"].astype(int).to_numpy()
final_model = make_model()
final_model = make_model(model_family=selected_model_family, random_state=args.random_state)
final_model.fit(x_train_val, y_train_val)
y_test_prob = final_model.predict_proba(x_test)[:, 1]
test_metrics = evaluate_probs(y_true=y_test, y_prob=y_test_prob, threshold=chosen_threshold)
@@ -158,6 +234,8 @@ def main() -> int:
"generated_at": datetime.now(timezone.utc).isoformat(),
"site": args.site,
"model_version": args.model_version,
"model_family_requested": args.model_family,
"model_family": selected_model_family,
"feature_set": args.feature_set,
"target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
"feature_columns": feature_cols,
@@ -194,6 +272,17 @@ def main() -> int:
**threshold_info,
"min_precision_constraint": args.min_precision,
},
"candidate_models": [
{
"model_family": c["model_family"],
"threshold_selection": {
**c["threshold_info"],
"min_precision_constraint": args.min_precision,
},
"validation_metrics": c["validation_metrics"],
}
for c in candidates
],
"validation_metrics": val_metrics,
"test_metrics": test_metrics,
}
@@ -202,6 +291,7 @@ def main() -> int:
print("Rain model training summary:")
print(f" site: {args.site}")
print(f" model_version: {args.model_version}")
print(f" model_family: {selected_model_family} (requested={args.model_family})")
print(f" feature_set: {args.feature_set} ({len(feature_cols)} features)")
print(f" rows: total={report['data_window']['model_rows']} train={report['split']['train_rows']} val={report['split']['val_rows']} test={report['split']['test_rows']}")
print(
@@ -250,6 +340,7 @@ def main() -> int:
else:
artifact = {
"model": final_model,
"model_family": selected_model_family,
"features": feature_cols,
"feature_set": args.feature_set,
"forecast_model": args.forecast_model if needs_forecast else None,