bugfix wunderground reporting
This commit is contained in:
@@ -9,6 +9,7 @@ MODEL_PATH="${MODEL_PATH:-models/rain_model.pkl}"
|
||||
REPORT_PATH="${REPORT_PATH:-models/rain_model_report.json}"
|
||||
AUDIT_PATH="${AUDIT_PATH:-models/rain_data_audit.json}"
|
||||
FEATURE_SET="${FEATURE_SET:-baseline}"
|
||||
MODEL_FAMILY="${MODEL_FAMILY:-logreg}"
|
||||
FORECAST_MODEL="${FORECAST_MODEL:-ecmwf}"
|
||||
DATASET_PATH="${DATASET_PATH:-models/datasets/rain_dataset_${MODEL_VERSION}_${FEATURE_SET}.csv}"
|
||||
|
||||
@@ -36,6 +37,7 @@ python scripts/train_rain_model.py \
|
||||
--val-ratio 0.15 \
|
||||
--min-precision 0.70 \
|
||||
--feature-set "$FEATURE_SET" \
|
||||
--model-family "$MODEL_FAMILY" \
|
||||
--forecast-model "$FORECAST_MODEL" \
|
||||
--model-version "$MODEL_VERSION" \
|
||||
--out "$MODEL_PATH" \
|
||||
|
||||
@@ -41,6 +41,7 @@ class WorkerConfig:
|
||||
site: str
|
||||
model_name: str
|
||||
model_version_base: str
|
||||
model_family: str
|
||||
feature_set: str
|
||||
forecast_model: str
|
||||
train_interval_hours: float
|
||||
@@ -130,6 +131,8 @@ def run_training_cycle(cfg: WorkerConfig, env: dict[str, str]) -> None:
|
||||
str(cfg.min_precision),
|
||||
"--feature-set",
|
||||
cfg.feature_set,
|
||||
"--model-family",
|
||||
cfg.model_family,
|
||||
"--forecast-model",
|
||||
cfg.forecast_model,
|
||||
"--model-version",
|
||||
@@ -176,6 +179,7 @@ def load_config() -> WorkerConfig:
|
||||
site=read_env("RAIN_SITE", "home"),
|
||||
model_name=read_env("RAIN_MODEL_NAME", "rain_next_1h"),
|
||||
model_version_base=read_env("RAIN_MODEL_VERSION_BASE", "rain-logreg-v1"),
|
||||
model_family=read_env("RAIN_MODEL_FAMILY", "logreg"),
|
||||
feature_set=read_env("RAIN_FEATURE_SET", "baseline"),
|
||||
forecast_model=read_env("RAIN_FORECAST_MODEL", "ecmwf"),
|
||||
train_interval_hours=read_env_float("RAIN_TRAIN_INTERVAL_HOURS", 24.0),
|
||||
@@ -212,6 +216,7 @@ def main() -> int:
|
||||
"[rain-ml] worker start "
|
||||
f"site={cfg.site} "
|
||||
f"model_name={cfg.model_name} "
|
||||
f"model_family={cfg.model_family} "
|
||||
f"feature_set={cfg.feature_set} "
|
||||
f"forecast_model={cfg.forecast_model} "
|
||||
f"train_interval_hours={cfg.train_interval_hours} "
|
||||
|
||||
@@ -3,9 +3,11 @@ import argparse
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import psycopg2
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
@@ -22,6 +24,8 @@ from rain_model_common import (
|
||||
feature_columns_need_forecast,
|
||||
model_frame,
|
||||
parse_time,
|
||||
safe_pr_auc,
|
||||
safe_roc_auc,
|
||||
select_threshold,
|
||||
split_time_ordered,
|
||||
to_builtin,
|
||||
@@ -33,6 +37,9 @@ except ImportError: # pragma: no cover - optional dependency
|
||||
joblib = None
|
||||
|
||||
|
||||
MODEL_FAMILIES = ("logreg", "hist_gb", "auto")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Train a rain prediction model (next 1h >= 0.2mm).")
|
||||
parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
|
||||
@@ -60,6 +67,21 @@ def parse_args() -> argparse.Namespace:
|
||||
default="ecmwf",
|
||||
help="Forecast model name when feature set requires forecast columns.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-family",
|
||||
default="logreg",
|
||||
choices=MODEL_FAMILIES,
|
||||
help=(
|
||||
"Estimator family. "
|
||||
"'auto' compares logreg and hist_gb on validation and selects best by PR-AUC/ROC-AUC/F1."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-state",
|
||||
type=int,
|
||||
default=42,
|
||||
help="Random seed for stochastic estimators.",
|
||||
)
|
||||
parser.add_argument("--out", default="models/rain_model.pkl", help="Path to save model.")
|
||||
parser.add_argument(
|
||||
"--report-out",
|
||||
@@ -82,13 +104,59 @@ def parse_args() -> argparse.Namespace:
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def make_model() -> Pipeline:
|
||||
return Pipeline(
|
||||
[
|
||||
("scaler", StandardScaler()),
|
||||
("clf", LogisticRegression(max_iter=1000, class_weight="balanced")),
|
||||
]
|
||||
)
|
||||
def make_model(model_family: str, random_state: int):
|
||||
if model_family == "logreg":
|
||||
return Pipeline(
|
||||
[
|
||||
("scaler", StandardScaler()),
|
||||
("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=random_state)),
|
||||
]
|
||||
)
|
||||
if model_family == "hist_gb":
|
||||
return HistGradientBoostingClassifier(
|
||||
max_iter=300,
|
||||
learning_rate=0.05,
|
||||
max_depth=5,
|
||||
min_samples_leaf=20,
|
||||
random_state=random_state,
|
||||
)
|
||||
raise ValueError(f"unknown model_family: {model_family}")
|
||||
|
||||
|
||||
def train_candidate(
|
||||
model_family: str,
|
||||
x_train,
|
||||
y_train: np.ndarray,
|
||||
x_val,
|
||||
y_val: np.ndarray,
|
||||
random_state: int,
|
||||
min_precision: float,
|
||||
fixed_threshold: float | None,
|
||||
) -> dict[str, Any]:
|
||||
model = make_model(model_family=model_family, random_state=random_state)
|
||||
model.fit(x_train, y_train)
|
||||
y_val_prob = model.predict_proba(x_val)[:, 1]
|
||||
|
||||
if fixed_threshold is not None:
|
||||
threshold = fixed_threshold
|
||||
threshold_info = {
|
||||
"selection_rule": "fixed_cli_threshold",
|
||||
"threshold": float(fixed_threshold),
|
||||
}
|
||||
else:
|
||||
threshold, threshold_info = select_threshold(
|
||||
y_true=y_val,
|
||||
y_prob=y_val_prob,
|
||||
min_precision=min_precision,
|
||||
)
|
||||
|
||||
val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=threshold)
|
||||
return {
|
||||
"model_family": model_family,
|
||||
"threshold": float(threshold),
|
||||
"threshold_info": threshold_info,
|
||||
"validation_metrics": val_metrics,
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
@@ -126,30 +194,38 @@ def main() -> int:
|
||||
x_test = test_df[feature_cols]
|
||||
y_test = test_df["rain_next_1h"].astype(int).to_numpy()
|
||||
|
||||
base_model = make_model()
|
||||
base_model.fit(x_train, y_train)
|
||||
y_val_prob = base_model.predict_proba(x_val)[:, 1]
|
||||
|
||||
if args.threshold is not None:
|
||||
chosen_threshold = args.threshold
|
||||
threshold_info = {
|
||||
"selection_rule": "fixed_cli_threshold",
|
||||
"threshold": float(args.threshold),
|
||||
}
|
||||
else:
|
||||
chosen_threshold, threshold_info = select_threshold(
|
||||
y_true=y_val,
|
||||
y_prob=y_val_prob,
|
||||
candidate_families = ["logreg", "hist_gb"] if args.model_family == "auto" else [args.model_family]
|
||||
candidates = [
|
||||
train_candidate(
|
||||
model_family=family,
|
||||
x_train=x_train,
|
||||
y_train=y_train,
|
||||
x_val=x_val,
|
||||
y_val=y_val,
|
||||
random_state=args.random_state,
|
||||
min_precision=args.min_precision,
|
||||
fixed_threshold=args.threshold,
|
||||
)
|
||||
|
||||
val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=chosen_threshold)
|
||||
for family in candidate_families
|
||||
]
|
||||
best_candidate = max(
|
||||
candidates,
|
||||
key=lambda c: (
|
||||
safe_pr_auc(c["validation_metrics"]),
|
||||
safe_roc_auc(c["validation_metrics"]),
|
||||
float(c["validation_metrics"]["f1"]),
|
||||
),
|
||||
)
|
||||
selected_model_family = str(best_candidate["model_family"])
|
||||
chosen_threshold = float(best_candidate["threshold"])
|
||||
threshold_info = best_candidate["threshold_info"]
|
||||
val_metrics = best_candidate["validation_metrics"]
|
||||
|
||||
train_val_df = model_df.iloc[: len(train_df) + len(val_df)]
|
||||
x_train_val = train_val_df[feature_cols]
|
||||
y_train_val = train_val_df["rain_next_1h"].astype(int).to_numpy()
|
||||
|
||||
final_model = make_model()
|
||||
final_model = make_model(model_family=selected_model_family, random_state=args.random_state)
|
||||
final_model.fit(x_train_val, y_train_val)
|
||||
y_test_prob = final_model.predict_proba(x_test)[:, 1]
|
||||
test_metrics = evaluate_probs(y_true=y_test, y_prob=y_test_prob, threshold=chosen_threshold)
|
||||
@@ -158,6 +234,8 @@ def main() -> int:
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"site": args.site,
|
||||
"model_version": args.model_version,
|
||||
"model_family_requested": args.model_family,
|
||||
"model_family": selected_model_family,
|
||||
"feature_set": args.feature_set,
|
||||
"target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"feature_columns": feature_cols,
|
||||
@@ -194,6 +272,17 @@ def main() -> int:
|
||||
**threshold_info,
|
||||
"min_precision_constraint": args.min_precision,
|
||||
},
|
||||
"candidate_models": [
|
||||
{
|
||||
"model_family": c["model_family"],
|
||||
"threshold_selection": {
|
||||
**c["threshold_info"],
|
||||
"min_precision_constraint": args.min_precision,
|
||||
},
|
||||
"validation_metrics": c["validation_metrics"],
|
||||
}
|
||||
for c in candidates
|
||||
],
|
||||
"validation_metrics": val_metrics,
|
||||
"test_metrics": test_metrics,
|
||||
}
|
||||
@@ -202,6 +291,7 @@ def main() -> int:
|
||||
print("Rain model training summary:")
|
||||
print(f" site: {args.site}")
|
||||
print(f" model_version: {args.model_version}")
|
||||
print(f" model_family: {selected_model_family} (requested={args.model_family})")
|
||||
print(f" feature_set: {args.feature_set} ({len(feature_cols)} features)")
|
||||
print(f" rows: total={report['data_window']['model_rows']} train={report['split']['train_rows']} val={report['split']['val_rows']} test={report['split']['test_rows']}")
|
||||
print(
|
||||
@@ -250,6 +340,7 @@ def main() -> int:
|
||||
else:
|
||||
artifact = {
|
||||
"model": final_model,
|
||||
"model_family": selected_model_family,
|
||||
"features": feature_cols,
|
||||
"feature_set": args.feature_set,
|
||||
"forecast_model": args.forecast_model if needs_forecast else None,
|
||||
|
||||
Reference in New Issue
Block a user