bugfix wunderground reporting

2026-03-09 09:19:45 +11:00
parent 5b8cad905f
commit c796f1324e
12 changed files with 253 additions and 33 deletions
@@ -45,6 +45,7 @@ site:
  latitude: -33.8688               # WGS84 latitude
  longitude: 151.2093              # WGS84 longitude
  elevation_m: 50                  # Currently informational (not used by Open-Meteo ECMWF endpoint)
+  timezone: "Australia/Sydney"     # IANA timezone used for daily rain boundary (e.g. Wunderground dailyrainin)

 pollers:
  open_meteo:
@@ -67,6 +68,7 @@ wunderground:
 - The Open-Meteo ECMWF endpoint is queried by the poller only. The UI reads forecasts from TimescaleDB.
 - Web UI supports Local/UTC toggle and date-aligned ranges (6h, 24h, 72h, 7d).
 - `mqtt.topic` is still supported for single-topic configs, but `mqtt.topics` is preferred.
+- Set `site.timezone` to your station timezone so Wunderground daily rain resets at local midnight.

 ## Schema & tables
 TimescaleDB schema is initialized from `db/init/001_schema.sql` and includes:
@@ -39,7 +39,17 @@ func main() {
 		cancel()
 	}()

-	latest := &mqttingest.Latest{}
+	rainDayLoc := time.Local
+	if cfg.Site.Timezone != "" {
+		loc, err := time.LoadLocation(cfg.Site.Timezone)
+		if err != nil {
+			log.Fatalf("site timezone load: %v", err)
+		}
+		rainDayLoc = loc
+	}
+	log.Printf("rain day timezone: %s", rainDayLoc)
+
+	latest := mqttingest.NewLatest(rainDayLoc)
 	forecastCache := &ForecastCache{}

 	d, err := db.Open(ctx, cfg.DB.ConnString)
@@ -18,6 +18,7 @@ site:
  latitude: -33.8688
  longitude: 151.2093
  elevation_m: 50
+  timezone: "Australia/Sydney"

 pollers:
  open_meteo:
@@ -36,6 +36,7 @@ services:
      RAIN_SITE: "home"
      RAIN_MODEL_NAME: "rain_next_1h"
      RAIN_MODEL_VERSION_BASE: "rain-logreg-v1"
+      RAIN_MODEL_FAMILY: "logreg"
      RAIN_FEATURE_SET: "baseline"
      RAIN_FORECAST_MODEL: "ecmwf"
      RAIN_LOOKBACK_DAYS: "30"
@@ -47,6 +47,12 @@ Feature-set options:
 - `extended`: adds wind-direction encoding, lag/rolling stats, recent rain accumulation,
  and aligned forecast features from `forecast_openmeteo_hourly`.

+Model-family options (`train_rain_model.py`):
+- `logreg`: logistic regression baseline.
+- `hist_gb`: histogram gradient boosting (tree-based baseline).
+- `auto`: trains both `logreg` and `hist_gb`, picks the best validation model by
+  PR-AUC, then ROC-AUC, then F1.
+
 ## Usage
 ### 1) Apply schema update (existing DBs)
 `001_schema.sql` now includes `predictions_rain_1h`.
@@ -79,6 +85,7 @@ python scripts/train_rain_model.py \
  --val-ratio 0.15 \
  --min-precision 0.70 \
  --feature-set "baseline" \
+  --model-family "logreg" \
  --model-version "rain-logreg-v1" \
  --out "models/rain_model.pkl" \
  --report-out "models/rain_model_report.json" \
@@ -92,6 +99,7 @@ python scripts/train_rain_model.py \
  --start "2026-02-01T00:00:00Z" \
  --end "2026-03-03T23:55:00Z" \
  --feature-set "extended" \
+  --model-family "logreg" \
  --forecast-model "ecmwf" \
  --model-version "rain-logreg-v1-extended" \
  --out "models/rain_model_extended.pkl" \
@@ -99,6 +107,35 @@ python scripts/train_rain_model.py \
  --dataset-out "models/datasets/rain_dataset_{model_version}_{feature_set}.csv"
 ```

+### 3c) Train tree-based baseline (P1)
+```sh
+python scripts/train_rain_model.py \
+  --site "home" \
+  --start "2026-02-01T00:00:00Z" \
+  --end "2026-03-03T23:55:00Z" \
+  --feature-set "extended" \
+  --model-family "hist_gb" \
+  --forecast-model "ecmwf" \
+  --model-version "rain-hgb-v1-extended" \
+  --out "models/rain_model_hgb.pkl" \
+  --report-out "models/rain_model_report_hgb.json" \
+  --dataset-out "models/datasets/rain_dataset_{model_version}_{feature_set}.csv"
+```
+
+### 3d) Auto-compare logistic vs tree baseline
+```sh
+python scripts/train_rain_model.py \
+  --site "home" \
+  --start "2026-02-01T00:00:00Z" \
+  --end "2026-03-03T23:55:00Z" \
+  --feature-set "extended" \
+  --model-family "auto" \
+  --forecast-model "ecmwf" \
+  --model-version "rain-auto-v1-extended" \
+  --out "models/rain_model_auto.pkl" \
+  --report-out "models/rain_model_report_auto.json"
+```
+
 ### 4) Run inference and store prediction
 ```sh
 python scripts/predict_rain_model.py \
@@ -2,6 +2,7 @@ package config

 import (
 	"errors"
+	"fmt"
 	"os"
 	"strings"
 	"time"
@@ -31,6 +32,7 @@ type Config struct {
 		Latitude   float64 `yaml:"latitude"`
 		Longitude  float64 `yaml:"longitude"`
 		ElevationM float64 `yaml:"elevation_m"`
+		Timezone   string  `yaml:"timezone"`
 	} `yaml:"site"`

 	Pollers struct {
@@ -105,6 +107,11 @@ func Load(path string) (*Config, error) {
 	if c.Site.Name == "" {
 		c.Site.Name = "default"
 	}
+	if c.Site.Timezone != "" {
+		if _, err := time.LoadLocation(c.Site.Timezone); err != nil {
+			return nil, fmt.Errorf("invalid site timezone %q: %w", c.Site.Timezone, err)
+		}
+	}
 	if c.Pollers.OpenMeteo.Model == "" {
 		c.Pollers.OpenMeteo.Model = "ecmwf"
 	}
@@ -27,7 +27,9 @@ type Latest struct {

 	// rolling sums built from "rain increment" values (mm)
 	rainIncs  []rainIncPoint // last 1h
-	dailyIncs []rainIncPoint // since midnight (or since start; we’ll trim daily by midnight)
+	dailyIncs []rainIncPoint // since midnight in rainDayLoc (or since start; trimmed each update)
+
+	rainDayLoc *time.Location
 }

 type rainIncPoint struct {
@@ -35,6 +37,15 @@ type rainIncPoint struct {
 	mm float64 // incremental rainfall at this timestamp (mm)
 }

+func NewLatest(rainDayLoc *time.Location) *Latest {
+	if rainDayLoc == nil {
+		rainDayLoc = time.Local
+	}
+	return &Latest{
+		rainDayLoc: rainDayLoc,
+	}
+}
+
 func (l *Latest) Update(ts time.Time, p *WS90Payload) {
 	l.mu.Lock()
 	defer l.mu.Unlock()
@@ -49,9 +60,9 @@ func (l *Latest) Update(ts time.Time, p *WS90Payload) {
 	cutoff := ts.Add(-1 * time.Hour)
 	l.rainIncs = trimBefore(l.rainIncs, cutoff)

-	// Track daily increments: trim before local midnight
+	// Track daily increments: trim before midnight in configured rain day timezone.
 	l.dailyIncs = append(l.dailyIncs, rainIncPoint{ts: ts, mm: inc})
-	midnight := localMidnight(ts)
+	midnight := l.rainDayMidnight(ts)
 	l.dailyIncs = trimBefore(l.dailyIncs, midnight)
 }

@@ -68,10 +79,13 @@ func trimBefore(a []rainIncPoint, cutoff time.Time) []rainIncPoint {
 	return a
 }

-// localMidnight returns midnight in the local timezone of the *process*.
-// If you want a specific timezone (e.g. Australia/Sydney) we can wire that in later.
-func localMidnight(t time.Time) time.Time {
-	lt := t.Local()
+// rainDayMidnight returns midnight in the configured rain day timezone.
+func (l *Latest) rainDayMidnight(t time.Time) time.Time {
+	loc := l.rainDayLoc
+	if loc == nil {
+		loc = time.Local
+	}
+	lt := t.In(loc)
 	return time.Date(lt.Year(), lt.Month(), lt.Day(), 0, 0, 0, 0, lt.Location())
 }

@@ -0,0 +1,50 @@
+package mqttingest
+
+import (
+	"testing"
+	"time"
+)
+
+func TestLatestDailyRainUsesConfiguredTimezone(t *testing.T) {
+	loc, err := time.LoadLocation("Australia/Sydney")
+	if err != nil {
+		t.Fatalf("load location: %v", err)
+	}
+
+	l := NewLatest(loc)
+
+	// Crosses UTC midnight but remains the same local day in Sydney (UTC+11 during DST).
+	l.Update(time.Date(2026, time.January, 14, 22, 0, 0, 0, time.UTC), &WS90Payload{RainMM: 0})
+	l.Update(time.Date(2026, time.January, 14, 23, 30, 0, 0, time.UTC), &WS90Payload{RainMM: 2})
+	l.Update(time.Date(2026, time.January, 15, 0, 5, 0, 0, time.UTC), &WS90Payload{RainMM: 2})
+
+	snap, ok := l.Snapshot()
+	if !ok {
+		t.Fatal("expected snapshot")
+	}
+	if snap.DailyRainMM != 2 {
+		t.Fatalf("expected daily rain 2.0mm, got %.2fmm", snap.DailyRainMM)
+	}
+}
+
+func TestLatestDailyRainResetsAtConfiguredLocalMidnight(t *testing.T) {
+	loc, err := time.LoadLocation("Australia/Sydney")
+	if err != nil {
+		t.Fatalf("load location: %v", err)
+	}
+
+	l := NewLatest(loc)
+
+	// Crosses local midnight in Sydney (00:00 local == 13:00 UTC during DST).
+	l.Update(time.Date(2026, time.January, 15, 12, 30, 0, 0, time.UTC), &WS90Payload{RainMM: 0})
+	l.Update(time.Date(2026, time.January, 15, 12, 50, 0, 0, time.UTC), &WS90Payload{RainMM: 1})
+	l.Update(time.Date(2026, time.January, 15, 13, 10, 0, 0, time.UTC), &WS90Payload{RainMM: 1})
+
+	snap, ok := l.Snapshot()
+	if !ok {
+		t.Fatal("expected snapshot")
+	}
+	if snap.DailyRainMM != 0 {
+		t.Fatalf("expected daily rain reset after local midnight; got %.2fmm", snap.DailyRainMM)
+	}
+}
@@ -9,6 +9,7 @@ MODEL_PATH="${MODEL_PATH:-models/rain_model.pkl}"
 REPORT_PATH="${REPORT_PATH:-models/rain_model_report.json}"
 AUDIT_PATH="${AUDIT_PATH:-models/rain_data_audit.json}"
 FEATURE_SET="${FEATURE_SET:-baseline}"
+MODEL_FAMILY="${MODEL_FAMILY:-logreg}"
 FORECAST_MODEL="${FORECAST_MODEL:-ecmwf}"
 DATASET_PATH="${DATASET_PATH:-models/datasets/rain_dataset_${MODEL_VERSION}_${FEATURE_SET}.csv}"

@@ -36,6 +37,7 @@ python scripts/train_rain_model.py \
  --val-ratio 0.15 \
  --min-precision 0.70 \
  --feature-set "$FEATURE_SET" \
+  --model-family "$MODEL_FAMILY" \
  --forecast-model "$FORECAST_MODEL" \
  --model-version "$MODEL_VERSION" \
  --out "$MODEL_PATH" \
@@ -41,6 +41,7 @@ class WorkerConfig:
    site: str
    model_name: str
    model_version_base: str
+    model_family: str
    feature_set: str
    forecast_model: str
    train_interval_hours: float
@@ -130,6 +131,8 @@ def run_training_cycle(cfg: WorkerConfig, env: dict[str, str]) -> None:
            str(cfg.min_precision),
            "--feature-set",
            cfg.feature_set,
+            "--model-family",
+            cfg.model_family,
            "--forecast-model",
            cfg.forecast_model,
            "--model-version",
@@ -176,6 +179,7 @@ def load_config() -> WorkerConfig:
        site=read_env("RAIN_SITE", "home"),
        model_name=read_env("RAIN_MODEL_NAME", "rain_next_1h"),
        model_version_base=read_env("RAIN_MODEL_VERSION_BASE", "rain-logreg-v1"),
+        model_family=read_env("RAIN_MODEL_FAMILY", "logreg"),
        feature_set=read_env("RAIN_FEATURE_SET", "baseline"),
        forecast_model=read_env("RAIN_FORECAST_MODEL", "ecmwf"),
        train_interval_hours=read_env_float("RAIN_TRAIN_INTERVAL_HOURS", 24.0),
@@ -212,6 +216,7 @@ def main() -> int:
        "[rain-ml] worker start "
        f"site={cfg.site} "
        f"model_name={cfg.model_name} "
+        f"model_family={cfg.model_family} "
        f"feature_set={cfg.feature_set} "
        f"forecast_model={cfg.forecast_model} "
        f"train_interval_hours={cfg.train_interval_hours} "
@@ -3,9 +3,11 @@ import argparse
 import json
 import os
 from datetime import datetime, timezone
+from typing import Any

 import numpy as np
 import psycopg2
+from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
@@ -22,6 +24,8 @@ from rain_model_common import (
    feature_columns_need_forecast,
    model_frame,
    parse_time,
+    safe_pr_auc,
+    safe_roc_auc,
    select_threshold,
    split_time_ordered,
    to_builtin,
@@ -33,6 +37,9 @@ except ImportError:  # pragma: no cover - optional dependency
    joblib = None


+MODEL_FAMILIES = ("logreg", "hist_gb", "auto")
+
+
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Train a rain prediction model (next 1h >= 0.2mm).")
    parser.add_argument("--db-url", default=os.getenv("DATABASE_URL"), help="Postgres connection string.")
@@ -60,6 +67,21 @@ def parse_args() -> argparse.Namespace:
        default="ecmwf",
        help="Forecast model name when feature set requires forecast columns.",
    )
+    parser.add_argument(
+        "--model-family",
+        default="logreg",
+        choices=MODEL_FAMILIES,
+        help=(
+            "Estimator family. "
+            "'auto' compares logreg and hist_gb on validation and selects best by PR-AUC/ROC-AUC/F1."
+        ),
+    )
+    parser.add_argument(
+        "--random-state",
+        type=int,
+        default=42,
+        help="Random seed for stochastic estimators.",
+    )
    parser.add_argument("--out", default="models/rain_model.pkl", help="Path to save model.")
    parser.add_argument(
        "--report-out",
@@ -82,13 +104,59 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


-def make_model() -> Pipeline:
-    return Pipeline(
-        [
-            ("scaler", StandardScaler()),
-            ("clf", LogisticRegression(max_iter=1000, class_weight="balanced")),
-        ]
-    )
+def make_model(model_family: str, random_state: int):
+    if model_family == "logreg":
+        return Pipeline(
+            [
+                ("scaler", StandardScaler()),
+                ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=random_state)),
+            ]
+        )
+    if model_family == "hist_gb":
+        return HistGradientBoostingClassifier(
+            max_iter=300,
+            learning_rate=0.05,
+            max_depth=5,
+            min_samples_leaf=20,
+            random_state=random_state,
+        )
+    raise ValueError(f"unknown model_family: {model_family}")
+
+
+def train_candidate(
+    model_family: str,
+    x_train,
+    y_train: np.ndarray,
+    x_val,
+    y_val: np.ndarray,
+    random_state: int,
+    min_precision: float,
+    fixed_threshold: float | None,
+) -> dict[str, Any]:
+    model = make_model(model_family=model_family, random_state=random_state)
+    model.fit(x_train, y_train)
+    y_val_prob = model.predict_proba(x_val)[:, 1]
+
+    if fixed_threshold is not None:
+        threshold = fixed_threshold
+        threshold_info = {
+            "selection_rule": "fixed_cli_threshold",
+            "threshold": float(fixed_threshold),
+        }
+    else:
+        threshold, threshold_info = select_threshold(
+            y_true=y_val,
+            y_prob=y_val_prob,
+            min_precision=min_precision,
+        )
+
+    val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=threshold)
+    return {
+        "model_family": model_family,
+        "threshold": float(threshold),
+        "threshold_info": threshold_info,
+        "validation_metrics": val_metrics,
+    }


 def main() -> int:
@@ -126,30 +194,38 @@ def main() -> int:
    x_test = test_df[feature_cols]
    y_test = test_df["rain_next_1h"].astype(int).to_numpy()

-    base_model = make_model()
-    base_model.fit(x_train, y_train)
-    y_val_prob = base_model.predict_proba(x_val)[:, 1]
-
-    if args.threshold is not None:
-        chosen_threshold = args.threshold
-        threshold_info = {
-            "selection_rule": "fixed_cli_threshold",
-            "threshold": float(args.threshold),
-        }
-    else:
-        chosen_threshold, threshold_info = select_threshold(
-            y_true=y_val,
-            y_prob=y_val_prob,
+    candidate_families = ["logreg", "hist_gb"] if args.model_family == "auto" else [args.model_family]
+    candidates = [
+        train_candidate(
+            model_family=family,
+            x_train=x_train,
+            y_train=y_train,
+            x_val=x_val,
+            y_val=y_val,
+            random_state=args.random_state,
            min_precision=args.min_precision,
+            fixed_threshold=args.threshold,
        )
-
-    val_metrics = evaluate_probs(y_true=y_val, y_prob=y_val_prob, threshold=chosen_threshold)
+        for family in candidate_families
+    ]
+    best_candidate = max(
+        candidates,
+        key=lambda c: (
+            safe_pr_auc(c["validation_metrics"]),
+            safe_roc_auc(c["validation_metrics"]),
+            float(c["validation_metrics"]["f1"]),
+        ),
+    )
+    selected_model_family = str(best_candidate["model_family"])
+    chosen_threshold = float(best_candidate["threshold"])
+    threshold_info = best_candidate["threshold_info"]
+    val_metrics = best_candidate["validation_metrics"]

    train_val_df = model_df.iloc[: len(train_df) + len(val_df)]
    x_train_val = train_val_df[feature_cols]
    y_train_val = train_val_df["rain_next_1h"].astype(int).to_numpy()

-    final_model = make_model()
+    final_model = make_model(model_family=selected_model_family, random_state=args.random_state)
    final_model.fit(x_train_val, y_train_val)
    y_test_prob = final_model.predict_proba(x_test)[:, 1]
    test_metrics = evaluate_probs(y_true=y_test, y_prob=y_test_prob, threshold=chosen_threshold)
@@ -158,6 +234,8 @@ def main() -> int:
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "site": args.site,
        "model_version": args.model_version,
+        "model_family_requested": args.model_family,
+        "model_family": selected_model_family,
        "feature_set": args.feature_set,
        "target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
        "feature_columns": feature_cols,
@@ -194,6 +272,17 @@ def main() -> int:
            **threshold_info,
            "min_precision_constraint": args.min_precision,
        },
+        "candidate_models": [
+            {
+                "model_family": c["model_family"],
+                "threshold_selection": {
+                    **c["threshold_info"],
+                    "min_precision_constraint": args.min_precision,
+                },
+                "validation_metrics": c["validation_metrics"],
+            }
+            for c in candidates
+        ],
        "validation_metrics": val_metrics,
        "test_metrics": test_metrics,
    }
@@ -202,6 +291,7 @@ def main() -> int:
    print("Rain model training summary:")
    print(f"  site: {args.site}")
    print(f"  model_version: {args.model_version}")
+    print(f"  model_family: {selected_model_family} (requested={args.model_family})")
    print(f"  feature_set: {args.feature_set} ({len(feature_cols)} features)")
    print(f"  rows: total={report['data_window']['model_rows']} train={report['split']['train_rows']} val={report['split']['val_rows']} test={report['split']['test_rows']}")
    print(
@@ -250,6 +340,7 @@ def main() -> int:
        else:
            artifact = {
                "model": final_model,
+                "model_family": selected_model_family,
                "features": feature_cols,
                "feature_set": args.feature_set,
                "forecast_model": args.forecast_model if needs_forecast else None,
@@ -24,7 +24,7 @@ Priority key: `P0` = critical/blocking, `P1` = important, `P2` = later optimizat

 ## 4) Modeling and Validation
 - [x] [P0] Keep logistic regression as baseline.
- [ ] [P1] Add at least one tree-based baseline (e.g. gradient boosting).
+- [x] [P1] Add at least one tree-based baseline (e.g. gradient boosting). (implemented via `hist_gb`; runtime evaluation pending local Python deps)
 - [x] [P0] Use strict time-based train/validation/test splits (no random shuffling).
 - [ ] [P1] Add walk-forward backtesting across multiple temporal folds.
 - [ ] [P1] Tune hyperparameters on validation data only.