work on model training

2026-03-05 11:03:20 +11:00
parent 96e72d7c43
commit c8e38cd597
10 changed files with 534 additions and 30 deletions
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import os
+import subprocess
+import sys
+import time
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+
+
+def read_env(name: str, default: str) -> str:
+    return os.getenv(name, default).strip()
+
+
+def read_env_float(name: str, default: float) -> float:
+    raw = os.getenv(name)
+    if raw is None or raw.strip() == "":
+        return default
+    return float(raw)
+
+
+def read_env_int(name: str, default: int) -> int:
+    raw = os.getenv(name)
+    if raw is None or raw.strip() == "":
+        return default
+    return int(raw)
+
+
+def read_env_bool(name: str, default: bool) -> bool:
+    raw = os.getenv(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+
+
+@dataclass
+class WorkerConfig:
+    database_url: str
+    site: str
+    model_name: str
+    model_version_base: str
+    train_interval_hours: float
+    predict_interval_minutes: float
+    lookback_days: int
+    train_ratio: float
+    val_ratio: float
+    min_precision: float
+    model_path: Path
+    report_path: Path
+    audit_path: Path
+    run_once: bool
+    retry_delay_seconds: int
+
+
+def now_utc() -> datetime:
+    return datetime.now(timezone.utc).replace(microsecond=0)
+
+
+def iso_utc(v: datetime) -> str:
+    return v.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
+
+
+def run_cmd(cmd: list[str], env: dict[str, str]) -> None:
+    print(f"[rain-ml] running: {' '.join(cmd)}", flush=True)
+    subprocess.run(cmd, env=env, check=True)
+
+
+def ensure_parent(path: Path) -> None:
+    if path.parent and not path.parent.exists():
+        path.parent.mkdir(parents=True, exist_ok=True)
+
+
+def training_window(lookback_days: int) -> tuple[str, str]:
+    end = now_utc()
+    start = end - timedelta(days=lookback_days)
+    return iso_utc(start), iso_utc(end)
+
+
+def run_training_cycle(cfg: WorkerConfig, env: dict[str, str]) -> None:
+    start, end = training_window(cfg.lookback_days)
+    model_version = f"{cfg.model_version_base}-{now_utc().strftime('%Y%m%d%H%M')}"
+
+    ensure_parent(cfg.audit_path)
+    ensure_parent(cfg.report_path)
+    ensure_parent(cfg.model_path)
+
+    run_cmd(
+        [
+            sys.executable,
+            "scripts/audit_rain_data.py",
+            "--site",
+            cfg.site,
+            "--start",
+            start,
+            "--end",
+            end,
+            "--out",
+            str(cfg.audit_path),
+        ],
+        env,
+    )
+
+    run_cmd(
+        [
+            sys.executable,
+            "scripts/train_rain_model.py",
+            "--site",
+            cfg.site,
+            "--start",
+            start,
+            "--end",
+            end,
+            "--train-ratio",
+            str(cfg.train_ratio),
+            "--val-ratio",
+            str(cfg.val_ratio),
+            "--min-precision",
+            str(cfg.min_precision),
+            "--model-version",
+            model_version,
+            "--out",
+            str(cfg.model_path),
+            "--report-out",
+            str(cfg.report_path),
+        ],
+        env,
+    )
+
+
+def run_predict_once(cfg: WorkerConfig, env: dict[str, str]) -> None:
+    if not cfg.model_path.exists():
+        raise RuntimeError(f"model artifact not found: {cfg.model_path}")
+
+    run_cmd(
+        [
+            sys.executable,
+            "scripts/predict_rain_model.py",
+            "--site",
+            cfg.site,
+            "--model-path",
+            str(cfg.model_path),
+            "--model-name",
+            cfg.model_name,
+        ],
+        env,
+    )
+
+
+def load_config() -> WorkerConfig:
+    database_url = read_env("DATABASE_URL", "")
+    if not database_url:
+        raise SystemExit("DATABASE_URL is required")
+
+    return WorkerConfig(
+        database_url=database_url,
+        site=read_env("RAIN_SITE", "home"),
+        model_name=read_env("RAIN_MODEL_NAME", "rain_next_1h"),
+        model_version_base=read_env("RAIN_MODEL_VERSION_BASE", "rain-logreg-v1"),
+        train_interval_hours=read_env_float("RAIN_TRAIN_INTERVAL_HOURS", 24.0),
+        predict_interval_minutes=read_env_float("RAIN_PREDICT_INTERVAL_MINUTES", 10.0),
+        lookback_days=read_env_int("RAIN_LOOKBACK_DAYS", 30),
+        train_ratio=read_env_float("RAIN_TRAIN_RATIO", 0.7),
+        val_ratio=read_env_float("RAIN_VAL_RATIO", 0.15),
+        min_precision=read_env_float("RAIN_MIN_PRECISION", 0.70),
+        model_path=Path(read_env("RAIN_MODEL_PATH", "models/rain_model.pkl")),
+        report_path=Path(read_env("RAIN_REPORT_PATH", "models/rain_model_report.json")),
+        audit_path=Path(read_env("RAIN_AUDIT_PATH", "models/rain_data_audit.json")),
+        run_once=read_env_bool("RAIN_RUN_ONCE", False),
+        retry_delay_seconds=read_env_int("RAIN_RETRY_DELAY_SECONDS", 60),
+    )
+
+
+def main() -> int:
+    cfg = load_config()
+    env = os.environ.copy()
+    env["DATABASE_URL"] = cfg.database_url
+
+    train_every = timedelta(hours=cfg.train_interval_hours)
+    predict_every = timedelta(minutes=cfg.predict_interval_minutes)
+    next_train = now_utc()
+    next_predict = now_utc()
+    trained_once = False
+    predicted_once = False
+
+    print(
+        "[rain-ml] worker start "
+        f"site={cfg.site} "
+        f"model_name={cfg.model_name} "
+        f"train_interval_hours={cfg.train_interval_hours} "
+        f"predict_interval_minutes={cfg.predict_interval_minutes}",
+        flush=True,
+    )
+
+    while True:
+        now = now_utc()
+        try:
+            if now >= next_train:
+                run_training_cycle(cfg, env)
+                next_train = now + train_every
+                trained_once = True
+
+            if now >= next_predict:
+                run_predict_once(cfg, env)
+                next_predict = now + predict_every
+                predicted_once = True
+
+            if cfg.run_once and trained_once and predicted_once:
+                print("[rain-ml] run-once complete", flush=True)
+                return 0
+
+        except subprocess.CalledProcessError as exc:
+            print(f"[rain-ml] command failed exit={exc.returncode}; retrying in {cfg.retry_delay_seconds}s", flush=True)
+            time.sleep(cfg.retry_delay_seconds)
+            continue
+        except Exception as exc:  # pragma: no cover - defensive for runtime worker
+            print(f"[rain-ml] worker error: {exc}; retrying in {cfg.retry_delay_seconds}s", flush=True)
+            time.sleep(cfg.retry_delay_seconds)
+            continue
+
+        sleep_for = min((next_train - now).total_seconds(), (next_predict - now).total_seconds(), 30.0)
+        if sleep_for > 0:
+            time.sleep(sleep_for)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())