more work on model training
This commit is contained in:
@@ -9,10 +9,13 @@ import numpy as np
|
||||
import psycopg2
|
||||
|
||||
from rain_model_common import (
|
||||
FEATURE_COLUMNS,
|
||||
AVAILABLE_FEATURE_SETS,
|
||||
RAIN_EVENT_THRESHOLD_MM,
|
||||
build_dataset,
|
||||
feature_columns_for_set,
|
||||
feature_columns_need_forecast,
|
||||
fetch_baro,
|
||||
fetch_forecast,
|
||||
fetch_ws90,
|
||||
model_frame,
|
||||
parse_time,
|
||||
@@ -26,6 +29,17 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.add_argument("--site", required=True, help="Site name (e.g. home).")
|
||||
parser.add_argument("--start", help="Start time (RFC3339 or YYYY-MM-DD).")
|
||||
parser.add_argument("--end", help="End time (RFC3339 or YYYY-MM-DD).")
|
||||
parser.add_argument(
|
||||
"--feature-set",
|
||||
default="baseline",
|
||||
choices=AVAILABLE_FEATURE_SETS,
|
||||
help="Named feature set used for model-readiness auditing.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--forecast-model",
|
||||
default="ecmwf",
|
||||
help="Forecast model name when feature set requires forecast columns.",
|
||||
)
|
||||
parser.add_argument("--out", default="models/rain_data_audit.json", help="Path to save JSON audit report.")
|
||||
return parser.parse_args()
|
||||
|
||||
@@ -65,13 +79,18 @@ def main() -> int:
|
||||
|
||||
start = parse_time(args.start) if args.start else ""
|
||||
end = parse_time(args.end) if args.end else ""
|
||||
feature_cols = feature_columns_for_set(args.feature_set)
|
||||
needs_forecast = feature_columns_need_forecast(feature_cols)
|
||||
|
||||
with psycopg2.connect(args.db_url) as conn:
|
||||
ws90 = fetch_ws90(conn, args.site, start, end)
|
||||
baro = fetch_baro(conn, args.site, start, end)
|
||||
forecast = None
|
||||
if needs_forecast:
|
||||
forecast = fetch_forecast(conn, args.site, start, end, model=args.forecast_model)
|
||||
|
||||
df = build_dataset(ws90, baro, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
|
||||
model_df = model_frame(df, FEATURE_COLUMNS, require_target=True)
|
||||
df = build_dataset(ws90, baro, forecast=forecast, rain_event_threshold_mm=RAIN_EVENT_THRESHOLD_MM)
|
||||
model_df = model_frame(df, feature_cols, require_target=True)
|
||||
|
||||
ws90_dupes = int(ws90.duplicated(subset=["ts", "station_id"]).sum()) if not ws90.empty else 0
|
||||
baro_dupes = int(baro.duplicated(subset=["ts", "source"]).sum()) if not baro.empty else 0
|
||||
@@ -95,7 +114,7 @@ def main() -> int:
|
||||
baro_max_gap_min = longest_zero_run(np.array(baro_counts)) * 5 if len(baro_counts) else 0
|
||||
|
||||
missingness = {}
|
||||
for col in FEATURE_COLUMNS + ["pressure_hpa", "rain_mm", "rain_inc", "rain_next_1h_mm"]:
|
||||
for col in feature_cols + ["pressure_hpa", "rain_mm", "rain_inc", "rain_next_1h_mm"]:
|
||||
if col in df.columns:
|
||||
missingness[col] = float(df[col].isna().mean())
|
||||
|
||||
@@ -105,6 +124,9 @@ def main() -> int:
|
||||
|
||||
report = {
|
||||
"site": args.site,
|
||||
"feature_set": args.feature_set,
|
||||
"feature_columns": feature_cols,
|
||||
"forecast_model": args.forecast_model if needs_forecast else None,
|
||||
"target_definition": f"rain_next_1h_mm >= {RAIN_EVENT_THRESHOLD_MM:.2f}",
|
||||
"requested_window": {
|
||||
"start": start or None,
|
||||
@@ -121,6 +143,7 @@ def main() -> int:
|
||||
"row_counts": {
|
||||
"ws90_rows": int(len(ws90)),
|
||||
"baro_rows": int(len(baro)),
|
||||
"forecast_rows": int(len(forecast)) if forecast is not None else 0,
|
||||
"model_rows": int(len(model_df)),
|
||||
},
|
||||
"duplicates": {
|
||||
@@ -152,10 +175,12 @@ def main() -> int:
|
||||
|
||||
print("Rain data audit summary:")
|
||||
print(f" site: {report['site']}")
|
||||
print(f" feature_set: {report['feature_set']}")
|
||||
print(
|
||||
" rows: "
|
||||
f"ws90={report['row_counts']['ws90_rows']} "
|
||||
f"baro={report['row_counts']['baro_rows']} "
|
||||
f"forecast={report['row_counts']['forecast_rows']} "
|
||||
f"model={report['row_counts']['model_rows']}"
|
||||
)
|
||||
print(
|
||||
|
||||
Reference in New Issue
Block a user