#!/usr/bin/env python3
"""Download the Agentic Data Science course dataset — the NYC taxi slice.

One command, no API keys, Python standard library only (works on any Python
3.9+; no pip install needed):

    python3 get_data.py

It fetches the exact ~170 MB slice every lesson is built on into ./data/,
then verifies each file's SHA-256 against the pinned course checksums so you
have byte-for-byte the same data the figures were drawn from. It is safe to
re-run: files already present and intact are skipped.

The slice (public NYC TLC data via CloudFront + free Open-Meteo weather):

  yellow_tripdata_2024-02.parquet   the schema-drift pair (Unit C: the
  yellow_tripdata_2024-03.parquet   airport_fee -> Airport_fee casing flip)
  yellow_tripdata_2024-06.parquet   a clean summer month
  taxi_zone_lookup.csv              zone names (the C3 warehouse join)
  taxi_zones.zip                    zone geometry (the Expedition Atlas map)
  weather_hourly_nyc.json           Open-Meteo hourly weather (the demand model)

Options:
  --check     verify the existing ./data/ against the pinned checksums and exit
              (downloads nothing)
  --data DIR  download into DIR instead of ./data

Sources & licence: NYC TLC Trip Record Data (public, redistributable);
weather by Open-Meteo.com (CC-BY 4.0). This script is the canonical copy
served by the course site at /get_data.py — keep the two in sync.
"""

from __future__ import annotations

import argparse
import hashlib
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path

TLC_BASE = "https://d37ci6vzurychx.cloudfront.net"
OPEN_METEO_URL = (
    "https://archive-api.open-meteo.com/v1/archive"
    "?latitude=40.71&longitude=-74.01"
    "&start_date=2024-01-31&end_date=2024-07-01"
    "&hourly=temperature_2m,precipitation,snowfall,wind_speed_10m"
    "&timezone=UTC"
)

# Pinned to the bytes the course figures were built on. If TLC ever re-publishes
# a month (it has, historically), a mismatch here is a real signal, not a nuisance.
FILES = [
    {
        "name": "yellow_tripdata_2024-02.parquet",
        "url": f"{TLC_BASE}/trip-data/yellow_tripdata_2024-02.parquet",
        "sha256": "c76c43c18c6c6664080dd920baab4928988d5786a6b65980792ca7cd796f9f20",
        "bytes": 50349284,
        "required": True,
        "used_for": "Unit A cleaning · Unit C schema-drift pair",
    },
    {
        "name": "yellow_tripdata_2024-03.parquet",
        "url": f"{TLC_BASE}/trip-data/yellow_tripdata_2024-03.parquet",
        "sha256": "2d4cdc8fb96726cdd3803b13b02d2e61e71d45720aff0ebc693a8bdd1f249823",
        "bytes": 60078280,
        "required": True,
        "used_for": "Unit C schema-drift pair · the zone-hour panel",
    },
    {
        "name": "yellow_tripdata_2024-06.parquet",
        "url": f"{TLC_BASE}/trip-data/yellow_tripdata_2024-06.parquet",
        "sha256": "677cf14c8347f745b583f012fbdba072334c6c4efa17bfd6a64369f2ba30329c",
        "bytes": 59859922,
        # Optional: the lab still works on the Feb/Mar pair if June won't fetch.
        "required": False,
        "used_for": "a clean summer month (Units D–F)",
    },
    {
        "name": "taxi_zone_lookup.csv",
        "url": f"{TLC_BASE}/misc/taxi_zone_lookup.csv",
        "sha256": "1a99e105092230f8620f301edcca7f80d3080642ff404d28ed957d3fa222c8ed",
        "bytes": 12331,
        "required": True,
        "used_for": "zone names — the C3 warehouse join",
    },
    {
        "name": "taxi_zones.zip",
        "url": f"{TLC_BASE}/misc/taxi_zones.zip",
        "sha256": "f6d711917bb4340f8f644d5366c51665489eb2d426dd1a4a55677721ae5adf17",
        "bytes": 1022574,
        "required": True,
        "used_for": "zone geometry — the Expedition Atlas map",
    },
    {
        "name": "weather_hourly_nyc.json",
        "url": OPEN_METEO_URL,
        "sha256": "9a865f8e4973c66bd09404715c4f74bdbdb2213e3cb4baeba5f3d1b5d80eda68",
        "bytes": 140890,
        # The live Open-Meteo archive can be revised; a mismatch just means a
        # slightly different (still valid) pull, so we warn rather than fail.
        "required": True,
        "verify": False,
        "used_for": "hourly weather — the demand model",
    },
]


def sha256_of(path: Path) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h.update(chunk)
    return h.hexdigest()


def human(n: int) -> str:
    return f"{n / 1e6:.1f} MB" if n >= 1e6 else f"{n / 1e3:.0f} KB"


def fetch(url: str, dest: Path) -> None:
    """Stream to a .tmp file with a one-line progress bar, then atomic move."""
    tmp = dest.with_suffix(dest.suffix + ".tmp")
    req = urllib.request.Request(url, headers={"User-Agent": "agentic-ds-course/1.0"})
    with urllib.request.urlopen(req, timeout=120) as resp, open(tmp, "wb") as out:
        total = int(resp.headers.get("Content-Length") or 0)
        got = 0
        last = -1
        while True:
            chunk = resp.read(1 << 20)
            if not chunk:
                break
            out.write(chunk)
            got += len(chunk)
            if total:
                pct = int(got * 100 / total)
                if pct != last and pct % 5 == 0:
                    bar = "#" * (pct // 5) + "-" * (20 - pct // 5)
                    print(f"\r        [{bar}] {pct:3d}%  {human(got)}", end="", flush=True)
                    last = pct
        if total:
            print()
    tmp.replace(dest)


def verify_existing(data: Path) -> int:
    missing, bad, ok = [], [], []
    for f in FILES:
        dest = data / f["name"]
        if not dest.exists():
            (missing if f["required"] else ok).append(f["name"])
            continue
        if f.get("verify", True) and sha256_of(dest) != f["sha256"]:
            bad.append(f["name"])
        else:
            ok.append(f["name"])
    print(f"\n  verified: {len(ok)} ok"
          + (f", {len(bad)} checksum mismatch" if bad else "")
          + (f", {len(missing)} missing required" if missing else ""))
    for n in bad:
        print(f"    [MISMATCH] {n} — re-run without --check to re-download")
    for n in missing:
        print(f"    [MISSING ] {n}")
    return 1 if (bad or missing) else 0


def main() -> int:
    ap = argparse.ArgumentParser(description="Download the course NYC taxi dataset.")
    ap.add_argument("--check", action="store_true", help="verify ./data and exit")
    ap.add_argument("--data", default="data", help="download directory (default: data)")
    args = ap.parse_args()

    data = Path(args.data)
    print("Agentic Data Science — course dataset")
    print(f"  target: {data.resolve()}\n")

    if args.check:
        return verify_existing(data)

    data.mkdir(parents=True, exist_ok=True)
    failures = []

    for f in FILES:
        dest = data / f["name"]
        if dest.exists() and (not f.get("verify", True) or sha256_of(dest) == f["sha256"]):
            print(f"  [skip] {f['name']}  (already present, checksum ok)")
            continue

        ok = False
        for attempt in (1, 2, 3):
            try:
                print(f"  [get ] {f['name']}  ~{human(f['bytes'])}  — {f['used_for']}")
                fetch(f["url"], dest)
                got = sha256_of(dest)
                if f.get("verify", True) and got != f["sha256"]:
                    print(f"        [warn] checksum differs from the pinned course copy")
                    print(f"               (TLC may have re-published this month; data is still usable)")
                print(f"        [ok ] {human(dest.stat().st_size)}")
                ok = True
                break
            except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
                wait = attempt * 6
                print(f"        [retry {attempt}/3] {e} — CloudFront rate-limits bursts; waiting {wait}s")
                time.sleep(wait)
            except Exception as e:  # noqa: BLE001
                print(f"        [fail] {e}")
                break

        if not ok:
            failures.append(f["name"])
            kind = "required" if f["required"] else "optional"
            print(f"        [{'ERROR' if f['required'] else 'degrade'}] {kind} file not downloaded")

    required_failed = [f["name"] for f in FILES if f["required"] and f["name"] in failures]
    print()
    if required_failed:
        print(f"FAILED — required files missing: {required_failed}")
        print("Re-run `python3 get_data.py` (CloudFront 403s are transient — try again in a minute).")
        return 1
    if failures:
        print(f"Done, with a degraded set (optional files skipped: {failures}).")
    else:
        print("Done. The full course dataset is in ./" + str(data) + "/ — you're ready to follow along.")
    print("\nQuick look (no extra install if you have DuckDB):")
    print("  duckdb -c \"SELECT count(*) FROM 'data/yellow_tripdata_2024-03.parquet'\"")
    return 0


if __name__ == "__main__":
    sys.exit(main())
