diff --git a/notebooks/player_bio_download.ipynb b/notebooks/player_bio_download.ipynb new file mode 100644 index 0000000..2715844 --- /dev/null +++ b/notebooks/player_bio_download.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6a2704d3", + "metadata": {}, + "source": [ + "# Player Bio Download\n", + "\n", + "Raw enrichment notebook for player biographical attributes. This workflow is intentionally independent from ranking logic so it can be rerun/versioned as a standalone data layer.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f746271", + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import annotations\n", + "from datetime import date\n", + "from pathlib import Path\n", + "import json\n", + "import urllib.error\n", + "import urllib.parse\n", + "import urllib.request\n", + "import pandas as pd\n", + "from IPython.display import display\n", + "from pybaseball import batting_stats, pitching_stats, cache, chadwick_register\n", + "cache.enable()\n" + ] + }, + { + "cell_type": "markdown", + "id": "5b65f5e1", + "metadata": {}, + "source": [ + "## Configuration\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a7be6bb", + "metadata": {}, + "outputs": [], + "source": [ + "DATA_DIR = Path('data')\n", + "DATA_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "DATE_TAG = date.today().strftime('%Y%m%d')\n", + "CURRENT_YEAR = date.today().year\n", + "YEARS_TO_LOAD = [CURRENT_YEAR - 2, CURRENT_YEAR - 1, CURRENT_YEAR]\n", + "INCLUDE_STATUS_POOL = True\n", + "\n", + "STATS_API_BASE = 'https://statsapi.mlb.com/api/v1'\n", + "PEOPLE_BATCH_SIZE = 100\n", + "\n", + "YEARS_TO_LOAD\n" + ] + }, + { + "cell_type": "markdown", + "id": "f43915cd", + "metadata": {}, + "source": [ + "## 1) Load player universe (leaderboards + optional active status draft pool)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5b0a5a9", + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_leaderboard_ids(year: int) -> pd.DataFrame:\n", + " hitters = batting_stats(year, qual=0)\n", + " pitchers = pitching_stats(year, qual=0)\n", + "\n", + " id_frames = []\n", + " for source_name, frame in [('hitter_lb', hitters), ('pitcher_lb', pitchers)]:\n", + " if frame.empty or 'IDfg' not in frame.columns:\n", + " continue\n", + " tmp = frame[['IDfg', 'Name']].copy()\n", + " tmp['source'] = source_name\n", + " tmp['season'] = year\n", + " id_frames.append(tmp)\n", + "\n", + " if not id_frames:\n", + " return pd.DataFrame(columns=['fangraphs_id', 'full_name_lb', 'source', 'season'])\n", + "\n", + " out = pd.concat(id_frames, ignore_index=True)\n", + " out = out.rename(columns={'IDfg': 'fangraphs_id', 'Name': 'full_name_lb'})\n", + " out['fangraphs_id'] = pd.to_numeric(out['fangraphs_id'], errors='coerce').astype('Int64')\n", + " return out.dropna(subset=['fangraphs_id'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5909c5ef", + "metadata": {}, + "outputs": [], + "source": [ + "leaderboard_ids = pd.concat([fetch_leaderboard_ids(year) for year in YEARS_TO_LOAD], ignore_index=True)\n", + "leaderboard_ids = leaderboard_ids.drop_duplicates(['fangraphs_id'])\n", + "\n", + "print(f'Unique Fangraphs IDs from leaderboards: {len(leaderboard_ids):,}')\n", + "leaderboard_ids.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "140b69d1", + "metadata": {}, + "outputs": [], + "source": [ + "def load_latest_status_pool_ids(data_dir: Path) -> pd.DataFrame:\n", + " candidates = sorted(data_dir.glob('player_status_roster_snapshot_*.parquet'))\n", + " if not candidates:\n", + " return pd.DataFrame(columns=['mlbam_id', 'status_snapshot_source'])\n", + "\n", + " latest = candidates[-1]\n", + " frame = pd.read_parquet(latest)\n", + " if 'person.id' not in frame.columns:\n", + " return pd.DataFrame(columns=['mlbam_id', 'status_snapshot_source'])\n", + "\n", + " out = frame[['person.id']].dropna().drop_duplicates().copy()\n", + " out = out.rename(columns={'person.id': 'mlbam_id'})\n", + " out['mlbam_id'] = pd.to_numeric(out['mlbam_id'], errors='coerce').astype('Int64')\n", + " out['status_snapshot_source'] = latest.name\n", + " return out.dropna(subset=['mlbam_id'])\n", + "\n", + "status_pool_ids = load_latest_status_pool_ids(DATA_DIR) if INCLUDE_STATUS_POOL else pd.DataFrame(columns=['mlbam_id'])\n", + "print(f'Unique MLBAM IDs from status snapshot: {len(status_pool_ids):,}')\n", + "status_pool_ids.head()\n" + ] + }, + { + "cell_type": "markdown", + "id": "d60c4a6f", + "metadata": {}, + "source": [ + "## 2) Pull bio attributes\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "395d4416", + "metadata": {}, + "outputs": [], + "source": [ + "register = chadwick_register()\n", + "register = register[['key_mlbam', 'key_fangraphs', 'name_first', 'name_last']].copy()\n", + "register['mlbam_id'] = pd.to_numeric(register['key_mlbam'], errors='coerce').astype('Int64')\n", + "register['fangraphs_id'] = pd.to_numeric(register['key_fangraphs'], errors='coerce').astype('Int64')\n", + "register['full_name_register'] = (register['name_first'].fillna('') + ' ' + register['name_last'].fillna('')).str.strip()\n", + "register = register[['mlbam_id', 'fangraphs_id', 'full_name_register']].dropna(subset=['mlbam_id'])\n", + "\n", + "mlbam_from_lb = register.merge(leaderboard_ids[['fangraphs_id']], how='inner', on='fangraphs_id')\n", + "mlbam_from_lb = mlbam_from_lb[['mlbam_id', 'fangraphs_id', 'full_name_register']]\n", + "\n", + "universe_ids = pd.concat([\n", + " mlbam_from_lb[['mlbam_id']],\n", + " status_pool_ids[['mlbam_id']] if not status_pool_ids.empty else pd.DataFrame(columns=['mlbam_id']),\n", + "], ignore_index=True).dropna().drop_duplicates()\n", + "\n", + "universe_ids['mlbam_id'] = pd.to_numeric(universe_ids['mlbam_id'], errors='coerce').astype('Int64')\n", + "universe_ids = universe_ids.dropna(subset=['mlbam_id']).sort_values('mlbam_id').reset_index(drop=True)\n", + "\n", + "print(f'Universe players (unique MLBAM IDs): {len(universe_ids):,}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ec3fa83", + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_people_batch(mlbam_ids: list[int]) -> pd.DataFrame:\n", + " if not mlbam_ids:\n", + " return pd.DataFrame()\n", + "\n", + " params = urllib.parse.urlencode({'personIds': ','.join(str(i) for i in mlbam_ids), 'hydrate': 'currentTeam'})\n", + " url = f'{STATS_API_BASE}/people?{params}'\n", + " request = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})\n", + " with urllib.request.urlopen(request, timeout=30) as response:\n", + " payload = json.loads(response.read().decode('utf-8'))\n", + "\n", + " rows = []\n", + " for p in payload.get('people', []):\n", + " rows.append({\n", + " 'mlbam_id': p.get('id'),\n", + " 'full_name': p.get('fullName'),\n", + " 'birth_date': p.get('birthDate'),\n", + " 'bats': (p.get('batSide') or {}).get('code'),\n", + " 'throws': (p.get('pitchHand') or {}).get('code'),\n", + " 'birth_country': p.get('birthCountry'),\n", + " 'birth_city': p.get('birthCity'),\n", + " 'debut_date': p.get('mlbDebutDate'),\n", + " })\n", + " return pd.DataFrame(rows)\n", + "\n", + "people_batches = []\n", + "ids_list = [int(v) for v in universe_ids['mlbam_id'].dropna().astype(int).tolist()]\n", + "for start in range(0, len(ids_list), PEOPLE_BATCH_SIZE):\n", + " chunk = ids_list[start:start + PEOPLE_BATCH_SIZE]\n", + " try:\n", + " people_batches.append(fetch_people_batch(chunk))\n", + " except urllib.error.HTTPError as exc:\n", + " print(f'HTTPError for chunk starting at {start}: {exc.code}')\n", + " except urllib.error.URLError as exc:\n", + " print(f'URLError for chunk starting at {start}: {exc.reason}')\n", + "\n", + "bio_raw = pd.concat(people_batches, ignore_index=True) if people_batches else pd.DataFrame()\n", + "print(f'Pulled bio rows: {len(bio_raw):,}')\n", + "bio_raw.head()\n" + ] + }, + { + "cell_type": "markdown", + "id": "ea2dfc71", + "metadata": {}, + "source": [ + "## 3) Normalize to a stable schema\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0aff67be", + "metadata": {}, + "outputs": [], + "source": [ + "player_bio = bio_raw.copy()\n", + "player_bio['mlbam_id'] = pd.to_numeric(player_bio['mlbam_id'], errors='coerce').astype('Int64')\n", + "\n", + "player_bio = player_bio.merge(\n", + " register[['mlbam_id', 'fangraphs_id']].drop_duplicates('mlbam_id'),\n", + " on='mlbam_id',\n", + " how='left'\n", + ")\n", + "\n", + "player_bio['birth_date'] = pd.to_datetime(player_bio['birth_date'], errors='coerce')\n", + "player_bio['debut_date'] = pd.to_datetime(player_bio['debut_date'], errors='coerce')\n", + "\n", + "today = pd.Timestamp(date.today())\n", + "player_bio['age'] = ((today - player_bio['birth_date']).dt.days / 365.25).round(2)\n", + "\n", + "stable_cols = [\n", + " 'mlbam_id', 'fangraphs_id', 'full_name',\n", + " 'birth_date', 'age',\n", + " 'bats', 'throws',\n", + " 'birth_country', 'birth_city',\n", + " 'debut_date',\n", + "]\n", + "\n", + "player_bio = (\n", + " player_bio[stable_cols]\n", + " .drop_duplicates(subset=['mlbam_id'])\n", + " .sort_values(['mlbam_id'])\n", + " .reset_index(drop=True)\n", + ")\n", + "\n", + "player_bio.head()\n" + ] + }, + { + "cell_type": "markdown", + "id": "88914957", + "metadata": {}, + "source": [ + "## 4) Basic quality checks\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "731ff762", + "metadata": {}, + "outputs": [], + "source": [ + "missing_rates = (\n", + " player_bio.isna()\n", + " .mean()\n", + " .sort_values(ascending=False)\n", + " .rename('missing_rate')\n", + " .to_frame()\n", + ")\n", + "\n", + "valid_handedness = {'L', 'R', 'S'}\n", + "invalid_bats = sorted(set(player_bio['bats'].dropna()) - valid_handedness)\n", + "invalid_throws = sorted(set(player_bio['throws'].dropna()) - {'L', 'R'})\n", + "\n", + "print('Missing rates:')\n", + "display(missing_rates)\n", + "print('Invalid bats values:', invalid_bats)\n", + "print('Invalid throws values:', invalid_throws)\n" + ] + }, + { + "cell_type": "markdown", + "id": "95f904b7", + "metadata": {}, + "source": [ + "## 5) Save artifact\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0c97c3f", + "metadata": {}, + "outputs": [], + "source": [ + "out_path = DATA_DIR / f'player_bio_{DATE_TAG}.parquet'\n", + "player_bio.to_parquet(out_path, index=False)\n", + "print(f'Wrote {out_path} ({len(player_bio):,} rows)')\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.13.4", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file