Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
347 changes: 347 additions & 0 deletions notebooks/player_bio_download.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,347 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6a2704d3",
"metadata": {},
"source": [
"# Player Bio Download\n",
"\n",
"Raw enrichment notebook for player biographical attributes. This workflow is intentionally independent from ranking logic so it can be rerun/versioned as a standalone data layer.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f746271",
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"from datetime import date\n",
"from pathlib import Path\n",
"import json\n",
"import urllib.error\n",
"import urllib.parse\n",
"import urllib.request\n",
"import pandas as pd\n",
"from IPython.display import display\n",
"from pybaseball import batting_stats, pitching_stats, cache, chadwick_register\n",
"cache.enable()\n"
]
},
{
"cell_type": "markdown",
"id": "5b65f5e1",
"metadata": {},
"source": [
"## Configuration\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a7be6bb",
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = Path('data')\n",
"DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
"DATE_TAG = date.today().strftime('%Y%m%d')\n",
"CURRENT_YEAR = date.today().year\n",
"YEARS_TO_LOAD = [CURRENT_YEAR - 2, CURRENT_YEAR - 1, CURRENT_YEAR]\n",
"INCLUDE_STATUS_POOL = True\n",
"\n",
"STATS_API_BASE = 'https://statsapi.mlb.com/api/v1'\n",
"PEOPLE_BATCH_SIZE = 100\n",
"\n",
"YEARS_TO_LOAD\n"
]
},
{
"cell_type": "markdown",
"id": "f43915cd",
"metadata": {},
"source": [
"## 1) Load player universe (leaderboards + optional active status draft pool)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5b0a5a9",
"metadata": {},
"outputs": [],
"source": [
"def fetch_leaderboard_ids(year: int) -> pd.DataFrame:\n",
" hitters = batting_stats(year, qual=0)\n",
" pitchers = pitching_stats(year, qual=0)\n",
"\n",
" id_frames = []\n",
" for source_name, frame in [('hitter_lb', hitters), ('pitcher_lb', pitchers)]:\n",
" if frame.empty or 'IDfg' not in frame.columns:\n",
" continue\n",
" tmp = frame[['IDfg', 'Name']].copy()\n",
" tmp['source'] = source_name\n",
" tmp['season'] = year\n",
" id_frames.append(tmp)\n",
"\n",
" if not id_frames:\n",
" return pd.DataFrame(columns=['fangraphs_id', 'full_name_lb', 'source', 'season'])\n",
"\n",
" out = pd.concat(id_frames, ignore_index=True)\n",
" out = out.rename(columns={'IDfg': 'fangraphs_id', 'Name': 'full_name_lb'})\n",
" out['fangraphs_id'] = pd.to_numeric(out['fangraphs_id'], errors='coerce').astype('Int64')\n",
" return out.dropna(subset=['fangraphs_id'])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5909c5ef",
"metadata": {},
"outputs": [],
"source": [
"leaderboard_ids = pd.concat([fetch_leaderboard_ids(year) for year in YEARS_TO_LOAD], ignore_index=True)\n",
"leaderboard_ids = leaderboard_ids.drop_duplicates(['fangraphs_id'])\n",
"\n",
"print(f'Unique Fangraphs IDs from leaderboards: {len(leaderboard_ids):,}')\n",
"leaderboard_ids.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "140b69d1",
"metadata": {},
"outputs": [],
"source": [
"def load_latest_status_pool_ids(data_dir: Path) -> pd.DataFrame:\n",
" candidates = sorted(data_dir.glob('player_status_roster_snapshot_*.parquet'))\n",
" if not candidates:\n",
" return pd.DataFrame(columns=['mlbam_id', 'status_snapshot_source'])\n",
"\n",
" latest = candidates[-1]\n",
" frame = pd.read_parquet(latest)\n",
" if 'person.id' not in frame.columns:\n",
" return pd.DataFrame(columns=['mlbam_id', 'status_snapshot_source'])\n",
"\n",
" out = frame[['person.id']].dropna().drop_duplicates().copy()\n",
" out = out.rename(columns={'person.id': 'mlbam_id'})\n",
" out['mlbam_id'] = pd.to_numeric(out['mlbam_id'], errors='coerce').astype('Int64')\n",
" out['status_snapshot_source'] = latest.name\n",
" return out.dropna(subset=['mlbam_id'])\n",
"\n",
"status_pool_ids = load_latest_status_pool_ids(DATA_DIR) if INCLUDE_STATUS_POOL else pd.DataFrame(columns=['mlbam_id'])\n",
"print(f'Unique MLBAM IDs from status snapshot: {len(status_pool_ids):,}')\n",
"status_pool_ids.head()\n"
]
},
{
"cell_type": "markdown",
"id": "d60c4a6f",
"metadata": {},
"source": [
"## 2) Pull bio attributes\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "395d4416",
"metadata": {},
"outputs": [],
"source": [
"register = chadwick_register()\n",
"register = register[['key_mlbam', 'key_fangraphs', 'name_first', 'name_last']].copy()\n",
"register['mlbam_id'] = pd.to_numeric(register['key_mlbam'], errors='coerce').astype('Int64')\n",
"register['fangraphs_id'] = pd.to_numeric(register['key_fangraphs'], errors='coerce').astype('Int64')\n",
"register['full_name_register'] = (register['name_first'].fillna('') + ' ' + register['name_last'].fillna('')).str.strip()\n",
"register = register[['mlbam_id', 'fangraphs_id', 'full_name_register']].dropna(subset=['mlbam_id'])\n",
"\n",
"mlbam_from_lb = register.merge(leaderboard_ids[['fangraphs_id']], how='inner', on='fangraphs_id')\n",
"mlbam_from_lb = mlbam_from_lb[['mlbam_id', 'fangraphs_id', 'full_name_register']]\n",
"\n",
"universe_ids = pd.concat([\n",
" mlbam_from_lb[['mlbam_id']],\n",
" status_pool_ids[['mlbam_id']] if not status_pool_ids.empty else pd.DataFrame(columns=['mlbam_id']),\n",
"], ignore_index=True).dropna().drop_duplicates()\n",
"\n",
"universe_ids['mlbam_id'] = pd.to_numeric(universe_ids['mlbam_id'], errors='coerce').astype('Int64')\n",
"universe_ids = universe_ids.dropna(subset=['mlbam_id']).sort_values('mlbam_id').reset_index(drop=True)\n",
"\n",
"print(f'Universe players (unique MLBAM IDs): {len(universe_ids):,}')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6ec3fa83",
"metadata": {},
"outputs": [],
"source": [
"def fetch_people_batch(mlbam_ids: list[int]) -> pd.DataFrame:\n",
" if not mlbam_ids:\n",
" return pd.DataFrame()\n",
"\n",
" params = urllib.parse.urlencode({'personIds': ','.join(str(i) for i in mlbam_ids), 'hydrate': 'currentTeam'})\n",
" url = f'{STATS_API_BASE}/people?{params}'\n",
" request = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})\n",
" with urllib.request.urlopen(request, timeout=30) as response:\n",
" payload = json.loads(response.read().decode('utf-8'))\n",
"\n",
" rows = []\n",
" for p in payload.get('people', []):\n",
" rows.append({\n",
" 'mlbam_id': p.get('id'),\n",
" 'full_name': p.get('fullName'),\n",
" 'birth_date': p.get('birthDate'),\n",
" 'bats': (p.get('batSide') or {}).get('code'),\n",
" 'throws': (p.get('pitchHand') or {}).get('code'),\n",
" 'birth_country': p.get('birthCountry'),\n",
" 'birth_city': p.get('birthCity'),\n",
" 'debut_date': p.get('mlbDebutDate'),\n",
" })\n",
" return pd.DataFrame(rows)\n",
"\n",
"people_batches = []\n",
"ids_list = [int(v) for v in universe_ids['mlbam_id'].dropna().astype(int).tolist()]\n",
"for start in range(0, len(ids_list), PEOPLE_BATCH_SIZE):\n",
" chunk = ids_list[start:start + PEOPLE_BATCH_SIZE]\n",
" try:\n",
" people_batches.append(fetch_people_batch(chunk))\n",
" except urllib.error.HTTPError as exc:\n",
" print(f'HTTPError for chunk starting at {start}: {exc.code}')\n",
" except urllib.error.URLError as exc:\n",
" print(f'URLError for chunk starting at {start}: {exc.reason}')\n",
"\n",
"bio_raw = pd.concat(people_batches, ignore_index=True) if people_batches else pd.DataFrame()\n",
"print(f'Pulled bio rows: {len(bio_raw):,}')\n",
"bio_raw.head()\n"
]
},
{
"cell_type": "markdown",
"id": "ea2dfc71",
"metadata": {},
"source": [
"## 3) Normalize to a stable schema\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0aff67be",
"metadata": {},
"outputs": [],
"source": [
"player_bio = bio_raw.copy()\n",
"player_bio['mlbam_id'] = pd.to_numeric(player_bio['mlbam_id'], errors='coerce').astype('Int64')\n",
"\n",
"player_bio = player_bio.merge(\n",
" register[['mlbam_id', 'fangraphs_id']].drop_duplicates('mlbam_id'),\n",
" on='mlbam_id',\n",
" how='left'\n",
")\n",
"\n",
"player_bio['birth_date'] = pd.to_datetime(player_bio['birth_date'], errors='coerce')\n",
"player_bio['debut_date'] = pd.to_datetime(player_bio['debut_date'], errors='coerce')\n",
"\n",
"today = pd.Timestamp(date.today())\n",
"player_bio['age'] = ((today - player_bio['birth_date']).dt.days / 365.25).round(2)\n",
"\n",
"stable_cols = [\n",
" 'mlbam_id', 'fangraphs_id', 'full_name',\n",
" 'birth_date', 'age',\n",
" 'bats', 'throws',\n",
" 'birth_country', 'birth_city',\n",
" 'debut_date',\n",
"]\n",
"\n",
"player_bio = (\n",
" player_bio[stable_cols]\n",
" .drop_duplicates(subset=['mlbam_id'])\n",
" .sort_values(['mlbam_id'])\n",
" .reset_index(drop=True)\n",
")\n",
"\n",
"player_bio.head()\n"
]
},
{
"cell_type": "markdown",
"id": "88914957",
"metadata": {},
"source": [
"## 4) Basic quality checks\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "731ff762",
"metadata": {},
"outputs": [],
"source": [
"missing_rates = (\n",
" player_bio.isna()\n",
" .mean()\n",
" .sort_values(ascending=False)\n",
" .rename('missing_rate')\n",
" .to_frame()\n",
")\n",
"\n",
"valid_handedness = {'L', 'R', 'S'}\n",
"invalid_bats = sorted(set(player_bio['bats'].dropna()) - valid_handedness)\n",
"invalid_throws = sorted(set(player_bio['throws'].dropna()) - {'L', 'R'})\n",
"\n",
"print('Missing rates:')\n",
"display(missing_rates)\n",
"print('Invalid bats values:', invalid_bats)\n",
"print('Invalid throws values:', invalid_throws)\n"
]
},
{
"cell_type": "markdown",
"id": "95f904b7",
"metadata": {},
"source": [
"## 5) Save artifact\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0c97c3f",
"metadata": {},
"outputs": [],
"source": [
"out_path = DATA_DIR / f'player_bio_{DATE_TAG}.parquet'\n",
"player_bio.to_parquet(out_path, index=False)\n",
"print(f'Wrote {out_path} ({len(player_bio):,} rows)')\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.13.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
}
},
"nbformat": 4,
"nbformat_minor": 5
}