-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathutils.py
More file actions
73 lines (58 loc) · 2.35 KB
/
utils.py
File metadata and controls
73 lines (58 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import re
import pandas as pd
import requests
from analytics.entities import DIMENSION_PAGE_PATH
ANVIL_DATASETS_API_URL = "https://service.explore.anvilproject.org/index/datasets"
DATASETS_PATH_PATTERN = re.compile(r"^/datasets/([^/]+)")
INSERT_AFTER_COLUMN = DIMENSION_PAGE_PATH["alias"]
PAGE_PATH_COLUMN = DIMENSION_PAGE_PATH["alias"]
DATASET_TITLE_COLUMN = "Dataset Title"
def fetch_dataset_title_map() -> dict[str, str]:
"""Fetch all datasets from the AnVIL API and return a mapping of entryId to title.
Paginates through the full catalog using the API's ``pagination.next`` URL.
"""
title_map: dict[str, str] = {}
url: str | None = ANVIL_DATASETS_API_URL
params: dict[str, int] | None = {"size": 1000}
while url is not None:
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
for hit in data["hits"]:
entry_id = hit.get("entryId")
datasets = hit.get("datasets", [])
if entry_id and datasets:
title = datasets[0].get("title", "")
if title:
title_map[entry_id] = title
url = data.get("pagination", {}).get("next")
params = None # subsequent URLs already include query params
return title_map
def add_dataset_titles(df: pd.DataFrame, title_map: dict[str, str] | None = None) -> pd.DataFrame:
"""Add a 'Dataset Title' column to a pageviews dataframe.
For rows where the page path matches /datasets/[id], the title is looked up
from the AnVIL API. All other rows get "N/A".
Args:
df: A dataframe containing a "Page Path" column.
title_map: Optional pre-fetched ID-to-title mapping.
Returns:
A copy of the dataframe with a "Dataset Title" column inserted
after the column specified by the INSERT_AFTER_COLUMN global variable.
"""
if title_map is None:
title_map = fetch_dataset_title_map()
df = df.copy()
def get_title(path: str) -> str:
match = DATASETS_PATH_PATTERN.match(path)
if match:
entry_id = match.group(1)
return title_map.get(entry_id, "N/A")
return "N/A"
df[DATASET_TITLE_COLUMN] = df[PAGE_PATH_COLUMN].map(get_title)
# Insert the title column right after the configured column
after_col_idx = list(df.columns).index(INSERT_AFTER_COLUMN)
cols = list(df.columns)
cols.remove(DATASET_TITLE_COLUMN)
cols.insert(after_col_idx + 1, DATASET_TITLE_COLUMN)
df = df[cols]
return df