Skip to content

Commit 35c3da1

Browse files
committed
Add new Data for WHO Bacteriologically Confirmed TB
1 parent aaf18fd commit 35c3da1

17 files changed

Lines changed: 10569 additions & 12063 deletions

scripts/us_census/acs5yr/subject_tables/S1251/S1251_spec.json

Lines changed: 0 additions & 489 deletions
This file was deleted.

scripts/us_census/acs5yr/subject_tables/S1251/test/S1251_cleaned.csv

Lines changed: 0 additions & 9289 deletions
This file was deleted.
-1.79 MB
Binary file not shown.

scripts/us_census/acs5yr/subject_tables/S1251/test/S1251_output.mcf

Lines changed: 0 additions & 1072 deletions
This file was deleted.

scripts/us_census/acs5yr/subject_tables/S1251/test/S1251_output.tmcf

Lines changed: 0 additions & 8 deletions
This file was deleted.

scripts/us_census/acs5yr/subject_tables/S1251/test/S1251_summary.json

Lines changed: 0 additions & 17 deletions
This file was deleted.

scripts/us_census/acs5yr/subject_tables/S1251/test/column_map.json

Lines changed: 0 additions & 1188 deletions
This file was deleted.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# WHO Tuberculosis Dataset: Bacteriologically Confirmed Pulmonary TB
2+
3+
## Overview
4+
This dataset provides the number of people diagnosed with a new episode of pulmonary TB whose disease was bacteriologically confirmed, sourced directly from the World Health Organization (WHO).
5+
6+
## Data Source
7+
8+
**Source URL:**
9+
https://data.who.int/indicators/i/1891124/5D51DB1
10+
11+
The data comes from the official WHO reporting database and includes comprehensive, country-level health metrics detailing annual Tuberculosis notifications and case classifications.
12+
13+
## How To Download Input Data
14+
To download the data, you'll need to run the provided download script `download_tb_data.py`. This script automatically queries the WHO API for the indicator, merges it with the WHO geographical master list to append standard `iso3` country codes, and saves the cleaned `TB_Bacteriologically_Confirmed.csv` file inside an "source_files" folder.
15+
16+
type of place: Country.
17+
18+
statvars: Health / Tuberculosis.
19+
20+
years: Historical to present.
21+
22+
## Processing Instructions
23+
To process the WHO Tuberculosis data and generate statistical variables, use the following commands from your root `data` directory:
24+
25+
**Download input file**
26+
```bash
27+
python3 statvar_imports/TB_Bacteriologically_Confirmed/download_tb_data.py
28+
```
29+
**For Test Data Run**
30+
```bash
31+
python3 tools/statvar_importer/stat_var_processor.py \
32+
--input_data=statvar_imports/TB_Bacteriologically_Confirmed/test_data/TB_Bacteriologically_Confirmed_input.csv \
33+
--pv_map=statvar_imports/TB_Bacteriologically_Confirmed/pulmonary_tb_bctpb_pvmap.csv \
34+
--output_path=statvar_imports/TB_Bacteriologically_Confirmed/test_data/TB_Bacteriologically_Confirmed_output \
35+
--config_file=statvar_imports/TB_Bacteriologically_Confirmed/pulmonary_tb_bctpb_metadata.csv \
36+
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf
37+
```
38+
39+
**For Main data run**
40+
```bash
41+
python3 tools/statvar_importer/stat_var_processor.py \
42+
--input_data=statvar_imports/TB_Bacteriologically_Confirmed/source_files/TB_Bacteriologically_Confirmed_input.csv \
43+
--pv_map=statvar_imports/TB_Bacteriologically_Confirmed/pulmonary_tb_bctpb_pvmap.csv \
44+
--output_path=statvar_imports/TB_Bacteriologically_Confirmed/TB_Bacteriologically_Confirmed_output \
45+
--config_file=statvar_imports/TB_Bacteriologically_Confirmed/pulmonary_tb_bctpb_metadata.csv \
46+
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf
47+
```
48+
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import os
2+
import requests
3+
import io
4+
import pandas as pd
5+
6+
def download_who_tb_data_with_iso3():
7+
# 1. Get the Clean Data from the API
8+
api_url = "https://xmart-api-public.who.int/DATA_/RELAY_TB_DATA"
9+
params = {
10+
"$filter": "IND_ID eq '18911245D51DB1'",
11+
"$select": "IND_ID,INDICATOR_NAME,YEAR,COUNTRY,VALUE",
12+
"$format": "csv"
13+
}
14+
15+
print("1. Fetching clean indicator data from WHO API...")
16+
api_response = requests.get(api_url, params=params)
17+
18+
if api_response.status_code != 200:
19+
print(f"Failed to fetch API data. HTTP {api_response.status_code}")
20+
return
21+
22+
# Load the clean API data into a pandas table
23+
api_df = pd.read_csv(io.StringIO(api_response.text))
24+
25+
# 2. Get ONLY the iso3 code from the master database
26+
print("2. Fetching country iso3 codes from WHO master database...")
27+
master_url = "https://extranet.who.int/tme/generateCSV.asp?ds=notifications"
28+
29+
# We only pull the 'country' (for matching) and 'iso3' columns
30+
geo_columns = ['country', 'iso3']
31+
master_df = pd.read_csv(master_url, usecols=geo_columns).drop_duplicates()
32+
33+
# 3. Merge the two datasets together based on the country name
34+
print("3. Merging data and formatting...")
35+
# The API uses uppercase 'COUNTRY', the master uses lowercase 'country'
36+
merged_df = pd.merge(api_df, master_df, left_on='COUNTRY', right_on='country', how='left')
37+
38+
# Drop the duplicate lowercase 'country' column used for joining
39+
merged_df = merged_df.drop(columns=['country'])
40+
41+
# Reorder columns so the iso3 code sits right next to the Country name
42+
final_columns = [
43+
'IND_ID', 'INDICATOR_NAME', 'YEAR', 'COUNTRY', 'iso3', 'VALUE'
44+
]
45+
merged_df = merged_df[final_columns]
46+
47+
# 4. Save to CSV
48+
output_dir = "statvar_imports/TB_Bacteriologically_Confirmed/source_files"
49+
filename = os.path.join(output_dir, "TB_Bacteriologically_Confirmed_input.csv")
50+
51+
os.makedirs(output_dir, exist_ok=True)
52+
53+
# Save without the pandas index column
54+
merged_df.to_csv(filename, index=False)
55+
print(f"Success! Data saved locally as '{filename}'")
56+
57+
if __name__ == "__main__":
58+
download_who_tb_data_with_iso3()
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"import_specifications": [
3+
{
4+
"import_name": "TB_Bacteriologically_Confirmed",
5+
"curator_emails": [
6+
"support@datacommons.org"
7+
],
8+
"provenance_url": "https://data.who.int/indicators/i/1891124/5D51DB1",
9+
"provenance_description": "Number of people diagnosed with a new episode of pulmonary TB whose disease was bacteriologically confirmed",
10+
"scripts": [
11+
"download_tb_data.py",
12+
"../../tools/statvar_importer/stat_var_processor.py --input_data=source_files/*.csv --pv_map=pulmonary_tb_bctpb_pvmap.csv --config_file=pulmonary_tb_bctpb_metadata.csv --output_path=TB_Bacteriologically_Confirmed_output"
13+
],
14+
"source_files": [
15+
"source_files/*.csv"
16+
],
17+
"import_inputs": [
18+
{
19+
"template_mcf": "TB_Bacteriologically_Confirmed_output.tmcf",
20+
"cleaned_csv": "TB_Bacteriologically_Confirmed_output.csv",
21+
"stat_var_mcf": "TB_Bacteriologically_Confirmed_output_stat_vars.mcf"
22+
}
23+
],
24+
"cron_schedule": "0 0 1 1,4,7,10 *"
25+
}
26+
]
27+
}

0 commit comments

Comments
 (0)