Skip to content

Commit 6186ad4

Browse files
committed
tuberculosis_percentage
1 parent 3371e36 commit 6186ad4

8 files changed

Lines changed: 1153 additions & 0 deletions
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# WHO Tuberculosis Percentage Dataset
2+
## Overview
3+
This dataset provides the percentage of people diagnosed with a new episode of pulmonary TB whose disease was bacteriologically confirmed, sourced from the World Health Organization (WHO) Global Tuberculosis Programme.
4+
5+
## Data Source
6+
7+
**Source URL:**
8+
https://data.who.int/indicators/i/1891124/449F55C
9+
10+
The data is fetched from the WHO's official Global Tuberculosis Database via their public API.
11+
12+
## How To Download Input Data
13+
To download the latest data, use the provided download script `download_who_tuberculosis.py`. This script fetches the data from the WHO API and merges it with country ISO3 codes to generate `tuberculosisPercentage_input.csv`.
14+
15+
**Type of place:** Country.
16+
17+
**Statvars:** Tuberculosis - Bacteriologically Confirmed Percentage.
18+
19+
**Years:** 1999 to 2024.
20+
21+
## Processing Instructions
22+
To process the Tuberculosis data and generate statistical variables, use the following commands from the project's root `data` directory:
23+
24+
**Download input file**
25+
```bash
26+
python3 statvar_imports/tuberculosis_percentage/tuberculosisPercentage_input.py
27+
```
28+
29+
**For Test Data Run**
30+
```bash
31+
python3 tools/statvar_importer/stat_var_processor.py \
32+
--input_data=statvar_imports/tuberculosis_percentage/test_data/tuberculosisPercentage_input.csv \
33+
--pv_map=statvar_imports/tuberculosis_percentage/test_data/tuberculosisPercentage_pvmap.csv \
34+
--output_path=statvar_imports/tuberculosis_percentage/test_data/tuberculosisPercentage_output \
35+
--config_file=statvar_imports/tuberculosis_percentage/test_data/tuberculosisPercentage_metadata.csv \
36+
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf
37+
```
38+
39+
**For Main data run**
40+
```bash
41+
python3 tools/statvar_importer/stat_var_processor.py \
42+
--input_data=statvar_imports/tuberculosis_percentage/tuberculosisPercentage_input.csv \
43+
--pv_map=statvar_imports/tuberculosis_percentage/tuberculosisPercentage_pvmap.csv \
44+
--output_path=statvar_imports/tuberculosis_percentage/tuberculosisPercentage_output \
45+
--config_file=statvar_imports/tuberculosis_percentage/tuberculosisPercentage_metadata.csv \
46+
--existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf
47+
```
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import os
2+
import requests
3+
import io
4+
import pandas as pd
5+
6+
def download_tb_percentage_data():
7+
# 1. Get the Clean Data from the API using the new Indicator ID
8+
api_url = "https://xmart-api-public.who.int/DATA_/RELAY_TB_DATA"
9+
params = {
10+
"$filter": "IND_ID eq '1891124449F55C'",
11+
"$select": "IND_ID,INDICATOR_NAME,YEAR,COUNTRY,VALUE",
12+
"$format": "csv"
13+
}
14+
15+
print("1. Fetching clean percentage data from WHO API...")
16+
api_response = requests.get(api_url, params=params)
17+
18+
if api_response.status_code != 200:
19+
print(f"Failed to fetch API data. HTTP {api_response.status_code}")
20+
return
21+
22+
# Load the clean API data into a pandas table
23+
api_df = pd.read_csv(io.StringIO(api_response.text))
24+
25+
# 2. Get ONLY the iso3 code from the master database
26+
print("2. Fetching country iso3 codes from WHO master database...")
27+
master_url = "https://extranet.who.int/tme/generateCSV.asp?ds=notifications"
28+
29+
# We only pull the 'country' (for matching) and 'iso3' columns
30+
geo_columns = ['country', 'iso3']
31+
master_df = pd.read_csv(master_url, usecols=geo_columns).drop_duplicates()
32+
33+
# 3. Merge the two datasets together based on the country name
34+
print("3. Merging data and formatting...")
35+
# The API uses uppercase 'COUNTRY', the master uses lowercase 'country'
36+
merged_df = pd.merge(api_df, master_df, left_on='COUNTRY', right_on='country', how='left')
37+
38+
# Drop the duplicate lowercase 'country' column used for joining
39+
merged_df = merged_df.drop(columns=['country'])
40+
41+
# Reorder columns so the iso3 code sits right next to the Country name
42+
final_columns = [
43+
'IND_ID', 'INDICATOR_NAME', 'YEAR', 'COUNTRY', 'iso3', 'VALUE'
44+
]
45+
merged_df = merged_df[final_columns]
46+
47+
# 4. Save to CSV in a new folder
48+
output_dir = "statvar_imports/tuberculosis_percentage/input_files"
49+
filename = os.path.join(output_dir, "TB_Bacteriologically_Confirmed_Percentage.csv")
50+
51+
os.makedirs(output_dir, exist_ok=True)
52+
53+
# Save without the pandas index column
54+
merged_df.to_csv(filename, index=False)
55+
print(f"Success! Data saved locally as '{filename}'")
56+
57+
if __name__ == "__main__":
58+
download_tb_percentage_data()
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"import_specifications": [
3+
{
4+
"import_name": "tuberculosis percentage",
5+
"curator_emails": [
6+
"support@datacommons.org"
7+
],
8+
"provenance_url": "https://data.who.int/indicators/i/1891124/449F55C",
9+
"provenance_description": "Percentage of people diagnosed with a new episode of pulmonary TB whose disease was bacteriologically confirmed",
10+
"scripts": [
11+
"download_who_tuberculosis.py",
12+
"../../tools/statvar_importer/stat_var_processor.py --input_data=source_files/*.csv --pv_map=tuberculosisPercentage_pvmap.csv --config_file=tuberculosisPercentage_metadata.csv --output_path=tuberculosisPercentage_output"
13+
],
14+
"source_files": [
15+
"source_files/*.csv"
16+
],
17+
"import_inputs": [
18+
{
19+
"template_mcf": "tuberculosisPercentage_output.tmcf",
20+
"cleaned_csv": "tuberculosisPercentage_output.csv",
21+
"stat_var_mcf": "tuberculosisPercentage_output_stat_vars.mcf"
22+
}
23+
],
24+
"cron_schedule": "0 0 1 1,4,7,10 *"
25+
}
26+
]
27+
}

0 commit comments

Comments
 (0)