Skip to content

Commit d811eae

Browse files
committed
Merge branch 'PulmonaryTuberculosis_Bacteriologically_Confirmed_branch' of https://github.com/abhishekjaisw/data into PulmonaryTuberculosis_Bacteriologically_Confirmed_branch
2 parents 326501d + 5ef26b2 commit d811eae

32 files changed

Lines changed: 1830 additions & 138 deletions

scripts/us_census/pep/population_estimates_by_asr/national_1900_1959.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
'''
1717
import os
1818
import pandas as pd
19+
import requests
20+
from absl import logging
1921

2022

2123
def national1900(output_folder: str):
@@ -41,8 +43,22 @@ def national1900(output_folder: str):
4143
# 8=Female_NonWhiteAlone
4244
cols = ['Age', '0', '1', '2', '3', '4', '5', '6', '7', '8']
4345
# reading the csv format input file and converting it to a dataframe
44-
df = pd.read_csv(url,names=cols,engine='python',skiprows=9,\
45-
skipfooter=15,encoding='ISO-8859-1')
46+
try:
47+
# Check if the URL is accessible
48+
response = requests.head(url, allow_redirects=True)
49+
if response.status_code != 200:
50+
logging.warning(f"Skipping {url} as it is not accessible.")
51+
continue
52+
53+
df = pd.read_csv(url,
54+
names=cols,
55+
engine='python',
56+
skiprows=9,
57+
skipfooter=15,
58+
encoding='ISO-8859-1')
59+
except Exception as e:
60+
logging.error(f"Error reading {url}: {e}")
61+
continue
4662
#Writing raw data to csv
4763
df.to_csv(os.path.join(
4864
os.path.dirname(os.path.abspath(__file__)), "raw_data",

scripts/us_census/pep/population_estimates_by_asr/national_1960_1979.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
'''
1717
import os
1818
import pandas as pd
19+
import requests
20+
from absl import logging
1921

2022

2123
def national1960(output_folder: str):
@@ -38,8 +40,21 @@ def national1960(output_folder: str):
3840
]
3941
# Reading the csv format input file and converting it to a dataframe.
4042
# Skipping unwanted rows from top and bottom.
41-
df = pd.read_csv(url,names=cols,engine='python',skiprows=8,\
42-
skipfooter=15)
43+
try:
44+
# Check if the URL is accessible
45+
response = requests.head(url, allow_redirects=True)
46+
if response.status_code != 200:
47+
logging.warning(f"Skipping {url} as it is not accessible.")
48+
continue
49+
50+
df = pd.read_csv(url,
51+
names=cols,
52+
engine='python',
53+
skiprows=8,
54+
skipfooter=15)
55+
except Exception as e:
56+
logging.error(f"Error reading {url}: {e}")
57+
continue
4358
#Writing raw data to csv
4459
df.to_csv(os.path.join(
4560
os.path.dirname(os.path.abspath(__file__)), "raw_data",

scripts/us_census/pep/population_estimates_by_asr/process.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,14 @@ def add_future_year_urls():
6868
for YEAR in range(2030, 2020, -1):
6969
url_to_check = url.format(YEAR=YEAR)
7070
try:
71-
check_url = requests.head(url_to_check)
71+
check_url = requests.head(url_to_check, allow_redirects=True)
7272
if check_url.status_code == 200:
7373
_FILES_TO_DOWNLOAD.append({"download_path": url_to_check})
7474
break
75-
76-
except:
77-
logging.error(f"URL is not accessable {url_to_check}")
75+
else:
76+
logging.warning(f"URL is not accessible: {url_to_check}")
77+
except Exception as e:
78+
logging.error(f"URL is not accessible {url_to_check}: {e}")
7879

7980

8081
MCF_TEMPLATE = ("Node: dcid:{pv1}\n"
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Tests for facilities_helper.py."""
15+
16+
import sys
17+
import unittest
18+
from pathlib import Path
19+
from unittest import mock
20+
21+
# Kept at scripts/us_epa/ instead of scripts/us_epa/util/ because the repo's
22+
# unittest discovery for scripts/us_epa would import it as
23+
# util.facilities_helper_test and collide with the top-level data/util package.
24+
REPO_ROOT = Path(__file__).resolve().parents[2]
25+
sys.path.insert(0, str(REPO_ROOT))
26+
27+
from scripts.us_epa.util import facilities_helper as fh
28+
29+
30+
class FacilitiesHelperTest(unittest.TestCase):
31+
32+
def test_get_all_statvars_returns_empty_set_for_empty_input(self):
33+
with mock.patch.object(fh, "get_datacommons_client") as mock_client:
34+
self.assertEqual(fh.get_all_statvars([]), set())
35+
36+
mock_client.assert_not_called()
37+
38+
def test_get_all_statvars_fetches_and_unions_variables(self):
39+
facilities = [f"epaGhgrpFacilityId/{i}" for i in range(55)]
40+
mock_client = mock.Mock()
41+
mock_client.observation.fetch.side_effect = [
42+
mock.Mock(to_dict=mock.Mock(
43+
return_value={
44+
"byVariable": {
45+
"Count_Person": {
46+
"byEntity": {
47+
facilities[0]: {}
48+
}
49+
},
50+
"Median_Age_Person": {
51+
"byEntity": {
52+
facilities[1]: {}
53+
}
54+
},
55+
}
56+
})),
57+
mock.Mock(to_dict=mock.Mock(
58+
return_value={
59+
"byVariable": {
60+
"Count_Person": {
61+
"byEntity": {
62+
facilities[50]: {}
63+
}
64+
},
65+
"Count_Household": {
66+
"byEntity": {
67+
facilities[54]: {}
68+
}
69+
},
70+
}
71+
})),
72+
]
73+
74+
with mock.patch.object(fh,
75+
"get_datacommons_client",
76+
return_value=mock_client):
77+
stat_vars = fh.get_all_statvars(facilities)
78+
79+
self.assertEqual(stat_vars, {
80+
"Count_Person",
81+
"Median_Age_Person",
82+
"Count_Household",
83+
})
84+
self.assertEqual(mock_client.observation.fetch.call_count, 2)
85+
self.assertEqual(
86+
mock_client.observation.fetch.call_args_list[0].kwargs, {
87+
"entity_dcids":
88+
facilities[:50],
89+
"variable_dcids": [],
90+
"select": [
91+
fh.ObservationSelect.VARIABLE,
92+
fh.ObservationSelect.ENTITY,
93+
],
94+
})
95+
self.assertEqual(
96+
mock_client.observation.fetch.call_args_list[1].kwargs, {
97+
"entity_dcids":
98+
facilities[50:],
99+
"variable_dcids": [],
100+
"select": [
101+
fh.ObservationSelect.VARIABLE,
102+
fh.ObservationSelect.ENTITY,
103+
],
104+
})
105+
106+
def test_get_all_statvars_allows_entities_missing_from_response(self):
107+
mock_response = mock.Mock(to_dict=mock.Mock(
108+
return_value={
109+
"byVariable": {
110+
"Count_Person": {
111+
"byEntity": {
112+
"epaGhgrpFacilityId/1": {}
113+
}
114+
}
115+
}
116+
}))
117+
mock_client = mock.Mock()
118+
mock_client.observation.fetch.return_value = mock_response
119+
120+
with mock.patch.object(fh,
121+
"get_datacommons_client",
122+
return_value=mock_client):
123+
stat_vars = fh.get_all_statvars(
124+
["epaGhgrpFacilityId/1", "epaGhgrpFacilityId/2"])
125+
126+
self.assertEqual(stat_vars, {"Count_Person"})
127+
128+
129+
if __name__ == "__main__":
130+
unittest.main()
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Tests for generate_svobs_helper()."""
15+
16+
import sys
17+
import unittest
18+
from pathlib import Path
19+
from unittest import mock
20+
21+
REPO_ROOT = Path(__file__).resolve().parents[3]
22+
sys.path.insert(0, str(REPO_ROOT))
23+
24+
from scripts.us_epa.parent_company import process_parent_company
25+
26+
27+
class GenerateSvobsHelperTest(unittest.TestCase):
28+
29+
def test_generate_svobs_helper_wires_statvars_and_svobs(self):
30+
ownership = {
31+
("epaGhgrpFacilityId/1001", "2018"): {
32+
"EpaParentCompany/A": 100.0,
33+
},
34+
("epaGhgrpFacilityId/1001", "2019"): {
35+
"EpaParentCompany/A": 100.0,
36+
},
37+
("epaGhgrpFacilityId/1002", "2019"): {
38+
"EpaParentCompany/B": 100.0,
39+
},
40+
}
41+
facility_sv_map = {"epaGhgrpFacilityId/1001": {"Count_Person": {}}}
42+
facets = {"facet-1": {"observationPeriod": "P1Y"}}
43+
44+
with mock.patch.object(process_parent_company,
45+
"_facility_year_company_percentages",
46+
return_value=ownership):
47+
with mock.patch.object(process_parent_company.fh,
48+
"get_all_statvars",
49+
return_value={"Count_Person"
50+
}) as mock_statvars:
51+
with mock.patch.object(process_parent_company.fh,
52+
"get_all_svobs",
53+
return_value=(facility_sv_map,
54+
facets)) as mock_svobs:
55+
with mock.patch.object(
56+
process_parent_company,
57+
"process_svobs") as mock_process_svobs:
58+
process_parent_company.generate_svobs_helper(
59+
"ownership.csv", "/tmp/svobs")
60+
61+
statvars_facilities = mock_statvars.call_args.args[0]
62+
self.assertEqual(set(statvars_facilities), {
63+
"epaGhgrpFacilityId/1001",
64+
"epaGhgrpFacilityId/1002",
65+
})
66+
self.assertEqual(set(mock_svobs.call_args.args[0]), {
67+
"epaGhgrpFacilityId/1001",
68+
"epaGhgrpFacilityId/1002",
69+
})
70+
self.assertEqual(mock_svobs.call_args.args[1], {"Count_Person"})
71+
mock_process_svobs.assert_called_once_with("/tmp/svobs", ownership,
72+
facility_sv_map, facets)
73+
74+
75+
if __name__ == "__main__":
76+
unittest.main()

scripts/us_epa/parent_company/process_parent_company.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@
4242
flags.DEFINE_string("svobs_output_path", "svobs",
4343
"Output directory for StatVarObs.")
4444

45-
_DC_API_URL = "https://api.datacommons.org/place/stat-vars"
46-
4745
# V_PARENT_COMPANY_INFO table
4846
_TABLE_PREFIX = "D_GHG_B"
4947
_TABLE = "V_PARENT_COMPANY_INFO"
@@ -487,7 +485,7 @@ def generate_svobs_helper(ownership_relationships_filepath, svobs_path_info):
487485

488486
facilities = list(facilities)
489487

490-
statVars = fh.get_all_statvars(_DC_API_URL, facilities)
488+
statVars = fh.get_all_statvars(facilities)
491489
facility_sv_map, facets = fh.get_all_svobs(facilities, statVars)
492490
print("# SVs : %d" % len(statVars))
493491
print("# Facilities : %d" % len(facility_sv_map))

scripts/us_epa/util/facilities_helper.py

Lines changed: 12 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,9 @@
2323

2424
from datacommons_client.models.observation import ObservationDate
2525
from datacommons_client.models.observation import ObservationSelect
26-
import json
2726
import pandas as pd
28-
import requests
2927

3028
from re import sub
31-
from requests.structures import CaseInsensitiveDict
32-
from requests.exceptions import HTTPError
3329

3430
REPO_ROOT = Path(__file__).resolve().parents[3]
3531
sys.path.insert(0, str(REPO_ROOT))
@@ -165,49 +161,27 @@ def get_county_candidates(zcta):
165161
return filtered_lists
166162

167163

168-
def _dc_sv_query(dc_api_url, data_string, svs=set()):
169-
headers = CaseInsensitiveDict()
170-
headers["Content-Type"] = "application/json"
171-
try:
172-
resp = requests.post(dc_api_url, headers=headers, data=data_string)
173-
except HTTPError as http_err:
174-
print(f'HTTP error occurred: {http_err}')
175-
return set()
176-
except Exception as e:
177-
print(f'Some unkonw Exceptionoccurred: {e}')
178-
return set()
179-
180-
d = json.loads(resp.content.decode('utf8').replace("'", '"'))
181-
for p, p_dict in d["places"].items():
182-
if "statVars" in p_dict:
183-
sv_list = d["places"][p]["statVars"]
184-
for sv in sv_list:
185-
svs.add(sv)
186-
return svs
187-
188-
189164
# Returns a union all StatVars associated with all facilities using the
190165
# Data Commons API.
191-
def get_all_statvars(dc_api_url, facility_ids):
166+
def get_all_statvars(facility_ids):
192167
if not facility_ids:
193168
return set()
194169

170+
client = get_datacommons_client()
195171
statVars = set()
196-
# 500 facilities at a time.
197172
n_facilities = 50
198173
print("****Getting existing StatVars for Facilities.")
199174
for i in range(0, len(facility_ids), n_facilities):
200-
if i % n_facilities == 0:
201-
print(f'**Processing facilities from index {i} to {i+n_facilities}')
202-
# Compose the API query params.
203-
# Need to be of the form:
204-
# '{"dcids":["epaGhgrpFacilityId/1004962","epaGhgrpFacilityId/1010899"]}'
205-
data_string = "{'dcids': ["
206-
for f in facility_ids[i:i + n_facilities]:
207-
data_string += '"%s",' % f
208-
data_string += ']}'
209-
210-
statVars = _dc_sv_query(dc_api_url, data_string, statVars)
175+
print(f'**Processing facilities from index {i} to {i+n_facilities}')
176+
response = client.observation.fetch(
177+
entity_dcids=facility_ids[i:i + n_facilities],
178+
variable_dcids=[],
179+
select=[
180+
ObservationSelect.VARIABLE,
181+
ObservationSelect.ENTITY,
182+
],
183+
).to_dict()
184+
statVars.update(response.get('byVariable', {}).keys())
211185

212186
print("****Done getting existing StatVars.")
213187
print("***********************************.")
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
StatVar,NumPlaces,MinDate,MeasurementMethods,Units
2+
InterestRate_TreasuryNote_3Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
3+
InterestRate_TreasuryBond_20Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
4+
InterestRate_TreasuryNote_5Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
5+
InterestRate_TreasuryNote_10Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
6+
InterestRate_TreasuryBill_1Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
date,1-Month,3-Month,6-Month,1-Year,2-Year,3-Year,5-Year,7-Year,10-Year,20-Year,30-Year
2+
1962-01-02,,,,3.22,,3.70,3.88,,4.06,4.07,
3+
1962-02-01,,,,3.30,,3.81,4.00,,4.09,4.13,
4+
1962-04-19,,,,3.00,,3.37,3.60,,3.82,3.91,

0 commit comments

Comments
 (0)