Skip to content

Commit db24a79

Browse files
modified USCensusPEP_PopulationEstimatebyRace code to skip empty files (#1909)
* modified code to skip urls that have no data * modified code to skip urls that have no data * Update scripts/us_census/pep/population_estimate_by_race/preprocess.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update scripts/us_census/pep/population_estimate_by_race/preprocess.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * modified code to skip urls that have no data * modified code to skip urls that have no data * modified code to skip urls that have no data * modified code to skip urls that have no data --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 98302ed commit db24a79

1 file changed

Lines changed: 51 additions & 14 deletions

File tree

scripts/us_census/pep/population_estimate_by_race/preprocess.py

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -507,8 +507,20 @@ def _clean_county_2022_csv_file(df: pd.DataFrame,
507507
Returns:
508508
df (DataFrame) : Transformed DataFrame for csv dataset.
509509
'''
510+
510511
try:
511-
# filter by agegrp = 0
512+
# Check if the dataframe is empty or missing the 'YEAR' column
513+
if df is None or df.empty:
514+
logging.info(f"Skipping empty file: {file_path}")
515+
return pd.DataFrame()
516+
517+
if 'YEAR' not in df.columns:
518+
logging.warning(
519+
f"File {file_path} does not contain 'YEAR' column. Likely an error page."
520+
)
521+
return pd.DataFrame()
522+
523+
# Existing logic
512524
df = df.query("YEAR not in [1]")
513525
df = df.query("AGEGRP == 0")
514526
# filter years 3 - 14
@@ -911,6 +923,11 @@ def _load_data(self, file: str) -> pd.DataFrame:
911923
elif "cc-est202" in file:
912924
df = pd.read_csv(file, encoding='ISO-8859-1', low_memory=False)
913925
df = _clean_county_2022_csv_file(df, file)
926+
if df.empty:
927+
logging.warning(
928+
f"Skipping further processing for empty/invalid file: {file}"
929+
)
930+
return df # Returns the empty DF and moves to next file
914931
# aggregating County data to obtain National data for 2020-2022
915932
df_national = df.copy()
916933
df_national['geo_ID'] = "country/USA"
@@ -957,8 +974,12 @@ def _transform_data(self, df: pd.DataFrame, file_path: str) -> None:
957974
file (str) : String of Dataset File Path
958975
959976
Returns:
960-
None
977+
bool: True if transformation is successful or skipped, False otherwise.
961978
"""
979+
if df is None or df.empty:
980+
logging.warning(f"No data to transform for file: {file_path}")
981+
return True
982+
962983
try:
963984
# Finding the Dir Path
964985
file_dir = self.cleaned_csv_file_path
@@ -1191,8 +1212,6 @@ def _generate_tmcf(self, df_cols: list, name: str) -> None:
11911212

11921213

11931214
# The outputs are loaded into
1194-
1195-
11961215
def _resolve_pe_11(file_name: str, url: str) -> pd.DataFrame:
11971216
"""
11981217
This method cleans the dataframe loaded from a csv file format.
@@ -1236,28 +1255,38 @@ def _resolve_pe_11(file_name: str, url: str) -> pd.DataFrame:
12361255
def add_future_yearurls():
12371256
"""
12381257
This method scans the download URLs for future years.
1239-
12401258
"""
12411259
global _FILES_TO_DOWNLOAD
12421260
with open(os.path.join(_MODULE_DIR, 'input_url.json'), 'r') as inpit_file:
12431261
_FILES_TO_DOWNLOAD = json.load(inpit_file)
1262+
12441263
urls_to_scan = [
12451264
"https://www2.census.gov/programs-surveys/popest/datasets/2020-{YEAR}/counties/asrh/cc-est{YEAR}-alldata.csv"
12461265
]
1266+
12471267
# This method will generate URLs for the years 2024 to 2029
12481268
for future_year in range(2024, 2030):
1249-
if dt.now().year > future_year:
1250-
YEAR = future_year
1251-
for url in urls_to_scan:
1252-
url_to_check = url.format(YEAR=YEAR)
1269+
if dt.now().year >= future_year:
1270+
for url_template in urls_to_scan:
1271+
# FIX: Define url_to_check by formatting the template
1272+
url_to_check = url_template.replace("{YEAR}", str(future_year))
1273+
12531274
try:
1254-
checkurl = requests.head(url_to_check)
1255-
if checkurl.status_code == 200:
1275+
checkurl = requests.head(url_to_check,
1276+
timeout=10,
1277+
allow_redirects=True)
1278+
1279+
# If it's 200 OK and NOT an HTML file
1280+
if checkurl.status_code == 200 and 'text/csv' in checkurl.headers.get(
1281+
'Content-Type', ''):
12561282
_FILES_TO_DOWNLOAD.append(
12571283
{"download_path": url_to_check})
1258-
1259-
except:
1260-
logging.error(f"URL is not accessable {url_to_check}")
1284+
else:
1285+
logging.info(
1286+
f"Data for {future_year} not yet available at {url_to_check}"
1287+
)
1288+
except requests.exceptions.RequestException:
1289+
logging.error(f"URL unreachable: {url_to_check}")
12611290

12621291

12631292
def download_files():
@@ -1288,6 +1317,14 @@ def download_files():
12881317
headers = {'User-Agent': 'Mozilla/5.0'}
12891318
response = requests.get(url, headers=headers, timeout=60)
12901319
response.raise_for_status()
1320+
1321+
content_type = response.headers.get('Content-Type', '').lower()
1322+
if 'text/html' in content_type:
1323+
logging.warning(
1324+
f"Server returned HTML for {url}. Skipping download.")
1325+
is_file_downloaded = True # Break the while loop
1326+
continue # Move to the next file in _FILES_TO_DOWNLOAD
1327+
12911328
if ".csv" in url:
12921329
if "st-est" in url or 'SC-EST' in url:
12931330
file_name = file_name.replace(".csv", ".xlsx")

0 commit comments

Comments
 (0)