@@ -507,8 +507,20 @@ def _clean_county_2022_csv_file(df: pd.DataFrame,
507507 Returns:
508508 df (DataFrame) : Transformed DataFrame for csv dataset.
509509 '''
510+
510511 try :
511- # filter by agegrp = 0
512+ # Check if the dataframe is empty or missing the 'YEAR' column
513+ if df is None or df .empty :
514+ logging .info (f"Skipping empty file: { file_path } " )
515+ return pd .DataFrame ()
516+
517+ if 'YEAR' not in df .columns :
518+ logging .warning (
519+ f"File { file_path } does not contain 'YEAR' column. Likely an error page."
520+ )
521+ return pd .DataFrame ()
522+
523+ # Existing logic
512524 df = df .query ("YEAR not in [1]" )
513525 df = df .query ("AGEGRP == 0" )
514526 # filter years 3 - 14
@@ -911,6 +923,11 @@ def _load_data(self, file: str) -> pd.DataFrame:
911923 elif "cc-est202" in file :
912924 df = pd .read_csv (file , encoding = 'ISO-8859-1' , low_memory = False )
913925 df = _clean_county_2022_csv_file (df , file )
926+ if df .empty :
927+ logging .warning (
928+ f"Skipping further processing for empty/invalid file: { file } "
929+ )
930+ return df # Returns the empty DF and moves to next file
914931 # aggregating County data to obtain National data for 2020-2022
915932 df_national = df .copy ()
916933 df_national ['geo_ID' ] = "country/USA"
@@ -957,8 +974,12 @@ def _transform_data(self, df: pd.DataFrame, file_path: str) -> None:
957974 file (str) : String of Dataset File Path
958975
959976 Returns:
960- None
977+ bool: True if transformation is successful or skipped, False otherwise.
961978 """
979+ if df is None or df .empty :
980+ logging .warning (f"No data to transform for file: { file_path } " )
981+ return True
982+
962983 try :
963984 # Finding the Dir Path
964985 file_dir = self .cleaned_csv_file_path
@@ -1191,8 +1212,6 @@ def _generate_tmcf(self, df_cols: list, name: str) -> None:
11911212
11921213
11931214# The outputs are loaded into
1194-
1195-
11961215def _resolve_pe_11 (file_name : str , url : str ) -> pd .DataFrame :
11971216 """
11981217 This method cleans the dataframe loaded from a csv file format.
@@ -1236,28 +1255,38 @@ def _resolve_pe_11(file_name: str, url: str) -> pd.DataFrame:
12361255def add_future_yearurls ():
12371256 """
12381257 This method scans the download URLs for future years.
1239-
12401258 """
12411259 global _FILES_TO_DOWNLOAD
12421260 with open (os .path .join (_MODULE_DIR , 'input_url.json' ), 'r' ) as inpit_file :
12431261 _FILES_TO_DOWNLOAD = json .load (inpit_file )
1262+
12441263 urls_to_scan = [
12451264 "https://www2.census.gov/programs-surveys/popest/datasets/2020-{YEAR}/counties/asrh/cc-est{YEAR}-alldata.csv"
12461265 ]
1266+
12471267 # This method will generate URLs for the years 2024 to 2029
12481268 for future_year in range (2024 , 2030 ):
1249- if dt .now ().year > future_year :
1250- YEAR = future_year
1251- for url in urls_to_scan :
1252- url_to_check = url .format (YEAR = YEAR )
1269+ if dt .now ().year >= future_year :
1270+ for url_template in urls_to_scan :
1271+ # FIX: Define url_to_check by formatting the template
1272+ url_to_check = url_template .replace ("{YEAR}" , str (future_year ))
1273+
12531274 try :
1254- checkurl = requests .head (url_to_check )
1255- if checkurl .status_code == 200 :
1275+ checkurl = requests .head (url_to_check ,
1276+ timeout = 10 ,
1277+ allow_redirects = True )
1278+
1279+ # If it's 200 OK and NOT an HTML file
1280+ if checkurl .status_code == 200 and 'text/csv' in checkurl .headers .get (
1281+ 'Content-Type' , '' ):
12561282 _FILES_TO_DOWNLOAD .append (
12571283 {"download_path" : url_to_check })
1258-
1259- except :
1260- logging .error (f"URL is not accessable { url_to_check } " )
1284+ else :
1285+ logging .info (
1286+ f"Data for { future_year } not yet available at { url_to_check } "
1287+ )
1288+ except requests .exceptions .RequestException :
1289+ logging .error (f"URL unreachable: { url_to_check } " )
12611290
12621291
12631292def download_files ():
@@ -1288,6 +1317,14 @@ def download_files():
12881317 headers = {'User-Agent' : 'Mozilla/5.0' }
12891318 response = requests .get (url , headers = headers , timeout = 60 )
12901319 response .raise_for_status ()
1320+
1321+ content_type = response .headers .get ('Content-Type' , '' ).lower ()
1322+ if 'text/html' in content_type :
1323+ logging .warning (
1324+ f"Server returned HTML for { url } . Skipping download." )
1325+ is_file_downloaded = True # Break the while loop
1326+ continue # Move to the next file in _FILES_TO_DOWNLOAD
1327+
12911328 if ".csv" in url :
12921329 if "st-est" in url or 'SC-EST' in url :
12931330 file_name = file_name .replace (".csv" , ".xlsx" )
0 commit comments