datacommonsorg · niveditasing · Apr 17, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/scripts/us_census/pep/us_pep_sex/process.py b/scripts/us_census/pep/us_pep_sex/process.py
@@ -1184,6 +1184,24 @@ def add_future_year_urls():
     # Loop through years in reverse order from 2030 to 2023
     for future_year in range(2030, 2022, -1):  # From 2030 to 2023
 
+        # We check the National CSV first. If it's 404, the whole year is skipped.
+        gatekeeper_url = urls_to_scan[0].format(YEAR=future_year)
+        try:
+            # Use a short 5-second timeout for the check
+            response = requests.head(gatekeeper_url,
+                                     allow_redirects=True,
+                                     timeout=5)
+            if response.status_code != 200:
+                logging.info(
+                    f"Skipping year {future_year}: National file not found (status code: {response.status_code})."
+                )
+                continue
+        except requests.exceptions.RequestException as e:
+            logging.warning(
+                f"Skipping year {future_year} due to an error checking the gatekeeper URL: {e}"
+            )
+            continue
+
         YEAR = future_year
         # Loop through URLs
         for url in urls_to_scan:
@@ -1242,7 +1260,7 @@ def download_files():
     global _FILES_TO_DOWNLOAD
     session = requests.session()
 
-    #Get set of already downloaded files
+    # Get set of already downloaded files
     downloaded_files = set(os.listdir(_GCS_OUTPUT_PERSISTENT_PATH))
 
     for file_to_download in _FILES_TO_DOWNLOAD:
@@ -1255,6 +1273,12 @@ def download_files():
         else:
             file_name_to_save = url.split('/')[-1]
 
+        # Skip if file already exists (Moved up for efficiency)
+        if file_name_to_save in downloaded_files:
+            logging.info(
+                f"Skipping already downloaded file: {file_name_to_save}")
+            continue
+
         headers = {'User-Agent': 'Mozilla/5.0'}
         try:
             with session.get(url, stream=True, timeout=120,
@@ -1263,45 +1287,39 @@ def download_files():
 
                 content_type = response.headers.get('Content-Type', '')
 
-                # Skip if file already exists
-                if file_name_to_save in downloaded_files:
-                    logging.info(
-                        f"Skipping already downloaded file: {file_name_to_save}"
+                # Minimal fix: Log error and continue to skip HTML pages
+                if 'html' in content_type.lower():
+                    logging.error(
+                        f"Server returned HTML error page for URL: {url}. Skipping."
                     )
                     continue
-                if 'html' in content_type.lower():
-                    logging.fatal(
-                        f"Server returned HTML error page for URL: {url}")
-                else:
-                    if response.status_code == 200:
-                        with tempfile.NamedTemporaryFile(
-                                delete=False) as tmp_file:
-                            # Stream the response into a temp file
-                            for chunk in response.iter_content(chunk_size=8192):
-                                if chunk:
-                                    tmp_file.write(chunk)
-                            tmp_file_path = tmp_file.name
-
-                        # Copy to local destination
-                        shutil.copy(
-                            tmp_file_path,
-                            os.path.join(_INPUT_FILE_PATH, file_name_to_save))
-
-                        # Copy to gcs destination
-                        shutil.copy(
-                            tmp_file_path,
-                            os.path.join(_GCS_OUTPUT_PERSISTENT_PATH,
-                                         file_name_to_save))
-
-                        # Optionally delete the temp file
-                        os.remove(tmp_file_path)
-                        file_to_download['is_downloaded'] = True
-                        logging.info(f"Downloaded file: {url}")
+
+                if response.status_code == 200:
+                    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+                        for chunk in response.iter_content(chunk_size=8192):
+                            if chunk:
+                                tmp_file.write(chunk)
+                        tmp_file_path = tmp_file.name
+
+                    # Copy to local destination
+                    shutil.copy(
+                        tmp_file_path,
+                        os.path.join(_INPUT_FILE_PATH, file_name_to_save))
+
+                    # Copy to gcs destination
+                    shutil.copy(
+                        tmp_file_path,
+                        os.path.join(_GCS_OUTPUT_PERSISTENT_PATH,
+                                     file_name_to_save))
+
+                    os.remove(tmp_file_path)
+                    file_to_download['is_downloaded'] = True
+                    logging.info(f"Downloaded file: {url}")
 
         except Exception as e:
             file_to_download['is_downloaded'] = False
             logging.error(f"Error downloading {url}: {e}")
-            raise  # re-raise to trigger @retry
+            raise
         time.sleep(1)
 
     return True