Skip to content
86 changes: 52 additions & 34 deletions scripts/us_census/pep/us_pep_sex/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,24 @@ def add_future_year_urls():
# Loop through years in reverse order from 2030 to 2023
for future_year in range(2030, 2022, -1): # From 2030 to 2023

# We check the National CSV first. If it's 404, the whole year is skipped.
gatekeeper_url = urls_to_scan[0].format(YEAR=future_year)
try:
# Use a short 5-second timeout for the check
response = requests.head(gatekeeper_url,
allow_redirects=True,
timeout=5)
if response.status_code != 200:
logging.info(
f"Skipping year {future_year}: National file not found (status code: {response.status_code})."
)
continue
except requests.exceptions.RequestException as e:
logging.warning(
f"Skipping year {future_year} due to an error checking the gatekeeper URL: {e}"
)
continue
Comment thread
niveditasing marked this conversation as resolved.

YEAR = future_year
# Loop through URLs
for url in urls_to_scan:
Expand Down Expand Up @@ -1242,7 +1260,7 @@ def download_files():
global _FILES_TO_DOWNLOAD
session = requests.session()

#Get set of already downloaded files
# Get set of already downloaded files
downloaded_files = set(os.listdir(_GCS_OUTPUT_PERSISTENT_PATH))

for file_to_download in _FILES_TO_DOWNLOAD:
Expand All @@ -1255,6 +1273,12 @@ def download_files():
else:
file_name_to_save = url.split('/')[-1]

# Skip if file already exists (Moved up for efficiency)
if file_name_to_save in downloaded_files:
Comment thread
niveditasing marked this conversation as resolved.
logging.info(
Comment thread
niveditasing marked this conversation as resolved.
f"Skipping already downloaded file: {file_name_to_save}")
continue

headers = {'User-Agent': 'Mozilla/5.0'}
try:
with session.get(url, stream=True, timeout=120,
Expand All @@ -1263,45 +1287,39 @@ def download_files():

content_type = response.headers.get('Content-Type', '')

# Skip if file already exists
if file_name_to_save in downloaded_files:
logging.info(
f"Skipping already downloaded file: {file_name_to_save}"
# Minimal fix: Log error and continue to skip HTML pages
if 'html' in content_type.lower():
logging.error(
f"Server returned HTML error page for URL: {url}. Skipping."
)
continue
if 'html' in content_type.lower():
logging.fatal(
f"Server returned HTML error page for URL: {url}")
else:
if response.status_code == 200:
with tempfile.NamedTemporaryFile(
delete=False) as tmp_file:
# Stream the response into a temp file
for chunk in response.iter_content(chunk_size=8192):
if chunk:
tmp_file.write(chunk)
tmp_file_path = tmp_file.name

# Copy to local destination
shutil.copy(
tmp_file_path,
os.path.join(_INPUT_FILE_PATH, file_name_to_save))

# Copy to gcs destination
shutil.copy(
tmp_file_path,
os.path.join(_GCS_OUTPUT_PERSISTENT_PATH,
file_name_to_save))

# Optionally delete the temp file
os.remove(tmp_file_path)
file_to_download['is_downloaded'] = True
logging.info(f"Downloaded file: {url}")

if response.status_code == 200:
with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
tmp_file.write(chunk)
tmp_file_path = tmp_file.name

# Copy to local destination
shutil.copy(
Comment thread
niveditasing marked this conversation as resolved.
tmp_file_path,
os.path.join(_INPUT_FILE_PATH, file_name_to_save))

# Copy to gcs destination
shutil.copy(
tmp_file_path,
os.path.join(_GCS_OUTPUT_PERSISTENT_PATH,
file_name_to_save))

os.remove(tmp_file_path)
file_to_download['is_downloaded'] = True
logging.info(f"Downloaded file: {url}")

except Exception as e:
file_to_download['is_downloaded'] = False
logging.error(f"Error downloading {url}: {e}")
raise # re-raise to trigger @retry
raise
time.sleep(1)

return True
Expand Down
Loading