Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import pandas as pd
import os
from absl import logging

_CODEDIR = os.path.dirname(os.path.realpath(__file__))

Expand Down Expand Up @@ -56,6 +57,11 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
index=False)

# providing proper column names
if len(df.columns) != 10:
logging.warning(
f"Skipping {file}: expected 10 cols, got {len(df.columns)}"
)
continue
df.columns = [
"Age", "All race total", "Count_Person_Male",
"Count_Person_Female", "White Total",
Expand Down Expand Up @@ -87,6 +93,11 @@ def process_national_1900_1970(ip_files: list) -> pd.DataFrame:
"nationals_result_1960_1979.csv",
index=False)
# providing proper column names
if len(df2.columns) != 13:
Comment thread
niveditasing marked this conversation as resolved.
logging.warning(
f"Skipping {file}: expected 13 cols, got {len(df2.columns)}"
)
continue
df2.columns = [
"Age", "All race total", "Count_Person_Male",
"Count_Person_Female", "White Total",
Expand Down
101 changes: 58 additions & 43 deletions scripts/us_census/pep/us_pep_sexrace/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,55 +128,70 @@ def downloadFiles(config_files: list, test=False):
os.system("mkdir -p " + os.path.join(_MODULE_DIR, __INPUTFILES))
try:
for config_file in config_files:
files = _get_urls(config_file, "urls", test)
if "national_1980_1990.json" in config_file:
process_national_1980_1990(files)
elif "national_1900_1970.json" in config_file:
process_national_1900_1970(files)
elif "national_1990_2000.json" in config_file:
process_national_1990_2000(files)
elif "national_2000_2010.json" in config_file:
process_national_2000_2010(files)
elif "national_2010_2020.json" in config_file:
process_national_2010_2020(files)
#Added for data refresh from 2020-2023
elif "national_2020_2022.json" in config_file:
process_national_2020_2022(files)
elif "state_1970_1979.json" in config_file:
process_state_1970_1979(files)
elif "state_1980_1990.json" in config_file:
process_state_1980_1990(files)
elif "state_1990_2000.json" in config_file:
process_state_1990_2000(files)
elif "state_2000_2010.json" in config_file:
process_state_2000_2010(files)
elif "state_2010_2020.json" in config_file:
process_state_2010_2020(files)
#Added for 2020-2023 part of data refresh
elif "state_2020_2022.json" in config_file:
process_state_2020_2022(files)
elif "county_1970_1979.json" in config_file:
process_county_1970_1979(files)
elif "county_1980_1989.json" in config_file:
process_county_1980_1989(files)
elif "county_1990_2000.json" in config_file:
process_county_1990_2000(files)
elif "county_2000_2009.json" in config_file:
process_county_2000_2009(files)
elif "county_2010_2020.json" in config_file:
process_county_2010_2020(files)
#added for 2020-2022 data refresh
elif "county_2020_2022.json" in config_file:
process_county_2020_2022(files)
try:
files = _get_urls(config_file, "urls", test)
if "national_1980_1990.json" in config_file:
process_national_1980_1990(files)
elif "national_1900_1970.json" in config_file:
process_national_1900_1970(files)
elif "national_1990_2000.json" in config_file:
process_national_1990_2000(files)
elif "national_2000_2010.json" in config_file:
process_national_2000_2010(files)
elif "national_2010_2020.json" in config_file:
process_national_2010_2020(files)
#Added for data refresh from 2020-2023
elif "national_2020_2022.json" in config_file:
process_national_2020_2022(files)
elif "state_1970_1979.json" in config_file:
process_state_1970_1979(files)
elif "state_1980_1990.json" in config_file:
process_state_1980_1990(files)
elif "state_1990_2000.json" in config_file:
process_state_1990_2000(files)
elif "state_2000_2010.json" in config_file:
process_state_2000_2010(files)
elif "state_2010_2020.json" in config_file:
process_state_2010_2020(files)
#Added for 2020-2023 part of data refresh
elif "state_2020_2022.json" in config_file:
process_state_2020_2022(files)
elif "county_1970_1979.json" in config_file:
process_county_1970_1979(files)
elif "county_1980_1989.json" in config_file:
process_county_1980_1989(files)
elif "county_1990_2000.json" in config_file:
process_county_1990_2000(files)
elif "county_2000_2009.json" in config_file:
process_county_2000_2009(files)
elif "county_2010_2020.json" in config_file:
process_county_2010_2020(files)
#added for 2020-2022 data refresh
elif "county_2020_2022.json" in config_file:
process_county_2020_2022(files)
except Exception as e:
logging.error(f"Failed to process {config_file}: {e}")

global _FILES_TO_DOWNLOAD
for file in _FILES_TO_DOWNLOAD:
file_name_to_save = None
url = file['download_path']
#Calling 2023 onwards methods
process_national_2020_2029(url)
process_county_2020_2029(url)
process_state_2020_2029(url)
try:
process_national_2020_2029(url)
except Exception as e:
logging.error(
f"Failed to process national 2020-2029 for {url}: {e}")
try:
process_county_2020_2029(url)
except Exception as e:
logging.error(
f"Failed to process county 2020-2029 for {url}: {e}")
try:
process_state_2020_2029(url)
except Exception as e:
logging.error(
f"Failed to process state 2020-2029 for {url}: {e}")
Comment thread
niveditasing marked this conversation as resolved.
except Exception as e:
logging.fatal(f"There is an error while downloading the files {e}")
Comment thread
niveditasing marked this conversation as resolved.

Expand Down
Comment thread
niveditasing marked this conversation as resolved.

This file was deleted.