Skip to content

Commit 09be2e7

Browse files
committed
slugify output file names; make import statements more efficient
1 parent 5295b41 commit 09be2e7

2 files changed

Lines changed: 33 additions & 38 deletions

File tree

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ requests==2.25.1
1313
openpyxl==3.0.6
1414
dicttoxml==1.7.4
1515
lxml==4.6.3
16+
slugify=5.0.3

src/electiondata/userinterface/__init__.py

Lines changed: 32 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
)
1414

1515
from slugify import slugify
16-
import pandas as pd
16+
from pandas import DataFrame, ExcelFile, json_normalize, read_csv, MultiIndex, read_excel, concat
1717
from pandas.errors import ParserError
1818
from os import walk, listdir
1919
from os.path import join, isdir, isfile
@@ -24,17 +24,11 @@
2424
from inspect import currentframe
2525
from xml.etree.ElementTree import parse
2626
from json import loads
27-
import shutil
28-
import xlrd
29-
30-
# may need for certain excel imports: import openpyxl
27+
from shutil import move, copy
28+
from xlrd import open_workbook
3129
from sqlalchemy.orm import Session
3230

3331

34-
# mapping from internal database reportingunit types to the user-facing contest types
35-
# (contests are categorized by the reporting unit type of their corresponding districts)
36-
37-
3832
def find_dupes(df):
3933
dupes_df = df[df.duplicated()].drop_duplicates(keep="first")
4034
deduped = df.drop_duplicates(keep="first")
@@ -60,9 +54,9 @@ def json_kwargs(
6054
else:
6155
json_rename[path_list[-1]] = mf
6256
meta = list(list(t) for t in meta_set)
63-
json_kwargs = {"meta": meta, "record_path": record_path, "errors": "ignore"}
57+
j_kwargs = {"meta": meta, "record_path": record_path, "errors": "ignore"}
6458

65-
return json_kwargs, json_rename
59+
return j_kwargs, json_rename
6660

6761

6862
def tabular_kwargs(
@@ -134,14 +128,14 @@ def list_desired_excel_sheets(f_path: str, p: dict) -> (Optional[list], Optional
134128
try:
135129
# read an xlsx file
136130
# # nb: the following fails on VT 2020 files
137-
xl = pd.ExcelFile(f_path)
131+
xl = ExcelFile(f_path)
138132
all_sheets = xl.sheet_names
139133
# xlsx = openpyxl.load_workbook(f_path)
140134
# all_sheets = xlsx.get_sheet_names()
141-
except Exception as exc:
135+
except Exception:
142136
try:
143137
# read xls file
144-
xls = xlrd.open_workbook(f_path, on_demand=True)
138+
xls = open_workbook(f_path, on_demand=True)
145139
all_sheets = xls.sheet_names()
146140
except Exception as exc:
147141
err = add_new_error(
@@ -170,7 +164,7 @@ def read_single_datafile(
170164
aux: bool = False,
171165
driving_path: Optional[str] = None,
172166
lookup_id: Optional[str] = None,
173-
) -> (Dict[str, pd.DataFrame], Dict[str, Dict[int, Any]], Optional[dict]):
167+
) -> (Dict[str, DataFrame], Dict[str, Dict[int, Any]], Optional[dict]):
174168
"""Length of returned dictionary is the number of sheets read -- usually 1 except for multi-sheet Excel.
175169
Auxiliary files have different parameters (e.g., no count locations)"""
176170
err = None
@@ -223,7 +217,7 @@ def read_single_datafile(
223217
# TODO what if json-nested is a lookup?
224218
with open(f_path, "r") as f:
225219
data = loads(f.read())
226-
df = pd.json_normalize(data, **kwargs)
220+
df = json_normalize(data, **kwargs)
227221
if not fatal_error(err):
228222
df.rename(columns=rename, inplace=True)
229223
df_dict = {"Sheet1": df}
@@ -245,7 +239,7 @@ def read_single_datafile(
245239
df_dict = dict()
246240
elif p["file_type"] == "flat_text":
247241
try:
248-
df = pd.read_csv(f_path, **kwargs)
242+
df = read_csv(f_path, **kwargs)
249243
except ValueError as ve:
250244
print(
251245
f"ValueError (while reading flat text file), possibly from uneven record lengths: {ve}\n "
@@ -256,7 +250,7 @@ def read_single_datafile(
256250
kwargs_pad = kwargs
257251
kwargs_pad["index_col"] = None
258252
kwargs_pad["header"] = None
259-
df = pd.read_csv(f_path, **kwargs_pad).fillna("")
253+
df = read_csv(f_path, **kwargs_pad).fillna("")
260254
# set headers per munger
261255
header_int_or_list = tabular_kwargs(p, dict())["header"]
262256
if isinstance(
@@ -284,7 +278,7 @@ def read_single_datafile(
284278
row_constant_kwargs = get_row_constant_kwargs(
285279
kwargs, p["rows_with_constants"]
286280
)
287-
row_df = pd.read_csv(f_path, **row_constant_kwargs)
281+
row_df = read_csv(f_path, **row_constant_kwargs)
288282
row_constants["Sheet1"], new_err = build_row_constants_from_df(
289283
row_df, p["rows_with_constants"], file_name, "Sheet1"
290284
)
@@ -299,7 +293,7 @@ def read_single_datafile(
299293
else:
300294
# strip whitespace from column names # TODO handle same for multi-index columns
301295
for k in df_dict.keys():
302-
if not isinstance(df_dict[k].columns, pd.MultiIndex):
296+
if not isinstance(df_dict[k].columns, MultiIndex):
303297
df_dict[k].columns = [
304298
(c.strip() if isinstance(c, str) else c)
305299
for c in df_dict[k].columns
@@ -349,7 +343,7 @@ def excel_to_dict(
349343
kwargs: Dict[str, Any],
350344
sheet_list: Optional[List[str]],
351345
rows_to_read: List[int],
352-
) -> (Dict[str, pd.DataFrame], Dict[str, Dict[str, Any]], Optional[dict]):
346+
) -> (Dict[str, DataFrame], Dict[str, Dict[str, Any]], Optional[dict]):
353347
"""Returns dictionary of dataframes (one for each sheet), dictionary of dictionaries of constant values
354348
(one dictionary for each sheet) and error."""
355349
kwargs["index_col"] = None
@@ -375,15 +369,15 @@ def excel_to_dict(
375369
)
376370
for sheet in sheet_list:
377371
try:
378-
df_dict[sheet] = pd.read_excel(f_path, **kwargs, sheet_name=sheet)
372+
df_dict[sheet] = read_excel(f_path, **kwargs, sheet_name=sheet)
379373
# ignore any empty sheet
380374
if df_dict[sheet].empty:
381375
df_dict.pop(sheet)
382376
err = add_new_error(
383377
err, "file", file_name, f"No data read from sheet {sheet}"
384378
)
385379
except Exception as exc:
386-
df_dict[sheet] = pd.DataFrame()
380+
df_dict[sheet] = DataFrame()
387381
err = add_new_error(
388382
err,
389383
"warn-file",
@@ -393,7 +387,7 @@ def excel_to_dict(
393387

394388
try:
395389
if rows_to_read:
396-
row_constant_df = pd.read_excel(
390+
row_constant_df = read_excel(
397391
f_path, **row_constant_kwargs, sheet_name=sheet
398392
)
399393
row_constants[sheet], new_err = build_row_constants_from_df(
@@ -415,7 +409,7 @@ def excel_to_dict(
415409

416410

417411
def build_row_constants_from_df(
418-
df: pd.DataFrame, rows_to_read: List[int], file_name: str, sheet: str
412+
df: DataFrame, rows_to_read: List[int], file_name: str, sheet: str
419413
) -> (Dict[int, Any], Optional[dict]):
420414
"""Returns first entries in rows corresponding to row_list
421415
(as a dictionary with rows in row_list as keys)"""
@@ -460,7 +454,7 @@ def copy_directory_with_backup(
460454
if backup_suffix:
461455
# make backup of anything with existing name
462456
if isdir(copy_path):
463-
shutil.move(copy_path, f"{copy_path}{backup_suffix}")
457+
move(copy_path, f"{copy_path}{backup_suffix}")
464458
print(f"Moved {copy_path} to {copy_path}{backup_suffix}")
465459
elif isfile(copy_path):
466460
old_stem = Path(copy_path).stem
@@ -501,7 +495,7 @@ def copy_with_err_handling(
501495
old = join(root, f)
502496
new = join(new_root, f)
503497
try:
504-
shutil.copy(old, new)
498+
copy(old, new)
505499
print(f"Copied {old} to {new}")
506500
except Exception as she:
507501
if report_error:
@@ -951,7 +945,7 @@ def get_filtered_input_options(
951945
"name": elections,
952946
"type": [None for election in elections],
953947
}
954-
df = pd.DataFrame(data=dropdown_options)
948+
df = DataFrame(data=dropdown_options)
955949
df["year"] = df["name"].str[:4]
956950
df["election_type"] = df["name"].str[5:]
957951
df.sort_values(
@@ -973,7 +967,7 @@ def get_filtered_input_options(
973967
"name": contest_types,
974968
"type": [None for contest_type in contest_types],
975969
}
976-
df = pd.DataFrame(data=dropdown_options)
970+
df = DataFrame(data=dropdown_options)
977971
elif menu_type == "contest":
978972
contest_type = list(set(constants.contest_types_model) & set(filters))[0]
979973

@@ -986,7 +980,7 @@ def get_filtered_input_options(
986980
connection.close()
987981

988982
# define input option for all contests of the given type
989-
contest_type_df = pd.DataFrame(
983+
contest_type_df = DataFrame(
990984
[
991985
{
992986
"parent": reporting_unit,
@@ -1000,7 +994,7 @@ def get_filtered_input_options(
1000994
contest_df = contest_df[contest_df["type"].isin(filters)].sort_values(
1001995
by=["parent", "type", "name"]
1002996
)
1003-
df = pd.concat([contest_type_df, contest_df])
997+
df = concat([contest_type_df, contest_df])
1004998
elif menu_type == "category":
1005999
election_id = db.list_to_id(session, "Election", filters)
10061000
jurisdiction_id = db.list_to_id(session, "ReportingUnit", filters)
@@ -1043,7 +1037,7 @@ def get_filtered_input_options(
10431037
+ [None for count_type in count_types]
10441038
+ [None for c in population],
10451039
}
1046-
df = pd.DataFrame(data=dropdown_options)
1040+
df = DataFrame(data=dropdown_options)
10471041
# check if it's looking for a count of contests
10481042
elif menu_type == "count" and bool([f for f in filters if f.startswith("Contest")]):
10491043
election_id = db.list_to_id(session, "Election", filters)
@@ -1122,7 +1116,7 @@ def get_filtered_input_options(
11221116
return package_display_results(df)
11231117

11241118

1125-
def package_display_results(data: pd.DataFrame) -> List[Dict[str, Any]]:
1119+
def package_display_results(data: DataFrame) -> List[Dict[str, Any]]:
11261120
"""takes a result set and packages into JSON to return.
11271121
Result set should already be ordered as desired for display
11281122
with display order controlled by "order_by" key"""
@@ -1208,10 +1202,10 @@ def clean_candidate_names(df):
12081202

12091203

12101204
def disambiguate_empty_cols(
1211-
df_in: pd.DataFrame,
1205+
df_in: DataFrame,
12121206
drop_empties: bool,
12131207
start: int = 0,
1214-
) -> pd.DataFrame:
1208+
) -> DataFrame:
12151209
"""Returns new df with empties dropped, or kept with non-blank placeholder info"""
12161210
original_number_of_columns = df_in.shape[1]
12171211
# set row index to default
@@ -1234,14 +1228,14 @@ def disambiguate_empty_cols(
12341228

12351229

12361230
def set_and_fill_headers(
1237-
df_in: pd.DataFrame,
1231+
df_in: DataFrame,
12381232
header_list: Optional[list],
12391233
merged_cells: bool,
12401234
drop_empties: bool = True,
1241-
) -> pd.DataFrame:
1235+
) -> DataFrame:
12421236
# standardize the index to 0, 1, 2, ...
12431237
df = df_in.reset_index(drop=True)
1244-
# rename all blank header entries to match convention of pd.read_excel
1238+
# rename all blank header entries to match convention of read_excel
12451239
#
12461240
#
12471241
if header_list:

0 commit comments

Comments
 (0)