Skip to content

Commit 378c775

Browse files
committed
slugify output file names; order import statements in userinterface
1 parent 09be2e7 commit 378c775

2 files changed

Lines changed: 61 additions & 53 deletions

File tree

requirements.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,7 @@ requests==2.25.1
1313
openpyxl==3.0.6
1414
dicttoxml==1.7.4
1515
lxml==4.6.3
16-
slugify=5.0.3
16+
python-slugify~=5.0.2
17+
electiondata~=2.0
18+
configparser~=5.0.2
19+
setuptools~=59.0.1

src/electiondata/userinterface/__init__.py

Lines changed: 57 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,29 @@
44
DuplicateOptionError,
55
ParsingError,
66
)
7+
from csv import QUOTE_MINIMAL
8+
from inspect import currentframe
9+
import json
10+
from numpy import where
11+
from os import walk, listdir
12+
from os.path import join, isdir, isfile
13+
import pandas as pd
14+
from pathlib import Path
15+
import shutil
16+
from slugify import slugify
17+
from sqlalchemy.orm import Session
18+
from typing import Optional, Dict, Any, List
19+
import xlrd
20+
import xml.etree.ElementTree
721

22+
# local imports
823
from electiondata import (
924
database as db,
1025
munge as m,
1126
nist as nist,
1227
constants,
1328
)
1429

15-
from slugify import slugify
16-
from pandas import DataFrame, ExcelFile, json_normalize, read_csv, MultiIndex, read_excel, concat
17-
from pandas.errors import ParserError
18-
from os import walk, listdir
19-
from os.path import join, isdir, isfile
20-
from pathlib import Path
21-
from typing import Optional, Dict, Any, List
22-
from csv import QUOTE_MINIMAL
23-
from numpy import where
24-
from inspect import currentframe
25-
from xml.etree.ElementTree import parse
26-
from json import loads
27-
from shutil import move, copy
28-
from xlrd import open_workbook
29-
from sqlalchemy.orm import Session
30-
3130

3231
def find_dupes(df):
3332
dupes_df = df[df.duplicated()].drop_duplicates(keep="first")
@@ -128,14 +127,14 @@ def list_desired_excel_sheets(f_path: str, p: dict) -> (Optional[list], Optional
128127
try:
129128
# read an xlsx file
130129
# # nb: the following fails on VT 2020 files
131-
xl = ExcelFile(f_path)
130+
xl = pd.ExcelFile(f_path)
132131
all_sheets = xl.sheet_names
133132
# xlsx = openpyxl.load_workbook(f_path)
134133
# all_sheets = xlsx.get_sheet_names()
135134
except Exception:
136135
try:
137136
# read xls file
138-
xls = open_workbook(f_path, on_demand=True)
137+
xls = xlrd.open_workbook(f_path, on_demand=True)
139138
all_sheets = xls.sheet_names()
140139
except Exception as exc:
141140
err = add_new_error(
@@ -164,7 +163,7 @@ def read_single_datafile(
164163
aux: bool = False,
165164
driving_path: Optional[str] = None,
166165
lookup_id: Optional[str] = None,
167-
) -> (Dict[str, DataFrame], Dict[str, Dict[int, Any]], Optional[dict]):
166+
) -> (Dict[str, pd.DataFrame], Dict[str, Dict[int, Any]], Optional[dict]):
168167
"""Length of returned dictionary is the number of sheets read -- usually 1 except for multi-sheet Excel.
169168
Auxiliary files have different parameters (e.g., no count locations)"""
170169
err = None
@@ -202,7 +201,7 @@ def read_single_datafile(
202201
else:
203202
driver = nist.xml_count_parse_info(p, ignore_namespace=True)
204203
xml_path_info = nist.xml_string_path_info(p["munge_fields"], p["namespace"])
205-
tree = parse(f_path)
204+
tree = xml.etree.ElementTree.parse(f_path)
206205
df, err = nist.df_from_tree(
207206
tree,
208207
xml_path_info=xml_path_info,
@@ -216,8 +215,8 @@ def read_single_datafile(
216215
elif p["file_type"] in ["json-nested"]:
217216
# TODO what if json-nested is a lookup?
218217
with open(f_path, "r") as f:
219-
data = loads(f.read())
220-
df = json_normalize(data, **kwargs)
218+
data = json.loads(f.read())
219+
df = pd.json_normalize(data, **kwargs)
221220
if not fatal_error(err):
222221
df.rename(columns=rename, inplace=True)
223222
df_dict = {"Sheet1": df}
@@ -239,7 +238,7 @@ def read_single_datafile(
239238
df_dict = dict()
240239
elif p["file_type"] == "flat_text":
241240
try:
242-
df = read_csv(f_path, **kwargs)
241+
df = pd.read_csv(f_path, **kwargs)
243242
except ValueError as ve:
244243
print(
245244
f"ValueError (while reading flat text file), possibly from uneven record lengths: {ve}\n "
@@ -250,7 +249,7 @@ def read_single_datafile(
250249
kwargs_pad = kwargs
251250
kwargs_pad["index_col"] = None
252251
kwargs_pad["header"] = None
253-
df = read_csv(f_path, **kwargs_pad).fillna("")
252+
df = pd.read_csv(f_path, **kwargs_pad).fillna("")
254253
# set headers per munger
255254
header_int_or_list = tabular_kwargs(p, dict())["header"]
256255
if isinstance(
@@ -278,7 +277,7 @@ def read_single_datafile(
278277
row_constant_kwargs = get_row_constant_kwargs(
279278
kwargs, p["rows_with_constants"]
280279
)
281-
row_df = read_csv(f_path, **row_constant_kwargs)
280+
row_df = pd.read_csv(f_path, **row_constant_kwargs)
282281
row_constants["Sheet1"], new_err = build_row_constants_from_df(
283282
row_df, p["rows_with_constants"], file_name, "Sheet1"
284283
)
@@ -293,7 +292,7 @@ def read_single_datafile(
293292
else:
294293
# strip whitespace from column names # TODO handle same for multi-index columns
295294
for k in df_dict.keys():
296-
if not isinstance(df_dict[k].columns, MultiIndex):
295+
if not isinstance(df_dict[k].columns, pd.MultiIndex):
297296
df_dict[k].columns = [
298297
(c.strip() if isinstance(c, str) else c)
299298
for c in df_dict[k].columns
@@ -305,7 +304,7 @@ def read_single_datafile(
305304
except UnicodeDecodeError as ude:
306305
err_str = f"Encoding error. Datafile not read completely.\n\t{ude}"
307306
err = add_new_error(err, "file", file_name, err_str)
308-
except ParserError as pe:
307+
except pd.errors.ParserError as pe:
309308
# DFs have trouble comparing against None. So we return an empty DF and
310309
# check for emptiness below as an indication of an error.
311310
err_str = f"Error parsing results file.\n{pe}"
@@ -343,7 +342,7 @@ def excel_to_dict(
343342
kwargs: Dict[str, Any],
344343
sheet_list: Optional[List[str]],
345344
rows_to_read: List[int],
346-
) -> (Dict[str, DataFrame], Dict[str, Dict[str, Any]], Optional[dict]):
345+
) -> (Dict[str, pd.DataFrame], Dict[str, Dict[str, Any]], Optional[dict]):
347346
"""Returns dictionary of dataframes (one for each sheet), dictionary of dictionaries of constant values
348347
(one dictionary for each sheet) and error."""
349348
kwargs["index_col"] = None
@@ -369,15 +368,15 @@ def excel_to_dict(
369368
)
370369
for sheet in sheet_list:
371370
try:
372-
df_dict[sheet] = read_excel(f_path, **kwargs, sheet_name=sheet)
371+
df_dict[sheet] = pd.read_excel(f_path, **kwargs, sheet_name=sheet)
373372
# ignore any empty sheet
374373
if df_dict[sheet].empty:
375374
df_dict.pop(sheet)
376375
err = add_new_error(
377376
err, "file", file_name, f"No data read from sheet {sheet}"
378377
)
379378
except Exception as exc:
380-
df_dict[sheet] = DataFrame()
379+
df_dict[sheet] = pd.DataFrame()
381380
err = add_new_error(
382381
err,
383382
"warn-file",
@@ -387,7 +386,7 @@ def excel_to_dict(
387386

388387
try:
389388
if rows_to_read:
390-
row_constant_df = read_excel(
389+
row_constant_df = pd.read_excel(
391390
f_path, **row_constant_kwargs, sheet_name=sheet
392391
)
393392
row_constants[sheet], new_err = build_row_constants_from_df(
@@ -409,7 +408,7 @@ def excel_to_dict(
409408

410409

411410
def build_row_constants_from_df(
412-
df: DataFrame, rows_to_read: List[int], file_name: str, sheet: str
411+
df: pd.DataFrame, rows_to_read: List[int], file_name: str, sheet: str
413412
) -> (Dict[int, Any], Optional[dict]):
414413
"""Returns first entries in rows corresponding to row_list
415414
(as a dictionary with rows in row_list as keys)"""
@@ -454,7 +453,7 @@ def copy_directory_with_backup(
454453
if backup_suffix:
455454
# make backup of anything with existing name
456455
if isdir(copy_path):
457-
move(copy_path, f"{copy_path}{backup_suffix}")
456+
shutil.move(copy_path, f"{copy_path}{backup_suffix}")
458457
print(f"Moved {copy_path} to {copy_path}{backup_suffix}")
459458
elif isfile(copy_path):
460459
old_stem = Path(copy_path).stem
@@ -495,7 +494,7 @@ def copy_with_err_handling(
495494
old = join(root, f)
496495
new = join(new_root, f)
497496
try:
498-
copy(old, new)
497+
shutil.copy(old, new)
499498
print(f"Copied {old} to {new}")
500499
except Exception as she:
501500
if report_error:
@@ -696,9 +695,12 @@ def report(
696695

697696
# write info to a .errors or .errors file named for the name_key <nk>
698697
out_path = join(
699-
output_location, slugify(
700-
f"{file_prefix}_{et}_{nk_name}.errors",regex_pattern=r'[^ A-z0-9-_]+', lowercase=False
701-
)
698+
output_location,
699+
slugify(
700+
f"{file_prefix}_{et}_{nk_name}.errors",
701+
regex_pattern=r"[^ A-z0-9-_]+",
702+
lowercase=False,
703+
),
702704
)
703705
with open(out_path, "a") as f:
704706
f.write(out_str)
@@ -720,9 +722,12 @@ def report(
720722
# write info to a .warnings file named for the error-type and name_key
721723

722724
out_path = join(
723-
output_location, slugify(
724-
f"{file_prefix}_{et}_{nk_name}.warnings",regex_pattern=r'[^ A-z0-9-_]+', lowercase=False
725-
)
725+
output_location,
726+
slugify(
727+
f"{file_prefix}_{et}_{nk_name}.warnings",
728+
regex_pattern=r"[^ A-z0-9-_]+",
729+
lowercase=False,
730+
),
726731
)
727732
with open(out_path, "a") as f:
728733
f.write(out_str)
@@ -945,7 +950,7 @@ def get_filtered_input_options(
945950
"name": elections,
946951
"type": [None for election in elections],
947952
}
948-
df = DataFrame(data=dropdown_options)
953+
df = pd.DataFrame(data=dropdown_options)
949954
df["year"] = df["name"].str[:4]
950955
df["election_type"] = df["name"].str[5:]
951956
df.sort_values(
@@ -967,7 +972,7 @@ def get_filtered_input_options(
967972
"name": contest_types,
968973
"type": [None for contest_type in contest_types],
969974
}
970-
df = DataFrame(data=dropdown_options)
975+
df = pd.DataFrame(data=dropdown_options)
971976
elif menu_type == "contest":
972977
contest_type = list(set(constants.contest_types_model) & set(filters))[0]
973978

@@ -980,7 +985,7 @@ def get_filtered_input_options(
980985
connection.close()
981986

982987
# define input option for all contests of the given type
983-
contest_type_df = DataFrame(
988+
contest_type_df = pd.DataFrame(
984989
[
985990
{
986991
"parent": reporting_unit,
@@ -994,7 +999,7 @@ def get_filtered_input_options(
994999
contest_df = contest_df[contest_df["type"].isin(filters)].sort_values(
9951000
by=["parent", "type", "name"]
9961001
)
997-
df = concat([contest_type_df, contest_df])
1002+
df = pd.concat([contest_type_df, contest_df])
9981003
elif menu_type == "category":
9991004
election_id = db.list_to_id(session, "Election", filters)
10001005
jurisdiction_id = db.list_to_id(session, "ReportingUnit", filters)
@@ -1037,7 +1042,7 @@ def get_filtered_input_options(
10371042
+ [None for count_type in count_types]
10381043
+ [None for c in population],
10391044
}
1040-
df = DataFrame(data=dropdown_options)
1045+
df = pd.DataFrame(data=dropdown_options)
10411046
# check if it's looking for a count of contests
10421047
elif menu_type == "count" and bool([f for f in filters if f.startswith("Contest")]):
10431048
election_id = db.list_to_id(session, "Election", filters)
@@ -1116,7 +1121,7 @@ def get_filtered_input_options(
11161121
return package_display_results(df)
11171122

11181123

1119-
def package_display_results(data: DataFrame) -> List[Dict[str, Any]]:
1124+
def package_display_results(data: pd.DataFrame) -> List[Dict[str, Any]]:
11201125
"""takes a result set and packages into JSON to return.
11211126
Result set should already be ordered as desired for display
11221127
with display order controlled by "order_by" key"""
@@ -1202,17 +1207,17 @@ def clean_candidate_names(df):
12021207

12031208

12041209
def disambiguate_empty_cols(
1205-
df_in: DataFrame,
1210+
df_in: pd.DataFrame,
12061211
drop_empties: bool,
12071212
start: int = 0,
1208-
) -> DataFrame:
1213+
) -> pd.DataFrame:
12091214
"""Returns new df with empties dropped, or kept with non-blank placeholder info"""
12101215
original_number_of_columns = df_in.shape[1]
12111216
# set row index to default
12121217
df = df_in.reset_index(drop=True)
12131218

12141219
# put dummy info into the tops of the bad columns
1215-
# in order to meet MultiIndex uniqueness criteria
1220+
# in order to meet pd.MultiIndex uniqueness criteria
12161221
mask = df.eq("").loc[start:].all()
12171222
bad_column_numbers = [j for j in range(original_number_of_columns) if mask[j]]
12181223
for j in bad_column_numbers:
@@ -1228,14 +1233,14 @@ def disambiguate_empty_cols(
12281233

12291234

12301235
def set_and_fill_headers(
1231-
df_in: DataFrame,
1236+
df_in: pd.DataFrame,
12321237
header_list: Optional[list],
12331238
merged_cells: bool,
12341239
drop_empties: bool = True,
1235-
) -> DataFrame:
1240+
) -> pd.DataFrame:
12361241
# standardize the index to 0, 1, 2, ...
12371242
df = df_in.reset_index(drop=True)
1238-
# rename all blank header entries to match convention of read_excel
1243+
# rename all blank header entries to match convention of pd.read_excel
12391244
#
12401245
#
12411246
if header_list:

0 commit comments

Comments
 (0)