44 DuplicateOptionError ,
55 ParsingError ,
66)
7+ from csv import QUOTE_MINIMAL
8+ from inspect import currentframe
9+ import json
10+ from numpy import where
11+ from os import walk , listdir
12+ from os .path import join , isdir , isfile
13+ import pandas as pd
14+ from pathlib import Path
15+ import shutil
16+ from slugify import slugify
17+ from sqlalchemy .orm import Session
18+ from typing import Optional , Dict , Any , List
19+ import xlrd
20+ import xml .etree .ElementTree
721
22+ # local imports
823from electiondata import (
924 database as db ,
1025 munge as m ,
1126 nist as nist ,
1227 constants ,
1328)
1429
15- from slugify import slugify
16- from pandas import DataFrame , ExcelFile , json_normalize , read_csv , MultiIndex , read_excel , concat
17- from pandas .errors import ParserError
18- from os import walk , listdir
19- from os .path import join , isdir , isfile
20- from pathlib import Path
21- from typing import Optional , Dict , Any , List
22- from csv import QUOTE_MINIMAL
23- from numpy import where
24- from inspect import currentframe
25- from xml .etree .ElementTree import parse
26- from json import loads
27- from shutil import move , copy
28- from xlrd import open_workbook
29- from sqlalchemy .orm import Session
30-
3130
3231def find_dupes (df ):
3332 dupes_df = df [df .duplicated ()].drop_duplicates (keep = "first" )
@@ -128,14 +127,14 @@ def list_desired_excel_sheets(f_path: str, p: dict) -> (Optional[list], Optional
128127 try :
129128 # read an xlsx file
130129 # # nb: the following fails on VT 2020 files
131- xl = ExcelFile (f_path )
130+ xl = pd . ExcelFile (f_path )
132131 all_sheets = xl .sheet_names
133132 # xlsx = openpyxl.load_workbook(f_path)
134133 # all_sheets = xlsx.get_sheet_names()
135134 except Exception :
136135 try :
137136 # read xls file
138- xls = open_workbook (f_path , on_demand = True )
137+ xls = xlrd . open_workbook (f_path , on_demand = True )
139138 all_sheets = xls .sheet_names ()
140139 except Exception as exc :
141140 err = add_new_error (
@@ -164,7 +163,7 @@ def read_single_datafile(
164163 aux : bool = False ,
165164 driving_path : Optional [str ] = None ,
166165 lookup_id : Optional [str ] = None ,
167- ) -> (Dict [str , DataFrame ], Dict [str , Dict [int , Any ]], Optional [dict ]):
166+ ) -> (Dict [str , pd . DataFrame ], Dict [str , Dict [int , Any ]], Optional [dict ]):
168167 """Length of returned dictionary is the number of sheets read -- usually 1 except for multi-sheet Excel.
169168 Auxiliary files have different parameters (e.g., no count locations)"""
170169 err = None
@@ -202,7 +201,7 @@ def read_single_datafile(
202201 else :
203202 driver = nist .xml_count_parse_info (p , ignore_namespace = True )
204203 xml_path_info = nist .xml_string_path_info (p ["munge_fields" ], p ["namespace" ])
205- tree = parse (f_path )
204+ tree = xml . etree . ElementTree . parse (f_path )
206205 df , err = nist .df_from_tree (
207206 tree ,
208207 xml_path_info = xml_path_info ,
@@ -216,8 +215,8 @@ def read_single_datafile(
216215 elif p ["file_type" ] in ["json-nested" ]:
217216 # TODO what if json-nested is a lookup?
218217 with open (f_path , "r" ) as f :
219- data = loads (f .read ())
220- df = json_normalize (data , ** kwargs )
218+ data = json . loads (f .read ())
219+ df = pd . json_normalize (data , ** kwargs )
221220 if not fatal_error (err ):
222221 df .rename (columns = rename , inplace = True )
223222 df_dict = {"Sheet1" : df }
@@ -239,7 +238,7 @@ def read_single_datafile(
239238 df_dict = dict ()
240239 elif p ["file_type" ] == "flat_text" :
241240 try :
242- df = read_csv (f_path , ** kwargs )
241+ df = pd . read_csv (f_path , ** kwargs )
243242 except ValueError as ve :
244243 print (
245244 f"ValueError (while reading flat text file), possibly from uneven record lengths: { ve } \n "
@@ -250,7 +249,7 @@ def read_single_datafile(
250249 kwargs_pad = kwargs
251250 kwargs_pad ["index_col" ] = None
252251 kwargs_pad ["header" ] = None
253- df = read_csv (f_path , ** kwargs_pad ).fillna ("" )
252+ df = pd . read_csv (f_path , ** kwargs_pad ).fillna ("" )
254253 # set headers per munger
255254 header_int_or_list = tabular_kwargs (p , dict ())["header" ]
256255 if isinstance (
@@ -278,7 +277,7 @@ def read_single_datafile(
278277 row_constant_kwargs = get_row_constant_kwargs (
279278 kwargs , p ["rows_with_constants" ]
280279 )
281- row_df = read_csv (f_path , ** row_constant_kwargs )
280+ row_df = pd . read_csv (f_path , ** row_constant_kwargs )
282281 row_constants ["Sheet1" ], new_err = build_row_constants_from_df (
283282 row_df , p ["rows_with_constants" ], file_name , "Sheet1"
284283 )
@@ -293,7 +292,7 @@ def read_single_datafile(
293292 else :
294293 # strip whitespace from column names # TODO handle same for multi-index columns
295294 for k in df_dict .keys ():
296- if not isinstance (df_dict [k ].columns , MultiIndex ):
295+ if not isinstance (df_dict [k ].columns , pd . MultiIndex ):
297296 df_dict [k ].columns = [
298297 (c .strip () if isinstance (c , str ) else c )
299298 for c in df_dict [k ].columns
@@ -305,7 +304,7 @@ def read_single_datafile(
305304 except UnicodeDecodeError as ude :
306305 err_str = f"Encoding error. Datafile not read completely.\n \t { ude } "
307306 err = add_new_error (err , "file" , file_name , err_str )
308- except ParserError as pe :
307+ except pd . errors . ParserError as pe :
309308 # DFs have trouble comparing against None. So we return an empty DF and
310309 # check for emptiness below as an indication of an error.
311310 err_str = f"Error parsing results file.\n { pe } "
@@ -343,7 +342,7 @@ def excel_to_dict(
343342 kwargs : Dict [str , Any ],
344343 sheet_list : Optional [List [str ]],
345344 rows_to_read : List [int ],
346- ) -> (Dict [str , DataFrame ], Dict [str , Dict [str , Any ]], Optional [dict ]):
345+ ) -> (Dict [str , pd . DataFrame ], Dict [str , Dict [str , Any ]], Optional [dict ]):
347346 """Returns dictionary of dataframes (one for each sheet), dictionary of dictionaries of constant values
348347 (one dictionary for each sheet) and error."""
349348 kwargs ["index_col" ] = None
@@ -369,15 +368,15 @@ def excel_to_dict(
369368 )
370369 for sheet in sheet_list :
371370 try :
372- df_dict [sheet ] = read_excel (f_path , ** kwargs , sheet_name = sheet )
371+ df_dict [sheet ] = pd . read_excel (f_path , ** kwargs , sheet_name = sheet )
373372 # ignore any empty sheet
374373 if df_dict [sheet ].empty :
375374 df_dict .pop (sheet )
376375 err = add_new_error (
377376 err , "file" , file_name , f"No data read from sheet { sheet } "
378377 )
379378 except Exception as exc :
380- df_dict [sheet ] = DataFrame ()
379+ df_dict [sheet ] = pd . DataFrame ()
381380 err = add_new_error (
382381 err ,
383382 "warn-file" ,
@@ -387,7 +386,7 @@ def excel_to_dict(
387386
388387 try :
389388 if rows_to_read :
390- row_constant_df = read_excel (
389+ row_constant_df = pd . read_excel (
391390 f_path , ** row_constant_kwargs , sheet_name = sheet
392391 )
393392 row_constants [sheet ], new_err = build_row_constants_from_df (
@@ -409,7 +408,7 @@ def excel_to_dict(
409408
410409
411410def build_row_constants_from_df (
412- df : DataFrame , rows_to_read : List [int ], file_name : str , sheet : str
411+ df : pd . DataFrame , rows_to_read : List [int ], file_name : str , sheet : str
413412) -> (Dict [int , Any ], Optional [dict ]):
414413 """Returns first entries in rows corresponding to row_list
415414 (as a dictionary with rows in row_list as keys)"""
@@ -454,7 +453,7 @@ def copy_directory_with_backup(
454453 if backup_suffix :
455454 # make backup of anything with existing name
456455 if isdir (copy_path ):
457- move (copy_path , f"{ copy_path } { backup_suffix } " )
456+ shutil . move (copy_path , f"{ copy_path } { backup_suffix } " )
458457 print (f"Moved { copy_path } to { copy_path } { backup_suffix } " )
459458 elif isfile (copy_path ):
460459 old_stem = Path (copy_path ).stem
@@ -495,7 +494,7 @@ def copy_with_err_handling(
495494 old = join (root , f )
496495 new = join (new_root , f )
497496 try :
498- copy (old , new )
497+ shutil . copy (old , new )
499498 print (f"Copied { old } to { new } " )
500499 except Exception as she :
501500 if report_error :
@@ -696,9 +695,12 @@ def report(
696695
697696 # write info to a .errors or .errors file named for the name_key <nk>
698697 out_path = join (
699- output_location , slugify (
700- f"{ file_prefix } _{ et } _{ nk_name } .errors" ,regex_pattern = r'[^ A-z0-9-_]+' , lowercase = False
701- )
698+ output_location ,
699+ slugify (
700+ f"{ file_prefix } _{ et } _{ nk_name } .errors" ,
701+ regex_pattern = r"[^ A-z0-9-_]+" ,
702+ lowercase = False ,
703+ ),
702704 )
703705 with open (out_path , "a" ) as f :
704706 f .write (out_str )
@@ -720,9 +722,12 @@ def report(
720722 # write info to a .warnings file named for the error-type and name_key
721723
722724 out_path = join (
723- output_location , slugify (
724- f"{ file_prefix } _{ et } _{ nk_name } .warnings" ,regex_pattern = r'[^ A-z0-9-_]+' , lowercase = False
725- )
725+ output_location ,
726+ slugify (
727+ f"{ file_prefix } _{ et } _{ nk_name } .warnings" ,
728+ regex_pattern = r"[^ A-z0-9-_]+" ,
729+ lowercase = False ,
730+ ),
726731 )
727732 with open (out_path , "a" ) as f :
728733 f .write (out_str )
@@ -945,7 +950,7 @@ def get_filtered_input_options(
945950 "name" : elections ,
946951 "type" : [None for election in elections ],
947952 }
948- df = DataFrame (data = dropdown_options )
953+ df = pd . DataFrame (data = dropdown_options )
949954 df ["year" ] = df ["name" ].str [:4 ]
950955 df ["election_type" ] = df ["name" ].str [5 :]
951956 df .sort_values (
@@ -967,7 +972,7 @@ def get_filtered_input_options(
967972 "name" : contest_types ,
968973 "type" : [None for contest_type in contest_types ],
969974 }
970- df = DataFrame (data = dropdown_options )
975+ df = pd . DataFrame (data = dropdown_options )
971976 elif menu_type == "contest" :
972977 contest_type = list (set (constants .contest_types_model ) & set (filters ))[0 ]
973978
@@ -980,7 +985,7 @@ def get_filtered_input_options(
980985 connection .close ()
981986
982987 # define input option for all contests of the given type
983- contest_type_df = DataFrame (
988+ contest_type_df = pd . DataFrame (
984989 [
985990 {
986991 "parent" : reporting_unit ,
@@ -994,7 +999,7 @@ def get_filtered_input_options(
994999 contest_df = contest_df [contest_df ["type" ].isin (filters )].sort_values (
9951000 by = ["parent" , "type" , "name" ]
9961001 )
997- df = concat ([contest_type_df , contest_df ])
1002+ df = pd . concat ([contest_type_df , contest_df ])
9981003 elif menu_type == "category" :
9991004 election_id = db .list_to_id (session , "Election" , filters )
10001005 jurisdiction_id = db .list_to_id (session , "ReportingUnit" , filters )
@@ -1037,7 +1042,7 @@ def get_filtered_input_options(
10371042 + [None for count_type in count_types ]
10381043 + [None for c in population ],
10391044 }
1040- df = DataFrame (data = dropdown_options )
1045+ df = pd . DataFrame (data = dropdown_options )
10411046 # check if it's looking for a count of contests
10421047 elif menu_type == "count" and bool ([f for f in filters if f .startswith ("Contest" )]):
10431048 election_id = db .list_to_id (session , "Election" , filters )
@@ -1116,7 +1121,7 @@ def get_filtered_input_options(
11161121 return package_display_results (df )
11171122
11181123
1119- def package_display_results (data : DataFrame ) -> List [Dict [str , Any ]]:
1124+ def package_display_results (data : pd . DataFrame ) -> List [Dict [str , Any ]]:
11201125 """takes a result set and packages into JSON to return.
11211126 Result set should already be ordered as desired for display
11221127 with display order controlled by "order_by" key"""
@@ -1202,17 +1207,17 @@ def clean_candidate_names(df):
12021207
12031208
12041209def disambiguate_empty_cols (
1205- df_in : DataFrame ,
1210+ df_in : pd . DataFrame ,
12061211 drop_empties : bool ,
12071212 start : int = 0 ,
1208- ) -> DataFrame :
1213+ ) -> pd . DataFrame :
12091214 """Returns new df with empties dropped, or kept with non-blank placeholder info"""
12101215 original_number_of_columns = df_in .shape [1 ]
12111216 # set row index to default
12121217 df = df_in .reset_index (drop = True )
12131218
12141219 # put dummy info into the tops of the bad columns
1215- # in order to meet MultiIndex uniqueness criteria
1220+ # in order to meet pd. MultiIndex uniqueness criteria
12161221 mask = df .eq ("" ).loc [start :].all ()
12171222 bad_column_numbers = [j for j in range (original_number_of_columns ) if mask [j ]]
12181223 for j in bad_column_numbers :
@@ -1228,14 +1233,14 @@ def disambiguate_empty_cols(
12281233
12291234
12301235def set_and_fill_headers (
1231- df_in : DataFrame ,
1236+ df_in : pd . DataFrame ,
12321237 header_list : Optional [list ],
12331238 merged_cells : bool ,
12341239 drop_empties : bool = True ,
1235- ) -> DataFrame :
1240+ ) -> pd . DataFrame :
12361241 # standardize the index to 0, 1, 2, ...
12371242 df = df_in .reset_index (drop = True )
1238- # rename all blank header entries to match convention of read_excel
1243+ # rename all blank header entries to match convention of pd. read_excel
12391244 #
12401245 #
12411246 if header_list :
0 commit comments