1313)
1414
1515from slugify import slugify
16- import pandas as pd
16+ from pandas import DataFrame , ExcelFile , json_normalize , read_csv , MultiIndex , read_excel , concat
1717from pandas .errors import ParserError
1818from os import walk , listdir
1919from os .path import join , isdir , isfile
2424from inspect import currentframe
2525from xml .etree .ElementTree import parse
2626from json import loads
27- import shutil
28- import xlrd
29-
30- # may need for certain excel imports: import openpyxl
27+ from shutil import move , copy
28+ from xlrd import open_workbook
3129from sqlalchemy .orm import Session
3230
3331
34- # mapping from internal database reportingunit types to the user-facing contest types
35- # (contests are categorized by the reporting unit type of their corresponding districts)
36-
37-
3832def find_dupes (df ):
3933 dupes_df = df [df .duplicated ()].drop_duplicates (keep = "first" )
4034 deduped = df .drop_duplicates (keep = "first" )
@@ -60,9 +54,9 @@ def json_kwargs(
6054 else :
6155 json_rename [path_list [- 1 ]] = mf
6256 meta = list (list (t ) for t in meta_set )
63- json_kwargs = {"meta" : meta , "record_path" : record_path , "errors" : "ignore" }
57+ j_kwargs = {"meta" : meta , "record_path" : record_path , "errors" : "ignore" }
6458
65- return json_kwargs , json_rename
59+ return j_kwargs , json_rename
6660
6761
6862def tabular_kwargs (
@@ -134,14 +128,14 @@ def list_desired_excel_sheets(f_path: str, p: dict) -> (Optional[list], Optional
134128 try :
135129 # read an xlsx file
136130 # # nb: the following fails on VT 2020 files
137- xl = pd . ExcelFile (f_path )
131+ xl = ExcelFile (f_path )
138132 all_sheets = xl .sheet_names
139133 # xlsx = openpyxl.load_workbook(f_path)
140134 # all_sheets = xlsx.get_sheet_names()
141- except Exception as exc :
135+ except Exception :
142136 try :
143137 # read xls file
144- xls = xlrd . open_workbook (f_path , on_demand = True )
138+ xls = open_workbook (f_path , on_demand = True )
145139 all_sheets = xls .sheet_names ()
146140 except Exception as exc :
147141 err = add_new_error (
@@ -170,7 +164,7 @@ def read_single_datafile(
170164 aux : bool = False ,
171165 driving_path : Optional [str ] = None ,
172166 lookup_id : Optional [str ] = None ,
173- ) -> (Dict [str , pd . DataFrame ], Dict [str , Dict [int , Any ]], Optional [dict ]):
167+ ) -> (Dict [str , DataFrame ], Dict [str , Dict [int , Any ]], Optional [dict ]):
174168 """Length of returned dictionary is the number of sheets read -- usually 1 except for multi-sheet Excel.
175169 Auxiliary files have different parameters (e.g., no count locations)"""
176170 err = None
@@ -223,7 +217,7 @@ def read_single_datafile(
223217 # TODO what if json-nested is a lookup?
224218 with open (f_path , "r" ) as f :
225219 data = loads (f .read ())
226- df = pd . json_normalize (data , ** kwargs )
220+ df = json_normalize (data , ** kwargs )
227221 if not fatal_error (err ):
228222 df .rename (columns = rename , inplace = True )
229223 df_dict = {"Sheet1" : df }
@@ -245,7 +239,7 @@ def read_single_datafile(
245239 df_dict = dict ()
246240 elif p ["file_type" ] == "flat_text" :
247241 try :
248- df = pd . read_csv (f_path , ** kwargs )
242+ df = read_csv (f_path , ** kwargs )
249243 except ValueError as ve :
250244 print (
251245 f"ValueError (while reading flat text file), possibly from uneven record lengths: { ve } \n "
@@ -256,7 +250,7 @@ def read_single_datafile(
256250 kwargs_pad = kwargs
257251 kwargs_pad ["index_col" ] = None
258252 kwargs_pad ["header" ] = None
259- df = pd . read_csv (f_path , ** kwargs_pad ).fillna ("" )
253+ df = read_csv (f_path , ** kwargs_pad ).fillna ("" )
260254 # set headers per munger
261255 header_int_or_list = tabular_kwargs (p , dict ())["header" ]
262256 if isinstance (
@@ -284,7 +278,7 @@ def read_single_datafile(
284278 row_constant_kwargs = get_row_constant_kwargs (
285279 kwargs , p ["rows_with_constants" ]
286280 )
287- row_df = pd . read_csv (f_path , ** row_constant_kwargs )
281+ row_df = read_csv (f_path , ** row_constant_kwargs )
288282 row_constants ["Sheet1" ], new_err = build_row_constants_from_df (
289283 row_df , p ["rows_with_constants" ], file_name , "Sheet1"
290284 )
@@ -299,7 +293,7 @@ def read_single_datafile(
299293 else :
300294 # strip whitespace from column names # TODO handle same for multi-index columns
301295 for k in df_dict .keys ():
302- if not isinstance (df_dict [k ].columns , pd . MultiIndex ):
296+ if not isinstance (df_dict [k ].columns , MultiIndex ):
303297 df_dict [k ].columns = [
304298 (c .strip () if isinstance (c , str ) else c )
305299 for c in df_dict [k ].columns
@@ -349,7 +343,7 @@ def excel_to_dict(
349343 kwargs : Dict [str , Any ],
350344 sheet_list : Optional [List [str ]],
351345 rows_to_read : List [int ],
352- ) -> (Dict [str , pd . DataFrame ], Dict [str , Dict [str , Any ]], Optional [dict ]):
346+ ) -> (Dict [str , DataFrame ], Dict [str , Dict [str , Any ]], Optional [dict ]):
353347 """Returns dictionary of dataframes (one for each sheet), dictionary of dictionaries of constant values
354348 (one dictionary for each sheet) and error."""
355349 kwargs ["index_col" ] = None
@@ -375,15 +369,15 @@ def excel_to_dict(
375369 )
376370 for sheet in sheet_list :
377371 try :
378- df_dict [sheet ] = pd . read_excel (f_path , ** kwargs , sheet_name = sheet )
372+ df_dict [sheet ] = read_excel (f_path , ** kwargs , sheet_name = sheet )
379373 # ignore any empty sheet
380374 if df_dict [sheet ].empty :
381375 df_dict .pop (sheet )
382376 err = add_new_error (
383377 err , "file" , file_name , f"No data read from sheet { sheet } "
384378 )
385379 except Exception as exc :
386- df_dict [sheet ] = pd . DataFrame ()
380+ df_dict [sheet ] = DataFrame ()
387381 err = add_new_error (
388382 err ,
389383 "warn-file" ,
@@ -393,7 +387,7 @@ def excel_to_dict(
393387
394388 try :
395389 if rows_to_read :
396- row_constant_df = pd . read_excel (
390+ row_constant_df = read_excel (
397391 f_path , ** row_constant_kwargs , sheet_name = sheet
398392 )
399393 row_constants [sheet ], new_err = build_row_constants_from_df (
@@ -415,7 +409,7 @@ def excel_to_dict(
415409
416410
417411def build_row_constants_from_df (
418- df : pd . DataFrame , rows_to_read : List [int ], file_name : str , sheet : str
412+ df : DataFrame , rows_to_read : List [int ], file_name : str , sheet : str
419413) -> (Dict [int , Any ], Optional [dict ]):
420414 """Returns first entries in rows corresponding to row_list
421415 (as a dictionary with rows in row_list as keys)"""
@@ -460,7 +454,7 @@ def copy_directory_with_backup(
460454 if backup_suffix :
461455 # make backup of anything with existing name
462456 if isdir (copy_path ):
463- shutil . move (copy_path , f"{ copy_path } { backup_suffix } " )
457+ move (copy_path , f"{ copy_path } { backup_suffix } " )
464458 print (f"Moved { copy_path } to { copy_path } { backup_suffix } " )
465459 elif isfile (copy_path ):
466460 old_stem = Path (copy_path ).stem
@@ -501,7 +495,7 @@ def copy_with_err_handling(
501495 old = join (root , f )
502496 new = join (new_root , f )
503497 try :
504- shutil . copy (old , new )
498+ copy (old , new )
505499 print (f"Copied { old } to { new } " )
506500 except Exception as she :
507501 if report_error :
@@ -951,7 +945,7 @@ def get_filtered_input_options(
951945 "name" : elections ,
952946 "type" : [None for election in elections ],
953947 }
954- df = pd . DataFrame (data = dropdown_options )
948+ df = DataFrame (data = dropdown_options )
955949 df ["year" ] = df ["name" ].str [:4 ]
956950 df ["election_type" ] = df ["name" ].str [5 :]
957951 df .sort_values (
@@ -973,7 +967,7 @@ def get_filtered_input_options(
973967 "name" : contest_types ,
974968 "type" : [None for contest_type in contest_types ],
975969 }
976- df = pd . DataFrame (data = dropdown_options )
970+ df = DataFrame (data = dropdown_options )
977971 elif menu_type == "contest" :
978972 contest_type = list (set (constants .contest_types_model ) & set (filters ))[0 ]
979973
@@ -986,7 +980,7 @@ def get_filtered_input_options(
986980 connection .close ()
987981
988982 # define input option for all contests of the given type
989- contest_type_df = pd . DataFrame (
983+ contest_type_df = DataFrame (
990984 [
991985 {
992986 "parent" : reporting_unit ,
@@ -1000,7 +994,7 @@ def get_filtered_input_options(
1000994 contest_df = contest_df [contest_df ["type" ].isin (filters )].sort_values (
1001995 by = ["parent" , "type" , "name" ]
1002996 )
1003- df = pd . concat ([contest_type_df , contest_df ])
997+ df = concat ([contest_type_df , contest_df ])
1004998 elif menu_type == "category" :
1005999 election_id = db .list_to_id (session , "Election" , filters )
10061000 jurisdiction_id = db .list_to_id (session , "ReportingUnit" , filters )
@@ -1043,7 +1037,7 @@ def get_filtered_input_options(
10431037 + [None for count_type in count_types ]
10441038 + [None for c in population ],
10451039 }
1046- df = pd . DataFrame (data = dropdown_options )
1040+ df = DataFrame (data = dropdown_options )
10471041 # check if it's looking for a count of contests
10481042 elif menu_type == "count" and bool ([f for f in filters if f .startswith ("Contest" )]):
10491043 election_id = db .list_to_id (session , "Election" , filters )
@@ -1122,7 +1116,7 @@ def get_filtered_input_options(
11221116 return package_display_results (df )
11231117
11241118
1125- def package_display_results (data : pd . DataFrame ) -> List [Dict [str , Any ]]:
1119+ def package_display_results (data : DataFrame ) -> List [Dict [str , Any ]]:
11261120 """takes a result set and packages into JSON to return.
11271121 Result set should already be ordered as desired for display
11281122 with display order controlled by "order_by" key"""
@@ -1208,10 +1202,10 @@ def clean_candidate_names(df):
12081202
12091203
12101204def disambiguate_empty_cols (
1211- df_in : pd . DataFrame ,
1205+ df_in : DataFrame ,
12121206 drop_empties : bool ,
12131207 start : int = 0 ,
1214- ) -> pd . DataFrame :
1208+ ) -> DataFrame :
12151209 """Returns new df with empties dropped, or kept with non-blank placeholder info"""
12161210 original_number_of_columns = df_in .shape [1 ]
12171211 # set row index to default
@@ -1234,14 +1228,14 @@ def disambiguate_empty_cols(
12341228
12351229
12361230def set_and_fill_headers (
1237- df_in : pd . DataFrame ,
1231+ df_in : DataFrame ,
12381232 header_list : Optional [list ],
12391233 merged_cells : bool ,
12401234 drop_empties : bool = True ,
1241- ) -> pd . DataFrame :
1235+ ) -> DataFrame :
12421236 # standardize the index to 0, 1, 2, ...
12431237 df = df_in .reset_index (drop = True )
1244- # rename all blank header entries to match convention of pd. read_excel
1238+ # rename all blank header entries to match convention of read_excel
12451239 #
12461240 #
12471241 if header_list :
0 commit comments