slugify output file names; order import statements in userinterface

sfsinger19103 · sfsinger19103 · commit 378c775a370c · 2021-12-08T16:01:24.000-08:00
diff --git a/requirements.txt b/requirements.txt
@@ -13,4 +13,7 @@ requests==2.25.1
 openpyxl==3.0.6
 dicttoxml==1.7.4
 lxml==4.6.3
-slugify=5.0.3
+python-slugify~=5.0.2
+electiondata~=2.0
+configparser~=5.0.2
+setuptools~=59.0.1
diff --git a/src/electiondata/userinterface/__init__.py b/src/electiondata/userinterface/__init__.py
@@ -4,30 +4,29 @@
     DuplicateOptionError,
     ParsingError,
 )
+from csv import QUOTE_MINIMAL
+from inspect import currentframe
+import json
+from numpy import where
+from os import walk, listdir
+from os.path import join, isdir, isfile
+import pandas as pd
+from pathlib import Path
+import shutil
+from slugify import slugify
+from sqlalchemy.orm import Session
+from typing import Optional, Dict, Any, List
+import xlrd
+import xml.etree.ElementTree
 
+# local imports
 from electiondata import (
     database as db,
     munge as m,
     nist as nist,
     constants,
 )
 
-from slugify import slugify
-from pandas import DataFrame, ExcelFile, json_normalize, read_csv, MultiIndex, read_excel, concat
-from pandas.errors import ParserError
-from os import walk, listdir
-from os.path import join, isdir, isfile
-from pathlib import Path
-from typing import Optional, Dict, Any, List
-from csv import QUOTE_MINIMAL
-from numpy import where 
-from inspect import currentframe
-from xml.etree.ElementTree import parse
-from json import loads
-from shutil import move, copy
-from xlrd import open_workbook
-from sqlalchemy.orm import Session
-
 
 def find_dupes(df):
     dupes_df = df[df.duplicated()].drop_duplicates(keep="first")
@@ -128,14 +127,14 @@ def list_desired_excel_sheets(f_path: str, p: dict) -> (Optional[list], Optional
         try:
             # read an xlsx file
             # # nb: the following fails on VT 2020 files
-            xl = ExcelFile(f_path)
+            xl = pd.ExcelFile(f_path)
             all_sheets = xl.sheet_names
             # xlsx = openpyxl.load_workbook(f_path)
             # all_sheets = xlsx.get_sheet_names()
         except Exception:
             try:
                 # read xls file
-                xls = open_workbook(f_path, on_demand=True)
+                xls = xlrd.open_workbook(f_path, on_demand=True)
                 all_sheets = xls.sheet_names()
             except Exception as exc:
                 err = add_new_error(
@@ -164,7 +163,7 @@ def read_single_datafile(
     aux: bool = False,
     driving_path: Optional[str] = None,
     lookup_id: Optional[str] = None,
-) -> (Dict[str, DataFrame], Dict[str, Dict[int, Any]], Optional[dict]):
+) -> (Dict[str, pd.DataFrame], Dict[str, Dict[int, Any]], Optional[dict]):
     """Length of returned dictionary is the number of sheets read -- usually 1 except for multi-sheet Excel.
     Auxiliary files have different parameters (e.g., no count locations)"""
     err = None
@@ -202,7 +201,7 @@ def read_single_datafile(
             else:
                 driver = nist.xml_count_parse_info(p, ignore_namespace=True)
             xml_path_info = nist.xml_string_path_info(p["munge_fields"], p["namespace"])
-            tree = parse(f_path)
+            tree = xml.etree.ElementTree.parse(f_path)
             df, err = nist.df_from_tree(
                 tree,
                 xml_path_info=xml_path_info,
@@ -216,8 +215,8 @@ def read_single_datafile(
         elif p["file_type"] in ["json-nested"]:
             # TODO what if json-nested is a lookup?
             with open(f_path, "r") as f:
-                data = loads(f.read())
-            df = json_normalize(data, **kwargs)
+                data = json.loads(f.read())
+            df = pd.json_normalize(data, **kwargs)
             if not fatal_error(err):
                 df.rename(columns=rename, inplace=True)
                 df_dict = {"Sheet1": df}
@@ -239,7 +238,7 @@ def read_single_datafile(
                 df_dict = dict()
         elif p["file_type"] == "flat_text":
             try:
-                df = read_csv(f_path, **kwargs)
+                df = pd.read_csv(f_path, **kwargs)
             except ValueError as ve:
                 print(
                     f"ValueError (while reading flat text file), possibly from uneven record lengths: {ve}\n "
@@ -250,7 +249,7 @@ def read_single_datafile(
                 kwargs_pad = kwargs
                 kwargs_pad["index_col"] = None
                 kwargs_pad["header"] = None
-                df = read_csv(f_path, **kwargs_pad).fillna("")
+                df = pd.read_csv(f_path, **kwargs_pad).fillna("")
                 # set headers per munger
                 header_int_or_list = tabular_kwargs(p, dict())["header"]
                 if isinstance(
@@ -278,7 +277,7 @@ def read_single_datafile(
                 row_constant_kwargs = get_row_constant_kwargs(
                     kwargs, p["rows_with_constants"]
                 )
-                row_df = read_csv(f_path, **row_constant_kwargs)
+                row_df = pd.read_csv(f_path, **row_constant_kwargs)
                 row_constants["Sheet1"], new_err = build_row_constants_from_df(
                     row_df, p["rows_with_constants"], file_name, "Sheet1"
                 )
@@ -293,7 +292,7 @@ def read_single_datafile(
         else:
             # strip whitespace from column names # TODO handle same for multi-index columns
             for k in df_dict.keys():
-                if not isinstance(df_dict[k].columns, MultiIndex):
+                if not isinstance(df_dict[k].columns, pd.MultiIndex):
                     df_dict[k].columns = [
                         (c.strip() if isinstance(c, str) else c)
                         for c in df_dict[k].columns
@@ -305,7 +304,7 @@ def read_single_datafile(
     except UnicodeDecodeError as ude:
         err_str = f"Encoding error. Datafile not read completely.\n\t{ude}"
         err = add_new_error(err, "file", file_name, err_str)
-    except ParserError as pe:
+    except pd.errors.ParserError as pe:
         # DFs have trouble comparing against None. So we return an empty DF and
         # check for emptiness below as an indication of an error.
         err_str = f"Error parsing results file.\n{pe}"
@@ -343,7 +342,7 @@ def excel_to_dict(
     kwargs: Dict[str, Any],
     sheet_list: Optional[List[str]],
     rows_to_read: List[int],
-) -> (Dict[str, DataFrame], Dict[str, Dict[str, Any]], Optional[dict]):
+) -> (Dict[str, pd.DataFrame], Dict[str, Dict[str, Any]], Optional[dict]):
     """Returns dictionary of dataframes (one for each sheet), dictionary of dictionaries of constant values
     (one dictionary for each sheet) and error."""
     kwargs["index_col"] = None
@@ -369,15 +368,15 @@ def excel_to_dict(
         )
     for sheet in sheet_list:
         try:
-            df_dict[sheet] = read_excel(f_path, **kwargs, sheet_name=sheet)
+            df_dict[sheet] = pd.read_excel(f_path, **kwargs, sheet_name=sheet)
             # ignore any empty sheet
             if df_dict[sheet].empty:
                 df_dict.pop(sheet)
                 err = add_new_error(
                     err, "file", file_name, f"No data read from sheet {sheet}"
                 )
         except Exception as exc:
-            df_dict[sheet] = DataFrame()
+            df_dict[sheet] = pd.DataFrame()
             err = add_new_error(
                 err,
                 "warn-file",
@@ -387,7 +386,7 @@ def excel_to_dict(
 
         try:
             if rows_to_read:
-                row_constant_df = read_excel(
+                row_constant_df = pd.read_excel(
                     f_path, **row_constant_kwargs, sheet_name=sheet
                 )
                 row_constants[sheet], new_err = build_row_constants_from_df(
@@ -409,7 +408,7 @@ def excel_to_dict(
 
 
 def build_row_constants_from_df(
-    df: DataFrame, rows_to_read: List[int], file_name: str, sheet: str
+    df: pd.DataFrame, rows_to_read: List[int], file_name: str, sheet: str
 ) -> (Dict[int, Any], Optional[dict]):
     """Returns first entries in rows corresponding to row_list
     (as a dictionary with rows in row_list as keys)"""
@@ -454,7 +453,7 @@ def copy_directory_with_backup(
         if backup_suffix:
             # make backup of anything with existing name
             if isdir(copy_path):
-                move(copy_path, f"{copy_path}{backup_suffix}")
+                shutil.move(copy_path, f"{copy_path}{backup_suffix}")
                 print(f"Moved {copy_path} to {copy_path}{backup_suffix}")
             elif isfile(copy_path):
                 old_stem = Path(copy_path).stem
@@ -495,7 +494,7 @@ def copy_with_err_handling(
             old = join(root, f)
             new = join(new_root, f)
             try:
-                copy(old, new)
+                shutil.copy(old, new)
                 print(f"Copied {old} to {new}")
             except Exception as she:
                 if report_error:
@@ -696,9 +695,12 @@ def report(
 
                     # write info to a .errors or .errors file named for the name_key <nk>
                     out_path = join(
-                        output_location, slugify(
-                            f"{file_prefix}_{et}_{nk_name}.errors",regex_pattern=r'[^ A-z0-9-_]+', lowercase=False
-                        )
+                        output_location,
+                        slugify(
+                            f"{file_prefix}_{et}_{nk_name}.errors",
+                            regex_pattern=r"[^ A-z0-9-_]+",
+                            lowercase=False,
+                        ),
                     )
                     with open(out_path, "a") as f:
                         f.write(out_str)
@@ -720,9 +722,12 @@ def report(
                     # write info to a .warnings file named for the error-type and name_key
 
                     out_path = join(
-                        output_location, slugify(
-                            f"{file_prefix}_{et}_{nk_name}.warnings",regex_pattern=r'[^ A-z0-9-_]+', lowercase=False
-                        )
+                        output_location,
+                        slugify(
+                            f"{file_prefix}_{et}_{nk_name}.warnings",
+                            regex_pattern=r"[^ A-z0-9-_]+",
+                            lowercase=False,
+                        ),
                     )
                     with open(out_path, "a") as f:
                         f.write(out_str)
@@ -945,7 +950,7 @@ def get_filtered_input_options(
                 "name": elections,
                 "type": [None for election in elections],
             }
-            df = DataFrame(data=dropdown_options)
+            df = pd.DataFrame(data=dropdown_options)
             df["year"] = df["name"].str[:4]
             df["election_type"] = df["name"].str[5:]
             df.sort_values(
@@ -967,7 +972,7 @@ def get_filtered_input_options(
             "name": contest_types,
             "type": [None for contest_type in contest_types],
         }
-        df = DataFrame(data=dropdown_options)
+        df = pd.DataFrame(data=dropdown_options)
     elif menu_type == "contest":
         contest_type = list(set(constants.contest_types_model) & set(filters))[0]
 
@@ -980,7 +985,7 @@ def get_filtered_input_options(
         connection.close()
 
         # define input option for all contests of the given type
-        contest_type_df = DataFrame(
+        contest_type_df = pd.DataFrame(
             [
                 {
                     "parent": reporting_unit,
@@ -994,7 +999,7 @@ def get_filtered_input_options(
         contest_df = contest_df[contest_df["type"].isin(filters)].sort_values(
             by=["parent", "type", "name"]
         )
-        df = concat([contest_type_df, contest_df])
+        df = pd.concat([contest_type_df, contest_df])
     elif menu_type == "category":
         election_id = db.list_to_id(session, "Election", filters)
         jurisdiction_id = db.list_to_id(session, "ReportingUnit", filters)
@@ -1037,7 +1042,7 @@ def get_filtered_input_options(
             + [None for count_type in count_types]
             + [None for c in population],
         }
-        df = DataFrame(data=dropdown_options)
+        df = pd.DataFrame(data=dropdown_options)
     # check if it's looking for a count of contests
     elif menu_type == "count" and bool([f for f in filters if f.startswith("Contest")]):
         election_id = db.list_to_id(session, "Election", filters)
@@ -1116,7 +1121,7 @@ def get_filtered_input_options(
     return package_display_results(df)
 
 
-def package_display_results(data: DataFrame) -> List[Dict[str, Any]]:
+def package_display_results(data: pd.DataFrame) -> List[Dict[str, Any]]:
     """takes a result set and packages into JSON to return.
     Result set should already be ordered as desired for display
     with display order controlled by "order_by" key"""
@@ -1202,17 +1207,17 @@ def clean_candidate_names(df):
 
 
 def disambiguate_empty_cols(
-    df_in: DataFrame,
+    df_in: pd.DataFrame,
     drop_empties: bool,
     start: int = 0,
-) -> DataFrame:
+) -> pd.DataFrame:
     """Returns new df with empties dropped, or kept with non-blank placeholder info"""
     original_number_of_columns = df_in.shape[1]
     # set row index to default
     df = df_in.reset_index(drop=True)
 
     # put dummy info into the tops of the bad columns
-    # in order to meet MultiIndex uniqueness criteria
+    # in order to meet pd.MultiIndex uniqueness criteria
     mask = df.eq("").loc[start:].all()
     bad_column_numbers = [j for j in range(original_number_of_columns) if mask[j]]
     for j in bad_column_numbers:
@@ -1228,14 +1233,14 @@ def disambiguate_empty_cols(
 
 
 def set_and_fill_headers(
-    df_in: DataFrame,
+    df_in: pd.DataFrame,
     header_list: Optional[list],
     merged_cells: bool,
     drop_empties: bool = True,
-) -> DataFrame:
+) -> pd.DataFrame:
     # standardize the index  to 0, 1, 2, ...
     df = df_in.reset_index(drop=True)
-    # rename all blank header entries to match convention of read_excel
+    # rename all blank header entries to match convention of pd.read_excel
     #
     #
     if header_list: