@@ -238,20 +238,25 @@ def get_data_for_scatter(
238238 subdivision_type : str ,
239239 election_id : int ,
240240 count_item_type : str ,
241- filter_str ,
242- count_type ,
243- is_runoff ,
244- ):
241+ filter_str : str ,
242+ count_type : str ,
243+ is_runoff : bool ,
244+ ) -> pd . DataFrame :
245245 """
246246 :param session: sqlalchemy database session
247247 :param jurisdiction_id: integer Id of jurisdiction in database ReportingUnit table
248- :param subdivision_type: string ReportingUnitType characterizing subdivisions for points in scatter
248+ :param subdivision_type: string ReportingUnitType
249249 :param election_id: integer Id of election in database Election table
250250 :param count_item_type: CountItemType characterizing vote counts
251- :param filter_str:
252- :param count_type:
253- :param is_runoff:
254- :return:
251+ :param filter_str: string used to filter data
252+ for vote counts: "All contests" or "All candidates" or a value for Contest.Name or Candidate.BallotName
253+ for other datasets: value for ExternalDataSet.Label
254+ :param count_type: string used to filter data:
255+ for vote counts: "candidates" or "parties" or "contests"
256+ for other datasets: value for ExternalDataSet.Category
257+ :param is_runoff: True for a run-off contest associated to the election, otherwise False
258+ :return: dataframe of results rolled up to ReportingUnits of <subdivision> ReportingUnitType,
259+ restricted by <count_item_type>, <filter_str" and <count_type>
255260 """
256261 if count_type .startswith ("Population" ):
257262 return get_external_data (
@@ -282,7 +287,38 @@ def get_external_data(
282287 category ,
283288 label ,
284289 subdivision_type ,
285- ):
290+ ) -> pd .DataFrame :
291+ """
292+ :param session: sqlalchemy database session
293+ :param jurisdiction_id: integer Id of jurisdiction in database ReportingUnit table
294+ :param subdivision_type: string ReportingUnitType characterizing subdivisions for points in scatter
295+ :param category: value for ExternalDataSet.Category
296+ :param label: value for ExternalDataSet.Label
297+ :param subdivision_type: value for ReportingUnit.ReportingUnitType
298+ :return: dataframe of results rolled up to ReportingUnits of <subdivision> ReportingUnitType
299+ restricted by <category> and <label>. Format is designed to match votecount format
300+ Columns are:
301+ "Election_Id",
302+ "Name", (name of ReportingUnit)
303+ "Selection", (set to <label>)
304+ "Contest_Id", dummy set to 0
305+ "Candidate_Id", dummy set to 0
306+ "Contest", (set to <category>)
307+ "CountItemType", (set to "total")
308+ "Count",
309+ """
310+ # specify output columns
311+ cols = [
312+ "Election_Id" ,
313+ "Name" ,
314+ "Selection" ,
315+ "Contest_Id" ,
316+ "Candidate_Id" ,
317+ "Contest" ,
318+ "CountItemType" ,
319+ "Count" ,
320+ ]
321+
286322 # get the census data
287323 census_df = db .read_external (
288324 session ,
@@ -305,20 +341,9 @@ def get_external_data(
305341 columns = {"Label" : "Selection" , "Value" : "Count" },
306342 inplace = True ,
307343 )
308- census_df = census_df [
309- [
310- "Election_Id" ,
311- "Name" ,
312- "Selection" ,
313- "Contest_Id" ,
314- "Candidate_Id" ,
315- "Contest" ,
316- "CountItemType" ,
317- "Count" ,
318- ]
319- ]
344+ census_df = census_df [cols ]
320345 return census_df
321- return pd .DataFrame ()
346+ return pd .DataFrame (columns = cols )
322347
323348
324349def get_votecount_data (
@@ -340,7 +365,8 @@ def get_votecount_data(
340365 :param filter_str: string, "All contests" or "All candidates" or a contest name or a candidate name
341366 :param count_type: "candidates" or "parties" or "contests"
342367 :param is_runoff: True if contest is a run-off; otherwise False
343- :return: dataframe of vote counts by subdivision, with columns specifying ReportingUnit, Contest,
368+ :return: dataframe of vote counts rolled up to ReportingUnits of type <subdivision>
369+ (or larger, if that's all that's available), Contest,
344370 Selection, VoteCountType along with various database Ids
345371 """
346372 unsummed = db .unsummed_vote_counts_with_rollup_subdivision_id (
@@ -586,34 +612,84 @@ def create_bar(
586612
587613
588614def assign_anomaly_score (data : pd .DataFrame ) -> pd .DataFrame :
589- """adds a new column called score between 0 and 1; 1 is more anomalous.
590- Also adds a `unit_id` column which assigns a score to each unit of analysis
591- that is considered. For example, we may decide to look at anomalies across each
592- distinct combination of contest, reporting unit type, and vote type. Each
593- combination of those would get assigned an ID. This means rows may get added
594- to the dataframe if needed."""
615+ """
616+
617+ :param data: dataframe with required columns:
618+ "ReportingUnitType",
619+ "ParentReportingUnit_Id",
620+ "ParentName",
621+ "ParentReportingUnitType",
622+ "Candidate_Id",
623+ "CountItemType",
624+ "Contest_Id",
625+ "Contest",
626+ "Selection",
627+ "Selection_Id",
628+ "contest_type",
629+ "contest_district_type",
630+ "Count",
631+
632+ and possibly other columns, such as:
633+ "Party",
634+
635+
636+ :return: dataframe obtained by appending columns to <data>:
637+ "score": value between 0 and 1; 1 is more anomalous
638+ "unit_id": identifies the set of vote counts within which the anomaly score of the single vote count was
639+ calculated. A single vote count's anomaly score depends on the set of vote counts within which it is
640+ considered.
641+ "reporting_unit_total": total votes for all selections in given contest for given reporting unit
642+ "ind_total": total votes for given selection in given contest in entire jurisdiction
643+ "rank": 1 for contest winner, 2 for second place, etc.
644+ "contest_total": total votes cast in the contest in entire jurisdiction
645+ "index": artifact from calculation
646+ "unit_id_tmp": artifact from calculation
647+
648+ and preserving columns
649+ "ReportingUnit_Id" (renamed from "ParentReportingUnit_Id")
650+ "Name", (renamed from "ParentName", the name of the ReportingUnit)
651+ "ReportingUnitType", (renamed from "ParentReportingUnitType")
652+ "Candidate_Id",
653+ "CountItemType",
654+ "Contest_Id",
655+ "Contest",
656+ "Selection",
657+ "contest_type",
658+ "contest_district_type",
659+ "Count",
660+ "Selection_Id",
661+ """
595662
596663 # Assign a ranking for each candidate by votes for each contest
664+
665+ # # create <total_data> dataframe with "total" CountItemType only
597666 if "total" not in data ["CountItemType" ].unique ():
598667 groupby_cols = list (data .columns )
599668 groupby_cols .remove ("Count" )
600669 total_data = data .groupby (groupby_cols ).sum ().reset_index ()
601670 else :
602671 total_data = data [data ["CountItemType" ] == "total" ]
603672
673+ # # create <ranked_df> of contest-candidate pairs,
674+ # column "rank": winners 1, second-place 2, etc. (tied candidates get same rank);
675+ # column "ind_total" with total votes for candidate;
676+ # column "contest_total" with total votes in contest
677+
678+ # # # Append total votes for selection
604679 ranked_df = (
605680 total_data .groupby (["Contest_Id" , "Selection" , "Selection_Id" ], as_index = False )[
606681 "Count"
607682 ]
608683 .sum ()
609684 .sort_values (["Contest_Id" , "Count" ], ascending = False )
610685 )
686+ # # # Append rank
611687 ranked_df ["rank" ] = ranked_df .groupby ("Contest_Id" )["Count" ].rank (
612688 "dense" , ascending = False
613689 )
614690 ranked_df .rename (columns = {"Count" : "ind_total" }, inplace = True )
615691
616- # Now get the total votes for the entire contest
692+ # # # Append total votes for the entire contest
617693 contest_df = ranked_df .groupby ("Contest_Id" )["ind_total" ].sum ().reset_index ()
618694 contest_df .rename (columns = {"ind_total" : "contest_total" }, inplace = True )
619695 ranked_df = ranked_df .merge (contest_df , how = "inner" , on = "Contest_Id" )
@@ -667,8 +743,8 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
667743 # loop through each unit ID and assign anomaly scores
668744 # also update the "real" unit_id which takes into account pairing of candidates
669745 unit_ids_tmp = df_with_units ["unit_id_tmp" ].unique ()
670- unit_id = 0
671- df = pd .DataFrame ()
746+ unit_id = 0 # increments on each pass through for loop
747+ df = pd .DataFrame () # collects records on each pass through for loop
672748 # for each unit ID
673749 for unit_id_tmp in unit_ids_tmp :
674750 # grab all the data there
@@ -722,7 +798,7 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
722798
723799
724800def get_most_anomalous (data : pd .DataFrame , n : int ) -> pd .DataFrame :
725- """Gets n contest , with 2 from largest votes at stake ratio
801+ """Gets n contests , with 2 from largest votes at stake ratio
726802 and 1 with largest score. If 2 from votes at stake cannot be found
727803 (bc of threshold for score) then we fill in the top n from scores"""
728804 # filter out very small votes at stake margins
@@ -1163,7 +1239,7 @@ def nist_office(session, election_id, jurisdiction_id):
11631239 return json .loads (result )
11641240
11651241
1166- def nist_candidate (session , election_id , jurisdiction_id ):
1242+ def nist_candidate (session : Session , election_id : int , jurisdiction_id : int ):
11671243 df = db .read_vote_count (
11681244 session ,
11691245 election_id = election_id ,
@@ -1176,25 +1252,37 @@ def nist_candidate(session, election_id, jurisdiction_id):
11761252
11771253
11781254def rollup_dataframe (
1179- session ,
1255+ session : Session ,
11801256 df : pd .DataFrame ,
11811257 count_col : str ,
11821258 ru_id_column : str ,
11831259 new_ru_id_column : str ,
11841260 rollup_rut : str = constants .default_subdivision_type ,
1185- ignore : Optional [List ] = None ,
1261+ ignore : Optional [List [ str ] ] = None ,
11861262) -> (pd .DataFrame (), Optional [dict ]):
1187- """Returns datafrome of results rolled up to the reporting unit type <rollup_rut>.
1188- For reporting units without parents of the given type (e.g., sometimes absentee votes
1189- are reported by state), preserve the record"""
1263+ """
1264+ :param session: sqlalchemy database session
1265+ :param df: dataframe of results
1266+ :param count_col: string, name of column with counts
1267+ :param ru_id_column: string, name of column with database Ids of ReportingUnits
1268+ :param new_ru_id_column: string, name of column in returned dataframe with database Ids
1269+ of newly rolled-up ReportingUnits
1270+ :param rollup_rut: string, ReportingUnitType to roll up to (e.g., "county")
1271+ :param ignore: (optional) list of names of columns to drop from <df>
1272+ :return:
1273+ dataframe of results rolled up to the given ReportingUnitType (NB: for reporting units without
1274+ parents of the given type (e.g., sometimes absentee votes are reported by state), preserve the reporting unit
1275+ dictionary of errors and warnings
1276+ """
11901277
1191- err = None # TODO error handling
1278+ err = None
11921279
11931280 # drop from dataframe any columns in <ignore>
11941281 if ignore :
11951282 working = df .copy ().drop (ignore , axis = 1 )
11961283 else :
11971284 working = df .copy ()
1285+
11981286 group_cols = [c for c in working .columns if (c not in (ru_id_column , count_col ))]
11991287 parents , err_str = db .parents (
12001288 session , df [ru_id_column ].unique (), subunit_type = rollup_rut
@@ -1206,6 +1294,7 @@ def rollup_dataframe(
12061294 f"{ Path (__file__ ).absolute ().parents [0 ].name } .{ inspect .currentframe ().f_code .co_name } " ,
12071295 f"Unable to read parents reporting unit info from column { ru_id_column } " ,
12081296 )
1297+ return pd .DataFrame (), err
12091298 try :
12101299 new_working = (
12111300 working .reset_index ()
0 commit comments