@@ -162,8 +162,8 @@ def create_scatter(
162162 dfv .iloc [0 ]["Contest" ],
163163 jurisdiction_id ,
164164 )
165- h_preliminary = db .is_preliminary (session ,h_election_id ,jurisdiction_id )
166- v_preliminary = db .is_preliminary (session ,v_election_id ,jurisdiction_id )
165+ h_preliminary = db .is_preliminary (session , h_election_id , jurisdiction_id )
166+ v_preliminary = db .is_preliminary (session , v_election_id , jurisdiction_id )
167167 results ["preliminary" ] = h_preliminary or v_preliminary
168168
169169 # only keep the ones where there are an (x, y) to graph
@@ -181,12 +181,12 @@ def create_scatter(
181181
182182
183183def package_results (
184- data : pd .DataFrame ,
185- jurisdiction : str ,
186- x : str ,
187- y : str ,
188- restrict : Optional [int ] = None ,
189- ) -> Dict [str ,Any ]:
184+ data : pd .DataFrame ,
185+ jurisdiction : str ,
186+ x : str ,
187+ y : str ,
188+ restrict : Optional [int ] = None ,
189+ ) -> Dict [str , Any ]:
190190 """
191191 :param data: dataframe
192192 if "x" not equal "y", columns are "Name" (values are reporting units within the
@@ -317,15 +317,15 @@ def get_external_data(
317317 """
318318 # specify output columns
319319 cols = [
320- "Election_Id" ,
321- "Name" ,
322- "Selection" ,
323- "Contest_Id" ,
324- "Candidate_Id" ,
325- "Contest" ,
326- "CountItemType" ,
327- "Count" ,
328- ]
320+ "Election_Id" ,
321+ "Name" ,
322+ "Selection" ,
323+ "Contest_Id" ,
324+ "Candidate_Id" ,
325+ "Contest" ,
326+ "CountItemType" ,
327+ "Count" ,
328+ ]
329329
330330 # get the census data
331331 census_df = db .read_external (
@@ -464,8 +464,8 @@ def create_bar(
464464 :param contest_district_type: (optional string)
465465 :param contest_or_contest_group: (optional string) from user-facing menu, either the name of a contest or of a
466466 group of contests, e.g., "All congressional"
467- :param for_export: (optional)
468- :return: List of dictionaries, where each dictionary contains information to create a bar
467+ :param for_export: (optional) if True, returns data for all bar charts, not just the "most interesting" ones
468+ :return: Nothing (if no interesting anomalous bar charts found) ,or List of dictionaries, where each dictionary contains information to create a bar
469469 chart. The bar charts in the list are chosen via an algorithm favoring charts with a single outlier
470470 county whose impact on the margin is large. Bar charts are restricted to results for the
471471 <contest_or_contest_group> , if given,and also from the contests with districts of type
@@ -510,10 +510,16 @@ def create_bar(
510510 ranked ["margins_pct" ] = ranked ["Count" ] / ranked ["reporting_unit_total" ]
511511 ranked_margin = ranked
512512 votes_at_stake = calculate_votes_at_stake (ranked_margin )
513- if not for_export :
514- top_ranked = get_most_anomalous ( votes_at_stake , 3 )
515- else :
513+ # if for export
514+ if for_export :
515+ # return all data
516516 top_ranked = votes_at_stake
517+ else :
518+ # otherwise return the "most interesting"
519+ top_ranked = get_most_interesting (
520+ votes_at_stake , constants .number_of_charts
521+ )
522+
517523 except Exception :
518524 return None
519525 if top_ranked .empty :
@@ -570,7 +576,8 @@ def create_bar(
570576 results = package_results (pivot_df , jurisdiction , x , y )
571577 else :
572578 results = package_results (
573- pivot_df , jurisdiction , x , y , restrict = constants .max_rus_per_bar_chart )
579+ pivot_df , jurisdiction , x , y , restrict = constants .max_rus_per_bar_chart
580+ )
574581 results ["election" ] = db .name_from_id (session , "Election" , election_id )
575582 results ["contest" ] = db .name_from_id (
576583 session , "Contest" , int (temp_df .iloc [0 ]["Contest_Id" ])
@@ -592,7 +599,9 @@ def create_bar(
592599 acted = "widened"
593600 results ["votes_at_stake" ] = f"Outlier { acted } margin by ~ { votes_at_stake } "
594601 results ["margin" ] = human_readable_numbers (results ["margin_raw" ])
595- results ["preliminary" ] = db .is_preliminary (session ,election_id ,jurisdiction_id )
602+ results ["preliminary" ] = db .is_preliminary (
603+ session , election_id , jurisdiction_id
604+ )
596605
597606 # display ballot info
598607 if multiple_ballot_types :
@@ -610,8 +619,8 @@ def create_bar(
610619 results [
611620 "title"
612621 ] = f"""{ results ["count_item_type" ].replace ("-" , " " ).title ()} Ballots Reported"""
613- download_date = db .data_file_download (session ,election_id ,jurisdiction_id )
614- if db .is_preliminary (session ,election_id ,jurisdiction_id ) and download_date :
622+ download_date = db .data_file_download (session , election_id , jurisdiction_id )
623+ if db .is_preliminary (session , election_id , jurisdiction_id ) and download_date :
615624 results [
616625 "title"
617626 ] = f"""{ results ["title" ]} as of { download_date } (preliminary)"""
@@ -665,7 +674,7 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
665674 "contest_district_type",
666675 "Count",
667676 "Selection_Id",
668- """
677+ """
669678
670679 # Assign a ranking for each candidate by votes for each contest
671680
@@ -750,8 +759,8 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
750759 # loop through each unit ID and assign anomaly scores
751760 # also update the "real" bar_chart_id which takes into account pairing of candidates
752761 bar_chart_ids_tmp = df_with_units ["bar_chart_id_tmp" ].unique ()
753- bar_chart_id = 0 # increments on each pass through for loop
754- df = pd .DataFrame () # collects records on each pass through for loop
762+ bar_chart_id = 0 # increments on each pass through for loop
763+ df = pd .DataFrame () # collects records on each pass through for loop
755764 # for each unit ID
756765 for bar_chart_id_tmp in bar_chart_ids_tmp :
757766 # grab all the data there
@@ -804,9 +813,13 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
804813 return df
805814
806815
807- def get_most_anomalous (data : pd .DataFrame , n : int ) -> pd .DataFrame :
816+ def get_most_interesting (data : pd .DataFrame , n : int ) -> pd .DataFrame :
808817 """
818+ Returns data for <n> bar charts, with <n>-1 from largest votes at stake ratio
819+ and 1 with largest score. If <n>-1 from votes at stake cannot be found
820+ (because of outlier_zscore_cutoff) then we fill in the top n from scores
809821 :param data: dataframe with required columns:
822+ "bar_chart_id": integer id in code identifying the set of points within which the anomaly score was calculated
810823 "margin_ratio": number of votes at stake divided by overall contest margin between the two candidates
811824 "score": anomaly z-score for the given ReportingUnit_Id within the given bar chart
812825 (specified by bar_chart_id)
@@ -815,42 +828,24 @@ def get_most_anomalous(data: pd.DataFrame, n: int) -> pd.DataFrame:
815828 "ReportingUnitType":
816829 "CountItemType":
817830 "Count":
818-
819-
820- "Name": (name of reporting unit)
821- "Candidate_Id",
822- "Contest":
823- "Selection":
824- "contest_type": "BallotMeasure" or "Candidate"
825- "contest_district_type": ReportingUnitType for contest district
826- "Selection_Id":
827- "selection_total": total votes for given selection in given contest in entire jurisdiction
828831 "rank": candidate rank within contest
829- "contest_total": number of votes for all candidates in the given contest, over entire district
830- "index":
831- "bar_chart_id_tmp": artifact from calculation
832- "bar_chart_id": internal integer id identifying the set of points within which the anomaly score was calculated
833- "reporting_unit_total": number of votes for all candidates in the given contest and reporting unit
834- "margins_pct":
835- "votes_at_stake": number of votes that would change if anomaly were brought in line with nearest point (see http://digitaleditions.walsworthprintgroup.com/publication/?m=7656&i=694516&p=10&ver=html5)
832+ "Name": (name of reporting unit)
833+ "Selection":
836834
837835 :param n: integer, number of anomalous datasets to return
838836
839- :return: dataframe
837+ :return: dataframe with data for only the <n> "most interesting" bar charts
840838 """
841839
842- """Gets n contests, with <n>-1 from largest votes at stake ratio
843- and 1 with largest score. If <n>-1 from votes at stake cannot be found
844- (bc of threshold for score) then we fill in the top n from scores"""
845840 # filter out very small votes at stake (relative to total contest margin)
846841 data = data [(data ["margin_ratio" ] > 0.01 ) | (data ["margin_ratio" ] < - 0.01 )]
847842
848843 # identify bar charts with significant outliers (z-score above constants.outlier_zscore_cutoff)
849844 # get ordering of sufficiently anomalous bar charts (descending by votes-at-stake-to-margin ratio)
850845 # and ordering by descending z-score
851846 margin_data = data [data ["score" ] > constants .outlier_zscore_cutoff ]
852- bar_charts_by_margin = bar_chart_ids_by_column_value (margin_data ,"margin_ratio" )
853- bar_charts_by_score = bar_chart_ids_by_column_value (data ,"score" )
847+ bar_charts_by_margin = bar_chart_ids_by_column_value (margin_data , "margin_ratio" )
848+ bar_charts_by_score = bar_chart_ids_by_column_value (data , "score" )
854849
855850 # pick top n bar charts: up to n-1 from margin data if there are enough, and the rest
856851 # from z-score
@@ -887,14 +882,19 @@ def get_most_anomalous(data: pd.DataFrame, n: int) -> pd.DataFrame:
887882 inplace = True ,
888883 )
889884
890- # now we get the top reporting unit IDs, in terms of anomaly score, of the winner and most anomalous (number of reportingunit IDs is constants.max_rus_per_bar_chart
885+ # now we get the top reporting unit IDs, in terms of anomaly score, of the winner and most anomalous
891886 ids = data ["bar_chart_id" ].unique ()
892887 df = pd .DataFrame ()
893888 for idx in ids :
894889 temp_df = data [data ["bar_chart_id" ] == idx ]
895890 max_score = temp_df ["score" ].max ()
896891 if max_score > 0 :
892+ # calculate contest rank of candidate with highest anomaly score
897893 rank = temp_df [temp_df ["score" ] == max_score ].iloc [0 ]["rank" ]
894+ # TODO what if two candidates have matching max anomaly scores?
895+
896+ # throw away information from any lower-ranked candidates in the contest
897+ # TODO why?
898898 temp_df = temp_df [temp_df ["rank" ].isin ([1 , rank ])]
899899 df = pd .concat ([df , temp_df ])
900900 return df
@@ -914,9 +914,9 @@ def euclidean_zscore(li: List[List[float]]) -> List[float]:
914914def calculate_votes_at_stake (data : pd .DataFrame ) -> pd .DataFrame :
915915 """
916916 :param data: dataframe with required columns
917- "ReportingUnit_Id":
918- "Count"
919- "selection_total"
917+ "ReportingUnit_Id":
918+ "Count"
919+ "selection_total"
920920 "bar_chart_id" (records with same bar_chart_id belong to a single bar chart plot, i.e., one pair of
921921 candidates and one vote type)
922922 "score"
@@ -955,7 +955,7 @@ def calculate_votes_at_stake(data: pd.DataFrame) -> pd.DataFrame:
955955 (one_chart_df ["ReportingUnit_Id" ] == reporting_unit_id )
956956 & (
957957 (one_chart_df ["score" ] == max_score )
958- | (one_chart_df ["rank" ] == 1 ) # note OR here
958+ | (one_chart_df ["rank" ] == 1 ) # note OR here
959959 & (one_chart_df ["reporting_unit_total" ] == reporting_unit_total )
960960 )
961961 ]
@@ -976,14 +976,19 @@ def calculate_votes_at_stake(data: pd.DataFrame) -> pd.DataFrame:
976976 ].index [0 ]
977977 next_reporting_unit_id = one_chart_df .loc [next_index , "ReportingUnit_Id" ]
978978 next_margin_pct = one_chart_df .loc [next_index , "margins_pct" ]
979- next_reporting_unit_total = one_chart_df .loc [next_index , "reporting_unit_total" ]
979+ next_reporting_unit_total = one_chart_df .loc [
980+ next_index , "reporting_unit_total"
981+ ]
980982 next_anomalous_df = (
981983 one_chart_df [
982984 (one_chart_df ["ReportingUnit_Id" ] == next_reporting_unit_id )
983985 & (
984986 (one_chart_df ["margins_pct" ] == next_margin_pct )
985987 | (one_chart_df ["rank" ] == 1 )
986- & (one_chart_df ["reporting_unit_total" ] == next_reporting_unit_total )
988+ & (
989+ one_chart_df ["reporting_unit_total" ]
990+ == next_reporting_unit_total
991+ )
987992 )
988993 ]
989994 .sort_values ("rank" , ascending = False )
@@ -996,9 +1001,7 @@ def calculate_votes_at_stake(data: pd.DataFrame) -> pd.DataFrame:
9961001 # store that change in a new column called "votes_at_stake"
9971002 # and store the ratio of votes at stake to the margin in new "margin_ratio" column
9981003 winner_bucket_total = int (outlier_df [outlier_df ["rank" ] == 1 ]["Count" ])
999- not_winner_bucket_total = int (
1000- outlier_df [outlier_df ["rank" ] != 1 ]["Count" ]
1001- )
1004+ not_winner_bucket_total = int (outlier_df [outlier_df ["rank" ] != 1 ]["Count" ])
10021005 reported_bucket_total = int (outlier_df ["Count" ].sum ())
10031006 next_bucket_total = int (next_anomalous_df ["Count" ].sum ())
10041007 adj_margin = (
@@ -1020,7 +1023,9 @@ def calculate_votes_at_stake(data: pd.DataFrame) -> pd.DataFrame:
10201023 - outlier_df [outlier_df ["rank" ] != 1 ].iloc [0 ]["selection_total" ]
10211024 )
10221025 one_chart_df ["votes_at_stake" ] = contest_margin - adj_contest_margin
1023- one_chart_df ["margin_ratio" ] = one_chart_df ["votes_at_stake" ] / contest_margin_ttl
1026+ one_chart_df ["margin_ratio" ] = (
1027+ one_chart_df ["votes_at_stake" ] / contest_margin_ttl
1028+ )
10241029 except Exception :
10251030 one_chart_df ["margin_ratio" ] = 0
10261031 one_chart_df ["votes_at_stake" ] = 0
@@ -1083,7 +1088,7 @@ def create_ballot_measure_contests(
10831088 return ballotmeasure_df
10841089
10851090
1086- def bar_chart_ids_by_column_value (data : pd .DataFrame ,column : str ) -> List [int ]:
1091+ def bar_chart_ids_by_column_value (data : pd .DataFrame , column : str ) -> List [int ]:
10871092 """
10881093 Given a dataframe of results, return a list of unique bar_chart_ids
10891094 that are sorted in desc order by the column's value
@@ -1097,7 +1102,9 @@ def bar_chart_ids_by_column_value(data: pd.DataFrame,column: str) -> List[int]:
10971102 """
10981103
10991104 data = data [["bar_chart_id" , column ]]
1100- data = data .groupby ("bar_chart_id" ).max (column ).sort_values (by = column , ascending = False )
1105+ data = (
1106+ data .groupby ("bar_chart_id" ).max (column ).sort_values (by = column , ascending = False )
1107+ )
11011108 data = data .reset_index ()
11021109 return list (data ["bar_chart_id" ].unique ())
11031110
@@ -1172,7 +1179,7 @@ def create_party_abbreviation(party):
11721179 return (party .strip ())[0 ].upper ()
11731180
11741181
1175- def dedupe_scatter_title (category : str , election : str , contest :str ):
1182+ def dedupe_scatter_title (category : str , election : str , contest : str ):
11761183 """
11771184 :param category:
11781185 :param election:
0 commit comments