Skip to content

Commit 223bd35

Browse files
committed
document
1 parent 11bef54 commit 223bd35

4 files changed

Lines changed: 84 additions & 76 deletions

File tree

src/electiondata/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3804,7 +3804,7 @@ def external_data_exists(
38043804

38053805
connection = an.session.bind.raw_connection()
38063806
cursor = connection.cursor()
3807-
df = db.read_external_cursor(cursor,election_id,jurisdiction_id,["Label"])
3807+
df = db.read_external_cursor(cursor, election_id, jurisdiction_id, ["Label"])
38083808
cursor.close()
38093809

38103810
# if no data found

src/electiondata/analyze/__init__.py

Lines changed: 74 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,8 @@ def create_scatter(
162162
dfv.iloc[0]["Contest"],
163163
jurisdiction_id,
164164
)
165-
h_preliminary = db.is_preliminary(session,h_election_id,jurisdiction_id)
166-
v_preliminary = db.is_preliminary(session,v_election_id,jurisdiction_id)
165+
h_preliminary = db.is_preliminary(session, h_election_id, jurisdiction_id)
166+
v_preliminary = db.is_preliminary(session, v_election_id, jurisdiction_id)
167167
results["preliminary"] = h_preliminary or v_preliminary
168168

169169
# only keep the ones where there are an (x, y) to graph
@@ -181,12 +181,12 @@ def create_scatter(
181181

182182

183183
def package_results(
184-
data: pd.DataFrame,
185-
jurisdiction: str,
186-
x: str,
187-
y: str,
188-
restrict: Optional[int] = None,
189-
) -> Dict[str,Any]:
184+
data: pd.DataFrame,
185+
jurisdiction: str,
186+
x: str,
187+
y: str,
188+
restrict: Optional[int] = None,
189+
) -> Dict[str, Any]:
190190
"""
191191
:param data: dataframe
192192
if "x" not equal "y", columns are "Name" (values are reporting units within the
@@ -317,15 +317,15 @@ def get_external_data(
317317
"""
318318
# specify output columns
319319
cols = [
320-
"Election_Id",
321-
"Name",
322-
"Selection",
323-
"Contest_Id",
324-
"Candidate_Id",
325-
"Contest",
326-
"CountItemType",
327-
"Count",
328-
]
320+
"Election_Id",
321+
"Name",
322+
"Selection",
323+
"Contest_Id",
324+
"Candidate_Id",
325+
"Contest",
326+
"CountItemType",
327+
"Count",
328+
]
329329

330330
# get the census data
331331
census_df = db.read_external(
@@ -464,8 +464,8 @@ def create_bar(
464464
:param contest_district_type: (optional string)
465465
:param contest_or_contest_group: (optional string) from user-facing menu, either the name of a contest or of a
466466
group of contests, e.g., "All congressional"
467-
:param for_export: (optional)
468-
:return: List of dictionaries, where each dictionary contains information to create a bar
467+
:param for_export: (optional) if True, returns data for all bar charts, not just the "most interesting" ones
468+
:return: Nothing (if no interesting anomalous bar charts found) ,or List of dictionaries, where each dictionary contains information to create a bar
469469
chart. The bar charts in the list are chosen via an algorithm favoring charts with a single outlier
470470
county whose impact on the margin is large. Bar charts are restricted to results for the
471471
<contest_or_contest_group> , if given,and also from the contests with districts of type
@@ -510,10 +510,16 @@ def create_bar(
510510
ranked["margins_pct"] = ranked["Count"] / ranked["reporting_unit_total"]
511511
ranked_margin = ranked
512512
votes_at_stake = calculate_votes_at_stake(ranked_margin)
513-
if not for_export:
514-
top_ranked = get_most_anomalous(votes_at_stake, 3)
515-
else:
513+
# if for export
514+
if for_export:
515+
# return all data
516516
top_ranked = votes_at_stake
517+
else:
518+
# otherwise return the "most interesting"
519+
top_ranked = get_most_interesting(
520+
votes_at_stake, constants.number_of_charts
521+
)
522+
517523
except Exception:
518524
return None
519525
if top_ranked.empty:
@@ -570,7 +576,8 @@ def create_bar(
570576
results = package_results(pivot_df, jurisdiction, x, y)
571577
else:
572578
results = package_results(
573-
pivot_df, jurisdiction, x, y, restrict=constants.max_rus_per_bar_chart)
579+
pivot_df, jurisdiction, x, y, restrict=constants.max_rus_per_bar_chart
580+
)
574581
results["election"] = db.name_from_id(session, "Election", election_id)
575582
results["contest"] = db.name_from_id(
576583
session, "Contest", int(temp_df.iloc[0]["Contest_Id"])
@@ -592,7 +599,9 @@ def create_bar(
592599
acted = "widened"
593600
results["votes_at_stake"] = f"Outlier {acted} margin by ~ {votes_at_stake}"
594601
results["margin"] = human_readable_numbers(results["margin_raw"])
595-
results["preliminary"] = db.is_preliminary(session,election_id,jurisdiction_id)
602+
results["preliminary"] = db.is_preliminary(
603+
session, election_id, jurisdiction_id
604+
)
596605

597606
# display ballot info
598607
if multiple_ballot_types:
@@ -610,8 +619,8 @@ def create_bar(
610619
results[
611620
"title"
612621
] = f"""{results["count_item_type"].replace("-", " ").title()} Ballots Reported"""
613-
download_date = db.data_file_download(session,election_id,jurisdiction_id)
614-
if db.is_preliminary(session,election_id,jurisdiction_id) and download_date:
622+
download_date = db.data_file_download(session, election_id, jurisdiction_id)
623+
if db.is_preliminary(session, election_id, jurisdiction_id) and download_date:
615624
results[
616625
"title"
617626
] = f"""{results["title"]} as of {download_date} (preliminary)"""
@@ -665,7 +674,7 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
665674
"contest_district_type",
666675
"Count",
667676
"Selection_Id",
668-
"""
677+
"""
669678

670679
# Assign a ranking for each candidate by votes for each contest
671680

@@ -750,8 +759,8 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
750759
# loop through each unit ID and assign anomaly scores
751760
# also update the "real" bar_chart_id which takes into account pairing of candidates
752761
bar_chart_ids_tmp = df_with_units["bar_chart_id_tmp"].unique()
753-
bar_chart_id = 0 # increments on each pass through for loop
754-
df = pd.DataFrame() # collects records on each pass through for loop
762+
bar_chart_id = 0 # increments on each pass through for loop
763+
df = pd.DataFrame() # collects records on each pass through for loop
755764
# for each unit ID
756765
for bar_chart_id_tmp in bar_chart_ids_tmp:
757766
# grab all the data there
@@ -804,9 +813,13 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
804813
return df
805814

806815

807-
def get_most_anomalous(data: pd.DataFrame, n: int) -> pd.DataFrame:
816+
def get_most_interesting(data: pd.DataFrame, n: int) -> pd.DataFrame:
808817
"""
818+
Returns data for <n> bar charts, with <n>-1 from largest votes at stake ratio
819+
and 1 with largest score. If <n>-1 from votes at stake cannot be found
820+
(because of outlier_zscore_cutoff) then we fill in the top n from scores
809821
:param data: dataframe with required columns:
822+
"bar_chart_id": integer id in code identifying the set of points within which the anomaly score was calculated
810823
"margin_ratio": number of votes at stake divided by overall contest margin between the two candidates
811824
"score": anomaly z-score for the given ReportingUnit_Id within the given bar chart
812825
(specified by bar_chart_id)
@@ -815,42 +828,24 @@ def get_most_anomalous(data: pd.DataFrame, n: int) -> pd.DataFrame:
815828
"ReportingUnitType":
816829
"CountItemType":
817830
"Count":
818-
819-
820-
"Name": (name of reporting unit)
821-
"Candidate_Id",
822-
"Contest":
823-
"Selection":
824-
"contest_type": "BallotMeasure" or "Candidate"
825-
"contest_district_type": ReportingUnitType for contest district
826-
"Selection_Id":
827-
"selection_total": total votes for given selection in given contest in entire jurisdiction
828831
"rank": candidate rank within contest
829-
"contest_total": number of votes for all candidates in the given contest, over entire district
830-
"index":
831-
"bar_chart_id_tmp": artifact from calculation
832-
"bar_chart_id": internal integer id identifying the set of points within which the anomaly score was calculated
833-
"reporting_unit_total": number of votes for all candidates in the given contest and reporting unit
834-
"margins_pct":
835-
"votes_at_stake": number of votes that would change if anomaly were brought in line with nearest point (see http://digitaleditions.walsworthprintgroup.com/publication/?m=7656&i=694516&p=10&ver=html5)
832+
"Name": (name of reporting unit)
833+
"Selection":
836834
837835
:param n: integer, number of anomalous datasets to return
838836
839-
:return: dataframe
837+
:return: dataframe with data for only the <n> "most interesting" bar charts
840838
"""
841839

842-
"""Gets n contests, with <n>-1 from largest votes at stake ratio
843-
and 1 with largest score. If <n>-1 from votes at stake cannot be found
844-
(bc of threshold for score) then we fill in the top n from scores"""
845840
# filter out very small votes at stake (relative to total contest margin)
846841
data = data[(data["margin_ratio"] > 0.01) | (data["margin_ratio"] < -0.01)]
847842

848843
# identify bar charts with significant outliers (z-score above constants.outlier_zscore_cutoff)
849844
# get ordering of sufficiently anomalous bar charts (descending by votes-at-stake-to-margin ratio)
850845
# and ordering by descending z-score
851846
margin_data = data[data["score"] > constants.outlier_zscore_cutoff]
852-
bar_charts_by_margin = bar_chart_ids_by_column_value(margin_data,"margin_ratio")
853-
bar_charts_by_score = bar_chart_ids_by_column_value(data,"score")
847+
bar_charts_by_margin = bar_chart_ids_by_column_value(margin_data, "margin_ratio")
848+
bar_charts_by_score = bar_chart_ids_by_column_value(data, "score")
854849

855850
# pick top n bar charts: up to n-1 from margin data if there are enough, and the rest
856851
# from z-score
@@ -887,14 +882,19 @@ def get_most_anomalous(data: pd.DataFrame, n: int) -> pd.DataFrame:
887882
inplace=True,
888883
)
889884

890-
# now we get the top reporting unit IDs, in terms of anomaly score, of the winner and most anomalous (number of reportingunit IDs is constants.max_rus_per_bar_chart
885+
# now we get the top reporting unit IDs, in terms of anomaly score, of the winner and most anomalous
891886
ids = data["bar_chart_id"].unique()
892887
df = pd.DataFrame()
893888
for idx in ids:
894889
temp_df = data[data["bar_chart_id"] == idx]
895890
max_score = temp_df["score"].max()
896891
if max_score > 0:
892+
# calculate contest rank of candidate with highest anomaly score
897893
rank = temp_df[temp_df["score"] == max_score].iloc[0]["rank"]
894+
# TODO what if two candidates have matching max anomaly scores?
895+
896+
# throw away information from any lower-ranked candidates in the contest
897+
# TODO why?
898898
temp_df = temp_df[temp_df["rank"].isin([1, rank])]
899899
df = pd.concat([df, temp_df])
900900
return df
@@ -914,9 +914,9 @@ def euclidean_zscore(li: List[List[float]]) -> List[float]:
914914
def calculate_votes_at_stake(data: pd.DataFrame) -> pd.DataFrame:
915915
"""
916916
:param data: dataframe with required columns
917-
"ReportingUnit_Id":
918-
"Count"
919-
"selection_total"
917+
"ReportingUnit_Id":
918+
"Count"
919+
"selection_total"
920920
"bar_chart_id" (records with same bar_chart_id belong to a single bar chart plot, i.e., one pair of
921921
candidates and one vote type)
922922
"score"
@@ -955,7 +955,7 @@ def calculate_votes_at_stake(data: pd.DataFrame) -> pd.DataFrame:
955955
(one_chart_df["ReportingUnit_Id"] == reporting_unit_id)
956956
& (
957957
(one_chart_df["score"] == max_score)
958-
| (one_chart_df["rank"] == 1) # note OR here
958+
| (one_chart_df["rank"] == 1) # note OR here
959959
& (one_chart_df["reporting_unit_total"] == reporting_unit_total)
960960
)
961961
]
@@ -976,14 +976,19 @@ def calculate_votes_at_stake(data: pd.DataFrame) -> pd.DataFrame:
976976
].index[0]
977977
next_reporting_unit_id = one_chart_df.loc[next_index, "ReportingUnit_Id"]
978978
next_margin_pct = one_chart_df.loc[next_index, "margins_pct"]
979-
next_reporting_unit_total = one_chart_df.loc[next_index, "reporting_unit_total"]
979+
next_reporting_unit_total = one_chart_df.loc[
980+
next_index, "reporting_unit_total"
981+
]
980982
next_anomalous_df = (
981983
one_chart_df[
982984
(one_chart_df["ReportingUnit_Id"] == next_reporting_unit_id)
983985
& (
984986
(one_chart_df["margins_pct"] == next_margin_pct)
985987
| (one_chart_df["rank"] == 1)
986-
& (one_chart_df["reporting_unit_total"] == next_reporting_unit_total)
988+
& (
989+
one_chart_df["reporting_unit_total"]
990+
== next_reporting_unit_total
991+
)
987992
)
988993
]
989994
.sort_values("rank", ascending=False)
@@ -996,9 +1001,7 @@ def calculate_votes_at_stake(data: pd.DataFrame) -> pd.DataFrame:
9961001
# store that change in a new column called "votes_at_stake"
9971002
# and store the ratio of votes at stake to the margin in new "margin_ratio" column
9981003
winner_bucket_total = int(outlier_df[outlier_df["rank"] == 1]["Count"])
999-
not_winner_bucket_total = int(
1000-
outlier_df[outlier_df["rank"] != 1]["Count"]
1001-
)
1004+
not_winner_bucket_total = int(outlier_df[outlier_df["rank"] != 1]["Count"])
10021005
reported_bucket_total = int(outlier_df["Count"].sum())
10031006
next_bucket_total = int(next_anomalous_df["Count"].sum())
10041007
adj_margin = (
@@ -1020,7 +1023,9 @@ def calculate_votes_at_stake(data: pd.DataFrame) -> pd.DataFrame:
10201023
- outlier_df[outlier_df["rank"] != 1].iloc[0]["selection_total"]
10211024
)
10221025
one_chart_df["votes_at_stake"] = contest_margin - adj_contest_margin
1023-
one_chart_df["margin_ratio"] = one_chart_df["votes_at_stake"] / contest_margin_ttl
1026+
one_chart_df["margin_ratio"] = (
1027+
one_chart_df["votes_at_stake"] / contest_margin_ttl
1028+
)
10241029
except Exception:
10251030
one_chart_df["margin_ratio"] = 0
10261031
one_chart_df["votes_at_stake"] = 0
@@ -1083,7 +1088,7 @@ def create_ballot_measure_contests(
10831088
return ballotmeasure_df
10841089

10851090

1086-
def bar_chart_ids_by_column_value(data: pd.DataFrame,column: str) -> List[int]:
1091+
def bar_chart_ids_by_column_value(data: pd.DataFrame, column: str) -> List[int]:
10871092
"""
10881093
Given a dataframe of results, return a list of unique bar_chart_ids
10891094
that are sorted in desc order by the column's value
@@ -1097,7 +1102,9 @@ def bar_chart_ids_by_column_value(data: pd.DataFrame,column: str) -> List[int]:
10971102
"""
10981103

10991104
data = data[["bar_chart_id", column]]
1100-
data = data.groupby("bar_chart_id").max(column).sort_values(by=column, ascending=False)
1105+
data = (
1106+
data.groupby("bar_chart_id").max(column).sort_values(by=column, ascending=False)
1107+
)
11011108
data = data.reset_index()
11021109
return list(data["bar_chart_id"].unique())
11031110

@@ -1172,7 +1179,7 @@ def create_party_abbreviation(party):
11721179
return (party.strip())[0].upper()
11731180

11741181

1175-
def dedupe_scatter_title(category: str, election: str, contest:str):
1182+
def dedupe_scatter_title(category: str, election: str, contest: str):
11761183
"""
11771184
:param category:
11781185
:param election:

src/electiondata/constants/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@
130130
"000_for_all_jurisdictions",
131131
"major_subjurisdiction_types.txt",
132132
)
133+
133134
def jurisdiction_wide_contests(abbr: str) -> List[str]:
134135
"""
135136
Inputs:
@@ -148,6 +149,7 @@ def jurisdiction_wide_contests(abbr: str) -> List[str]:
148149
f"{abbr} Secretary of State",
149150
]
150151

152+
151153
# analysis parameters
152154
if 1:
153155
# z-scores below this value are considered not particularly anomalous by
@@ -157,6 +159,8 @@ def jurisdiction_wide_contests(abbr: str) -> List[str]:
157159
# If there are more in the contest district, least interesting ones
158160
# will be averaged into the last set of bars in the bar chart.
159161
max_rus_per_bar_chart = 8
162+
# max number of bar charts to return when curating bar charts
163+
number_of_charts = 3
160164

161165
# display information
162166
if 1:

src/electiondata/database/__init__.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1660,7 +1660,9 @@ def list_to_id(session: Session, element: str, names: List[str]) -> Optional[int
16601660

16611661

16621662
def data_file_download(
1663-
session: Session, election_id: int, jurisdiction_id: int,
1663+
session: Session,
1664+
election_id: int,
1665+
jurisdiction_id: int,
16641666
) -> Optional[str]:
16651667
"""
16661668
:param session: sqlalchemy database session
@@ -1682,7 +1684,6 @@ def data_file_download(
16821684
return date_str
16831685

16841686

1685-
16861687
def data_file_download_cursor(
16871688
cursor: psycopg2.extensions.cursor, election_id: int, jurisdiction_id: int
16881689
) -> Optional[str]:
@@ -1703,7 +1704,7 @@ def data_file_download_cursor(
17031704
"""
17041705
)
17051706
try:
1706-
cursor.execute(q,[election_id,jurisdiction_id])
1707+
cursor.execute(q, [election_id, jurisdiction_id])
17071708
return cursor.fetchall()[0][0]
17081709
except Exception as exc:
17091710
return None
@@ -1736,11 +1737,7 @@ def is_preliminary_cursor(
17361737
return True
17371738

17381739

1739-
def is_preliminary(
1740-
session: Session,
1741-
election_id: int,
1742-
jurisdiction_id: int
1743-
) -> bool:
1740+
def is_preliminary(session: Session, election_id: int, jurisdiction_id: int) -> bool:
17441741
"""
17451742
:param session: sqlalchemy database session
17461743
:param election_id: integer, database Id for election

0 commit comments

Comments
 (0)