Skip to content

Commit 132c7d9

Browse files
committed
document inline
1 parent e5783ce commit 132c7d9

1 file changed

Lines changed: 130 additions & 41 deletions

File tree

src/electiondata/analyze/__init__.py

Lines changed: 130 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -238,20 +238,25 @@ def get_data_for_scatter(
238238
subdivision_type: str,
239239
election_id: int,
240240
count_item_type: str,
241-
filter_str,
242-
count_type,
243-
is_runoff,
244-
):
241+
filter_str: str,
242+
count_type: str,
243+
is_runoff: bool,
244+
) -> pd.DataFrame:
245245
"""
246246
:param session: sqlalchemy database session
247247
:param jurisdiction_id: integer Id of jurisdiction in database ReportingUnit table
248-
:param subdivision_type: string ReportingUnitType characterizing subdivisions for points in scatter
248+
:param subdivision_type: string ReportingUnitType
249249
:param election_id: integer Id of election in database Election table
250250
:param count_item_type: CountItemType characterizing vote counts
251-
:param filter_str:
252-
:param count_type:
253-
:param is_runoff:
254-
:return:
251+
:param filter_str: string used to filter data
252+
for vote counts: "All contests" or "All candidates" or a value for Contest.Name or Candidate.BallotName
253+
for other datasets: value for ExternalDataSet.Label
254+
:param count_type: string used to filter data:
255+
for vote counts: "candidates" or "parties" or "contests"
256+
for other datasets: value for ExternalDataSet.Category
257+
:param is_runoff: True for a run-off contest associated to the election, otherwise False
258+
:return: dataframe of results rolled up to ReportingUnits of <subdivision> ReportingUnitType,
259+
restricted by <count_item_type>, <filter_str" and <count_type>
255260
"""
256261
if count_type.startswith("Population"):
257262
return get_external_data(
@@ -282,7 +287,38 @@ def get_external_data(
282287
category,
283288
label,
284289
subdivision_type,
285-
):
290+
) -> pd.DataFrame:
291+
"""
292+
:param session: sqlalchemy database session
293+
:param jurisdiction_id: integer Id of jurisdiction in database ReportingUnit table
294+
:param subdivision_type: string ReportingUnitType characterizing subdivisions for points in scatter
295+
:param category: value for ExternalDataSet.Category
296+
:param label: value for ExternalDataSet.Label
297+
:param subdivision_type: value for ReportingUnit.ReportingUnitType
298+
:return: dataframe of results rolled up to ReportingUnits of <subdivision> ReportingUnitType
299+
restricted by <category> and <label>. Format is designed to match votecount format
300+
Columns are:
301+
"Election_Id",
302+
"Name", (name of ReportingUnit)
303+
"Selection", (set to <label>)
304+
"Contest_Id", dummy set to 0
305+
"Candidate_Id", dummy set to 0
306+
"Contest", (set to <category>)
307+
"CountItemType", (set to "total")
308+
"Count",
309+
"""
310+
# specify output columns
311+
cols = [
312+
"Election_Id",
313+
"Name",
314+
"Selection",
315+
"Contest_Id",
316+
"Candidate_Id",
317+
"Contest",
318+
"CountItemType",
319+
"Count",
320+
]
321+
286322
# get the census data
287323
census_df = db.read_external(
288324
session,
@@ -305,20 +341,9 @@ def get_external_data(
305341
columns={"Label": "Selection", "Value": "Count"},
306342
inplace=True,
307343
)
308-
census_df = census_df[
309-
[
310-
"Election_Id",
311-
"Name",
312-
"Selection",
313-
"Contest_Id",
314-
"Candidate_Id",
315-
"Contest",
316-
"CountItemType",
317-
"Count",
318-
]
319-
]
344+
census_df = census_df[cols]
320345
return census_df
321-
return pd.DataFrame()
346+
return pd.DataFrame(columns=cols)
322347

323348

324349
def get_votecount_data(
@@ -340,7 +365,8 @@ def get_votecount_data(
340365
:param filter_str: string, "All contests" or "All candidates" or a contest name or a candidate name
341366
:param count_type: "candidates" or "parties" or "contests"
342367
:param is_runoff: True if contest is a run-off; otherwise False
343-
:return: dataframe of vote counts by subdivision, with columns specifying ReportingUnit, Contest,
368+
:return: dataframe of vote counts rolled up to ReportingUnits of type <subdivision>
369+
(or larger, if that's all that's available), Contest,
344370
Selection, VoteCountType along with various database Ids
345371
"""
346372
unsummed = db.unsummed_vote_counts_with_rollup_subdivision_id(
@@ -586,34 +612,84 @@ def create_bar(
586612

587613

588614
def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
589-
"""adds a new column called score between 0 and 1; 1 is more anomalous.
590-
Also adds a `unit_id` column which assigns a score to each unit of analysis
591-
that is considered. For example, we may decide to look at anomalies across each
592-
distinct combination of contest, reporting unit type, and vote type. Each
593-
combination of those would get assigned an ID. This means rows may get added
594-
to the dataframe if needed."""
615+
"""
616+
617+
:param data: dataframe with required columns:
618+
"ReportingUnitType",
619+
"ParentReportingUnit_Id",
620+
"ParentName",
621+
"ParentReportingUnitType",
622+
"Candidate_Id",
623+
"CountItemType",
624+
"Contest_Id",
625+
"Contest",
626+
"Selection",
627+
"Selection_Id",
628+
"contest_type",
629+
"contest_district_type",
630+
"Count",
631+
632+
and possibly other columns, such as:
633+
"Party",
634+
635+
636+
:return: dataframe obtained by appending columns to <data>:
637+
"score": value between 0 and 1; 1 is more anomalous
638+
"unit_id": identifies the set of vote counts within which the anomaly score of the single vote count was
639+
calculated. A single vote count's anomaly score depends on the set of vote counts within which it is
640+
considered.
641+
"reporting_unit_total": total votes for all selections in given contest for given reporting unit
642+
"ind_total": total votes for given selection in given contest in entire jurisdiction
643+
"rank": 1 for contest winner, 2 for second place, etc.
644+
"contest_total": total votes cast in the contest in entire jurisdiction
645+
"index": artifact from calculation
646+
"unit_id_tmp": artifact from calculation
647+
648+
and preserving columns
649+
"ReportingUnit_Id" (renamed from "ParentReportingUnit_Id")
650+
"Name", (renamed from "ParentName", the name of the ReportingUnit)
651+
"ReportingUnitType", (renamed from "ParentReportingUnitType")
652+
"Candidate_Id",
653+
"CountItemType",
654+
"Contest_Id",
655+
"Contest",
656+
"Selection",
657+
"contest_type",
658+
"contest_district_type",
659+
"Count",
660+
"Selection_Id",
661+
"""
595662

596663
# Assign a ranking for each candidate by votes for each contest
664+
665+
# # create <total_data> dataframe with "total" CountItemType only
597666
if "total" not in data["CountItemType"].unique():
598667
groupby_cols = list(data.columns)
599668
groupby_cols.remove("Count")
600669
total_data = data.groupby(groupby_cols).sum().reset_index()
601670
else:
602671
total_data = data[data["CountItemType"] == "total"]
603672

673+
# # create <ranked_df> of contest-candidate pairs,
674+
# column "rank": winners 1, second-place 2, etc. (tied candidates get same rank);
675+
# column "ind_total" with total votes for candidate;
676+
# column "contest_total" with total votes in contest
677+
678+
# # # Append total votes for selection
604679
ranked_df = (
605680
total_data.groupby(["Contest_Id", "Selection", "Selection_Id"], as_index=False)[
606681
"Count"
607682
]
608683
.sum()
609684
.sort_values(["Contest_Id", "Count"], ascending=False)
610685
)
686+
# # # Append rank
611687
ranked_df["rank"] = ranked_df.groupby("Contest_Id")["Count"].rank(
612688
"dense", ascending=False
613689
)
614690
ranked_df.rename(columns={"Count": "ind_total"}, inplace=True)
615691

616-
# Now get the total votes for the entire contest
692+
# # # Append total votes for the entire contest
617693
contest_df = ranked_df.groupby("Contest_Id")["ind_total"].sum().reset_index()
618694
contest_df.rename(columns={"ind_total": "contest_total"}, inplace=True)
619695
ranked_df = ranked_df.merge(contest_df, how="inner", on="Contest_Id")
@@ -667,8 +743,8 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
667743
# loop through each unit ID and assign anomaly scores
668744
# also update the "real" unit_id which takes into account pairing of candidates
669745
unit_ids_tmp = df_with_units["unit_id_tmp"].unique()
670-
unit_id = 0
671-
df = pd.DataFrame()
746+
unit_id = 0 # increments on each pass through for loop
747+
df = pd.DataFrame() # collects records on each pass through for loop
672748
# for each unit ID
673749
for unit_id_tmp in unit_ids_tmp:
674750
# grab all the data there
@@ -722,7 +798,7 @@ def assign_anomaly_score(data: pd.DataFrame) -> pd.DataFrame:
722798

723799

724800
def get_most_anomalous(data: pd.DataFrame, n: int) -> pd.DataFrame:
725-
"""Gets n contest, with 2 from largest votes at stake ratio
801+
"""Gets n contests, with 2 from largest votes at stake ratio
726802
and 1 with largest score. If 2 from votes at stake cannot be found
727803
(bc of threshold for score) then we fill in the top n from scores"""
728804
# filter out very small votes at stake margins
@@ -1163,7 +1239,7 @@ def nist_office(session, election_id, jurisdiction_id):
11631239
return json.loads(result)
11641240

11651241

1166-
def nist_candidate(session, election_id, jurisdiction_id):
1242+
def nist_candidate(session: Session, election_id: int, jurisdiction_id: int):
11671243
df = db.read_vote_count(
11681244
session,
11691245
election_id=election_id,
@@ -1176,25 +1252,37 @@ def nist_candidate(session, election_id, jurisdiction_id):
11761252

11771253

11781254
def rollup_dataframe(
1179-
session,
1255+
session: Session,
11801256
df: pd.DataFrame,
11811257
count_col: str,
11821258
ru_id_column: str,
11831259
new_ru_id_column: str,
11841260
rollup_rut: str = constants.default_subdivision_type,
1185-
ignore: Optional[List] = None,
1261+
ignore: Optional[List[str]] = None,
11861262
) -> (pd.DataFrame(), Optional[dict]):
1187-
"""Returns datafrome of results rolled up to the reporting unit type <rollup_rut>.
1188-
For reporting units without parents of the given type (e.g., sometimes absentee votes
1189-
are reported by state), preserve the record"""
1263+
"""
1264+
:param session: sqlalchemy database session
1265+
:param df: dataframe of results
1266+
:param count_col: string, name of column with counts
1267+
:param ru_id_column: string, name of column with database Ids of ReportingUnits
1268+
:param new_ru_id_column: string, name of column in returned dataframe with database Ids
1269+
of newly rolled-up ReportingUnits
1270+
:param rollup_rut: string, ReportingUnitType to roll up to (e.g., "county")
1271+
:param ignore: (optional) list of names of columns to drop from <df>
1272+
:return:
1273+
dataframe of results rolled up to the given ReportingUnitType (NB: for reporting units without
1274+
parents of the given type (e.g., sometimes absentee votes are reported by state), preserve the reporting unit
1275+
dictionary of errors and warnings
1276+
"""
11901277

1191-
err = None # TODO error handling
1278+
err = None
11921279

11931280
# drop from dataframe any columns in <ignore>
11941281
if ignore:
11951282
working = df.copy().drop(ignore, axis=1)
11961283
else:
11971284
working = df.copy()
1285+
11981286
group_cols = [c for c in working.columns if (c not in (ru_id_column, count_col))]
11991287
parents, err_str = db.parents(
12001288
session, df[ru_id_column].unique(), subunit_type=rollup_rut
@@ -1206,6 +1294,7 @@ def rollup_dataframe(
12061294
f"{Path(__file__).absolute().parents[0].name}.{inspect.currentframe().f_code.co_name}",
12071295
f"Unable to read parents reporting unit info from column {ru_id_column}",
12081296
)
1297+
return pd.DataFrame(), err
12091298
try:
12101299
new_working = (
12111300
working.reset_index()

0 commit comments

Comments
 (0)