From ffb2d5b0572c953c57e10cf4463b5dc1e70c7aee Mon Sep 17 00:00:00 2001 From: Sean Moss Date: Tue, 19 May 2026 21:54:36 -0400 Subject: [PATCH 1/2] Link legislator profiles to district pages --- .serena/.gitignore | 2 + .serena/project.yml | 133 +++++++++++++++++ .../LegislatorProfile/DistrictTab.test.tsx | 77 ++++++++++ components/LegislatorProfile/DistrictTab.tsx | 100 +++++++++++++ .../LegislatorProfilePage.tsx | 91 +++++++++++ components/LegislatorProfile/index.ts | 2 + components/ProfilePage/ProfileLegislators.tsx | 20 ++- components/db/districts.ts | 51 +++++++ components/db/index.ts | 1 + components/links.tsx | 2 + functions/src/districts/index.ts | 3 + functions/src/districts/normalize.ts | 116 ++++++++++++++ .../src/districts/parseSecDistricts.test.ts | 141 ++++++++++++++++++ functions/src/districts/parseSecDistricts.ts | 103 +++++++++++++ functions/src/districts/types.ts | 32 ++++ pages/legislators/[court]/[memberCode].tsx | 39 +++++ .../scrapeLegislativeDistricts.ts | 89 +++++++++++ 17 files changed, 997 insertions(+), 5 deletions(-) create mode 100644 .serena/.gitignore create mode 100644 .serena/project.yml create mode 100644 components/LegislatorProfile/DistrictTab.test.tsx create mode 100644 components/LegislatorProfile/DistrictTab.tsx create mode 100644 components/LegislatorProfile/LegislatorProfilePage.tsx create mode 100644 components/LegislatorProfile/index.ts create mode 100644 components/db/districts.ts create mode 100644 functions/src/districts/index.ts create mode 100644 functions/src/districts/normalize.ts create mode 100644 functions/src/districts/parseSecDistricts.test.ts create mode 100644 functions/src/districts/parseSecDistricts.ts create mode 100644 functions/src/districts/types.ts create mode 100644 pages/legislators/[court]/[memberCode].tsx create mode 100644 scripts/firebase-admin/scrapeLegislativeDistricts.ts diff --git a/.serena/.gitignore b/.serena/.gitignore new file mode 100644 index 000000000..2e510aff5 --- /dev/null +++ b/.serena/.gitignore @@ -0,0 +1,2 @@ +/cache +/project.local.yml diff --git a/.serena/project.yml b/.serena/project.yml new file mode 100644 index 000000000..75e8c0642 --- /dev/null +++ b/.serena/project.yml @@ -0,0 +1,133 @@ +# the name by which the project can be referenced within Serena +project_name: "maple" + + +# list of languages for which language servers are started; choose from: +# al angular ansible bash clojure +# cpp cpp_ccls crystal csharp csharp_omnisharp +# dart elixir elm erlang fortran +# fsharp go groovy haskell haxe +# hlsl html java json julia +# kotlin lean4 lua luau markdown +# matlab msl nix ocaml pascal +# perl php php_phpactor powershell python +# python_jedi python_ty r rego ruby +# ruby_solargraph rust scala scss solidity +# svelte swift systemverilog terraform toml +# typescript typescript_vts vue yaml zig +# (This list may be outdated. For the current list, see values of Language enum here: +# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py +# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.) +# Note: +# - For C, use cpp +# - For JavaScript, use typescript +# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root) +# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm) +# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three) +# - For Free Pascal/Lazarus, use pascal +# Special requirements: +# Some languages require additional setup/installations. +# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers +# When using multiple languages, the first language server that supports a given file will be used for that file. +# The first language is the default language and the respective language server will be used as a fallback. +# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored. +languages: +- typescript + +# the encoding used by text files in the project +# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings +encoding: "utf-8" + +# line ending convention to use when writing source files. +# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default) +# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings. +line_ending: + +# The language backend to use for this project. +# If not set, the global setting from serena_config.yml is used. +# Valid values: LSP, JetBrains +# Note: the backend is fixed at startup. If a project with a different backend +# is activated post-init, an error will be returned. +language_backend: + +# whether to use project's .gitignore files to ignore files +ignore_all_files_in_gitignore: true + +# advanced configuration option allowing to configure language server-specific options. +# Maps the language key to the options. +# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available. +# No documentation on options means no options are available. +ls_specific_settings: {} + +# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos). +# Paths can be absolute or relative to the project root. +# Each folder is registered as an LSP workspace folder, enabling language servers to discover +# symbols and references across package boundaries. +# Currently supported for: TypeScript. +# Example: +# additional_workspace_folders: +# - ../sibling-package +# - ../shared-lib +additional_workspace_folders: [] + +# list of additional paths to ignore in this project. +# Same syntax as gitignore, so you can use * and **. +# Note: global ignored_paths from serena_config.yml are also applied additively. +ignored_paths: [] + +# whether the project is in read-only mode +# If set to true, all editing tools will be disabled and attempts to use them will result in an error +# Added on 2025-04-18 +read_only: false + +# list of tool names to exclude. +# This extends the existing exclusions (e.g. from the global configuration) +# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html +excluded_tools: [] + +# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default). +# This extends the existing inclusions (e.g. from the global configuration). +# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html +included_optional_tools: [] + +# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools. +# This cannot be combined with non-empty excluded_tools or included_optional_tools. +# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html +fixed_tools: [] + +# list of mode names that are to be activated by default, overriding the setting in the global configuration. +# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes. +# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply. +# Otherwise, this overrides the setting from the global configuration (serena_config.yml). +# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply +# for this project. +# This setting can, in turn, be overridden by CLI parameters (--mode). +# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes +default_modes: + +# list of mode names to be activated additionally for this project, e.g. ["query-projects"] +# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes. +# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes +added_modes: + +# initial prompt for the project. It will always be given to the LLM upon activating the project +# (contrary to the memories, which are loaded on demand). +initial_prompt: "" + +# time budget (seconds) per tool call for the retrieval of additional symbol information +# such as docstrings or parameter information. +# This overrides the corresponding setting in the global configuration; see the documentation there. +# If null or missing, use the setting from the global configuration. +symbol_info_budget: + +# list of regex patterns which, when matched, mark a memory entry as read‑only. +# Extends the list from the global configuration, merging the two lists. +read_only_memory_patterns: [] + +# list of regex patterns for memories to completely ignore. +# Matching memories will not appear in list_memories or activate_project output +# and cannot be accessed via read_memory or write_memory. +# To access ignored memory files, use the read_file tool on the raw file path. +# Extends the list from the global configuration, merging the two lists. +# Example: ["_archive/.*", "_episodes/.*"] +ignored_memory_patterns: [] diff --git a/components/LegislatorProfile/DistrictTab.test.tsx b/components/LegislatorProfile/DistrictTab.test.tsx new file mode 100644 index 000000000..3f82e8983 --- /dev/null +++ b/components/LegislatorProfile/DistrictTab.test.tsx @@ -0,0 +1,77 @@ +import "@testing-library/jest-dom" +import { render, screen } from "@testing-library/react" +import { DistrictTab } from "./DistrictTab" +import type { District } from "components/db" + +const baseDistrict: District = { + id: "house-9-hampden", + branch: "House", + district: "9th Hampden", + sourceDistrict: "Ninth Hampden", + sourceUrl: "https://example.test/districts", + fetchedAt: {} as any, + municipalities: [] +} + +describe("DistrictTab", () => { + it("renders full-town-only districts without subdivision chips", () => { + render( + + ) + + expect(screen.getAllByText("9th Hampden District")).toHaveLength(2) + expect( + screen.getByText("West Springfield & Springfield") + ).toBeInTheDocument() + expect(screen.queryByText(/Ward/)).not.toBeInTheDocument() + expect( + screen.getByText("Source: MA Secretary of the Commonwealth district data") + ).toBeInTheDocument() + }) + + it("renders subdivision chips and prefixes them when multiple municipalities are split", () => { + render( + + ) + + expect( + screen.getByText("West Springfield: Ward 1 Precinct C1") + ).toBeInTheDocument() + expect( + screen.getByText("Springfield: Ward 8 Precinct A") + ).toBeInTheDocument() + expect( + screen.getByText("Springfield: Ward 8 Precinct B") + ).toBeInTheDocument() + }) + + it("renders a fallback when district data is missing", () => { + render() + + expect( + screen.getByText("District details are not available yet.") + ).toBeInTheDocument() + }) +}) diff --git a/components/LegislatorProfile/DistrictTab.tsx b/components/LegislatorProfile/DistrictTab.tsx new file mode 100644 index 000000000..a00f706b4 --- /dev/null +++ b/components/LegislatorProfile/DistrictTab.tsx @@ -0,0 +1,100 @@ +import { FontAwesomeIcon } from "@fortawesome/react-fontawesome" +import { faLocationDot } from "@fortawesome/free-solid-svg-icons" +import styled from "styled-components" +import type { District } from "components/db" + +const MapPreview = styled.div` + background: #dfe8fb; + border: 1px solid #d4deef; + border-radius: 8px 8px 0 0; + color: #18358f; + min-height: 300px; +` + +const DistrictCard = styled.section` + border: 1px solid #d9dee5; + border-radius: 8px; + overflow: hidden; +` + +const Chip = styled.span` + border: 1px solid #d9dee5; + border-radius: 999px; + color: #4a5564; + display: inline-flex; + font-size: 1rem; + font-weight: 600; + line-height: 1.2; + padding: 0.55rem 1rem; +` + +function municipalitySummary(district: District) { + return district.municipalities + .map(municipality => municipality.name) + .join(" & ") +} + +function subdivisionChips(district: District) { + const municipalitiesWithSubdivisions = district.municipalities.filter( + municipality => municipality.subdivisions.length > 0 + ) + const shouldPrefix = municipalitiesWithSubdivisions.length > 1 + + return municipalitiesWithSubdivisions.flatMap(municipality => + municipality.subdivisions.map(subdivision => + shouldPrefix ? `${municipality.name}: ${subdivision}` : subdivision + ) + ) +} + +export function DistrictTab({ + district, + loading +}: { + district?: District + loading?: boolean +}) { + if (loading) { + return
Loading district...
+ } + + if (!district) { + return ( +
+ District details are not available yet. +
+ ) + } + + const chips = subdivisionChips(district) + + return ( + <> + + + +

{district.district} District

+

+ Interactive map · Coming soon +

+
+
+

{district.district} District

+

+ {municipalitySummary(district)} +

+ {chips.length > 0 && ( +
+ {chips.map(chip => ( + {chip} + ))} +
+ )} +
+
+

+ Source: MA Secretary of the Commonwealth district data +

+ + ) +} diff --git a/components/LegislatorProfile/LegislatorProfilePage.tsx b/components/LegislatorProfile/LegislatorProfilePage.tsx new file mode 100644 index 000000000..34a0d4ed8 --- /dev/null +++ b/components/LegislatorProfile/LegislatorProfilePage.tsx @@ -0,0 +1,91 @@ +import ErrorPage from "next/error" +import styled from "styled-components" +import { Col, Container, Row, Spinner } from "components/bootstrap" +import { useDistrict, useMember } from "components/db" +import { DistrictTab } from "./DistrictTab" + +const tabs = [ + "Priorities", + "Bills", + "Elections", + "Finance", + "District", + "Her testimony", + "Votes" +] + +const TabButton = styled.button` + background: transparent; + border: 0; + border-bottom: 5px solid transparent; + color: #68707a; + font-size: 1.35rem; + font-weight: 700; + padding: 1.35rem 1.9rem 1.15rem; + + &.active { + border-bottom-color: #18358f; + color: #18358f; + } +` + +export function LegislatorProfilePage({ + court, + memberCode +}: { + court: number + memberCode: string +}) { + const { member, loading: memberLoading } = useMember(court, memberCode) + const { district, loading: districtLoading } = useDistrict( + court, + member?.Branch, + member?.District + ) + + if (memberLoading) { + return ( + + + + ) + } + + if (!member) { + return + } + + return ( + + + +

{member.Name}

+

+ {member.Branch} - {member.District} +

+ +
+
+ {tabs.map(label => ( + + {label} + + ))} +
+
+ +
+
+ ) +} diff --git a/components/LegislatorProfile/index.ts b/components/LegislatorProfile/index.ts new file mode 100644 index 000000000..056729577 --- /dev/null +++ b/components/LegislatorProfile/index.ts @@ -0,0 +1,2 @@ +export * from "./DistrictTab" +export * from "./LegislatorProfilePage" diff --git a/components/ProfilePage/ProfileLegislators.tsx b/components/ProfilePage/ProfileLegislators.tsx index 6f1cc6bd2..3fbacd6e3 100644 --- a/components/ProfilePage/ProfileLegislators.tsx +++ b/components/ProfilePage/ProfileLegislators.tsx @@ -3,6 +3,8 @@ import { ProfileMember } from "../db" import { LabeledIcon, TitledSectionCard } from "../shared" import { Card as MapleCard } from "components/Card" import { useTranslation } from "next-i18next" +import { currentGeneralCourt } from "functions/src/shared" +import { Internal, maple } from "components/links" type ProfileMemberPlus = (ProfileMember & { title: string }) | undefined @@ -23,11 +25,19 @@ const DisplayLegislator = ({ return ( <> {legislator ? ( - + + + ) : (
{t("content.noLegislatorInfo")}
)} diff --git a/components/db/districts.ts b/components/db/districts.ts new file mode 100644 index 000000000..f5b9f35b1 --- /dev/null +++ b/components/db/districts.ts @@ -0,0 +1,51 @@ +import type { Timestamp } from "firebase/firestore" +import { useMemo } from "react" +import { useAsync } from "react-async-hook" +import { districtId } from "functions/src/districts/normalize" +import { loadDoc } from "./common" + +export type DistrictBranch = "House" | "Senate" + +export type DistrictMunicipality = { + name: string + subdivisions: string[] +} + +export type District = { + id: string + branch: DistrictBranch + district: string + sourceDistrict: string + sourceUrl: string + municipalities: DistrictMunicipality[] + fetchedAt: Timestamp +} + +async function getDistrict( + court: number, + branch?: string | null, + district?: string | null +): Promise { + if (branch !== "House" && branch !== "Senate") return undefined + if (!district) return undefined + + return loadDoc( + `/generalCourts/${court}/districts/${districtId(branch, district)}` + ) as Promise +} + +export function useDistrict( + court: number, + branch?: string | null, + district?: string | null +) { + const { loading, result } = useAsync(getDistrict, [court, branch, district]) + + return useMemo( + () => ({ + district: result, + loading + }), + [loading, result] + ) +} diff --git a/components/db/index.ts b/components/db/index.ts index 3fc6a012f..913c78537 100644 --- a/components/db/index.ts +++ b/components/db/index.ts @@ -1,6 +1,7 @@ export * from "./api" export * from "./bills" export * from "./createTableHook" +export * from "./districts" export * from "./members" export * from "./news" export * from "./profile" diff --git a/components/links.tsx b/components/links.tsx index b6ac2cdd1..0482a64fc 100644 --- a/components/links.tsx +++ b/components/links.tsx @@ -75,6 +75,8 @@ export const maple = { ballotQuestion: ({ id }: { id: string }) => `/ballotQuestions/${id}`, bill: ({ court, id }: { court: number; id: string }) => `/bills/${court}/${id}`, + legislator: ({ court, memberCode }: { court: number; memberCode: string }) => + `/legislators/${court}/${memberCode}`, testimony: ({ publishedId }: { publishedId: string }) => `/testimony/${publishedId}`, userTestimony: ({ diff --git a/functions/src/districts/index.ts b/functions/src/districts/index.ts new file mode 100644 index 000000000..5f2634ed3 --- /dev/null +++ b/functions/src/districts/index.ts @@ -0,0 +1,3 @@ +export * from "./normalize" +export * from "./parseSecDistricts" +export * from "./types" diff --git a/functions/src/districts/normalize.ts b/functions/src/districts/normalize.ts new file mode 100644 index 000000000..867c26ed9 --- /dev/null +++ b/functions/src/districts/normalize.ts @@ -0,0 +1,116 @@ +const ordinalWords: Record = { + first: 1, + second: 2, + third: 3, + fourth: 4, + fifth: 5, + sixth: 6, + seventh: 7, + eighth: 8, + ninth: 9, + tenth: 10, + eleventh: 11, + twelfth: 12, + thirteenth: 13, + fourteenth: 14, + fifteenth: 15, + sixteenth: 16, + seventeenth: 17, + eighteenth: 18, + nineteenth: 19 +} + +const tensOrdinalWords: Record = { + twentieth: 20, + thirtieth: 30 +} + +const tensWords: Record = { + twenty: 20, + thirty: 30 +} + +const ordinalSuffix = /^(\d+)(st|nd|rd|th)?$/i + +function parseOrdinalToken(word: string) { + const numericOrdinal = word.match(ordinalSuffix) + if (numericOrdinal) return Number(numericOrdinal[1]) + + return ordinalWords[word] ?? tensOrdinalWords[word] +} + +function normalizeLeadingOrdinal(words: string[]) { + const [firstWord, secondWord] = words + + if (!firstWord) return words + + const ordinal = parseOrdinalToken(firstWord) + if (ordinal) return [String(ordinal), ...words.slice(1)] + + const tens = tensWords[firstWord] + const ones = secondWord ? ordinalWords[secondWord] : undefined + if (tens && ones) return [String(tens + ones), ...words.slice(2)] + + return words +} + +export function normalizeDistrictName(district: string) { + const words = district + .toLowerCase() + .replace(/&/g, " and ") + .replace(/[-–—]/g, " ") + .replace(/[^\w\s]/g, " ") + .replace(/\band\b/g, " ") + .replace(/\s+/g, " ") + .trim() + .split(" ") + .filter(Boolean) + + return normalizeLeadingOrdinal(words).join(" ") +} + +export function districtId(branch: "House" | "Senate", district: string) { + return `${branch.toLowerCase()}-${normalizeDistrictName(district).replace( + /\s+/g, + "-" + )}` +} + +const ordinalSuffixes = ["th", "st", "nd", "rd"] + +function formatOrdinal(number: number) { + const suffix = + number % 100 >= 11 && number % 100 <= 13 + ? "th" + : ordinalSuffixes[number % 10] ?? "th" + + return `${number}${suffix}` +} + +export function displayDistrictName(sourceDistrict: string) { + const words = sourceDistrict.split(/\s+/) + const firstParts = (words[0] ?? "").toLowerCase().split(/[-–—]/) + + if (firstParts.length === 2) { + const normalized = normalizeLeadingOrdinal(firstParts) + if (normalized.length === 1 && /^\d+$/.test(normalized[0])) { + return [formatOrdinal(Number(normalized[0])), ...words.slice(1)].join(" ") + } + } + + const firstOrdinal = parseOrdinalToken( + (words[0] ?? "").toLowerCase().replace(/[^\w]/g, "") + ) + if (firstOrdinal) { + return [formatOrdinal(firstOrdinal), ...words.slice(1)].join(" ") + } + + const secondOrdinal = (words[1] ?? "").toLowerCase().replace(/[^\w]/g, "") + const tens = tensWords[(words[0] ?? "").toLowerCase().replace(/[^\w]/g, "")] + const ones = ordinalWords[secondOrdinal] + if (tens && ones) { + return [formatOrdinal(tens + ones), ...words.slice(2)].join(" ") + } + + return sourceDistrict +} diff --git a/functions/src/districts/parseSecDistricts.test.ts b/functions/src/districts/parseSecDistricts.test.ts new file mode 100644 index 000000000..34bd4c8e2 --- /dev/null +++ b/functions/src/districts/parseSecDistricts.test.ts @@ -0,0 +1,141 @@ +import { + displayDistrictName, + districtId, + normalizeDistrictName +} from "./normalize" +import { parseSecDistricts } from "./parseSecDistricts" + +const sourceUrl = "https://example.test/districts" + +describe("district normalization", () => { + it("normalizes district names across ordinal and punctuation variants", () => { + expect(normalizeDistrictName("Ninth Hampden")).toBe("9 hampden") + expect(normalizeDistrictName("9th Hampden")).toBe("9 hampden") + expect(normalizeDistrictName("Thirty-Seventh Middlesex")).toBe( + "37 middlesex" + ) + expect( + normalizeDistrictName("Berkshire, Hampden, Franklin, and Hampshire") + ).toBe("berkshire hampden franklin hampshire") + }) + + it("builds firestore-safe district ids with the branch", () => { + expect(districtId("House", "9th Hampden")).toBe("house-9-hampden") + expect(districtId("Senate", "Third Bristol and Plymouth")).toBe( + "senate-3-bristol-plymouth" + ) + }) + + it("converts House source headings to member-facing ordinal names", () => { + expect(displayDistrictName("Ninth Hampden")).toBe("9th Hampden") + expect(displayDistrictName("Thirty-Seventh Middlesex")).toBe( + "37th Middlesex" + ) + expect(displayDistrictName("Barnstable, Dukes, and Nantucket")).toBe( + "Barnstable, Dukes, and Nantucket" + ) + }) +}) + +describe("parseSecDistricts", () => { + it("parses Senate h2 district sections and preserves split municipalities", () => { + const html = ` +

Massachusetts Senatorial Districts

+

Third Bristol and Plymouth

+
    +
  • Berkley
  • +
  • Taunton:
    Ward 1 Precincts A, B;
    Ward 2;
  • +
+
+

Plymouth and Barnstable

+
    +
  • Bourne
  • +
  • Falmouth
  • +
+ ` + + expect(parseSecDistricts(html, { branch: "Senate", sourceUrl })).toEqual([ + { + id: "senate-3-bristol-plymouth", + branch: "Senate", + district: "Third Bristol and Plymouth", + sourceDistrict: "Third Bristol and Plymouth", + sourceUrl, + municipalities: [ + { name: "Berkley", subdivisions: [] }, + { + name: "Taunton", + subdivisions: ["Ward 1 Precincts A, B", "Ward 2"] + } + ] + }, + { + id: "senate-plymouth-barnstable", + branch: "Senate", + district: "Plymouth and Barnstable", + sourceDistrict: "Plymouth and Barnstable", + sourceUrl, + municipalities: [ + { name: "Bourne", subdivisions: [] }, + { name: "Falmouth", subdivisions: [] } + ] + } + ]) + }) + + it("parses House county h2 wrappers, district h3s, and cross-county h2 districts", () => { + const html = ` +

Massachusetts Representative Districts

+

Barnstable County

+

First Barnstable

+
    +
  • Brewster
  • +
  • Yarmouth:
    Precincts 1, 2, 3;
  • +
+
+

Barnstable, Dukes, and Nantucket

+
    +
  • Aquinnah
  • +
  • Falmouth:
    Precincts 1, 2, 6;
  • +
+

Middlesex County

+

Thirty-Seventh Middlesex

+
    +
  • Acton:
    Precinct 6A;
  • +
+ ` + + expect(parseSecDistricts(html, { branch: "House", sourceUrl })).toEqual([ + { + id: "house-1-barnstable", + branch: "House", + district: "1st Barnstable", + sourceDistrict: "First Barnstable", + sourceUrl, + municipalities: [ + { name: "Brewster", subdivisions: [] }, + { name: "Yarmouth", subdivisions: ["Precincts 1, 2, 3"] } + ] + }, + { + id: "house-barnstable-dukes-nantucket", + branch: "House", + district: "Barnstable, Dukes, and Nantucket", + sourceDistrict: "Barnstable, Dukes, and Nantucket", + sourceUrl, + municipalities: [ + { name: "Aquinnah", subdivisions: [] }, + { name: "Falmouth", subdivisions: ["Precincts 1, 2, 6"] } + ] + }, + { + id: "house-37-middlesex", + branch: "House", + district: "37th Middlesex", + sourceDistrict: "Thirty-Seventh Middlesex", + sourceUrl, + municipalities: [{ name: "Acton", subdivisions: ["Precinct 6A"] }] + } + ]) + }) +}) diff --git a/functions/src/districts/parseSecDistricts.ts b/functions/src/districts/parseSecDistricts.ts new file mode 100644 index 000000000..ff2b45c99 --- /dev/null +++ b/functions/src/districts/parseSecDistricts.ts @@ -0,0 +1,103 @@ +import { JSDOM } from "jsdom" +import { compact } from "lodash" +import { districtId, displayDistrictName } from "./normalize" +import type { DistrictBranch, ParsedDistrict } from "./types" + +type ParseOptions = { + branch: DistrictBranch + sourceUrl: string +} + +function cleanText(text: string) { + return text + .replace(/\u00a0/g, " ") + .replace(/\s+/g, " ") + .trim() +} + +function isHouseCountyHeading(element: Element) { + return ( + element.tagName.toLowerCase() === "h2" && + / county$/i.test(cleanText(element.textContent ?? "")) + ) +} + +function isDistrictHeading(element: Element, branch: DistrictBranch) { + const tagName = element.tagName.toLowerCase() + if (branch === "Senate") return tagName === "h2" + return ( + (tagName === "h2" || tagName === "h3") && !isHouseCountyHeading(element) + ) +} + +function followingDistrictItems(heading: Element, branch: DistrictBranch) { + const items: HTMLLIElement[] = [] + let element = heading.nextElementSibling + + while (element) { + if (isDistrictHeading(element, branch)) break + if (branch === "House" && isHouseCountyHeading(element)) break + items.push( + ...Array.from(element.querySelectorAll("li")).filter( + (item): item is HTMLLIElement => + item instanceof heading.ownerDocument.defaultView!.HTMLLIElement + ) + ) + element = element.nextElementSibling + } + + return items +} + +function parseMunicipality(text: string) { + const [rawName, ...detailParts] = text.split(":") + const name = cleanText(rawName) + const details = cleanText(detailParts.join(":")) + const subdivisions = compact( + details + .split(";") + .map(detail => detail.replace(/[.;]+$/g, "")) + .map(cleanText) + ) + + return { name, subdivisions } +} + +export function parseSecDistricts( + html: string, + options: ParseOptions +): ParsedDistrict[] { + const dom = new JSDOM(html) + const document = dom.window.document + const title = Array.from(document.querySelectorAll("h1")).find(heading => + /massachusetts .* districts/i.test(heading.textContent ?? "") + ) + const headings = Array.from(document.querySelectorAll("h2, h3")).filter( + heading => + (!title || + Boolean( + title.compareDocumentPosition(heading) & + dom.window.Node.DOCUMENT_POSITION_FOLLOWING + )) && + isDistrictHeading(heading, options.branch) + ) + + return headings.map(heading => { + const sourceDistrict = cleanText(heading.textContent ?? "") + const district = + options.branch === "House" + ? displayDistrictName(sourceDistrict) + : sourceDistrict + + return { + id: districtId(options.branch, district), + branch: options.branch, + district, + sourceDistrict, + sourceUrl: options.sourceUrl, + municipalities: followingDistrictItems(heading, options.branch) + .map(item => parseMunicipality(item.textContent ?? "")) + .filter(municipality => municipality.name.length > 0) + } + }) +} diff --git a/functions/src/districts/types.ts b/functions/src/districts/types.ts new file mode 100644 index 000000000..30804f877 --- /dev/null +++ b/functions/src/districts/types.ts @@ -0,0 +1,32 @@ +import { + Array as RtArray, + InstanceOf, + Literal, + Record, + Static, + String, + Union +} from "runtypes" +import { Timestamp } from "../firebase" + +export const DistrictBranch = Union(Literal("House"), Literal("Senate")) +export type DistrictBranch = Static + +export const DistrictMunicipality = Record({ + name: String, + subdivisions: RtArray(String) +}) +export type DistrictMunicipality = Static + +export const District = Record({ + id: String, + branch: DistrictBranch, + district: String, + sourceDistrict: String, + sourceUrl: String, + municipalities: RtArray(DistrictMunicipality), + fetchedAt: InstanceOf(Timestamp) +}) +export type District = Static + +export type ParsedDistrict = Omit diff --git a/pages/legislators/[court]/[memberCode].tsx b/pages/legislators/[court]/[memberCode].tsx new file mode 100644 index 000000000..4c3084166 --- /dev/null +++ b/pages/legislators/[court]/[memberCode].tsx @@ -0,0 +1,39 @@ +import { GetServerSideProps } from "next" +import { serverSideTranslations } from "next-i18next/serverSideTranslations" +import { z } from "zod" +import { LegislatorProfilePage } from "components/LegislatorProfile" +import { createPage } from "components/page" + +const Query = z.object({ + court: z.coerce.number(), + memberCode: z.string() +}) + +export default createPage<{ + court: number + memberCode: string +}>({ + titleI18nKey: "titles.legislatorProfile", + Page: ({ court, memberCode }) => ( + + ) +}) + +export const getServerSideProps: GetServerSideProps = async ctx => { + ctx.res.setHeader( + "Cache-Control", + "public, s-maxage=60, stale-while-revalidate=300" + ) + + const query = Query.safeParse(ctx.query) + if (!query.success) return { notFound: true } + + const locale = ctx.locale ?? ctx.defaultLocale ?? "en" + + return { + props: { + ...query.data, + ...(await serverSideTranslations(locale, ["auth", "common", "footer"])) + } + } +} diff --git a/scripts/firebase-admin/scrapeLegislativeDistricts.ts b/scripts/firebase-admin/scrapeLegislativeDistricts.ts new file mode 100644 index 000000000..de3bf3598 --- /dev/null +++ b/scripts/firebase-admin/scrapeLegislativeDistricts.ts @@ -0,0 +1,89 @@ +import axios from "axios" +import { Timestamp } from "../../functions/src/firebase" +import { parseSecDistricts } from "../../functions/src/districts" +import type { ParsedDistrict } from "../../functions/src/districts" +import { currentGeneralCourt } from "../../functions/src/shared" +import type { Script } from "./types" + +const sources = [ + { + branch: "Senate" as const, + expectedCount: 40, + url: "https://www.sec.state.ma.us/divisions/elections/voting-information/district/2022-senatorial.htm" + }, + { + branch: "House" as const, + expectedCount: 160, + url: "https://www.sec.state.ma.us/divisions/elections/voting-information/district/2022-representative.htm" + } +] + +async function fetchHtml(url: string) { + const response = await axios.get(url, { + headers: { + "User-Agent": + "Mozilla/5.0 (compatible; MAPLE district backfill; +https://mapletestimony.org)" + }, + responseType: "text", + timeout: 60_000 + }) + return response.data +} + +function logCollisions(districts: ParsedDistrict[]) { + const byId = new Map() + districts.forEach(district => { + byId.set(district.id, [...(byId.get(district.id) ?? []), district]) + }) + + Array.from(byId.entries()) + .filter(([, matchingDistricts]) => matchingDistricts.length > 1) + .forEach(([id, matchingDistricts]) => { + console.warn( + `District normalization collision for ${id}: ${matchingDistricts + .map(district => district.sourceDistrict) + .join(", ")}` + ) + }) +} + +export const script: Script = async ({ db }) => { + const districts = ( + await Promise.all( + sources.map(async source => { + const html = await fetchHtml(source.url) + const parsed = parseSecDistricts(html, { + branch: source.branch, + sourceUrl: source.url + }) + + if (parsed.length !== source.expectedCount) { + throw Error( + `Expected ${source.expectedCount} ${source.branch} districts, parsed ${parsed.length}` + ) + } + + console.log(`Parsed ${parsed.length} ${source.branch} districts`) + return parsed + }) + ) + ).flat() + + logCollisions(districts) + + const writer = db.bulkWriter() + const fetchedAt = Timestamp.now() + + districts.forEach(district => { + writer.set( + db.doc(`/generalCourts/${currentGeneralCourt}/districts/${district.id}`), + { ...district, fetchedAt }, + { merge: true } + ) + }) + + await writer.close() + console.log( + `Upserted ${districts.length} districts for general court ${currentGeneralCourt}` + ) +} From 43d66207426862ebc0ff0542b0afb68620a50a22 Mon Sep 17 00:00:00 2001 From: Sean Moss Date: Tue, 19 May 2026 22:24:06 -0400 Subject: [PATCH 2/2] Enhance SEC district parsing and testing - Added functions to read district snapshots from files and strip ignored HTML tags. - Updated `parseSecDistricts` to utilize the new function for cleaning HTML input. - Introduced tests for parsing SEC snapshots in `parseSecDistricts.test.ts` to ensure correct district counts for Senate and House. - Modified the scraping script to read HTML from local files if not fetching from the web. --- .../src/districts/parseSecDistricts.test.ts | 24 +++++++++++++++++ functions/src/districts/parseSecDistricts.ts | 7 ++++- .../scrapeLegislativeDistricts.ts | 27 ++++++++++++++++--- 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/functions/src/districts/parseSecDistricts.test.ts b/functions/src/districts/parseSecDistricts.test.ts index 34bd4c8e2..e684bf648 100644 --- a/functions/src/districts/parseSecDistricts.test.ts +++ b/functions/src/districts/parseSecDistricts.test.ts @@ -1,3 +1,5 @@ +import { readFileSync } from "fs" +import path from "path" import { displayDistrictName, districtId, @@ -7,6 +9,13 @@ import { parseSecDistricts } from "./parseSecDistricts" const sourceUrl = "https://example.test/districts" +function readDistrictSnapshot(fileName: string) { + return readFileSync( + path.resolve(__dirname, "../../../districts", fileName), + "utf8" + ) +} + describe("district normalization", () => { it("normalizes district names across ordinal and punctuation variants", () => { expect(normalizeDistrictName("Ninth Hampden")).toBe("9 hampden") @@ -38,6 +47,21 @@ describe("district normalization", () => { }) describe("parseSecDistricts", () => { + it("parses the checked-in SEC snapshots", () => { + expect( + parseSecDistricts(readDistrictSnapshot("senatorial.html"), { + branch: "Senate", + sourceUrl + }) + ).toHaveLength(40) + expect( + parseSecDistricts(readDistrictSnapshot("representative.html"), { + branch: "House", + sourceUrl + }) + ).toHaveLength(160) + }) + it("parses Senate h2 district sections and preserves split municipalities", () => { const html = `

Massachusetts Senatorial Districts

diff --git a/functions/src/districts/parseSecDistricts.ts b/functions/src/districts/parseSecDistricts.ts index ff2b45c99..52c945a25 100644 --- a/functions/src/districts/parseSecDistricts.ts +++ b/functions/src/districts/parseSecDistricts.ts @@ -11,10 +11,15 @@ type ParseOptions = { function cleanText(text: string) { return text .replace(/\u00a0/g, " ") + .replace(/\s*▲?\s*Top of page\s*/gi, "") .replace(/\s+/g, " ") .trim() } +function stripIgnoredTags(html: string) { + return html.replace(//gi, "") +} + function isHouseCountyHeading(element: Element) { return ( element.tagName.toLowerCase() === "h2" && @@ -67,7 +72,7 @@ export function parseSecDistricts( html: string, options: ParseOptions ): ParsedDistrict[] { - const dom = new JSDOM(html) + const dom = new JSDOM(stripIgnoredTags(html)) const document = dom.window.document const title = Array.from(document.querySelectorAll("h1")).find(heading => /massachusetts .* districts/i.test(heading.textContent ?? "") diff --git a/scripts/firebase-admin/scrapeLegislativeDistricts.ts b/scripts/firebase-admin/scrapeLegislativeDistricts.ts index de3bf3598..97a11765a 100644 --- a/scripts/firebase-admin/scrapeLegislativeDistricts.ts +++ b/scripts/firebase-admin/scrapeLegislativeDistricts.ts @@ -1,23 +1,33 @@ import axios from "axios" +import { promises as fs } from "fs" +import path from "path" import { Timestamp } from "../../functions/src/firebase" import { parseSecDistricts } from "../../functions/src/districts" import type { ParsedDistrict } from "../../functions/src/districts" import { currentGeneralCourt } from "../../functions/src/shared" -import type { Script } from "./types" +import type { Script, ScriptContext } from "./types" const sources = [ { branch: "Senate" as const, expectedCount: 40, + fileName: "senatorial.html", url: "https://www.sec.state.ma.us/divisions/elections/voting-information/district/2022-senatorial.htm" }, { branch: "House" as const, expectedCount: 160, + fileName: "representative.html", url: "https://www.sec.state.ma.us/divisions/elections/voting-information/district/2022-representative.htm" } ] +type Source = (typeof sources)[number] + +function shouldFetch(args: ScriptContext["args"]) { + return args.fetch === true || args.argv.includes("--fetch") +} + async function fetchHtml(url: string) { const response = await axios.get(url, { headers: { @@ -30,6 +40,17 @@ async function fetchHtml(url: string) { return response.data } +async function readHtml(source: Source, args: ScriptContext["args"]) { + if (shouldFetch(args)) return fetchHtml(source.url) + + const dir = + typeof args["district-dir"] === "string" + ? args["district-dir"] + : path.resolve(process.cwd(), "districts") + + return fs.readFile(path.join(dir, source.fileName), "utf8") +} + function logCollisions(districts: ParsedDistrict[]) { const byId = new Map() districts.forEach(district => { @@ -47,11 +68,11 @@ function logCollisions(districts: ParsedDistrict[]) { }) } -export const script: Script = async ({ db }) => { +export const script: Script = async ({ db, args }) => { const districts = ( await Promise.all( sources.map(async source => { - const html = await fetchHtml(source.url) + const html = await readHtml(source, args) const parsed = parseSecDistricts(html, { branch: source.branch, sourceUrl: source.url