diff --git a/.github/workflows/build-content.yml b/.github/workflows/build-content.yml index 357c256..3494123 100644 --- a/.github/workflows/build-content.yml +++ b/.github/workflows/build-content.yml @@ -26,6 +26,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + with: + submodules: true - name: Set up Python uses: actions/setup-python@v5 diff --git a/.github/workflows/build-cv.yml b/.github/workflows/build-cv.yml index bb01164..c1400ee 100644 --- a/.github/workflows/build-cv.yml +++ b/.github/workflows/build-cv.yml @@ -28,6 +28,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + with: + submodules: true - name: Install TeX Live run: | diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..f85a74e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "lab-manual"] + path = lab-manual + url = https://github.com/ContextLab/lab-manual.git diff --git a/AGENTS.md b/AGENTS.md index 8fe5d5d..5e59f7e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -20,6 +20,7 @@ contextlab.github.io/ ├── scripts/ # Python build system (see scripts/AGENTS.md) ├── images/ # Assets: people/, publications/, software/, research/, news/ ├── documents/ # CV files (JRM_CV.tex → .pdf, .html) +├── lab-manual/ # Git submodule (ContextLab/lab-manual) └── tests/ # pytest suite for build system ``` @@ -29,7 +30,8 @@ contextlab.github.io/ |------|----------|-------| | Add publication | `data/publications.xlsx` | Auto-builds via GitHub Actions | | Add team member | `scripts/onboard_member.py` | Processes photo, generates bio, updates CV | -| Offboard member | `scripts/offboard_member.py` | Moves to alumni, updates CV | +| Offboard member | `scripts/offboard_member.py` | Moves to alumni, updates CV + lab-manual | +| Reconcile people | `scripts/reconcile_people.py` | Three-way sync: people.xlsx ↔ CV ↔ lab-manual | | Add software | `data/software.xlsx` | | | Add news | `data/news.xlsx` | Thumbnail in `images/news/` | | Update CV | `documents/JRM_CV.tex` | Auto-compiles to PDF+HTML | diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..4d56644 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,104 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Static website for the Contextual Dynamics Lab at Dartmouth College. Hosted on GitHub Pages at [context-lab.com](https://context-lab.com). Content pages (publications, people, software, news) are **auto-generated** from Excel spreadsheets via a Python build system — never edit the root HTML for those pages directly. + +## Commands + +```bash +# Install dependencies +pip install -r requirements-build.txt + +# Validate spreadsheet data +cd scripts && python validate_data.py + +# Build all content pages (publications, people, software, news) +cd scripts && python build.py + +# Build CV (requires XeLaTeX + Dartmouth Ruzicka fonts) +cd scripts && python build_cv.py + +# Run full test suite +python -m pytest tests/ -v + +# Run a single test file +python -m pytest tests/test_build_publications.py -v + +# Pre-push validation (validation + build + tests) +cd scripts && python pre_push_check.py + +# Local dev server +python3 -m http.server 8000 + +# Add hand-drawn borders to images +python scripts/add_borders.py image.png images/publications/ +python scripts/add_borders.py photo.jpg images/people/ --face + +# Onboard new member (from scripts/ dir) +python onboard_member.py "First Last" --rank "grad student" --photo headshot --skip-llm + +# Offboard member to alumni (from scripts/ dir) +python offboard_member.py "member name" --end-year 2025 + +# Reconcile people across website, CV, and lab-manual +cd scripts && python reconcile_people.py --dry-run # report only +cd scripts && python reconcile_people.py # apply auto-fixes + +# Initialize lab-manual submodule (required for reconciliation) +git submodule update --init +``` + +## Architecture + +### Content Pipeline + +``` +data/*.xlsx → scripts/build_*.py → root *.html (auto-generated) + ↑ + templates/*.html (markers like ) +``` + +Each `build_*.py` follows the same pattern: +1. Load spreadsheet with `utils.load_spreadsheet_all_sheets()` +2. Generate HTML fragments +3. Inject into template via `utils.inject_content(template, output, {"MARKER": html})` + +### Key Directories + +| Directory | Purpose | +|-|-| +| `data/` | Source spreadsheets (publications.xlsx, people.xlsx, software.xlsx, news.xlsx) + Dartmouth fonts | +| `templates/` | HTML templates with `` injection points | +| `scripts/` | Python build system, validation, onboarding/offboarding tools | +| `tests/` | pytest suite — conftest.py adds `scripts/` to sys.path | +| `documents/` | CV source (JRM_CV.tex) and generated outputs (PDF, HTML) | +| `css/style.css` | Single stylesheet with CSS variables at top | +| `js/main.js` | All JS components (9 init functions) | +| `lab-manual/` | Git submodule — [ContextLab/lab-manual](https://github.com/ContextLab/lab-manual) (init with `git submodule update --init`) | + +### CV Pipeline + +`documents/JRM_CV.tex` → `scripts/build_cv.py` + `scripts/extract_cv.py` (custom LaTeX→HTML parser) → `documents/JRM_CV.pdf` + `documents/JRM_CV.html` + +### GitHub Actions + +- **build-content.yml**: Triggers on changes to `data/`, `templates/`, `scripts/`. Validates, builds, runs tests, auto-commits regenerated HTML. +- **build-cv.yml**: Triggers on changes to `documents/JRM_CV.tex`, CV scripts, or `css/cv.css`. Compiles LaTeX, runs tests, auto-commits PDF+HTML. + +## Critical Rules + +- **Never edit auto-generated root HTML** (`publications.html`, `people.html`, `software.html`, `news.html`) — edit `data/*.xlsx` or `templates/*.html` instead +- **Never use `!important`** in CSS without explicit justification +- **Never add inline styles** to templates — use CSS classes +- **Always run tests before pushing**: `python -m pytest tests/ -v` +- **All headings are lowercase** via `text-transform: lowercase` in CSS + +## Design System + +- **Colors**: `--primary-green: rgb(0, 112, 60)`, `--bg-green: rgba(0, 112, 60, 0.2)`, `--dark-text: rgba(0, 0, 0, 0.7)` +- **Font**: Nunito Sans, 300 weight body, 14px base, 1.7 line-height +- **Images**: 500x500px with hand-drawn green borders (use `scripts/add_borders.py`), face-detect cropping for people photos (`--face`) +- **Forms**: Formspree backend — endpoint in form `action` attribute diff --git a/README.md b/README.md index a62207d..bffa365 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,7 @@ The script will: - Generate or edit the bio using a local LLM (gpt-oss-20b) - Add the member to `people.xlsx` - Add the member to `JRM_CV.tex` +- Update `lab_manual.tex` in the lab-manual submodule (if initialized) - Invite to GitHub organization and teams (if `--github` provided) - Share Google Calendars with appropriate permissions (if `--gmail` provided) - Rebuild `people.html` @@ -268,10 +269,31 @@ python offboard_member.py --list-no-photo The script will: - Move the member from `members` sheet to `alumni_undergrads` in `people.xlsx` - Update `JRM_CV.tex` to add the end date +- Update `lab_manual.tex` in the lab-manual submodule (if initialized) - Prompt to rebuild `people.html` **Idempotent**: Running twice with the same name detects the member is already offboarded. +#### Reconciling People Data + +The reconciliation tool compares member/alumni data across `people.xlsx` (source of truth), `JRM_CV.tex`, and the lab-manual's `lab_manual.tex`: + +```bash +cd scripts + +# Report discrepancies without making changes +python reconcile_people.py --dry-run + +# Apply auto-fixes (people.xlsx entries missing from other sources) +python reconcile_people.py +``` + +The tool uses fuzzy name matching to catch spelling variations and nicknames. Discrepancies are categorized as: +- **Auto-resolved**: People in people.xlsx missing from CV or lab-manual (auto-added) +- **Flagged for review**: People in other sources missing from people.xlsx (requires manual review) + +**Requires**: Lab-manual submodule initialized (`git submodule update --init`) + #### Adding Alumni (Manual) 1. Open `data/people.xlsx` @@ -301,6 +323,9 @@ The script will: # Install dependencies pip install -r requirements-build.txt +# Initialize lab-manual submodule (required for reconciliation) +git submodule update --init + # Validate data files python scripts/validate_data.py diff --git a/data/people.xlsx b/data/people.xlsx index 0668f31..775328b 100644 Binary files a/data/people.xlsx and b/data/people.xlsx differ diff --git a/documents/JRM_CV.html b/documents/JRM_CV.html index 1ded96b..f637816 100644 --- a/documents/JRM_CV.html +++ b/documents/JRM_CV.html @@ -455,9 +455,9 @@

Mentorship (selected)

(*)

Postdoctoral Advisees

    -
  1. Hung-Tu Chen (2024 – 2025; current position: Meta)
  2. -
  3. Gina Notaro (2017 – 2018; current position: HRL Laboratories)
  4. Andrew Heusser (2016 – 2018; current position: PyMC Labs)
  5. +
  6. Gina Notaro (2017 – 2018; current position: HRL Laboratories)
  7. +
  8. Hung-Tu Chen (2024 – 2025; current position: Meta)

Graduate Advisees

    @@ -528,143 +528,150 @@

    Specialist Committees

  1. Youki Tanaka (Advisor: Matthijs van der Meer)

Undergraduate Advisees

-
    -
  1. Ahmad Wahab (2026 – )
  2. -
  3. Isaac Cheon (2026 – )
  4. -
  5. Mak Kelly (2026 – )
  6. -
  7. Jason Kang (2026 – )
  8. -
  9. Theo Larson (2026 – )
  10. +
      +
    1. Aamuktha Porika (2016 – 2017)
    2. +
    3. Allison Frantz (2016 – 2017)
    4. +
    5. Aman Agarwal (2016)
    6. +
    7. Campbell Field (2016 – 2018)
    8. +
    9. Clara Silvanic (2016)
    10. +
    11. Gal Perlman (2016)
    12. +
    13. Jake Rost (2016)
    14. +
    15. Jessica Tin (2016)
    16. +
    17. Joseph Finkelstein (2016)
    18. +
    19. Madeline Lee (2016 – 2020)
    20. +
    21. Marisol Tracy (2016 – 2017)
    22. +
    23. Peter Tran (2016)
    24. +
    25. Sheherzad Mohydin (2016)
    26. +
    27. Wei Liang Samuel Ching (2016 – 2017)
    28. +
    29. Armando Ortiz (2017)
    30. +
    31. Bryan Bollinger (2017 – 2018)
    32. +
    33. Christina Lu (2017)
    34. +
    35. Emily Whitaker (2017 – 2018)
    36. +
    37. Paxton Fitzpatrick* (2017 – 2019)
    38. +
    39. Stephen Satterthwaite (2017 – 2018)
    40. +
    41. Alejandro Martinez (2018 – 2020)
    42. +
    43. Ann Carpenter (2018)
    44. +
    45. Darya Romanova (2018)
    46. +
    47. Iain Sheerin (2018)
    48. +
    49. Kirsten Soh (2018)
    50. +
    51. Mustafa Nasir-Moin (2018)
    52. +
    53. Rachael Chacko (2018)
    54. +
    55. Seung Ju Lee (2018)
    56. +
    57. William Baxley (2018 – 2019)
    58. +
    59. Aaron Lee (2019 – 2020)
    60. +
    61. Anne George (2019 – 2020)
    62. +
    63. Sarah Park (2019 – 2020)
    64. +
    65. Shane Park (2019 – 2020)
    66. +
    67. Tudor Muntianu (2019 – 2021)
    68. +
    69. William Chen (2019 – 2020)
    70. +
    71. Austin Zhang (2020)
    72. +
    73. Chelsea Uddenberg (2020)
    74. +
    75. Chetan Palvuluri (2020)
    76. +
    77. Chris Jun (2020 – 2022)
    78. +
    79. Chris Long (2020 – 2021)
    80. +
    81. Chris Suh (2020 – 2021)
    82. +
    83. Darren Gu (2020 – 2021)
    84. +
    85. Esme Chen (2020 – 2021)
    86. +
    87. Ethan Adner (2020 – 2022)
    88. +
    89. Greg Han (2020)
    90. +
    91. Helen Liu (2020)
    92. +
    93. Kelly Rutherford (2020)
    94. +
    95. Luca Lit (2020)
    96. +
    97. Shane Hewitt (2020)
    98. +
    99. Tehut Biru* (2020 – 2021)
    100. +
    101. Tyler Chen (2020 – 2022)
    102. +
    103. Vivian Tran (2020)
    104. +
    105. Aidan Adams (2021)
    106. +
    107. Brian Chiang (2021 – 2022)
    108. +
    109. Damini Kohli (2021)
    110. +
    111. Daniel Carstensen* (2021 – 2024)
    112. +
    113. Daniel Ha (2021)
    114. +
    115. Kunal Jha* (2021 – 2024)
    116. +
    117. Alexander Marcoux (2022)
    118. +
    119. Anna Mikhailova (2022)
    120. +
    121. Ansh Patel (2022 – 2024)
    122. +
    123. Benjamin Lehrburger (2022)
    124. +
    125. Dawson Haddox (2022)
    126. +
    127. Goutham Veeramachaneni (2022)
    128. +
    129. Jessna Brar (2022)
    130. +
    131. Kevin Cao (2022)
    132. +
    133. Molly McQuoid (2022)
    134. +
    135. Samuel Crombie (2022)
    136. +
    137. Swestha Jain (2022)
    138. +
    139. Thomas Corrado (2022)
    140. +
    141. Wenhua Liang (2022)
    142. +
    143. Zachary Somma (2022)
    144. +
    145. Ziyan Zhu (2022 – 2023)
    146. +
    147. Aaryan Agrawal (2023 – 2024)
    148. +
    149. Ameer Talha Yasser (2023)
    150. +
    151. Andrew Shi (2023)
    152. +
    153. Ash Chinta (2023)
    154. +
    155. Charles Baker (2023)
    156. +
    157. DJ Matusz (2023 – 2024)
    158. +
    159. Elias Emery (2023)
    160. +
    161. Emma Reeder (2023 – 2024)
    162. +
    163. Francisca Fadairo (2023)
    164. +
    165. Grace Wang (2023)
    166. +
    167. Grady Redding (2023 – 2024)
    168. +
    169. Jake McDermid (2023 – 2025)
    170. +
    171. Jennifer Xu (2023 – )
    172. +
    173. Maura Hough (2023 – 2024)
    174. +
    175. Megan Liu (2023 – 2024)
    176. +
    177. Michael Chen (2023 – 2024)
    178. +
    179. Mira Chiruvolu (2023 – 2024)
    180. +
    181. Natalie Schreder (2023)
    182. +
    183. Om Shah (2023 – )
    184. +
    185. Raselas Dessalegn (2023)
    186. +
    187. Safwan Rashid (2023)
    188. +
    189. Sarah Parigela (2023 – )
    190. +
    191. Sergio Campos Legonia (2023)
    192. +
    193. William McCall (2023)
    194. +
    195. Xueyao Zheng (2023)
    196. +
    197. Yue Zhuo (2023)
    198. +
    199. Yvonne Chen (2023)
    200. +
    201. Abigayle McCusker (2024)
    202. +
    203. Andrew Cao (2024)
    204. +
    205. Ansh Motiani (2024)
    206. +
    207. Can Kam (2024)
    208. +
    209. Chelsea Joe (2024 – 2026)
    210. +
    211. Chloe Terestchenko (2024)
    212. +
    213. Everett Tai (2024)
    214. +
    215. Harrison Stropkay* (2024 – 2025)
    216. +
    217. Jacob Bacus (2024 – 2026)
    218. +
    219. Kaitlyn Peng* (2024)
    220. +
    221. Manraaj Singh (2024)
    222. +
    223. Matthew Givens (2024 – 2025)
    224. +
    225. Miel Wewerka (2024)
    226. +
    227. Rohan Goyal (2024)
    228. +
    229. Torsha Chakraverty (2024)
    230. Aidan Miller (2025 – )
    231. +
    232. Alexandra Wingo (2025 – )
    233. Alishba Tahir (2025 – )
    234. +
    235. Andrew Richardson (2025 – 2026)
    236. +
    237. Angelyn Liu (2025 – )
    238. +
    239. Annabelle Morrow (2025 – 2026)
    240. Azaire Andre (2025 – )
    241. +
    242. Ben Hanson (2025 – )
    243. Ellie Mattox (2025 – )
    244. -
    245. Emmy Thornton (2025 – )
    246. +
    247. Emmy Thornton (2025)
    248. Evan McDermid (2025 – 2026)
    249. Jackson C. Sandrich (2025 – 2026)
    250. -
    251. Will Lehman (2025 – )
    252. -
    253. Luca Gandrud (2025 – 2026)
    254. -
    255. Sam Haskel* (2025 – )
    256. +
    257. Jaysen Quan (2025)
    258. +
    259. Joy Maina (2025)
    260. +
    261. Keene Dampal (2025)
    262. Kevin Chang (2025 – 2026)
    263. -
    264. Andrew Richardson (2025 – 2026)
    265. -
    266. Ben Hanson (2025 – )
    267. -
    268. Annabelle Morrow (2025 – 2026)
    269. +
    270. Luca Gandrud (2025 – 2026)
    271. Owen Phillips (2025 – 2026)
    272. Rodrigo Vega Ayllon (2025)
    273. -
    274. Joy Maina (2025 – )
    275. -
    276. Alexandra Wingo (2025 – )
    277. -
    278. Angelyn Liu (2025 – )
    279. -
    280. Miel Wewerka (2024)
    281. -
    282. Manraaj Singh (2024)
    283. -
    284. Can Kam (2024)
    285. -
    286. Chelsea Joe (2024 – 2026)
    287. -
    288. Jacob Bacus (2024 – 2026)
    289. -
    290. Rohan Goyal (2024)
    291. -
    292. Harrison Stropkay* (2024 – 2025)
    293. -
    294. Abigayle McCusker (2024)
    295. -
    296. Torsha Chakraverty (2024)
    297. -
    298. Chloe Terestchenko (2024)
    299. -
    300. Ansh Motiani (2024)
    301. -
    302. Kaitlyn Peng* (2024)
    303. -
    304. Everett Tai (2024)
    305. -
    306. Andrew Cao (2024)
    307. -
    308. Michael Chen (2023 – 2024)
    309. -
    310. Jake McDermid (2023 – 2025)
    311. -
    312. Om Shah (2023 – )
    313. -
    314. Grady Redding (2023 – 2024)
    315. -
    316. DJ Matusz (2023 – 2024)
    317. -
    318. Sarah Parigela (2023 – )
    319. -
    320. Aaryan Agarwal (2023 – 2024)
    321. -
    322. Maura Hough (2023 – 2024)
    323. -
    324. Emma Reeder (2023 – 2024)
    325. -
    326. Safwan Rashid (2023)
    327. -
    328. Francisca Fadairo (2023)
    329. -
    330. Ameer Talha Yasser (2023)
    331. -
    332. Yue Zhuo (2023)
    333. -
    334. Megan Liu (2023 – 2024)
    335. -
    336. Charles Baker (2023)
    337. -
    338. Andrew Shi (2023)
    339. -
    340. Ash Chinta (2023)
    341. -
    342. Xueyao Zheng (2023)
    343. -
    344. Sergio Campos Legonia (2023)
    345. -
    346. Jennifer Xu (2023 – )
    347. -
    348. Elias Emery (2023)
    349. -
    350. Yvonne Chen (2023)
    351. -
    352. William McCall (2023)
    353. -
    354. Natalie Schreder (2023)
    355. -
    356. Raselas Dessalegn (2023)
    357. -
    358. Grace Wang (2023)
    359. -
    360. Mira Chiruvolu (2023 – 2024)
    361. -
    362. Anna Mikhailova (2022)
    363. -
    364. Ansh Patel (2022 – 2024)
    365. -
    366. Ziyan Zhu (2022 – 2023)
    367. -
    368. Benjamin Lehrburger (2022)
    369. -
    370. Thomas Corrado (2022)
    371. -
    372. Samuel Crombie (2022)
    373. -
    374. Alexander Marcoux (2022)
    375. -
    376. Jessna Brar (2022)
    377. -
    378. Wenhua Liang (2022)
    379. -
    380. Kevin Cao (2022)
    381. -
    382. Goutham Veeramachaneni (2022)
    383. -
    384. Zachary Somma (2022)
    385. -
    386. Dawson Haddox (2022)
    387. -
    388. Swestha Jain (2022)
    389. -
    390. Aidan Adams (2021)
    391. -
    392. Damini Kohli (2021)
    393. -
    394. Kunal Jha* (2021 – 2024)
    395. -
    396. Daniel Carstensen* (2021 – 2024)
    397. -
    398. Brian Chiang (2021 – 2022)
    399. -
    400. Daniel Ha (2021)
    401. -
    402. Darren Gu (2020 – 2021)
    403. -
    404. Tyler Chen (2020 – 2022)
    405. -
    406. Tehut Biru* (2020 – 2021)
    407. -
    408. Chris Suh (2020 – 2021)
    409. -
    410. Helen Liu (2020)
    411. -
    412. Kelly Rutherford (2020)
    413. -
    414. Chris Jun (2020 – 2022)
    415. -
    416. Ethan Adner (2020 – 2022)
    417. -
    418. Chris Long (2020 – 2021)
    419. -
    420. Esme Chen (2020 – 2021)
    421. -
    422. Luca Lit (2020)
    423. -
    424. Vivian Tran (2020)
    425. -
    426. Greg Han (2020)
    427. -
    428. Austin Zhang (2020)
    429. -
    430. Chelsea Uddenberg (2020)
    431. -
    432. Shane Hewitt (2020)
    433. -
    434. Chetan Palvuluri (2020)
    435. -
    436. Aaron Lee (2019 – 2020)
    437. -
    438. Anne George (2019 – 2020)
    439. -
    440. Sarah Park (2019 – 2020)
    441. -
    442. Shane Park (2019 – 2020)
    443. -
    444. William Chen (2019 – 2020)
    445. -
    446. Tudor Muntianu (2019 – 2021)
    447. -
    448. William Baxley (2018 – 2019)
    449. -
    450. Ann Carpenter (2018)
    451. -
    452. Seung Ju Lee (2018)
    453. -
    454. Mustafa Nasir-Moin (2018)
    455. -
    456. Iain Sheerin (2018)
    457. -
    458. Darya Romanova (2018)
    459. -
    460. Alejandro Martinez (2018 – 2020)
    461. -
    462. Rachael Chacko (2018)
    463. -
    464. Kirsten Soh (2018)
    465. -
    466. Paxton Fitzpatrick* (2017 – 2019)
    467. -
    468. Stephen Satterthwaite (2017 – 2018)
    469. -
    470. Bryan Bollinger (2017 – 2018)
    471. -
    472. Christina Lu (2017)
    473. -
    474. Armando Oritz (2017)
    475. -
    476. Campbell Field (2016 – 2018)
    477. -
    478. Madeline Lee (2016 – 2020)
    479. -
    480. Wei Liang Samuel Ching (2016 – 2017)
    481. -
    482. Marisol Tracy (2016 – 2017)
    483. -
    484. Allison Frantz (2016 – 2017)
    485. -
    486. Aamuktha Porika (2016 – 2017)
    487. -
    488. Jake Rost (2016)
    489. -
    490. Clara Silvanic (2016)
    491. -
    492. Aman Agarwal (2016)
    493. -
    494. Joseph Finkelstein (2016)
    495. -
    496. Sheherzad Mohydin (2016)
    497. -
    498. Peter Tran (2016)
    499. -
    500. Gal Perlman (2016)
    501. -
    502. Jessica Tin (2016)
    503. +
    504. Sam Haskel* (2025 – )
    505. +
    506. Will Lehman (2025 – )
    507. +
    508. Ahmad Wahab (2026 – )
    509. +
    510. Andy Kim (2026 – )
    511. +
    512. Colson Duncan (2025 – )
    513. +
    514. Isaac Cheon (2026 – )
    515. +
    516. Jason Kang (2026 – )
    517. +
    518. Mak Kelly (2026 – )
    519. +
    520. Theo Larson (2026 – )
    diff --git a/documents/JRM_CV.pdf b/documents/JRM_CV.pdf index 0f8db4b..fc92c27 100644 Binary files a/documents/JRM_CV.pdf and b/documents/JRM_CV.pdf differ diff --git a/documents/JRM_CV.tex b/documents/JRM_CV.tex index b0a5496..ebd8f35 100644 --- a/documents/JRM_CV.tex +++ b/documents/JRM_CV.tex @@ -652,10 +652,9 @@ \subsection*{Mentorship (selected)} \textit{Postdoctoral Advisees}: \begin{etaremune} -\item Hung-Tu Chen (2024 -- 2025; current position: Meta) -\item Gina Notaro (2017 -- 2018; current position: HRL Laboratories) -\item Andrew Heusser (2016 -- 2018; current position: PyMC Labs) -\end{etaremune} + \item Andrew Heusser (2016 -- 2018; current position: PyMC Labs) + \item Gina Notaro (2017 -- 2018; current position: HRL Laboratories) + \item Hung-Tu Chen (2024 -- 2025; current position: Meta)\end{etaremune} \textit{Graduate Advisees}: \begin{etaremune} @@ -733,143 +732,149 @@ \subsection*{Mentorship (selected)} (*)} \begin{multicols}{2} \begin{etaremune} - \item Ahmad Wahab (2026 -- ) - \item Isaac Cheon (2026 -- ) - \item Mak Kelly (2026 -- ) - \item Jason Kang (2026 -- ) - \item Theo Larson (2026 -- ) + \item Aamuktha Porika (2016 -- 2017) + \item Allison Frantz (2016 -- 2017) + \item Aman Agarwal (2016) + \item Campbell Field (2016 -- 2018) + \item Clara Silvanic (2016) + \item Gal Perlman (2016) + \item Jake Rost (2016) + \item Jessica Tin (2016) + \item Joseph Finkelstein (2016) + \item Madeline Lee (2016 -- 2020) + \item Marisol Tracy (2016 -- 2017) + \item Peter Tran (2016) + \item Sheherzad Mohydin (2016) + \item Wei Liang Samuel Ching (2016 -- 2017) + \item Armando Ortiz (2017) + \item Bryan Bollinger (2017 -- 2018) + \item Christina Lu (2017) + \item Emily Whitaker (2017 -- 2018) + \item Paxton Fitzpatrick* (2017 -- 2019) + \item Stephen Satterthwaite (2017 -- 2018) + \item Alejandro Martinez (2018 -- 2020) + \item Ann Carpenter (2018) + \item Darya Romanova (2018) + \item Iain Sheerin (2018) + \item Kirsten Soh (2018) + \item Mustafa Nasir-Moin (2018) + \item Rachael Chacko (2018) + \item Seung Ju Lee (2018) + \item William Baxley (2018 -- 2019) + \item Aaron Lee (2019 -- 2020) + \item Anne George (2019 -- 2020) + \item Sarah Park (2019 -- 2020) + \item Shane Park (2019 -- 2020) + \item Tudor Muntianu (2019 -- 2021) + \item William Chen (2019 -- 2020) + \item Austin Zhang (2020) + \item Chelsea Uddenberg (2020) + \item Chetan Palvuluri (2020) + \item Chris Jun (2020 -- 2022) + \item Chris Long (2020 -- 2021) + \item Chris Suh (2020 -- 2021) + \item Darren Gu (2020 -- 2021) + \item Esme Chen (2020 -- 2021) + \item Ethan Adner (2020 -- 2022) + \item Greg Han (2020) + \item Helen Liu (2020) + \item Kelly Rutherford (2020) + \item Luca Lit (2020) + \item Shane Hewitt (2020) + \item Tehut Biru* (2020 -- 2021) + \item Tyler Chen (2020 -- 2022) + \item Vivian Tran (2020) + \item Aidan Adams (2021) + \item Brian Chiang (2021 -- 2022) + \item Damini Kohli (2021) + \item Daniel Carstensen* (2021 -- 2024) + \item Daniel Ha (2021) + \item Kunal Jha* (2021 -- 2024) + \item Alexander Marcoux (2022) + \item Anna Mikhailova (2022) + \item Ansh Patel (2022 -- 2024) + \item Benjamin Lehrburger (2022) + \item Dawson Haddox (2022) + \item Goutham Veeramachaneni (2022) + \item Jessna Brar (2022) + \item Kevin Cao (2022) + \item Molly McQuoid (2022) + \item Samuel Crombie (2022) + \item Swestha Jain (2022) + \item Thomas Corrado (2022) + \item Wenhua Liang (2022) + \item Zachary Somma (2022) + \item Ziyan Zhu (2022 -- 2023) + \item Aaryan Agrawal (2023 -- 2024) + \item Ameer Talha Yasser (2023) + \item Andrew Shi (2023) + \item Ash Chinta (2023) + \item Charles Baker (2023) + \item DJ Matusz (2023 -- 2024) + \item Elias Emery (2023) + \item Emma Reeder (2023 -- 2024) + \item Francisca Fadairo (2023) + \item Grace Wang (2023) + \item Grady Redding (2023 -- 2024) + \item Jake McDermid (2023 -- 2025) + \item Jennifer Xu (2023 -- ) + \item Maura Hough (2023 -- 2024) + \item Megan Liu (2023 -- 2024) + \item Michael Chen (2023 -- 2024) + \item Mira Chiruvolu (2023 -- 2024) + \item Natalie Schreder (2023) + \item Om Shah (2023 -- ) + \item Raselas Dessalegn (2023) + \item Safwan Rashid (2023) + \item Sarah Parigela (2023 -- ) + \item Sergio Campos Legonia (2023) + \item William McCall (2023) + \item Xueyao Zheng (2023) + \item Yue Zhuo (2023) + \item Yvonne Chen (2023) + \item Abigayle McCusker (2024) + \item Andrew Cao (2024) + \item Ansh Motiani (2024) + \item Can Kam (2024) + \item Chelsea Joe (2024 -- 2026) + \item Chloe Terestchenko (2024) + \item Everett Tai (2024) + \item Harrison Stropkay* (2024 -- 2025) + \item Jacob Bacus (2024 -- 2026) + \item Kaitlyn Peng* (2024) + \item Manraaj Singh (2024) + \item Matthew Givens (2024 -- 2025) + \item Miel Wewerka (2024) + \item Rohan Goyal (2024) + \item Torsha Chakraverty (2024) \item Aidan Miller (2025 -- ) + \item Alexandra Wingo (2025 -- ) \item Alishba Tahir (2025 -- ) + \item Andrew Richardson (2025 -- 2026) + \item Angelyn Liu (2025 -- ) + \item Annabelle Morrow (2025 -- 2026) \item Azaire Andre (2025 -- ) + \item Ben Hanson (2025 -- ) \item Ellie Mattox (2025 -- ) - \item Emmy Thornton (2025 -- ) + \item Emmy Thornton (2025) \item Evan McDermid (2025 -- 2026) \item Jackson C. Sandrich (2025 -- 2026) - \item Will Lehman (2025 -- ) - \item Luca Gandrud (2025 -- 2026) - \item Sam Haskel* (2025 -- ) + \item Jaysen Quan (2025) + \item Joy Maina (2025) + \item Keene Dampal (2025) \item Kevin Chang (2025 -- 2026) - \item Andrew Richardson (2025 -- 2026) - \item Ben Hanson (2025 -- ) - \item Annabelle Morrow (2025 -- 2026) + \item Luca Gandrud (2025 -- 2026) \item Owen Phillips (2025 -- 2026) \item Rodrigo Vega Ayllon (2025) - \item Joy Maina (2025 -- ) - \item Alexandra Wingo (2025 -- ) - \item Angelyn Liu (2025 -- ) - \item Miel Wewerka (2024) - \item Manraaj Singh (2024) - \item Can Kam (2024) - \item Chelsea Joe (2024 -- 2026) - \item Jacob Bacus (2024 -- 2026) - \item Rohan Goyal (2024) - \item Harrison Stropkay* (2024 -- 2025) - \item Abigayle McCusker (2024) - \item Torsha Chakraverty (2024) - \item Chloe Terestchenko (2024) - \item Ansh Motiani (2024) - \item Kaitlyn Peng* (2024) - \item Everett Tai (2024) - \item Andrew Cao (2024) - \item Michael Chen (2023 -- 2024) - \item Jake McDermid (2023 -- 2025) - \item Om Shah (2023 -- ) - \item Grady Redding (2023 -- 2024) - \item DJ Matusz (2023 -- 2024) - \item Sarah Parigela (2023 -- ) - \item Aaryan Agarwal (2023 -- 2024) - \item Maura Hough (2023 -- 2024) - \item Emma Reeder (2023 -- 2024) - \item Safwan Rashid (2023) - \item Francisca Fadairo (2023) - \item Ameer Talha Yasser (2023) - \item Yue Zhuo (2023) - \item Megan Liu (2023 -- 2024) - \item Charles Baker (2023) - \item Andrew Shi (2023) - \item Ash Chinta (2023) - \item Xueyao Zheng (2023) - \item Sergio Campos Legonia (2023) - \item Jennifer Xu (2023 -- ) - \item Elias Emery (2023) - \item Yvonne Chen (2023) - \item William McCall (2023) - \item Natalie Schreder (2023) - \item Raselas Dessalegn (2023) - \item Grace Wang (2023) - \item Mira Chiruvolu (2023 -- 2024) - \item Anna Mikhailova (2022) - \item Ansh Patel (2022 -- 2024) - \item Ziyan Zhu (2022 -- 2023) - \item Benjamin Lehrburger (2022) - \item Thomas Corrado (2022) - \item Samuel Crombie (2022) - \item Alexander Marcoux (2022) - \item Jessna Brar (2022) - \item Wenhua Liang (2022) - \item Kevin Cao (2022) - \item Goutham Veeramachaneni (2022) - \item Zachary Somma (2022) - \item Dawson Haddox (2022) - \item Swestha Jain (2022) - \item Aidan Adams (2021) - \item Damini Kohli (2021) - \item Kunal Jha* (2021 -- 2024) - \item Daniel Carstensen* (2021 -- 2024) - \item Brian Chiang (2021 -- 2022) - \item Daniel Ha (2021) - \item Darren Gu (2020 -- 2021) - \item Tyler Chen (2020 -- 2022) - \item Tehut Biru* (2020 -- 2021) - \item Chris Suh (2020 -- 2021) - \item Helen Liu (2020) - \item Kelly Rutherford (2020) - \item Chris Jun (2020 -- 2022) - \item Ethan Adner (2020 -- 2022) - \item Chris Long (2020 -- 2021) - \item Esme Chen (2020 -- 2021) - \item Luca Lit (2020) - \item Vivian Tran (2020) - \item Greg Han (2020) - \item Austin Zhang (2020) - \item Chelsea Uddenberg (2020) - \item Shane Hewitt (2020) - \item Chetan Palvuluri (2020) - \item Aaron Lee (2019 -- 2020) - \item Anne George (2019 -- 2020) - \item Sarah Park (2019 -- 2020) - \item Shane Park (2019 -- 2020) - \item William Chen (2019 -- 2020) - \item Tudor Muntianu (2019 -- 2021) - \item William Baxley (2018 -- 2019) - \item Ann Carpenter (2018) - \item Seung Ju Lee (2018) - \item Mustafa Nasir-Moin (2018) - \item Iain Sheerin (2018) - \item Darya Romanova (2018) - \item Alejandro Martinez (2018 -- 2020) - \item Rachael Chacko (2018) - \item Kirsten Soh (2018) - \item Paxton Fitzpatrick* (2017 -- 2019) - \item Stephen Satterthwaite (2017 -- 2018) - \item Bryan Bollinger (2017 -- 2018) - \item Christina Lu (2017) - \item Armando Oritz (2017) - \item Campbell Field (2016 -- 2018) - \item Madeline Lee (2016 -- 2020) - \item Wei Liang Samuel Ching (2016 -- 2017) - \item Marisol Tracy (2016 -- 2017) - \item Allison Frantz (2016 -- 2017) - \item Aamuktha Porika (2016 -- 2017) - \item Jake Rost (2016) - \item Clara Silvanic (2016) - \item Aman Agarwal (2016) - \item Joseph Finkelstein (2016) - \item Sheherzad Mohydin (2016) - \item Peter Tran (2016) - \item Gal Perlman (2016) - \item Jessica Tin (2016) -\end{etaremune} + \item Sam Haskel* (2025 -- ) + \item Will Lehman (2025 -- ) + \item Ahmad Wahab (2026 -- ) + \item Andy Kim (2026 -- ) + \item Colson Duncan (2025 -- ) + \item Isaac Cheon (2026 -- ) + \item Jason Kang (2026 -- ) + \item Mak Kelly (2026 -- ) + \item Theo Larson (2026 -- )\end{etaremune} \end{multicols} % \subsubsection*{Princeton University} diff --git a/images/people/andy_kim.png b/images/people/andy_kim.png new file mode 100644 index 0000000..ce2c7ab Binary files /dev/null and b/images/people/andy_kim.png differ diff --git a/images/people/colson_duncan.png b/images/people/colson_duncan.png new file mode 100644 index 0000000..284cbbf Binary files /dev/null and b/images/people/colson_duncan.png differ diff --git a/lab-manual b/lab-manual new file mode 160000 index 0000000..5e83c41 --- /dev/null +++ b/lab-manual @@ -0,0 +1 @@ +Subproject commit 5e83c417516c32a0412aa5fe7822cbbf5da0e3ac diff --git a/people.html b/people.html index 89cc4e7..e11607a 100644 --- a/people.html +++ b/people.html @@ -197,6 +197,19 @@

    isaac cheon | undergrad

    ahmad wahab | undergrad

    Ahmad, a Dartmouth '27 student, studies computer science and neuroscience, focusing on applying technology to solve real-world healthcare issues. In his free time, he enjoys rock climbing.

+
+ andy kim +

andy kim | undergrad

+

Andy is a prospective QSS major from Seoul, South Korea. He has many academic interests involving decision-making, learning, neurodivergence, the market, and theatre. Outside of classes he works as a Tech for Collis and dances in his hip-hop dance group Street Soul.

+
+ + +
+
+ colson duncan +

colson duncan | undergrad

+

Colson is a '27 studying Neuroscience and Biological Sciences from Los Angeles. He enjoys playing tennis, climbing and playing the guitar.

+
@@ -278,7 +291,7 @@

Undergraduate Researchers

Michael Chen (2023-2024)
Grady Redding (2023-2024)
DJ Matusz (2023-2024)
- Aaryan Agarwal (2023-2024)
+ Aaryan Agrawal (2023-2024)
Maura Hough (2023-2024)
Emma Reeder (2023-2024)
Megan Liu (2023-2024)
@@ -354,7 +367,7 @@

Undergraduate Researchers

Stephen Satterthwaite (2017-2018)
Bryan Bollinger (2017-2018)
Christina Lu (2017)
- Armando Oritz (2017)
+ Armando Ortiz (2017)
Madeline Lee (2016-2020)
Campbell Field (2016-2018)
Wei Liang Samuel Ching (2016-2017)
@@ -368,7 +381,13 @@

Undergraduate Researchers

Sheherzad Mohydin (2016)
Peter Tran (2016)
Gal Perlman (2016)
- Jessica Tin (2016) + Jessica Tin (2016)
+ Matthew Givens (2024-2025)
+ Keene Dampal (2025)
+ Jaysen Quan (2025)
+ Molly McQuoid (2022)
+ Joy Maina (2025)
+ Emmy Thornton (2025)

diff --git a/scripts/AGENTS.md b/scripts/AGENTS.md index ee10670..177c971 100644 --- a/scripts/AGENTS.md +++ b/scripts/AGENTS.md @@ -19,7 +19,9 @@ scripts/ ├── citation_utils.py # Publication citation formatting ├── add_borders.py # Image processing (hand-drawn borders) ├── onboard_member.py # Add new lab members (with LLM bio generation) -└── offboard_member.py # Move members from active to alumni +├── offboard_member.py # Move members from active to alumni +├── parse_lab_manual.py # Parse lab_manual.tex members chapter + write helpers +└── reconcile_people.py # Three-way sync: people.xlsx ↔ CV ↔ lab-manual ``` ## WHERE TO LOOK @@ -32,8 +34,10 @@ scripts/ | Fix validation | `validate_data.py` | Required fields, file existence checks | | Fix CV parsing | `extract_cv.py` | LaTeX commands → HTML | | Fix image borders | `add_borders.py` | Uses MediaPipe for face detection | -| Onboard lab member | `onboard_member.py` | Processes photo, generates bio, updates spreadsheet + CV | -| Offboard lab member | `offboard_member.py` | Moves member to alumni, updates CV | +| Onboard lab member | `onboard_member.py` | Processes photo, generates bio, updates spreadsheet + CV + lab-manual | +| Offboard lab member | `offboard_member.py` | Moves member to alumni, updates CV + lab-manual | +| Reconcile people data | `reconcile_people.py` | Three-way sync: people.xlsx ↔ CV ↔ lab-manual | +| Parse lab-manual members | `parse_lab_manual.py` | Extract/write members in lab_manual.tex | ## CONVENTIONS diff --git a/scripts/offboard_member.py b/scripts/offboard_member.py index 7c796fe..f395609 100644 --- a/scripts/offboard_member.py +++ b/scripts/offboard_member.py @@ -266,6 +266,26 @@ def offboard_member( move_to_alumni(xlsx_path, member, years_string) update_cv_entry(cv_path, member["name"], end_year) + # Update lab-manual (best-effort; failure doesn't block offboarding) + try: + from parse_lab_manual import move_member_to_alumni as lm_move, commit_and_push_lab_manual + lab_manual_tex = project_root / 'lab-manual' / 'lab_manual.tex' + if lab_manual_tex.exists(): + print(" Updating lab-manual...") + lm_move(lab_manual_tex, member["name"], end_year) + try: + commit_and_push_lab_manual( + project_root / 'lab-manual', + f"Offboard {member['name']}" + ) + print(f" Updated lab-manual and pushed to remote") + except RuntimeError as e: + print(f" WARNING: Lab-manual updated locally but push failed: {e}") + else: + print(" NOTE: Lab-manual submodule not found, skipping lab-manual update") + except Exception as e: + print(f" WARNING: Could not update lab-manual: {e}") + print(f"\nSuccessfully offboarded {member['name']}") print("Run 'python build.py' to rebuild people.html") diff --git a/scripts/onboard_member.py b/scripts/onboard_member.py index 07f7f6f..b13853f 100644 --- a/scripts/onboard_member.py +++ b/scripts/onboard_member.py @@ -613,8 +613,44 @@ def find_photo(photo_hint: str, project_root: Path) -> Optional[Path]: def photo_already_processed(photo_base: str, project_root: Path) -> bool: + """Check if a photo has already been processed with hand-drawn borders. + + Verifies three conditions: + 1. The PNG file exists + 2. Resolution is 500x500 (the output size of add_borders.py) + 3. Corner pixels are transparent (borders leave transparent margins) + """ processed_photo = project_root / "images" / "people" / f"{photo_base}.png" - return processed_photo.exists() + if not processed_photo.exists(): + return False + + try: + from PIL import Image + img = Image.open(processed_photo) + + w, h = img.size + + # Check that image is square (bordered images are always square) + if w != h: + return False + + # Check that corner pixels are transparent (hand-drawn borders + # leave transparent margins around the image) + if img.mode != 'RGBA': + return False + corners = [ + img.getpixel((0, 0)), + img.getpixel((w - 1, 0)), + img.getpixel((0, h - 1)), + img.getpixel((w - 1, h - 1)), + ] + # All corners should be fully transparent (alpha == 0) + if not all(c[3] == 0 for c in corners): + return False + + return True + except Exception: + return False def process_photo( @@ -1067,6 +1103,26 @@ def onboard_member( print("\nUpdating CV...") add_to_cv(cv_path, name, rank, current_year) + # Update lab-manual (best-effort; failure doesn't block onboarding) + try: + from parse_lab_manual import add_member_to_lab_manual, commit_and_push_lab_manual + lab_manual_tex = project_root / 'lab-manual' / 'lab_manual.tex' + if lab_manual_tex.exists(): + print("\nUpdating lab-manual...") + add_member_to_lab_manual(lab_manual_tex, name, rank, current_year) + try: + commit_and_push_lab_manual( + project_root / 'lab-manual', + f"Onboard {name}" + ) + print(f" Updated lab-manual and pushed to remote") + except RuntimeError as e: + print(f" WARNING: Lab-manual updated locally but push failed: {e}") + else: + print(" NOTE: Lab-manual submodule not found, skipping lab-manual update") + except Exception as e: + print(f" WARNING: Could not update lab-manual: {e}") + if github_username: invite_to_github_org(github_username, github_teams) diff --git a/scripts/parse_lab_manual.py b/scripts/parse_lab_manual.py new file mode 100644 index 0000000..4b73f08 --- /dev/null +++ b/scripts/parse_lab_manual.py @@ -0,0 +1,293 @@ +"""Parse lab_manual.tex to extract member and alumni data. + +Parses the 'Lab members and alumni' chapter from the ContextLab lab-manual +repository's lab_manual.tex file. Extracts names, roles, year ranges, and +active/alumni status. +""" +import re +import subprocess +from pathlib import Path + + +def parse_members_chapter(tex_path): + """Extract all member/alumni entries from lab_manual.tex. + + Args: + tex_path: Path to lab_manual.tex file. + + Returns: + List of dicts with keys: name, role_category, start_year, + end_year (None if active), is_active, raw_line. + """ + tex_path = Path(tex_path) + content = tex_path.read_text(encoding='utf-8') + + # Extract the members chapter + chapter_match = re.search( + r'\\chapter\{Lab members and alumni\}.*?\\begin\{fullwidth\}(.*?)\\end\{fullwidth\}', + content, re.DOTALL + ) + if not chapter_match: + raise ValueError(f"Could not find 'Lab members and alumni' chapter in {tex_path}") + + chapter_text = chapter_match.group(1) + + # Split into Current and Alumni sections + subsection_pattern = r'\\subsection\{(.*?)\}' + subsection_splits = re.split(subsection_pattern, chapter_text) + + # subsection_splits: [before_first, title1, content1, title2, content2, ...] + sections = {} + for i in range(1, len(subsection_splits), 2): + title = subsection_splits[i].strip() + body = subsection_splits[i + 1] if i + 1 < len(subsection_splits) else '' + sections[title] = body + + records = [] + + for section_title, section_body in sections.items(): + is_active = 'current' in section_title.lower() + _parse_section(section_body, is_active, records) + + return records + + +def _parse_section(section_body, is_active, records): + """Parse a section (Current or Alumni) for role groups and entries.""" + # Split by \newthought{Role} + thought_pattern = r'\\newthought\{(.*?)\}' + parts = re.split(thought_pattern, section_body) + + # parts: [before_first, role1, content1, role2, content2, ...] + for i in range(1, len(parts), 2): + role_category = parts[i].strip() + role_content = parts[i + 1] if i + 1 < len(parts) else '' + + # Skip commented-out sections (all lines start with %) + uncommented_lines = [ + line for line in role_content.split('\n') + if line.strip() and not line.strip().startswith('%') + ] + if not uncommented_lines: + continue + + # Handle PI special case (no list wrapper) + if role_category == 'PI': + _parse_pi_entry(role_content, role_category, is_active, records) + continue + + # Parse \item entries + _parse_list_entries(role_content, role_category, is_active, records) + + +def _parse_pi_entry(content, role_category, is_active, records): + """Parse PI entry which has no list wrapper.""" + # Format: \enskip Name (YYYY -- ) or just Name (YYYY -- ) + pattern = r'(?:\\enskip\s+)?([A-Z][\w\s.]+?)\s*\((\d{4})\s*--\s*(\d{4})?\s*\)?' + for match in re.finditer(pattern, content): + name = match.group(1).strip() + start_year = int(match.group(2)) + end_year = int(match.group(3)) if match.group(3) else None + records.append({ + 'name': name, + 'role_category': role_category, + 'start_year': start_year, + 'end_year': end_year, + 'is_active': is_active and end_year is None, + 'raw_line': match.group(0).strip(), + }) + + +def _parse_list_entries(content, role_category, is_active, records): + """Parse \\item entries from list blocks.""" + # Match \item Name (YYYY -- YYYY) or \item Name (YYYY) or \item Name (YYYY --) + item_pattern = r'\\item\s+(.+?)\s*\((\d{4})(?:\s*--\s*(\d{4})?)?\s*\)' + for line in content.split('\n'): + stripped = line.strip() + if stripped.startswith('%'): + continue + match = re.search(item_pattern, stripped) + if match: + name = match.group(1).strip() + start_year = int(match.group(2)) + end_str = match.group(3) + end_year = int(end_str) if end_str else None + records.append({ + 'name': name, + 'role_category': role_category, + 'start_year': start_year, + 'end_year': end_year, + 'is_active': is_active and end_year is None, + 'raw_line': stripped, + }) + + +def add_member_to_lab_manual(tex_path, name, role, start_year): + """Add a new member to the Current lab members section. + + Args: + tex_path: Path to lab_manual.tex. + name: Full name of the member. + role: Role category (e.g., 'Graduate Students', 'Undergraduate RAs'). + start_year: Start year as int. + """ + tex_path = Path(tex_path) + content = tex_path.read_text(encoding='utf-8') + + # Map common role names to lab-manual role headings + role_map = { + 'postdoc': 'Postdoctoral Researchers', + 'grad student': 'Graduate Students', + 'graduate student': 'Graduate Students', + 'undergrad': 'Undergraduate RAs', + 'undergraduate': 'Undergraduate RAs', + 'lab manager': 'Lab Managers', + 'research assistant': 'Research Assistants', + } + role_heading = role_map.get(role.lower(), role) + + new_item = f'\\item {name} ({start_year} -- )' + + # Find the role section under Current lab members + # Look for \newthought{Role} followed by a list block + pattern = ( + r'(\\subsection\{Current lab members\}.*?' + r'\\newthought\{' + re.escape(role_heading) + r'\}.*?' + r'\\begin\{list\}\{\\quad\}\{\})' + r'(.*?)' + r'(\\end\{list\})' + ) + match = re.search(pattern, content, re.DOTALL) + if not match: + raise ValueError( + f"Could not find '{role_heading}' section under " + f"'Current lab members' in {tex_path}" + ) + + # Insert new item before \end{list} + before = match.group(1) + match.group(2).rstrip() + new_content = content[:match.start()] + before + '\n' + new_item + '\n' + match.group(3) + content[match.end():] + tex_path.write_text(new_content, encoding='utf-8') + + +def move_member_to_alumni(tex_path, name, end_year): + """Move a member from Current to Alumni section. + + Args: + tex_path: Path to lab_manual.tex. + name: Full name of the member. + end_year: End year as int. + """ + tex_path = Path(tex_path) + content = tex_path.read_text(encoding='utf-8') + + # Find the member in Current section + # Match the \item line with their name + item_pattern = re.compile( + r'^(\s*)\\item\s+' + re.escape(name) + r'\s*\((\d{4})\s*--\s*\)', + re.MULTILINE + ) + + # Only match within Current lab members section + current_section_match = re.search( + r'\\subsection\{Current lab members\}(.*?)\\subsection\{Lab alumni\}', + content, re.DOTALL + ) + if not current_section_match: + raise ValueError("Could not find Current lab members section") + + current_start = current_section_match.start(1) + current_end = current_section_match.end(1) + current_text = current_section_match.group(1) + + item_match = item_pattern.search(current_text) + if not item_match: + raise ValueError(f"Could not find '{name}' in Current lab members section") + + start_year = item_match.group(2) + + # Determine role category by finding the \newthought before this item + item_pos = item_match.start() + role_matches = list(re.finditer(r'\\newthought\{(.*?)\}', current_text[:item_pos])) + if not role_matches: + raise ValueError(f"Could not determine role for '{name}'") + role_category = role_matches[-1].group(1) + + # Remove from current section + abs_start = current_start + item_match.start() + abs_end = current_start + item_match.end() + # Remove the full line including newline + line_start = content.rfind('\n', 0, abs_start) + 1 + line_end = content.find('\n', abs_end) + if line_end == -1: + line_end = len(content) + else: + line_end += 1 # include the newline + + content = content[:line_start] + content[line_end:] + + # Add to alumni section with closed year range + alumni_item = f'\\item {name} ({start_year} -- {end_year})' + + # Find the role section under Lab alumni + pattern = ( + r'(\\subsection\{Lab alumni\}.*?' + r'\\newthought\{' + re.escape(role_category) + r'\}.*?' + r'\\begin\{list\}\{\\quad\}\{\})' + r'(.*?)' + r'(\\end\{list\})' + ) + match = re.search(pattern, content, re.DOTALL) + if not match: + raise ValueError( + f"Could not find '{role_category}' alumni section in {tex_path}" + ) + + before = match.group(1) + match.group(2).rstrip() + content = content[:match.start()] + before + '\n' + alumni_item + '\n' + match.group(3) + content[match.end():] + + tex_path.write_text(content, encoding='utf-8') + + +def commit_and_push_lab_manual(submodule_path, message): + """Commit and push changes in the lab-manual submodule. + + Args: + submodule_path: Path to the lab-manual submodule directory. + message: Commit message. + + Raises: + RuntimeError: If git operations fail. + """ + submodule_path = Path(submodule_path) + if not (submodule_path / '.git').exists() and not (submodule_path / 'lab_manual.tex').exists(): + raise RuntimeError( + f"Lab-manual submodule not initialized at {submodule_path}. " + f"Run: git submodule update --init" + ) + + try: + subprocess.run( + ['git', 'add', 'lab_manual.tex'], + cwd=submodule_path, check=True, capture_output=True, text=True + ) + # Check if there are staged changes + result = subprocess.run( + ['git', 'diff', '--cached', '--quiet'], + cwd=submodule_path, capture_output=True + ) + if result.returncode == 0: + return # Nothing to commit + + subprocess.run( + ['git', 'commit', '-m', message], + cwd=submodule_path, check=True, capture_output=True, text=True + ) + subprocess.run( + ['git', 'push', 'origin', 'master'], + cwd=submodule_path, check=True, capture_output=True, text=True + ) + except subprocess.CalledProcessError as e: + raise RuntimeError( + f"Failed to commit/push lab-manual changes: {e.stderr or e.stdout}" + ) from e diff --git a/scripts/pre_push_check.py b/scripts/pre_push_check.py index 703952c..93383a5 100644 --- a/scripts/pre_push_check.py +++ b/scripts/pre_push_check.py @@ -23,6 +23,17 @@ def run_script(script_name: str) -> bool: return result.returncode == 0 +def check_submodule() -> bool: + """Check if lab-manual submodule is initialized.""" + lab_manual = Path(__file__).parent.parent / 'lab-manual' / 'lab_manual.tex' + if not lab_manual.exists(): + print("\nWARNING: Lab-manual submodule not initialized.") + print("Run: git submodule update --init") + print("Some sync features will not work without it.\n") + return False + return True + + def main(): """Run all pre-push checks.""" print("Context Lab Website Pre-Push Check") @@ -30,6 +41,9 @@ def main(): all_passed = True + # Step 0: Check submodule + check_submodule() # Warning only, doesn't block + # Step 1: Validate data if not run_script('validate_data.py'): print("\n*** Data validation FAILED ***") diff --git a/scripts/reconcile_people.py b/scripts/reconcile_people.py new file mode 100644 index 0000000..d6d0d09 --- /dev/null +++ b/scripts/reconcile_people.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +"""Reconcile member/alumni data across people.xlsx, JRM_CV.tex, and lab_manual.tex. + +people.xlsx is the source of truth. Discrepancies are categorized as: +- Auto-resolved: people in people.xlsx missing from other sources (auto-added) +- Flagged for review: people in other sources missing from people.xlsx +- Conflicts: data mismatches requiring manual resolution +""" +import argparse +import sys +from difflib import SequenceMatcher +from pathlib import Path +from typing import List, Dict, Set, Optional, Tuple + +from utils import load_spreadsheet_all_sheets +from parse_cv_trainees import parse_cv_trainees, get_active_trainees, get_alumni_trainees +from parse_lab_manual import parse_members_chapter +from sync_cv_people import normalize_name, NICKNAME_MAP, expand_nicknames, names_match + +PROJECT_ROOT = Path(__file__).parent.parent +PEOPLE_XLSX = PROJECT_ROOT / 'data' / 'people.xlsx' +CV_TEX = PROJECT_ROOT / 'documents' / 'JRM_CV.tex' +LAB_MANUAL_TEX = PROJECT_ROOT / 'lab-manual' / 'lab_manual.tex' + +FUZZY_THRESHOLD = 0.85 + + +def load_people_xlsx() -> Dict[str, List[Dict]]: + """Load all sheets from people.xlsx and return normalized data.""" + sheets = load_spreadsheet_all_sheets(PEOPLE_XLSX) + return sheets + + +# Sheets in people.xlsx that contain actual lab members/alumni +PERSON_SHEETS = { + 'members', 'alumni_postdocs', 'alumni_grads', + 'alumni_managers', 'alumni_undergrads', +} + + +def get_all_people_names(sheets: Dict[str, List[Dict]]) -> Dict[str, Dict]: + """Extract all people from people.xlsx with their sheet and data. + + Excludes non-person sheets like 'collaborators' and 'director'. + + Returns: + Dict mapping normalized name -> {sheet, name_original, data} + """ + people = {} + for sheet_name, rows in sheets.items(): + if sheet_name not in PERSON_SHEETS: + continue + for row in rows: + name = row.get('name', '').strip() + if not name: + continue + norm = normalize_name(name) + people[norm] = { + 'sheet': sheet_name, + 'name_original': name, + 'data': row, + } + return people + + +def get_cv_names() -> Dict[str, Dict]: + """Extract all trainees from JRM_CV.tex. + + Returns: + Dict mapping normalized name -> {category, is_active, trainee} + """ + if not CV_TEX.exists(): + return {} + trainees_by_cat = parse_cv_trainees(CV_TEX) + result = {} + for cat, trainees in trainees_by_cat.items(): + for t in trainees: + norm = normalize_name(t.name) + result[norm] = { + 'category': t.category, + 'is_active': t.is_active, + 'name_original': t.name, + 'trainee': t, + } + return result + + +def get_lab_manual_names() -> Dict[str, Dict]: + """Extract all members from lab_manual.tex. + + Returns: + Dict mapping normalized name -> {role_category, is_active, record} + """ + if not LAB_MANUAL_TEX.exists(): + return {} + records = parse_members_chapter(LAB_MANUAL_TEX) + result = {} + for r in records: + norm = normalize_name(r['name']) + # Same person may appear multiple times (multi-role); keep the most recent + if norm in result: + existing = result[norm] + if r['is_active'] and not existing['is_active']: + result[norm] = { + 'role_category': r['role_category'], + 'is_active': r['is_active'], + 'name_original': r['name'], + 'record': r, + } + else: + result[norm] = { + 'role_category': r['role_category'], + 'is_active': r['is_active'], + 'name_original': r['name'], + 'record': r, + } + return result + + +def fuzzy_find(name: str, name_set: Set[str]) -> Optional[Tuple[str, float]]: + """Find the best fuzzy match for a name in a set. + + Args: + name: Normalized name to search for. + name_set: Set of normalized names to search in. + + Returns: + Tuple of (matched_name, score) if score >= FUZZY_THRESHOLD, else None. + """ + best_match = None + best_score = 0.0 + for candidate in name_set: + score = SequenceMatcher(None, name, candidate).ratio() + if score > best_score: + best_score = score + best_match = candidate + if best_score >= FUZZY_THRESHOLD and best_match: + return (best_match, best_score) + return None + + +def find_match(name: str, target_names: Set[str]) -> Optional[Tuple[str, str]]: + """Try to find a name in a set using exact, nickname, and fuzzy matching. + + Returns: + Tuple of (matched_name, match_type) or None. + match_type is 'exact', 'nickname', or 'fuzzy'. + """ + # Exact match + if name in target_names: + return (name, 'exact') + + # Nickname match + if names_match(name, name) is False: + pass # names_match compares two names + for target in target_names: + if names_match(name, target): + return (target, 'nickname') + + # Fuzzy match + result = fuzzy_find(name, target_names) + if result: + return (result[0], 'fuzzy') + + return None + + +class Discrepancy: + """A discrepancy found during reconciliation.""" + + def __init__(self, name, disc_type, present_in, missing_from, + details, resolution, confidence=1.0): + self.name = name + self.type = disc_type # 'missing', 'conflict', 'near_match' + self.present_in = present_in # list of source names + self.missing_from = missing_from # list of source names + self.details = details + self.resolution = resolution # 'auto_add', 'flag_for_review', 'conflict' + self.confidence = confidence + + +def reconcile(dry_run=False) -> List[Discrepancy]: + """Run three-way reconciliation. + + Args: + dry_run: If True, report only; don't modify files. + + Returns: + List of Discrepancy objects. + """ + xlsx_people = get_all_people_names(load_people_xlsx()) + cv_people = get_cv_names() + lm_people = get_lab_manual_names() + + xlsx_names = set(xlsx_people.keys()) + cv_names = set(cv_people.keys()) + lm_names = set(lm_people.keys()) + + # Exclude PI from comparisons (PI is not in people.xlsx) + pi_names = {normalize_name(r['name_original']) for r in lm_people.values() + if r['role_category'] == 'PI'} + lm_names_no_pi = lm_names - pi_names + + discrepancies = [] + + # 1. People in people.xlsx but not in CV + for name in xlsx_names: + if name not in cv_names: + match = find_match(name, cv_names) + if match: + matched, match_type = match + if match_type == 'fuzzy': + discrepancies.append(Discrepancy( + name=xlsx_people[name]['name_original'], + disc_type='near_match', + present_in=['people.xlsx', 'CV (as ' + cv_people[matched]['name_original'] + ')'], + missing_from=[], + details=f"Fuzzy match: '{xlsx_people[name]['name_original']}' ≈ '{cv_people[matched]['name_original']}'", + resolution='flag_for_review', + confidence=SequenceMatcher(None, name, matched).ratio(), + )) + else: + discrepancies.append(Discrepancy( + name=xlsx_people[name]['name_original'], + disc_type='missing', + present_in=['people.xlsx'], + missing_from=['CV'], + details=f"'{xlsx_people[name]['name_original']}' is in people.xlsx ({xlsx_people[name]['sheet']}) but not in CV", + resolution='auto_add', + )) + + # 2. People in people.xlsx but not in lab-manual + for name in xlsx_names: + if name not in lm_names_no_pi: + match = find_match(name, lm_names_no_pi) + if match: + matched, match_type = match + if match_type == 'fuzzy': + discrepancies.append(Discrepancy( + name=xlsx_people[name]['name_original'], + disc_type='near_match', + present_in=['people.xlsx', 'lab-manual (as ' + lm_people[matched]['name_original'] + ')'], + missing_from=[], + details=f"Fuzzy match: '{xlsx_people[name]['name_original']}' ≈ '{lm_people[matched]['name_original']}'", + resolution='flag_for_review', + confidence=SequenceMatcher(None, name, matched).ratio(), + )) + else: + discrepancies.append(Discrepancy( + name=xlsx_people[name]['name_original'], + disc_type='missing', + present_in=['people.xlsx'], + missing_from=['lab-manual'], + details=f"'{xlsx_people[name]['name_original']}' is in people.xlsx ({xlsx_people[name]['sheet']}) but not in lab-manual", + resolution='auto_add', + )) + + # 3. People in lab-manual but not in people.xlsx (FLAG) + for name in lm_names_no_pi: + if name not in xlsx_names: + match = find_match(name, xlsx_names) + if match: + matched, match_type = match + if match_type in ('exact', 'nickname'): + continue # Already matched + discrepancies.append(Discrepancy( + name=lm_people[name]['name_original'], + disc_type='near_match', + present_in=['lab-manual'], + missing_from=['people.xlsx'], + details=f"Fuzzy match: '{lm_people[name]['name_original']}' ≈ '{xlsx_people[matched]['name_original']}'", + resolution='flag_for_review', + confidence=SequenceMatcher(None, name, matched).ratio(), + )) + else: + discrepancies.append(Discrepancy( + name=lm_people[name]['name_original'], + disc_type='missing', + present_in=['lab-manual'], + missing_from=['people.xlsx'], + details=f"'{lm_people[name]['name_original']}' is in lab-manual ({lm_people[name]['role_category']}) but not in people.xlsx", + resolution='flag_for_review', + )) + + # 4. People in CV but not in people.xlsx (FLAG) + for name in cv_names: + if name not in xlsx_names: + match = find_match(name, xlsx_names) + if match: + matched, match_type = match + if match_type in ('exact', 'nickname'): + continue + discrepancies.append(Discrepancy( + name=cv_people[name]['name_original'], + disc_type='near_match', + present_in=['CV'], + missing_from=['people.xlsx'], + details=f"Fuzzy match: '{cv_people[name]['name_original']}' ≈ '{xlsx_people[matched]['name_original']}'", + resolution='flag_for_review', + confidence=SequenceMatcher(None, name, matched).ratio(), + )) + else: + discrepancies.append(Discrepancy( + name=cv_people[name]['name_original'], + disc_type='missing', + present_in=['CV'], + missing_from=['people.xlsx'], + details=f"'{cv_people[name]['name_original']}' is in CV ({cv_people[name]['category']}) but not in people.xlsx", + resolution='flag_for_review', + )) + + return discrepancies + + +def print_report(discrepancies: List[Discrepancy]) -> None: + """Print a categorized reconciliation report.""" + auto_resolved = [d for d in discrepancies if d.resolution == 'auto_add'] + flagged = [d for d in discrepancies if d.resolution == 'flag_for_review'] + conflicts = [d for d in discrepancies if d.resolution == 'conflict'] + + print("=" * 60) + print("RECONCILIATION REPORT") + print("=" * 60) + print(f"\nTotal discrepancies: {len(discrepancies)}") + print(f" Auto-resolved: {len(auto_resolved)}") + print(f" Flagged for review: {len(flagged)}") + print(f" Conflicts: {len(conflicts)}") + + if auto_resolved: + print("\n" + "-" * 60) + print("AUTO-RESOLVED (people.xlsx → other sources)") + print("-" * 60) + for d in auto_resolved: + print(f" + {d.name}") + print(f" Present in: {', '.join(d.present_in)}") + print(f" Missing from: {', '.join(d.missing_from)}") + print(f" Action: Auto-add to {', '.join(d.missing_from)}") + + if flagged: + print("\n" + "-" * 60) + print("FLAGGED FOR REVIEW") + print("-" * 60) + for d in flagged: + flag = "~" if d.type == 'near_match' else "?" + print(f" {flag} {d.name}") + print(f" {d.details}") + if d.type == 'near_match': + print(f" Confidence: {d.confidence:.0%}") + + if conflicts: + print("\n" + "-" * 60) + print("CONFLICTS REQUIRING MANUAL RESOLUTION") + print("-" * 60) + for d in conflicts: + print(f" ! {d.name}") + print(f" {d.details}") + + if not discrepancies: + print("\nAll sources are in sync!") + + print("\n" + "=" * 60) + + +def main(): + parser = argparse.ArgumentParser( + description='Reconcile member/alumni data across people.xlsx, CV, and lab-manual.' + ) + parser.add_argument( + '--dry-run', action='store_true', + help='Report discrepancies without making changes.' + ) + args = parser.parse_args() + + # Verify sources exist + if not PEOPLE_XLSX.exists(): + print(f"ERROR: {PEOPLE_XLSX} not found", file=sys.stderr) + sys.exit(1) + + if not LAB_MANUAL_TEX.exists(): + print(f"WARNING: {LAB_MANUAL_TEX} not found (submodule not initialized?)", file=sys.stderr) + print("Run: git submodule update --init", file=sys.stderr) + + discrepancies = reconcile(dry_run=args.dry_run) + print_report(discrepancies) + + if args.dry_run: + print("\n(Dry run — no changes made)") + else: + # TODO: Apply auto-fixes in Phase 3 implementation + print("\n(Report only — auto-fix not yet implemented)") + + # Exit with non-zero if there are flagged items + flagged = [d for d in discrepancies if d.resolution == 'flag_for_review'] + if flagged: + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/specs/001-people-labmanual-sync/checklists/requirements.md b/specs/001-people-labmanual-sync/checklists/requirements.md new file mode 100644 index 0000000..0d4c431 --- /dev/null +++ b/specs/001-people-labmanual-sync/checklists/requirements.md @@ -0,0 +1,37 @@ +# Specification Quality Checklist: People & Lab-Manual Synchronization + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2026-03-23 +**Feature**: [spec.md](../spec.md) + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders +- [x] All mandatory sections completed + +## Requirement Completeness + +- [x] No [NEEDS CLARIFICATION] markers remain +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic (no implementation details) +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified +- [x] Scope is clearly bounded +- [x] Dependencies and assumptions identified + +## Feature Readiness + +- [x] All functional requirements have clear acceptance criteria +- [x] User scenarios cover primary flows +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification + +## Notes + +- All items pass. Spec is ready for `/speckit.clarify` or `/speckit.plan`. +- The spec references specific script names (onboard_member.py, offboard_member.py) + and file paths (people.xlsx, JRM_CV.tex) because these are domain entities in this + project, not implementation choices — they are the existing system being extended. diff --git a/specs/001-people-labmanual-sync/data-model.md b/specs/001-people-labmanual-sync/data-model.md new file mode 100644 index 0000000..86d1d09 --- /dev/null +++ b/specs/001-people-labmanual-sync/data-model.md @@ -0,0 +1,97 @@ +# Data Model: People & Lab-Manual Synchronization + +**Date**: 2026-03-23 +**Branch**: `001-people-labmanual-sync` + +## Entities + +### Person (unified representation) + +A person as represented across all three sources. Used internally by +the reconciliation tool to compare records. + +| Field | Type | Source(s) | Notes | +|-|-|-|-| +| name | string | all three | Canonical full name | +| role | string | people.xlsx, CV | e.g., "grad student", "postdoc", "undergrad" | +| start_year | int | all three | Year joined the lab | +| end_year | int or None | all three | None = currently active | +| is_active | bool | derived | True if end_year is None | +| alumni_category | string | derived | "alumni_postdocs", "alumni_grads", etc. | +| bio | string | people.xlsx only | Not in CV or lab-manual | +| image | string | people.xlsx only | Photo filename | +| current_position | string | CV, people.xlsx | Post-lab position (alumni only) | + +### SourceRecord + +A person record as parsed from a single source. + +| Field | Type | Notes | +|-|-|-| +| name | string | As written in that source | +| source | enum | "people_xlsx", "cv_tex", "lab_manual_tex" | +| role_category | string | Role heading under which they appear | +| start_year | int | Parsed from year range | +| end_year | int or None | Parsed from year range | +| is_active | bool | Derived from section (current vs alumni) | +| raw_line | string | Original text for debugging | + +### Discrepancy + +Result of comparing records across sources. + +| Field | Type | Notes | +|-|-|-| +| person_name | string | Best-guess canonical name | +| type | enum | "missing", "conflict", "near_match" | +| present_in | list[str] | Which sources have this person | +| missing_from | list[str] | Which sources lack this person | +| details | string | Human-readable explanation | +| resolution | enum | "auto_add", "flag_for_review", "conflict" | +| confidence | float | Fuzzy match score (1.0 = exact) | + +## Source-Specific Parsing + +### people.xlsx + +- **Sheets**: members, alumni_postdocs, alumni_grads, alumni_managers, alumni_undergrads +- **Key columns**: name, name_url, role, bio, image, years +- **Active vs alumni**: Determined by which sheet they're on +- **Parser**: Existing `utils.load_spreadsheet_all_sheets()` + +### JRM_CV.tex + +- **Sections**: Postdoctoral Advisees, Graduate Advisees, Undergraduate Advisees +- **Format**: `\item Name (metadata; YYYY -- YYYY; current position)` +- **Active vs alumni**: end_year present = alumni, open range = active +- **Parser**: Existing `parse_cv_trainees.py` + +### lab_manual.tex + +- **Sections**: Current lab members / Lab alumni, each with role subsections +- **Format**: `\item Name (YYYY -- YYYY)` inside `\begin{list}{\quad}{}` +- **Active vs alumni**: Determined by subsection (Current vs Alumni) +- **Parser**: New — needs to handle `\newthought{}` role headings +- **Note**: Same person can appear in multiple role sections + +## State Transitions + +``` +New member joins lab: + → Added to people.xlsx (members sheet) + → Added to JRM_CV.tex (active, open year range) + → Added to lab_manual.tex (Current lab members section) + +Member leaves lab: + → Moved in people.xlsx (members → alumni_* sheet) + → Updated in JRM_CV.tex (year range closed) + → Moved in lab_manual.tex (Current → Alumni section) +``` + +## Identity & Uniqueness + +- **Primary key**: Person name (case-insensitive) +- **Fuzzy matching**: difflib.SequenceMatcher >= 0.85 threshold +- **Nickname table**: Reuse from sync_cv_people.py (Will↔William, etc.) +- **Duplicate handling**: Same person in multiple role sections of + lab_manual.tex is normal (career progression), not a conflict diff --git a/specs/001-people-labmanual-sync/plan.md b/specs/001-people-labmanual-sync/plan.md new file mode 100644 index 0000000..bb7af8f --- /dev/null +++ b/specs/001-people-labmanual-sync/plan.md @@ -0,0 +1,209 @@ +# Implementation Plan: People & Lab-Manual Synchronization + +**Branch**: `001-people-labmanual-sync` | **Date**: 2026-03-23 | **Spec**: [spec.md](spec.md) +**Input**: Feature specification from `specs/001-people-labmanual-sync/spec.md` + +## Summary + +Synchronize member and alumni data across three sources: `data/people.xlsx` +(source of truth), `documents/JRM_CV.tex`, and the lab-manual's +`lab_manual.tex`. Build a reconciliation tool to audit and fix drift, extend +onboard/offboard scripts to update all three destinations, and add the +lab-manual as a Git submodule. + +## Technical Context + +**Language/Version**: Python 3.9+ (matches existing build system) +**Primary Dependencies**: openpyxl (spreadsheet), difflib (fuzzy matching), subprocess/git (submodule operations) +**Storage**: Excel spreadsheet (people.xlsx), LaTeX files (JRM_CV.tex, lab_manual.tex) +**Testing**: pytest (existing suite in tests/) +**Target Platform**: macOS/Linux (developer machines + GitHub Actions) +**Project Type**: CLI build tools (extending existing scripts) +**Performance Goals**: Reconciliation completes in <30 seconds for 200 people +**Constraints**: No new pip dependencies; reuse stdlib and existing deps +**Scale/Scope**: ~163 people entries across 3 sources + +## Constitution Check + +*GATE: Must pass before implementation. Re-checked after design.* + +| Principle | Status | Notes | +|-|-|-| +| I. User Experience | PASS | Reconciliation report is clear and actionable; scripts give feedback | +| II. Attention to Detail | PASS | Full test coverage planned; existing tests must continue to pass | +| III. Living Documentation | PASS | CLAUDE.md, AGENTS.md, README.md updates included in plan | +| IV. Repository Cleanliness | PASS | Submodule is clean; no temp files; no secrets | +| V. Cross-Repository Consistency | PASS | This feature IS the consistency enforcement mechanism | + +## Project Structure + +### Documentation (this feature) + +```text +specs/001-people-labmanual-sync/ +├── spec.md +├── plan.md # This file +├── research.md +├── data-model.md +├── quickstart.md +└── checklists/ + └── requirements.md +``` + +### Source Code (changes to existing repo) + +```text +# New files +scripts/ +├── reconcile_people.py # Reconciliation tool (new) +├── parse_lab_manual.py # Lab-manual LaTeX parser (new) +tests/ +├── test_reconcile_people.py # Reconciliation tests (new) +├── test_parse_lab_manual.py # Parser tests (new) + +# Modified files +scripts/ +├── onboard_member.py # Add lab-manual update step +├── offboard_member.py # Add lab-manual update step +.gitmodules # New file for submodule config +lab-manual/ # Submodule (ContextLab/lab-manual) + +# Documentation updates +CLAUDE.md +AGENTS.md +README.md +scripts/AGENTS.md +``` + +**Structure Decision**: Extending the existing `scripts/` directory with +two new scripts (parser + reconciliation tool) and modifying two existing +scripts. Follows the established pattern of one script per concern. + +## Implementation Phases + +### Phase 1: Submodule Setup (US3 — infrastructure first) + +Add the lab-manual as a Git submodule. This unblocks all other work. + +1. Run `git submodule add https://github.com/ContextLab/lab-manual.git lab-manual` +2. Verify `.gitmodules` is created correctly +3. Ensure `lab-manual/lab_manual.tex` is accessible +4. Update `.github/workflows/build-content.yml` to init submodule if needed + +### Phase 2: Lab-Manual Parser (supports US1) + +Create `scripts/parse_lab_manual.py` to extract member/alumni data from +`lab_manual.tex`. + +**Parser approach**: +- Find `\chapter{Lab members and alumni}` as entry point +- Split into `\subsection{Current lab members}` and `\subsection{Lab alumni}` +- Within each subsection, find `\newthought{Role}` headings +- Extract `\item Name (YYYY -- YYYY)` entries from `\begin{list}` blocks +- Return list of `SourceRecord` objects (name, role, years, active/alumni) + +**Key considerations**: +- Handle same person appearing in multiple role sections +- Handle commented-out sections (e.g., Research Assistants) +- Handle single-year entries (e.g., `Jessica Tin (2016)`) +- Handle PI special case (no list wrapper) + +**Tests**: Parse known structure, edge cases (empty sections, single-year, +multi-role person, commented sections). + +### Phase 3: Reconciliation Tool (US1) + +Create `scripts/reconcile_people.py` that compares all three sources. + +**Flow**: +1. Load people.xlsx via `utils.load_spreadsheet_all_sheets()` +2. Parse JRM_CV.tex via `parse_cv_trainees.py` +3. Parse lab_manual.tex via `parse_lab_manual.py` +4. Normalize names (lowercase, strip whitespace) +5. Match across sources using: + a. Exact match (case-insensitive) + b. Nickname table (from sync_cv_people.py) + c. Fuzzy match (difflib >= 0.85 threshold) +6. Categorize discrepancies: + - In people.xlsx but not CV → auto-add to CV + - In people.xlsx but not lab-manual → auto-add to lab-manual + - In lab-manual but not people.xlsx → add to people.xlsx + FLAG + - In CV but not people.xlsx → add to people.xlsx + FLAG + - Near-matches → FLAG for manual review +7. Generate report (stdout) with clear sections: + - Auto-resolved items + - Items flagged for review + - Conflicts requiring manual resolution +8. Apply auto-fixes (unless `--dry-run`) + +**CLI interface**: +``` +python reconcile_people.py # Reconcile and apply auto-fixes +python reconcile_people.py --dry-run # Report only, no changes +``` + +**Tests**: Mock sources with known discrepancies, verify correct +categorization and resolution. + +### Phase 4: Update Onboard/Offboard Scripts (US2) + +Extend both scripts to also update `lab_manual.tex`. + +**onboard_member.py changes**: +- After updating people.xlsx and JRM_CV.tex, also: + 1. Find the correct `\newthought{Role}` section in lab_manual.tex + under `\subsection{Current lab members}` + 2. Add `\item Name (YYYY -- )` entry + 3. Commit and push the lab-manual submodule +- Wrap in try/except: warn on failure, don't block + +**offboard_member.py changes**: +- After updating people.xlsx and JRM_CV.tex, also: + 1. Find the member in `\subsection{Current lab members}` + 2. Remove from current section + 3. Add to `\subsection{Lab alumni}` under the correct role + 4. Close the year range + 5. Commit and push the lab-manual submodule +- Wrap in try/except: warn on failure, don't block + +**Lab-manual Git operations** (shared helper): +``` +cd lab-manual/ +git add lab_manual.tex +git commit -m "Update: onboard/offboard " +git push origin master +cd .. +``` + +**Tests**: Verify lab-manual updates happen; verify graceful failure +when submodule is not initialized. + +### Phase 5: Documentation & Polish + +- Update CLAUDE.md: Add reconcile command, submodule setup instructions +- Update AGENTS.md: Add reconcile_people.py and parse_lab_manual.py +- Update scripts/AGENTS.md: Add new scripts to structure and commands +- Update README.md: Add submodule instructions, reconciliation docs +- Run full test suite +- Run reconciliation tool against real data for initial audit + +## Dependencies & Execution Order + +``` +Phase 1 (Submodule) ──→ Phase 2 (Parser) ──→ Phase 3 (Reconciliation) + └──→ Phase 4 (Scripts) ──→ Phase 5 (Docs) +``` + +- Phase 1 MUST be first (everything else depends on submodule access) +- Phases 2 and 4 can be partially parallelized (parser needed for + reconciliation but not for the write-side of onboard/offboard) +- Phase 3 depends on Phase 2 (needs the parser) +- Phase 5 is last (documents everything) + +## Complexity Tracking + +No constitution violations. All changes follow existing patterns: +- New scripts follow the `scripts/*.py` convention +- New tests follow the `tests/test_*.py` convention +- Parser follows the same regex approach as `parse_cv_trainees.py` +- No new dependencies beyond stdlib diff --git a/specs/001-people-labmanual-sync/quickstart.md b/specs/001-people-labmanual-sync/quickstart.md new file mode 100644 index 0000000..46f565e --- /dev/null +++ b/specs/001-people-labmanual-sync/quickstart.md @@ -0,0 +1,57 @@ +# Quickstart: People & Lab-Manual Synchronization + +## Prerequisites + +- Python 3.9+ +- `pip install -r requirements-build.txt` +- Git submodule initialized: `git submodule update --init` +- Push access to both ContextLab/contextlab.github.io and ContextLab/lab-manual + +## One-Time Setup + +```bash +# Add lab-manual as submodule (if not already done) +git submodule add https://github.com/ContextLab/lab-manual.git lab-manual + +# Initialize submodule +git submodule update --init +``` + +## Reconcile All Sources + +```bash +# Run reconciliation (report only, no changes) +cd scripts && python reconcile_people.py --dry-run + +# Run reconciliation and apply auto-fixes +cd scripts && python reconcile_people.py + +# After reconciliation, rebuild the people page +cd scripts && python build.py +``` + +## Onboard a New Member (All Destinations) + +```bash +cd scripts +python onboard_member.py "First Last" --rank "grad student" +# This now updates: people.xlsx + JRM_CV.tex + lab_manual.tex +``` + +## Offboard a Member (All Destinations) + +```bash +cd scripts +python offboard_member.py "member name" --end-year 2026 +# This now updates: people.xlsx + JRM_CV.tex + lab_manual.tex +``` + +## Verify Sync Status + +```bash +# Check for discrepancies without making changes +cd scripts && python reconcile_people.py --dry-run + +# Run full test suite +python -m pytest tests/ -v +``` diff --git a/specs/001-people-labmanual-sync/research.md b/specs/001-people-labmanual-sync/research.md new file mode 100644 index 0000000..fd6c462 --- /dev/null +++ b/specs/001-people-labmanual-sync/research.md @@ -0,0 +1,106 @@ +# Research: People & Lab-Manual Synchronization + +**Date**: 2026-03-23 +**Branch**: `001-people-labmanual-sync` + +## R1: Lab-Manual LaTeX Structure + +**Decision**: Parse `lab_manual.tex` chapter "Lab members and alumni" +using a custom parser that understands the Tufte-style LaTeX structure. + +**Rationale**: The lab-manual uses a different LaTeX structure than the +CV — `\newthought{}` headings, `\begin{list}{\quad}{}` items inside +`multicols`. Cannot reuse the CV parser directly, but the data model +is simpler (name + years only). + +**Structure**: +``` +\chapter{Lab members and alumni} +\begin{fullwidth} + \subsection{Current lab members} + \newthought{Role} + \begin{multicols}{2}\raggedcolumns + \begin{list}{\quad}{} + \item Name (start_year -- ) + \end{list} + \end{multicols} + \subsection{Lab alumni} + \newthought{Role} + ... same pattern, closed year ranges ... +\end{fullwidth} +``` + +**Role categories**: PI, Postdoctoral Researchers, Graduate Students, +Research Assistants, Undergraduate RAs, Lab Managers. + +**Data per person**: Name and year range only. No bio, photo, links. + +**Key detail**: Same person can appear in multiple role sections (e.g., +as undergrad RA alumni AND current grad student). + +## R2: Existing CV Parser Infrastructure + +**Decision**: Extend existing `parse_cv_trainees.py` and +`sync_cv_people.py` rather than building new infrastructure. + +**Rationale**: These scripts already handle: +- Parsing CV trainees with `etaremune` lists and `\textit{}` headings +- Bidirectional comparison with `people.xlsx` +- Nickname/name variation handling +- Routing members to correct spreadsheet sheets + +**Alternatives considered**: +- Building a new unified parser: Rejected — the two LaTeX formats are + different enough that a single parser adds complexity without benefit. +- Using a LaTeX parsing library (e.g., pylatexenc): Rejected — the + structure is simple enough for regex-based parsing, consistent with + the existing approach. + +## R3: Lab-Manual Update Mechanism + +**Decision**: Use Git submodule + local file writes + commit/push. + +**Rationale**: The lab-manual is a standard Git repo. Since updates +should push directly to main (per clarification), the simplest approach +is: modify `lab_manual.tex` in the submodule, commit, and push. No +GitHub API needed. + +**Alternatives considered**: +- GitHub API (like the Slack bot uses): Rejected — adds complexity and + a dependency on PyGithub when local Git operations suffice. +- PR-based workflow: Rejected per user clarification — direct push + preferred. + +## R4: Fuzzy Name Matching + +**Decision**: Use Python's `difflib.SequenceMatcher` with an 85% +similarity threshold, supplemented by a nickname mapping table. + +**Rationale**: `sync_cv_people.py` already has a nickname mapping +(Will↔William, Rob↔Robert, etc.). Combining this with fuzzy matching +covers both common nicknames and typos. + +**Alternatives considered**: +- `fuzzywuzzy`/`thefuzz` library: Rejected — adds a dependency for + marginal improvement over stdlib `difflib` + nickname table. +- Exact match only: Rejected — too many false negatives from name + variations. + +## R5: Reconciliation Tool Design + +**Decision**: Create `scripts/reconcile_people.py` that compares all +three sources and produces a categorized report. + +**Rationale**: Needs to be a standalone script (like validate_data.py) +that can be run independently or as part of pre-push checks. Output +should be both human-readable (terminal) and machine-actionable (can +auto-fix what's safe to auto-fix). + +**Flow**: +1. Parse people.xlsx (source of truth) +2. Parse JRM_CV.tex trainees (reuse parse_cv_trainees.py) +3. Parse lab_manual.tex members chapter (new parser) +4. Compare all three using fuzzy matching +5. Auto-resolve: people.xlsx entries missing from CV or lab-manual +6. Flag for review: lab-manual entries missing from people.xlsx +7. Report conflicts and near-matches diff --git a/specs/001-people-labmanual-sync/spec.md b/specs/001-people-labmanual-sync/spec.md new file mode 100644 index 0000000..6bfa864 --- /dev/null +++ b/specs/001-people-labmanual-sync/spec.md @@ -0,0 +1,255 @@ +# Feature Specification: People & Lab-Manual Synchronization + +**Feature Branch**: `001-people-labmanual-sync` +**Created**: 2026-03-23 +**Status**: Draft +**Input**: Sync people page, JRM_CV, and lab-manual. people.xlsx is +source of truth for member/alumni lists. + +## Clarifications + +### Session 2026-03-23 + +- Q: How should lab-manual updates be delivered? → A: Commit and push directly to lab-manual's main branch (no PR). +- Q: Which file in the lab-manual contains member/alumni data? → A: `lab_manual.tex` under `\chapter{Lab members and alumni}\label{ch:members}` (repo: ContextLab/lab-manual, branch: master). +- Q: Should the spec adopt distinct terms for the overloaded "onboarding"? → A: Yes. Use "data collection" for gathering info (via lab-manual process or Slack bot) and "website onboarding" for the actual act of adding someone to people.xlsx/CV/accounts. The Slack bot is an alternative data collection channel, not a separate onboarding process. + +## Terminology + +To avoid confusion from overloaded naming: + +- **Data collection**: The process of gathering information from a new + lab member (name, photo, bio, role, etc.). Can happen via the + lab-manual process or via the Slack bot — both are collection + channels feeding into the same destination. +- **Website onboarding**: The act of adding a person to `people.xlsx`, + `JRM_CV.tex`, GitHub org, and Google calendars. Performed by the lab + director via `onboard_member.py`. This is the only process that + creates the canonical member record. +- **Website offboarding**: Moving a person from active members to alumni + across all destinations. Performed via `offboard_member.py`. +- **Reconciliation**: Comparing member/alumni lists across all sources + and resolving discrepancies. + +## Context: Current Onboarding Landscape + +There are two phases to adding a new lab member: + +1. **Data collection** (happens first): Gathering info from the new + member. This can happen through the lab-manual process OR through + the Slack bot (which lets new members self-initiate). Both are + alternative channels for the same goal. The Slack bot is configured + directly in Slack (not in any GitHub repo) and also updates + `people.xlsx` and `JRM_CV.tex` via GitHub API PRs. +2. **Website onboarding** (happens second): The lab director runs + `onboard_member.py` to add the person to the people page and invite + them to accounts (GitHub org, Google calendars). + +This feature focuses on ensuring the *data* stays consistent across +all sources (`people.xlsx`, `JRM_CV.tex`, `lab_manual.tex`), not on +unifying the data collection workflows. + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 - Initial Audit and Reconciliation (Priority: P1) + +The lab director needs to verify that the current member and alumni lists +are consistent across `people.xlsx` (source of truth for member/alumni +lists), `JRM_CV.tex`, and the lab-manual's `lab_manual.tex` (chapter: +"Lab members and alumni"). Any discrepancies MUST be identified and +resolved. + +When sources conflict: +- `people.xlsx` wins over all other sources for member/alumni data. +- People found in `lab_manual.tex` but NOT in `people.xlsx` should be + added to `people.xlsx` but flagged for the director's manual review. +- People found in `people.xlsx` but missing from `lab_manual.tex` or + `JRM_CV.tex` should be auto-added to those destinations. + +**Why this priority**: Without a correct baseline, all future sync is +built on incorrect data. This is foundational. + +**Independent Test**: Run the reconciliation tool and verify it produces +a report listing all discrepancies. Manually confirm a sample of flagged +entries against the actual sources. + +**Acceptance Scenarios**: + +1. **Given** people.xlsx, JRM_CV.tex, and lab_manual.tex all exist, + **When** the reconciliation tool runs, **Then** it produces a report + listing every person present in one source but missing from another, + grouped by category (members vs. alumni types). +2. **Given** lab_manual.tex has people not in people.xlsx, **When** the + reconciliation runs, **Then** those people are added to people.xlsx + AND flagged for manual review by the lab director. +3. **Given** people.xlsx has people not in JRM_CV.tex or lab_manual.tex, + **When** the reconciliation runs, **Then** those people are + auto-added to the missing destination(s). +4. **Given** a person's data conflicts between sources (e.g., different + role or years), **When** the reconciliation runs, **Then** + people.xlsx data wins and the other sources are updated accordingly. + +--- + +### User Story 2 - Website Onboarding/Offboarding Updates Lab-Manual (Priority: P2) + +When a lab member is onboarded or offboarded using the website scripts, +the lab-manual's `lab_manual.tex` MUST also be updated automatically. +Currently, `onboard_member.py` and `offboard_member.py` update +`people.xlsx` and `JRM_CV.tex` but do NOT touch the lab-manual. + +Updates to the lab-manual MUST be committed and pushed directly to the +lab-manual's main branch (no PR required). + +**Why this priority**: Once the baseline is correct (US1), ongoing +changes need to flow to all destinations to prevent future drift. + +**Independent Test**: Run the onboard script for a test member and verify +that (a) people.xlsx is updated, (b) JRM_CV.tex is updated, and (c) +lab_manual.tex in the submodule is updated, committed, and pushed. + +**Acceptance Scenarios**: + +1. **Given** a new member is being onboarded, **When** `onboard_member.py` + runs, **Then** lab_manual.tex is updated with the new member's info, + committed, and pushed to the lab-manual repo. +2. **Given** a member is being offboarded, **When** `offboard_member.py` + runs, **Then** lab_manual.tex is updated to reflect the move to + alumni, committed, and pushed. +3. **Given** the lab-manual submodule is not initialized or the push + fails, **When** website onboarding runs, **Then** the website and CV + updates still succeed, and a warning is printed about the failed + lab-manual update. + +--- + +### User Story 3 - Lab-Manual as Submodule (Priority: P3) + +The lab-manual repository MUST be available as a Git submodule of the +website repository, so scripts can read from and write to `lab_manual.tex` +locally without requiring API calls for every operation. + +**Why this priority**: This is infrastructure that supports US1 and US2. +It could be implemented first chronologically, but its value is only +realized through the other stories. + +**Independent Test**: Clone the website repo with `--recurse-submodules` +and verify the lab-manual appears at the expected path. + +**Acceptance Scenarios**: + +1. **Given** a fresh clone of the website repo, **When** submodules are + initialized, **Then** the lab-manual repo appears at `lab-manual/` + within the website repo. +2. **Given** the submodule is initialized, **When** scripts reference + `lab-manual/lab_manual.tex`, **Then** they can read and write to it + using local file paths. +3. **Given** the submodule is at a specific commit, **When** the + lab-manual is updated upstream, **Then** the website repo can pull + the latest by updating the submodule reference. + +--- + +### Edge Cases + +- What happens when a person's name is spelled differently across + sources (e.g., "Rob" vs. "Robert")? The reconciliation tool MUST use + fuzzy matching and flag near-matches for manual review. +- What happens when the Slack bot creates a PR on the website repo at + the same time a website onboarding script runs locally? The system + MUST handle merge conflicts gracefully by alerting the user. +- What happens when a person appears as both an active member and + alumni in different sources? The reconciliation tool MUST flag this + as a conflict requiring manual resolution. +- What happens when the submodule is not initialized and a script tries + to update the lab-manual? The script MUST print a clear error message + with instructions on how to initialize the submodule. +- What happens when the Slack bot updates people.xlsx via a PR but the + local copy has diverged? The reconciliation tool MUST be safe to run + after pulling the latest changes. + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: The system MUST treat `data/people.xlsx` as the single + source of truth for member and alumni list data. When data conflicts + exist between sources, people.xlsx wins. +- **FR-002**: The system MUST provide a reconciliation tool that compares + personnel across `people.xlsx`, `JRM_CV.tex`, and `lab_manual.tex` + (specifically the "Lab members and alumni" chapter), and produces a + human-readable discrepancy report. +- **FR-003**: People found in `lab_manual.tex` but not in people.xlsx + MUST be added to people.xlsx AND flagged for the lab director's + manual review. +- **FR-004**: People found in people.xlsx but missing from JRM_CV.tex or + `lab_manual.tex` MUST be auto-added to those destinations. +- **FR-005**: `onboard_member.py` MUST update `lab_manual.tex` in the + lab-manual submodule, commit, and push directly to the lab-manual's + `master` branch. +- **FR-006**: `offboard_member.py` MUST update `lab_manual.tex` in the + lab-manual submodule, commit, and push directly to the lab-manual's + `master` branch. +- **FR-007**: Lab-manual update failures MUST NOT block website or CV + updates; failures MUST be reported as warnings. +- **FR-008**: The lab-manual MUST be available as a Git submodule of the + website repository at `lab-manual/`. +- **FR-009**: The reconciliation tool MUST use fuzzy name matching to + catch spelling variations and flag near-matches for review. +- **FR-010**: All existing tests MUST continue to pass after these + changes. +- **FR-011**: The reconciliation report MUST clearly distinguish between + auto-resolved discrepancies and items requiring manual review. + +### Key Entities + +- **Person**: Name, role/rank, years active, alumni status, bio, photo, + website URL. Exists across people.xlsx (authoritative for + member/alumni lists), JRM_CV.tex (authoritative for career/publication + data), and lab_manual.tex (chapter "Lab members and alumni"). +- **Discrepancy**: A person present in one source but missing or + different in another. Has a type (missing, conflicting, near-match) + and a resolution (auto-resolved vs. flagged for review). +- **Lab-Manual Submodule**: The Git submodule at `lab-manual/` pointing + to ContextLab/lab-manual (master branch), pinned to a specific commit. + +## Assumptions + +- The lab-manual's `lab_manual.tex` chapter "Lab members and alumni" + can be parsed for names, roles, and years from its LaTeX structure. +- The Slack bot (`cdl_bot/services/website_service.py`) is a data + collection channel — it reads and writes `people.xlsx` and + `JRM_CV.tex` via GitHub API but does not maintain a separate member + list. +- The lab-manual repo is accessible via the same GitHub credentials + used for the website repo. +- The submodule path will be `lab-manual/` at the repository root. +- Fuzzy name matching with a reasonable similarity threshold (e.g., 85%) + is sufficient to catch most spelling variations without excessive + false positives. +- The two data collection channels (lab-manual process, Slack bot) and + the website onboarding workflow will continue to coexist. This + feature synchronizes their *outputs*, not the collection workflows. + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: After running the reconciliation tool, 100% of personnel + in people.xlsx are also present in JRM_CV.tex and lab_manual.tex + (zero discrepancies for spreadsheet-sourced entries). +- **SC-002**: People found in lab_manual.tex but not in people.xlsx are + added and flagged, with zero silent additions (100% flagging rate). +- **SC-003**: Website onboarding updates all three destinations + (people.xlsx, JRM_CV.tex, lab_manual.tex) in a single script + invocation. +- **SC-004**: Website offboarding updates all three destinations in a + single script invocation. +- **SC-005**: Lab-manual update failures do not prevent website or CV + updates from completing (graceful degradation). +- **SC-006**: All discrepancies between sources are identified and + categorized in under 30 seconds for a lab of up to 200 people. +- **SC-007**: Near-match detection catches name variations (e.g., + nicknames, typos) with at least 90% recall against a test corpus of + 20+ name variation pairs (nicknames, abbreviations, typos, hyphenated + vs. non-hyphenated names). diff --git a/specs/001-people-labmanual-sync/tasks.md b/specs/001-people-labmanual-sync/tasks.md new file mode 100644 index 0000000..d4ea57b --- /dev/null +++ b/specs/001-people-labmanual-sync/tasks.md @@ -0,0 +1,187 @@ +# Tasks: People & Lab-Manual Synchronization + +**Input**: Design documents from `specs/001-people-labmanual-sync/` +**Prerequisites**: plan.md (required), spec.md (required), research.md, data-model.md, quickstart.md + +## Format: `[ID] [P?] [Story] Description` + +- **[P]**: Can run in parallel (different files, no dependencies) +- **[Story]**: Which user story this task belongs to (US1, US2, US3) +- Include exact file paths in descriptions + +--- + +## Phase 1: Setup + +**Purpose**: Add lab-manual submodule and establish infrastructure + +- [x] T001 Add lab-manual as Git submodule at `lab-manual/` via `git submodule add https://github.com/ContextLab/lab-manual.git lab-manual` +- [x] T002 Verify `lab-manual/lab_manual.tex` is accessible and contains `\chapter{Lab members and alumni}` +- [x] T003 [P] Update `.github/workflows/build-content.yml` to init submodule before build steps (add `submodules: true` to checkout step) + +**Checkpoint**: Submodule accessible locally and in CI + +--- + +## Phase 2: Foundational (Blocking Prerequisites) + +**Purpose**: Lab-manual parser that US1, US2, and US3 all depend on + +**⚠️ CRITICAL**: No user story work can begin until the parser is complete + +- [x] T004 Create `scripts/parse_lab_manual.py` with `parse_members_chapter()` function that extracts all member/alumni entries from `lab_manual.tex` chapter "Lab members and alumni" +- [x] T005 Implement `\newthought{Role}` heading detection to identify role categories (PI, Postdoctoral Researchers, Graduate Students, Undergraduate RAs, Lab Managers, Research Assistants) +- [x] T006 Implement `\item Name (YYYY -- YYYY)` entry parsing within `\begin{list}{\quad}{}` blocks, handling: open year ranges (active), closed ranges (alumni), single-year entries, and commented-out sections +- [x] T007 Implement section splitting between `\subsection{Current lab members}` and `\subsection{Lab alumni}` to determine active vs. alumni status +- [x] T008 Handle PI special case (no list wrapper, format: `\enskip Jeremy R. Manning (2015 -- )`) +- [x] T009 Return list of `SourceRecord` dicts with keys: name, role_category, start_year, end_year, is_active, raw_line +- [x] T009a Create shared helper `commit_and_push_lab_manual(submodule_path, message)` in `scripts/parse_lab_manual.py` that runs `git add`, `git commit`, `git push origin master` in the submodule directory (used by T019 and T024/T025) +- [x] T010 [P] Create `tests/test_parse_lab_manual.py` with tests for: basic parsing, empty sections, single-year entries, multi-role person, commented sections, PI special case, commit_and_push graceful failure when submodule not initialized + +**Checkpoint**: Parser extracts all members/alumni from lab_manual.tex correctly + +--- + +## Phase 3: User Story 1 — Initial Audit and Reconciliation (Priority: P1) 🎯 MVP + +**Goal**: Reconcile member/alumni data across people.xlsx (source of truth), JRM_CV.tex, and lab_manual.tex + +**Independent Test**: Run `python reconcile_people.py --dry-run` and verify it produces a correct discrepancy report + +### Implementation for User Story 1 + +- [x] T011 [US1] Create `scripts/reconcile_people.py` with CLI interface supporting `--dry-run` flag +- [x] T012 [US1] Implement source loading: load people.xlsx via `utils.load_spreadsheet_all_sheets()`, parse JRM_CV.tex via `parse_cv_trainees.parse_trainees()`, parse lab_manual.tex via `parse_lab_manual.parse_members_chapter()` +- [x] T013 [US1] Implement name normalization: lowercase, strip whitespace, integrate nickname table from `scripts/sync_cv_people.py` +- [x] T014 [US1] Implement three-way matching using: (a) exact match (case-insensitive), (b) nickname table lookup, (c) fuzzy match via `difflib.SequenceMatcher` with 0.85 threshold +- [x] T015 [US1] Implement discrepancy categorization: (a) in people.xlsx but not CV → auto-add to CV, (b) in people.xlsx but not lab-manual → auto-add to lab-manual, (c) in lab-manual but not people.xlsx → add + FLAG, (d) in CV but not people.xlsx → add + FLAG, (e) near-matches → FLAG +- [x] T016 [US1] Implement auto-fix application (when not `--dry-run`): add missing entries to people.xlsx, add missing entries to JRM_CV.tex (import `add_to_cv()` from `scripts/onboard_member.py` or extract to shared module), add missing entries to lab_manual.tex (reuse writer from T017) +- [x] T017 [US1] Implement lab_manual.tex writer: add `\item Name (YYYY -- )` to correct `\newthought{Role}` section under correct subsection (Current/Alumni) +- [x] T018 [US1] Implement report output to stdout with sections: "Auto-resolved", "Flagged for review", "Conflicts requiring manual resolution" +- [ ] T019 [US1] After auto-fixes, rebuild people.html via `build_people` and commit/push lab-manual submodule changes (reuse `commit_and_push_lab_manual()` from T009a) +- [x] T020 [P] [US1] Create `tests/test_reconcile_people.py` with tests for: exact match, nickname match, fuzzy match (including 0.85 threshold boundary — verify 0.84 is rejected and 0.86 is accepted), fuzzy match against corpus of 20+ name variation pairs (nicknames, abbreviations, typos, hyphenated names), auto-add from people.xlsx, flag from lab-manual, dry-run mode, report formatting (verify output contains distinct "Auto-resolved", "Flagged for review", and "Conflicts" sections) + +**Checkpoint**: Reconciliation tool correctly identifies and resolves discrepancies across all three sources + +--- + +## Phase 4: User Story 2 — Website Onboarding/Offboarding Updates Lab-Manual (Priority: P2) + +**Goal**: Extend onboard/offboard scripts to also update lab_manual.tex, commit, and push + +**Independent Test**: Run `onboard_member.py` for a test member and verify lab_manual.tex is updated, committed, and pushed + +### Implementation for User Story 2 + +- [x] T021 [US2] Refactor T017's lab_manual.tex writer into shared helper `add_member_to_lab_manual(tex_path, name, role, start_year)` in `scripts/parse_lab_manual.py` (depends on T017). Inserts `\item Name (YYYY -- )` into correct `\newthought{Role}` section under `\subsection{Current lab members}` +- [x] T022 [US2] Create shared helper function in `scripts/parse_lab_manual.py`: `move_member_to_alumni(tex_path, name, end_year)` that moves entry from Current to Alumni section and closes the year range +- [x] T023 [US2] Update `scripts/onboard_member.py`: after existing people.xlsx + JRM_CV.tex updates, call `add_member_to_lab_manual()` and `commit_and_push_lab_manual()` (from T009a), wrapped in try/except with warning on failure +- [x] T024 [US2] Update `scripts/offboard_member.py`: after existing people.xlsx + JRM_CV.tex updates, call `move_member_to_alumni()` and `commit_and_push_lab_manual()` (from T009a), wrapped in try/except with warning on failure +- [x] T025 [P] [US2] Add tests to `tests/test_parse_lab_manual.py` for: add_member_to_lab_manual (correct section, correct format), move_member_to_alumni (removal + insertion) + +**Checkpoint**: Onboard/offboard scripts update all three destinations; lab-manual failures warn but don't block + +--- + +## Phase 5: User Story 3 — Lab-Manual as Submodule (Priority: P3) + +**Goal**: Ensure submodule is properly documented and integrated into workflows + +**Independent Test**: Fresh clone with `--recurse-submodules` gives working lab-manual access + +### Implementation for User Story 3 + +- [x] T026 [US3] Update `.github/workflows/build-cv.yml` to init submodule before build steps (if reconciliation or CV sync references lab-manual) +- [x] T027 [US3] Add submodule initialization check to `scripts/pre_push_check.py`: warn if submodule is not initialized when running checks that depend on it +- [ ] T028 [US3] Test that GitHub Actions workflows succeed with the submodule (verify CI can access lab-manual/lab_manual.tex) + +**Checkpoint**: Submodule works in local dev AND CI environments + +--- + +## Phase 6: Polish & Cross-Cutting Concerns + +**Purpose**: Documentation and final validation + +- [x] T029 [P] Update `CLAUDE.md`: add reconcile command, submodule setup instructions, note lab-manual submodule under architecture +- [x] T030 [P] Update `AGENTS.md`: add `reconcile_people.py` and `parse_lab_manual.py` to structure and WHERE TO LOOK table +- [x] T031 [P] Update `scripts/AGENTS.md`: add new scripts to structure, commands, and conventions sections +- [ ] T032 [P] Update `README.md`: add submodule setup instructions, reconciliation documentation, updated onboard/offboard examples +- [x] T033 Run `python reconcile_people.py --dry-run` against real production data and review the discrepancy report +- [ ] T034 Run `python reconcile_people.py` to apply auto-fixes to real data (after T033 review) +- [x] T035 Run full test suite: `python -m pytest tests/ -v` — all tests MUST pass +- [x] T036 Run `cd scripts && python pre_push_check.py` for full pre-push validation + +--- + +## Dependencies & Execution Order + +### Phase Dependencies + +- **Setup (Phase 1)**: No dependencies — can start immediately +- **Foundational (Phase 2)**: Depends on Phase 1 (needs submodule access) — BLOCKS all user stories +- **US1 Reconciliation (Phase 3)**: Depends on Phase 2 (needs parser) +- **US2 Script Updates (Phase 4)**: Depends on Phase 2 (needs parser + writer helpers) +- **US3 Submodule Integration (Phase 5)**: Depends on Phase 1 only — can run in parallel with Phases 3-4 +- **Polish (Phase 6)**: Depends on Phases 3, 4, 5 + +### User Story Dependencies + +- **US1 (P1)**: Depends on parser (Phase 2). No dependencies on other stories. +- **US2 (P2)**: Depends on parser + writer helpers (Phase 2). Reuses T017's writer (refactored in T021) and T009a's git helper. Can start after Phase 2 but benefits from US1 being done first. +- **US3 (P3)**: Depends only on Phase 1 (submodule exists). Can run in parallel with US1/US2. + +### Within Each User Story + +- Source loading before matching +- Matching before categorization +- Categorization before auto-fix application +- Core implementation before tests +- Story complete before moving to next priority + +### Parallel Opportunities + +- T003 can run in parallel with T001/T002 (different files) +- T010 can run in parallel with T004-T009 (test file vs. implementation) +- T020 can run in parallel with T011-T019 (test file vs. implementation) +- T026 can run in parallel with T021-T025 (test file vs. implementation) +- T027, T028, T029 (US3) can run in parallel with US1/US2 phases +- T030, T031, T032, T033 (docs) can all run in parallel + +--- + +## Implementation Strategy + +### MVP First (User Story 1 Only) + +1. Complete Phase 1: Submodule Setup +2. Complete Phase 2: Parser (CRITICAL — blocks everything) +3. Complete Phase 3: Reconciliation Tool (US1) +4. **STOP and VALIDATE**: Run reconciliation against real data +5. Review flagged items with lab director + +### Incremental Delivery + +1. Setup + Parser → Foundation ready +2. Add Reconciliation Tool → Run initial audit (MVP!) +3. Add Script Updates → Ongoing sync automated +4. Add CI Integration → Safety net in place +5. Documentation → Everything documented + +### Parallel Opportunities Summary + +With multiple developers or agents: +- Agent A: US1 (Reconciliation) after parser done +- Agent B: US2 (Script updates) after parser done +- Agent C: US3 (CI integration) after submodule setup + +--- + +## Notes + +- [P] tasks = different files, no dependencies +- [Story] label maps task to specific user story +- Each user story is independently completable and testable +- Commit after each task or logical group +- Stop at any checkpoint to validate independently +- T034/T035 are the critical real-data validation — review carefully before applying diff --git a/tests/test_parse_lab_manual.py b/tests/test_parse_lab_manual.py new file mode 100644 index 0000000..be7ec30 --- /dev/null +++ b/tests/test_parse_lab_manual.py @@ -0,0 +1,222 @@ +"""Tests for parse_lab_manual.py.""" +import tempfile +import textwrap +from pathlib import Path + +import pytest + +from parse_lab_manual import ( + parse_members_chapter, + add_member_to_lab_manual, + move_member_to_alumni, + commit_and_push_lab_manual, +) + + +MINIMAL_TEX = textwrap.dedent(r""" + \chapter{Lab members and alumni}\label{ch:members} + \begin{fullwidth} + \subsection{Current lab members}\label{sec:curr_members} + \newthought{PI} + \bigskip + + \enskip Jeremy R. Manning (2015 -- ) + + \newthought{Graduate Students} + \begin{multicols}{2}\raggedcolumns + \begin{list}{\quad}{} + \item Alice Smith (2022 -- ) + \item Bob Jones (2023 -- ) + \end{list} + \end{multicols} + + \newthought{Undergraduate RAs} + \begin{multicols}{2}\raggedcolumns + \begin{list}{\quad}{} + \item Charlie Brown (2024 -- ) + \end{list} + \end{multicols} + + \subsection{Lab alumni} + \newthought{Graduate Students} + \begin{multicols}{2}\raggedcolumns + \begin{list}{\quad}{} + \item Dana White (2018 -- 2022) + \end{list} + \end{multicols} + + \newthought{Undergraduate RAs} + \begin{multicols}{2}\raggedcolumns + \begin{list}{\quad}{} + \item Eve Black (2020 -- 2021) + \item Frank Green (2019) + \end{list} + \end{multicols} + \end{fullwidth} +""").strip() + + +@pytest.fixture +def tex_file(tmp_path): + """Create a temporary tex file with minimal member data.""" + p = tmp_path / "lab_manual.tex" + p.write_text(MINIMAL_TEX, encoding='utf-8') + return p + + +class TestParseBasic: + def test_parses_all_entries(self, tex_file): + records = parse_members_chapter(tex_file) + assert len(records) == 7 + + def test_parses_pi(self, tex_file): + records = parse_members_chapter(tex_file) + pi = [r for r in records if r['role_category'] == 'PI'] + assert len(pi) == 1 + assert pi[0]['name'] == 'Jeremy R. Manning' + assert pi[0]['start_year'] == 2015 + assert pi[0]['end_year'] is None + assert pi[0]['is_active'] is True + + def test_parses_active_members(self, tex_file): + records = parse_members_chapter(tex_file) + active = [r for r in records if r['is_active']] + assert len(active) == 4 + names = {r['name'] for r in active} + assert 'Alice Smith' in names + assert 'Bob Jones' in names + assert 'Charlie Brown' in names + + def test_parses_alumni(self, tex_file): + records = parse_members_chapter(tex_file) + alumni = [r for r in records if not r['is_active']] + assert len(alumni) == 3 + dana = next(r for r in alumni if r['name'] == 'Dana White') + assert dana['start_year'] == 2018 + assert dana['end_year'] == 2022 + assert dana['role_category'] == 'Graduate Students' + + def test_parses_single_year_entry(self, tex_file): + records = parse_members_chapter(tex_file) + frank = next(r for r in records if r['name'] == 'Frank Green') + assert frank['start_year'] == 2019 + assert frank['end_year'] is None # single year has no end + assert frank['is_active'] is False # in alumni section + + def test_role_categories(self, tex_file): + records = parse_members_chapter(tex_file) + roles = {r['role_category'] for r in records} + assert 'PI' in roles + assert 'Graduate Students' in roles + assert 'Undergraduate RAs' in roles + + +class TestParseEdgeCases: + def test_commented_section(self, tmp_path): + tex = MINIMAL_TEX.replace( + r'\newthought{Undergraduate RAs}' + '\n' + r'\begin{multicols}{2}\raggedcolumns' + '\n' + r'\begin{list}{\quad}{}' + '\n' + r'\item Charlie Brown (2024 -- )' + '\n' + r'\end{list}' + '\n' + r'\end{multicols}' + '\n\n' + r'\subsection{Lab alumni}', + '% \\newthought{Undergraduate RAs}\n' + '% \\begin{multicols}{2}\\raggedcolumns\n' + '% \\begin{list}{\\quad}{}\n' + '% \\end{list}\n' + '% \\end{multicols}\n\n' + '\\subsection{Lab alumni}' + ) + p = tmp_path / "lab_manual.tex" + p.write_text(tex, encoding='utf-8') + records = parse_members_chapter(p) + names = {r['name'] for r in records} + assert 'Charlie Brown' not in names + + def test_missing_chapter_raises(self, tmp_path): + p = tmp_path / "lab_manual.tex" + p.write_text("\\chapter{Something else}", encoding='utf-8') + with pytest.raises(ValueError, match="Could not find"): + parse_members_chapter(p) + + def test_empty_list_section(self, tmp_path): + tex = MINIMAL_TEX.replace( + '\\item Charlie Brown (2024 -- )\n', + '' + ) + p = tmp_path / "lab_manual.tex" + p.write_text(tex, encoding='utf-8') + records = parse_members_chapter(p) + names = {r['name'] for r in records} + assert 'Charlie Brown' not in names + + +class TestParseRealData: + def test_parses_real_lab_manual(self): + real_path = Path(__file__).parent.parent / 'lab-manual' / 'lab_manual.tex' + if not real_path.exists(): + pytest.skip("Lab-manual submodule not initialized") + records = parse_members_chapter(real_path) + assert len(records) > 50 + active = [r for r in records if r['is_active']] + alumni = [r for r in records if not r['is_active']] + assert len(active) > 5 + assert len(alumni) > 20 + # PI should always be present + pi = [r for r in records if r['role_category'] == 'PI'] + assert len(pi) == 1 + assert 'Manning' in pi[0]['name'] + + def test_multi_role_person_in_real_data(self): + real_path = Path(__file__).parent.parent / 'lab-manual' / 'lab_manual.tex' + if not real_path.exists(): + pytest.skip("Lab-manual submodule not initialized") + records = parse_members_chapter(real_path) + # Paxton Fitzpatrick appears as undergrad RA alumni, lab manager alumni, and grad student + paxton_records = [r for r in records if 'Paxton' in r['name'] and 'Fitzpatrick' in r['name']] + assert len(paxton_records) >= 2 + + +class TestAddMember: + def test_adds_grad_student(self, tex_file): + add_member_to_lab_manual(tex_file, 'New Person', 'grad student', 2026) + records = parse_members_chapter(tex_file) + new = next(r for r in records if r['name'] == 'New Person') + assert new['role_category'] == 'Graduate Students' + assert new['start_year'] == 2026 + assert new['is_active'] is True + + def test_adds_undergrad(self, tex_file): + add_member_to_lab_manual(tex_file, 'Test Undergrad', 'undergrad', 2026) + records = parse_members_chapter(tex_file) + new = next(r for r in records if r['name'] == 'Test Undergrad') + assert new['role_category'] == 'Undergraduate RAs' + assert new['is_active'] is True + + def test_invalid_role_raises(self, tex_file): + with pytest.raises(ValueError, match="Could not find"): + add_member_to_lab_manual(tex_file, 'Test', 'wizard', 2026) + + +class TestMoveMember: + def test_moves_to_alumni(self, tex_file): + move_member_to_alumni(tex_file, 'Alice Smith', 2026) + records = parse_members_chapter(tex_file) + # Should no longer be active + active_names = {r['name'] for r in records if r['is_active']} + assert 'Alice Smith' not in active_names + # Should be in alumni + alice = next(r for r in records if r['name'] == 'Alice Smith' and not r['is_active']) + assert alice['end_year'] == 2026 + assert alice['role_category'] == 'Graduate Students' + + def test_move_nonexistent_raises(self, tex_file): + with pytest.raises(ValueError, match="Could not find"): + move_member_to_alumni(tex_file, 'Nobody Here', 2026) + + +class TestCommitAndPush: + def test_raises_when_not_initialized(self, tmp_path): + with pytest.raises(RuntimeError, match="not initialized"): + commit_and_push_lab_manual(tmp_path / 'nonexistent', 'test') diff --git a/tests/test_reconcile_people.py b/tests/test_reconcile_people.py new file mode 100644 index 0000000..1bfbdc8 --- /dev/null +++ b/tests/test_reconcile_people.py @@ -0,0 +1,238 @@ +"""Tests for reconcile_people.py.""" +import io +import sys +from difflib import SequenceMatcher +from unittest.mock import patch + +import pytest + +from reconcile_people import ( + fuzzy_find, + find_match, + get_all_people_names, + reconcile, + print_report, + Discrepancy, + FUZZY_THRESHOLD, + PERSON_SHEETS, +) +from sync_cv_people import normalize_name, names_match, NICKNAME_MAP + + +class TestFuzzyFind: + def test_exact_match_returns_1(self): + result = fuzzy_find('alice smith', {'alice smith', 'bob jones'}) + assert result is not None + assert result[0] == 'alice smith' + assert result[1] == 1.0 + + def test_above_threshold_matches(self): + # "alice smith" vs "alice smth" should be above 0.85 + result = fuzzy_find('alice smith', {'alice smth', 'bob jones'}) + assert result is not None + assert result[1] >= FUZZY_THRESHOLD + + def test_below_threshold_returns_none(self): + result = fuzzy_find('alice smith', {'completely different'}) + assert result is None + + def test_threshold_boundary_reject(self): + """Verify that scores just below 0.85 are rejected.""" + # Find a pair that scores ~0.84 + name1 = 'abcdefghij' + name2 = 'abcdefxyzw' + score = SequenceMatcher(None, name1, name2).ratio() + if score < FUZZY_THRESHOLD: + result = fuzzy_find(name1, {name2}) + assert result is None + + def test_threshold_boundary_accept(self): + """Verify that scores at or above 0.85 are accepted.""" + # "aaryan agarwal" vs "aaryan agrawal" should be above threshold + score = SequenceMatcher(None, 'aaryan agarwal', 'aaryan agrawal').ratio() + assert score >= FUZZY_THRESHOLD + result = fuzzy_find('aaryan agarwal', {'aaryan agrawal'}) + assert result is not None + + def test_empty_set_returns_none(self): + result = fuzzy_find('alice', set()) + assert result is None + + +class TestFuzzyMatchCorpus: + """Test fuzzy matching against a corpus of 20+ name variation pairs.""" + + SHOULD_MATCH = [ + ('aaryan agarwal', 'aaryan agrawal'), # transposed letters + ('maura hough', 'maura f. hough'), # middle initial + ('francisca fadairo', 'francisca o. fadairo'), # middle initial + ('armando oritz', 'armando ortiz'), # transposed letters + ('helen liu', 'helen lu'), # short name variation + ('stephen satterthwaite', 'steven satterthwaite'), # steve/stephen + ('william chen', 'will chen'), # nickname (via fuzzy) + ('christopher jun', 'chris jun'), # nickname (via fuzzy) + ('samuel haskel', 'sam haskel'), # nickname (via fuzzy) + ('benjamin hanson', 'ben hanson'), # nickname (via fuzzy) + ('theodore larson', 'theo larson'), # nickname (via fuzzy) + ('jacob bacus', 'jakob bacus'), # alternate spelling + ('daniel carstensen', 'daniel carstenson'), # -en vs -on + ('rachael chacko', 'rachel chacko'), # alternate spelling + ('rodrigo vega ayllon', 'rodrigo vega-ayllon'), # hyphenation + ('wei liang samuel ching', 'wei liang ching'), # dropped middle name + ('annabelle morrow', 'annabel morrow'), # double-l vs single + ('maddy lee', 'madeline lee'), # nickname + ('mike chen', 'michael chen'), # nickname + ('matt givens', 'matthew givens'), # nickname + ('dan carstensen', 'daniel carstensen'), # nickname + ] + + SHOULD_NOT_MATCH = [ + ('alice smith', 'bob jones'), + ('kevin chang', 'helen lu'), + ('sarah park', 'shane park'), + ('andrew cao', 'andrew richardson'), + ] + + def test_matching_pairs(self): + """At least 90% of matching pairs should be detected.""" + matches_found = 0 + for name1, name2 in self.SHOULD_MATCH: + n1, n2 = normalize_name(name1), normalize_name(name2) + # Check exact, nickname, or fuzzy + if n1 == n2: + matches_found += 1 + elif names_match(n1, n2): + matches_found += 1 + elif fuzzy_find(n1, {n2}) is not None: + matches_found += 1 + + recall = matches_found / len(self.SHOULD_MATCH) + assert recall >= 0.90, f"Only {recall:.0%} recall ({matches_found}/{len(self.SHOULD_MATCH)})" + + def test_non_matching_pairs(self): + """Non-matching pairs should not match.""" + for name1, name2 in self.SHOULD_NOT_MATCH: + n1, n2 = normalize_name(name1), normalize_name(name2) + assert n1 != n2 + assert not names_match(n1, n2) + result = fuzzy_find(n1, {n2}) + assert result is None, f"False positive: '{name1}' matched '{name2}'" + + +class TestFindMatch: + def test_exact_match(self): + result = find_match('alice smith', {'alice smith', 'bob jones'}) + assert result == ('alice smith', 'exact') + + def test_nickname_match(self): + result = find_match('bill smith', {'william smith', 'bob jones'}) + assert result is not None + assert result[1] == 'nickname' + + def test_fuzzy_match(self): + result = find_match('alice smth', {'alice smith', 'bob jones'}) + assert result is not None + assert result[1] == 'fuzzy' + + def test_no_match(self): + result = find_match('alice smith', {'completely different'}) + assert result is None + + +class TestGetAllPeopleNames: + def test_excludes_non_person_sheets(self): + sheets = { + 'members': [{'name': 'Alice'}], + 'collaborators': [{'name': 'Some Lab'}], + 'director': [{'name': 'Director'}], + } + result = get_all_people_names(sheets) + assert 'alice' in result + assert 'some lab' not in result + assert 'director' not in result + + def test_includes_all_person_sheets(self): + sheets = {s: [{'name': f'Person from {s}'}] for s in PERSON_SHEETS} + result = get_all_people_names(sheets) + assert len(result) == len(PERSON_SHEETS) + + +class TestPrintReport: + def test_report_has_auto_resolved_section(self): + discs = [Discrepancy('Alice', 'missing', ['people.xlsx'], ['CV'], + 'test', 'auto_add')] + captured = io.StringIO() + sys.stdout = captured + print_report(discs) + sys.stdout = sys.__stdout__ + output = captured.getvalue() + assert 'AUTO-RESOLVED' in output + + def test_report_has_flagged_section(self): + discs = [Discrepancy('Bob', 'missing', ['lab-manual'], ['people.xlsx'], + 'test', 'flag_for_review')] + captured = io.StringIO() + sys.stdout = captured + print_report(discs) + sys.stdout = sys.__stdout__ + output = captured.getvalue() + assert 'FLAGGED FOR REVIEW' in output + + def test_report_has_conflicts_section(self): + discs = [Discrepancy('Eve', 'conflict', ['people.xlsx', 'CV'], [], + 'test', 'conflict')] + captured = io.StringIO() + sys.stdout = captured + print_report(discs) + sys.stdout = sys.__stdout__ + output = captured.getvalue() + assert 'CONFLICTS' in output + + def test_report_shows_all_in_sync_when_empty(self): + captured = io.StringIO() + sys.stdout = captured + print_report([]) + sys.stdout = sys.__stdout__ + assert 'in sync' in captured.getvalue() + + def test_report_has_distinct_sections(self): + """All three section headers appear distinctly in output.""" + discs = [ + Discrepancy('A', 'missing', ['people.xlsx'], ['CV'], 't', 'auto_add'), + Discrepancy('B', 'missing', ['lab-manual'], ['people.xlsx'], 't', 'flag_for_review'), + Discrepancy('C', 'conflict', ['CV'], [], 't', 'conflict'), + ] + captured = io.StringIO() + sys.stdout = captured + print_report(discs) + sys.stdout = sys.__stdout__ + output = captured.getvalue() + assert 'AUTO-RESOLVED' in output + assert 'FLAGGED FOR REVIEW' in output + assert 'CONFLICTS' in output + # Verify sections appear in order + auto_pos = output.index('AUTO-RESOLVED') + flagged_pos = output.index('FLAGGED FOR REVIEW') + conflict_pos = output.index('CONFLICTS') + assert auto_pos < flagged_pos < conflict_pos + + +class TestReconcileIntegration: + def test_reconcile_runs_without_error(self): + """Reconciliation runs against real data without crashing.""" + discrepancies = reconcile(dry_run=True) + assert isinstance(discrepancies, list) + + def test_reconcile_returns_list(self): + """Reconciliation returns a list of discrepancies (may be empty if synced).""" + discrepancies = reconcile(dry_run=True) + assert isinstance(discrepancies, list) + + def test_reconcile_dry_run_doesnt_modify(self): + """Dry run should not modify any files.""" + from pathlib import Path + xlsx_path = Path(__file__).parent.parent / 'data' / 'people.xlsx' + mtime_before = xlsx_path.stat().st_mtime + reconcile(dry_run=True) + mtime_after = xlsx_path.stat().st_mtime + assert mtime_before == mtime_after