diff --git a/CLAUDE.md b/CLAUDE.md index 241cd83..de3a48d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -121,7 +121,9 @@ These are DIFFERENT VALUES. When bundling user data for the frontend, include bo Two scripts live in `scripts/` for diagnosing and backfilling team rosters on `/hack/`: - `audit_hackathon_team_users.py --event-id ` (read-only) — walks `hackathons/{id}.teams[] -> teams/{id}.users[]` and reports per-team member counts, dangling refs (team points to deleted user doc), and "ghost" users (no name + no propel_id = imported but never logged in). -- `import_hackathon_users_from_csv.py --csv --event-id --csv-type {registrants|projects|roster} [--apply]` — dry-run by default. `projects` parses Devpost projects CSVs (variable-length team-member triplets starting at col 22, 1-indexed); `roster` parses a generic `team,email[,first_name,last_name,name]` CSV for backfilling memberships; `registrants` just seeds user docs. Users are matched by `email_address` (case-insensitive). Imported users get `imported=True`, `import_source`, `import_event_id`, blank `user_id`/`propel_id`. Team membership writes are additive — never removes existing members. Re-runnable. +- `import_hackathon_users_from_csv.py --csv --event-id --csv-type {registrants|projects|roster} [--apply]` — dry-run by default. `projects` parses Devpost projects CSVs; the team-member triplet offset is resolved by header lookup (`Team Member 1 First Name`), since old 23-col exports have no "Team Number" column while newer 24-col ones do. Each parsed "email" is validated with the email regex — rows where the triplet shifted off-axis are skipped with a warning rather than written as bogus user docs. `roster` parses a generic `team,email[,first_name,last_name,name]` CSV for backfilling memberships; `registrants` just seeds user docs. Users are matched by `email_address` (case-insensitive). Imported users get `imported=True`, `import_source`, `import_event_id`, blank `user_id`/`propel_id`. Team membership writes are additive — never removes existing members. Re-runnable. +- `cleanup_bogus_imported_users.py [--event-id ] [--apply]` — finds and removes the user docs left behind by the older off-by-one `parse_projects` bug. Fingerprint: `imported=True` AND `propel_id=""` AND `email_address` present but not a valid email AND `import_source` starts with `projects-`. For each matched user it prunes the doc-ref from every team's `users[]` that references it, then deletes the user doc. Dry-run by default. After running, re-run `import_hackathon_users_from_csv.py --csv-type projects` against the affected events to import the real members. +- `backfill_devpost_winners.py --event-id --devpost-url [--projects-csv ] [--apply]` — scrapes the Devpost project gallery for EVERY project tile, flagging winners (`aside.entry-badge img.winner`). For each project it matches to a Firestore team via a layered strategy: `teams.devpost_link` exact-URL → team name (case-insensitive) → email-overlap via Devpost projects CSV (auto-discovered from `/tmp/devpost_files//projects-*.csv`). Two backfills happen in one pass: (1) any matched team with an empty `devpost_link` gets the gallery URL written; (2) matched WINNERS additionally get `/software/` fetched for prize text + member names, with prize strings mapped to status — "1st place" → `FOUNDING_ENGINEERS`, "Completion" or "2nd place" → `COMPLETION_SUPPORT`, anything else marked Winner → `CATEGORY_WINNER` (rank-based; multi-prize teams get the best status, all prize text retained in `awards: []`). Conflicts (team already has a different `devpost_link`) are logged but never overwritten. Unmatched winners exit with code 2 so a human notices; unmatched non-winners are listed for visibility but don't fail the run (typical for teams that registered only on Devpost). Only sets `status`, `awards`, `winners_backfilled_at/source`, and `devpost_link`; never touches `users[]`. Re-runnable. Adds `beautifulsoup4` to requirements. ## Resend audience sync `scripts/sync_resend_audience.py --source {all|profiles|volunteers|mentors|judges|sponsors|helpers|leads} --audience "" [--event-id ] [--selected-only] [--apply]` — pulls emails from Firestore (`users.email_address`, `volunteers.email` filtered by `volunteer_type`, `leads.email`) and upserts contacts into a Resend audience (creates if missing). Dry-run by default. Re-runnable: lists existing audience contacts first and only POSTs new emails. Needs `RESEND_API_KEY` with audiences scope — the existing `RESEND_WELCOME_EMAIL_KEY` is send-only and will 401. Uses the deprecated `resend.Audiences` SDK class (now an alias for Segments) — fine for now, but if it breaks switch to `resend.Segments`. diff --git a/Dockerfile b/Dockerfile index 413811d..2568b09 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,4 +24,4 @@ ENV GUNICORN_CMD_ARGS="--bind=[::]:6060 --workers=2" # Copy project COPY . /app/ # Run the application -CMD ["venv/bin/gunicorn", "api.wsgi:app", "--log-file=-", "--log-level", "debug", "--preload", "--workers", "1"] \ No newline at end of file +CMD ["venv/bin/gunicorn", "api.wsgi:app", "--log-file=-", "--log-level", "debug", "--preload", "--workers", "1", "--timeout", "120"] \ No newline at end of file diff --git a/api/messages/messages_service.py b/api/messages/messages_service.py index 62ab73d..5c8cbb2 100644 --- a/api/messages/messages_service.py +++ b/api/messages/messages_service.py @@ -335,18 +335,71 @@ def get_profile_metadata_old(propel_id): } +# Fields returned to the admin /admin/profiles consumers (page + UserSearchDialog). +# Keep this in sync with frontend src/pages/admin/profile/index.js and +# src/components/admin/UserSearchDialog.js. Drop anything heavy (history) or +# unused (mailing address, propel_id, want_stickers) — those routes have their +# own /profile/ fetch when a row is opened. +_ADMIN_PROFILE_LEAN_FIELDS = ( + "name", + "nickname", + "email_address", + "user_id", + "profile_image", + "last_login", + "github", + "linkedin_url", + "instagram_url", + "company", + "education", + "role", + "shirt_size", + "expertise", + "why", +) + + +def _lean_admin_profile(doc): + """Project a Firestore user doc into the lean shape the admin search uses. + + Resolves DocumentReference lists (badges/teams/hackathons) to id strings + inline, since the frontend only reads `.length` on these arrays. Avoids + `doc_to_json`'s broader behavior and the heavy `history` field entirely. + """ + d = doc.to_dict() or {} + out = {"id": doc.id} + for key in _ADMIN_PROFILE_LEAN_FIELDS: + v = d.get(key) + if v is not None: + out[key] = v + + for ref_key in ("badges", "teams", "hackathons"): + value = d.get(ref_key) + if isinstance(value, list): + out[ref_key] = [ + v.id if isinstance(v, firestore.DocumentReference) else v + for v in value + ] + + vol = d.get("volunteering") + if isinstance(vol, list): + out["volunteering"] = [ + {"hours": v.get("hours", 0)} + for v in vol if isinstance(v, dict) + ] + + return out + + +# 5-minute TTL is enough to absorb tab refreshes / multiple admins loading the +# page in close succession while still picking up new signups within minutes. +@cached(cache=TTLCache(maxsize=1, ttl=300), key=lambda: "all") def get_all_profiles(): db = get_db() - docs = db.collection('users').stream() # steam() gets all records - if docs is None: - return {[]} - else: - results = [] - for doc in docs: - results.append(doc_to_json(docid=doc.id, doc=doc)) - + docs = db.collection('users').stream() + results = [_lean_admin_profile(doc) for doc in docs] logger.info(f"get_all_profiles returned {len(results)} profiles") - return { "profiles": results } + return {"profiles": results} # Caching is not needed because the parent method already is caching diff --git a/api/messages/messages_views.py b/api/messages/messages_views.py index 80b78fb..eb23cdc 100644 --- a/api/messages/messages_views.py +++ b/api/messages/messages_views.py @@ -204,7 +204,10 @@ def update_hackathon(): logger.info("PATCH /hackathon called") user_id = get_authenticated_user_id() if user_id: - return vars(save_hackathon(request.get_json(), user_id)) + result = save_hackathon(request.get_json(), user_id) + if isinstance(result, tuple): + return result + return vars(result) return {"error": "Unauthorized"}, 401 diff --git a/api/teams/teams_service.py b/api/teams/teams_service.py index 17565c7..1c57053 100644 --- a/api/teams/teams_service.py +++ b/api/teams/teams_service.py @@ -3,7 +3,7 @@ from datetime import datetime from db.db import get_db, get_user_doc_reference from api.messages.messages_service import get_problem_statement_from_id_old -from services.teams_service import get_teams_list +from services.teams_service import get_teams_list, get_team from services.nonprofits_service import get_single_npo from common.utils.firestore_helpers import clear_all_caches as clear_cache from services.users_service import ( @@ -20,6 +20,31 @@ logger = logging.getLogger("myapp") +# Slack user IDs for OHack admins who are auto-invited to every team channel +# and CCed on completion broadcasts. Single source of truth for both call sites. +TEAM_COMPLETION_SLACK_ADMINS = [ + "UCQKX6LPR", + "U035023T81Z", + "UC31XTRT5", + "UC2JW3T3K", + "UPD90QV17", + "UEP2U69AA", +] + +# Canonical 8-item Definition of Done. Must stay in lockstep with the frontend +# COMPLETION_ITEMS array in src/components/Teams/TeamCompletionChecklist.js. +COMPLETION_ITEMS = [ + {"slug": "deployed", "label": "Deployed", "blurb": "Code is live in production (AWS, fly.io, GCP)."}, + {"slug": "nonprofit_signoff", "label": "Nonprofit Signoff", "blurb": "Your nonprofit partner agrees the software meets their needs."}, + {"slug": "login_details", "label": "Login Details for Testing", "blurb": "Test credentials shared securely (changeable later)."}, + {"slug": "code_updated", "label": "Code Updated", "blurb": "All code, README, and docs in the designated GitHub repo."}, + {"slug": "tasks_closed", "label": "Tasks Closed", "blurb": "GitHub issues/tasks closed or addressed."}, + {"slug": "sensitive_info_security", "label": "Sensitive Info Secured", "blurb": "No secrets in the repo; shared securely elsewhere."}, + {"slug": "documentation", "label": "Documentation", "blurb": "How to use, deploy, update, and configure."}, + {"slug": "open_source", "label": "Open-Sourced (MIT)", "blurb": "Repo is public under MIT."}, +] +COMPLETION_ITEM_SLUGS = {item["slug"]: item for item in COMPLETION_ITEMS} + def add_team_member(team_id, user_id): """ Admin function to add a member to a team @@ -423,8 +448,7 @@ def queue_team(propel_user_id, json): # Add Slack admins - slack_admins = ["UCQKX6LPR", "U035023T81Z", "UC31XTRT5", "UC2JW3T3K", "UPD90QV17", "UEP2U69AA"] - for admin in slack_admins: + for admin in TEAM_COMPLETION_SLACK_ADMINS: logger.info("Inviting admin %s to slack channel %s", admin, slack_channel) invite_user_to_channel(admin, slack_channel) @@ -988,4 +1012,195 @@ def send_team_message(admin_user, teamid, json): "message": f"Message sent to team {teamid}", "success": True, "team_id": teamid - } \ No newline at end of file + } + + +def user_is_on_team(propel_user_id, team_id): + """ + Returns True if the caller (identified by their PropelAuth UUID) is one of + the team's users[]. Translates propel_id -> OAuth user_id via + get_propel_user_details_by_id (same pattern as get_my_teams_by_event_id), + then walks team.users[] DocumentReferences and compares each user doc's + `user_id` field (or its `propel_id` field as a fallback). + """ + if not propel_user_id or not team_id: + return False + db = get_db() + team_doc = db.collection("teams").document(team_id).get() + if not team_doc.exists: + return False + + try: + details = get_propel_user_details_by_id(propel_user_id) or () + caller_oauth_user_id = details[1] if len(details) > 1 else None + except Exception as e: + logger.warning("user_is_on_team: get_propel_user_details_by_id failed: %s", e) + caller_oauth_user_id = None + + team_data = team_doc.to_dict() or {} + for ref in team_data.get("users", []): + try: + snap = ref.get() if hasattr(ref, "get") else db.collection("users").document(ref).get() + if not snap.exists: + continue + user_data = snap.to_dict() or {} + stored_user_id = user_data.get("user_id") + stored_propel_id = user_data.get("propel_id") + if caller_oauth_user_id and stored_user_id and stored_user_id == caller_oauth_user_id: + return True + if stored_propel_id and stored_propel_id == propel_user_id: + return True + except Exception as e: + logger.warning("user_is_on_team: failed to resolve a user ref on team %s: %s", team_id, e) + continue + return False + + +def _completion_done_count(checklist): + if not isinstance(checklist, dict): + return 0 + return sum(1 for slug in COMPLETION_ITEM_SLUGS if checklist.get(slug, {}).get("done")) + + +def _project_link(team_data, team_id): + event_id = team_data.get("hackathon_event_id") or "" + if event_id: + return f"https://ohack.dev/hack/{event_id}/team/{team_id}" + return f"https://ohack.dev/hack/team/{team_id}" + + +def _completer_name(propel_user_id): + try: + details = get_propel_user_details_by_id(propel_user_id) or {} + return details.get("firstName") or details.get("email") or "A teammate" + except Exception: + return "A teammate" + + +def toggle_completion_item(propel_user_id, team_id, item_slug): + """ + Mark a single Definition-of-Done item complete for a team. Sends a Slack + message to the team's channel. Idempotent in the strict sense: once an item + is `done=True`, this returns 409 (no double-Slack, no unchecking). + """ + if item_slug not in COMPLETION_ITEM_SLUGS: + return {"error": f"Unknown checklist item: {item_slug}"}, 400 + + if not user_is_on_team(propel_user_id, team_id): + return {"error": "You must be on this team to update its completion checklist."}, 403 + + db = get_db() + team_doc = db.collection("teams").document(team_id) + snap = team_doc.get() + if not snap.exists: + return {"error": "Team not found"}, 404 + team_data = snap.to_dict() or {} + checklist = dict(team_data.get("completion_checklist") or {}) + if checklist.get(item_slug, {}).get("done"): + return {"error": "Already complete"}, 409 + + name = _completer_name(propel_user_id) + now_iso = datetime.now().isoformat() + checklist[item_slug] = { + "done": True, + "completed_at": now_iso, + "completed_by_propel_id": propel_user_id, + "completed_by_name": name, + } + done_n = _completion_done_count(checklist) + total_n = len(COMPLETION_ITEMS) + new_status = "complete" if done_n == total_n else "in_progress" + + update = { + "completion_checklist": checklist, + "completion_status": new_status, + } + team_doc.set(update, merge=True) + + item = COMPLETION_ITEM_SLUGS[item_slug] + slack_channel = team_data.get("slack_channel") + if slack_channel: + msg = ( + f":white_check_mark: *{name}* checked off *{item['label']}* " + f"({done_n}/{total_n} complete!)\n" + f"{item['blurb']}\n" + f"See the full Definition of Done → https://ohack.dev/about/completion" + ) + try: + send_slack(message=msg, channel=slack_channel) + except Exception as e: + logger.error("toggle_completion_item: send_slack failed for team %s: %s", team_id, e) + + send_slack_audit( + action="completion_toggle", + message=f"Team {team_id} item {item_slug} marked done by {name} ({done_n}/{total_n})", + payload={"team_id": team_id, "item": item_slug, "by": propel_user_id}, + ) + clear_cache() + + # Route through get_team so DocumentReferences on users[] are flattened + # via doc_to_json + enriched into profile dicts (Flask can't JSON-serialize + # raw DocumentReference objects). + fresh = (get_team(team_id) or {}).get("team") or {} + return {"success": True, "team": fresh, "done": done_n, "total": total_n}, 200 + + +def mark_team_complete(propel_user_id, team_id): + """ + Finalize a team's project. Requires all 8 checklist items to be done. Posts + a celebration message to the team's Slack channel CCing the OHack admins. + Idempotent: returns 409 if already complete. + """ + if not user_is_on_team(propel_user_id, team_id): + return {"error": "You must be on this team to mark it complete."}, 403 + + db = get_db() + team_doc = db.collection("teams").document(team_id) + snap = team_doc.get() + if not snap.exists: + return {"error": "Team not found"}, 404 + team_data = snap.to_dict() or {} + + if team_data.get("completion_status") == "complete": + return {"error": "Project already marked complete"}, 409 + + checklist = team_data.get("completion_checklist") or {} + missing = [s for s in COMPLETION_ITEM_SLUGS if not checklist.get(s, {}).get("done")] + if missing: + return {"error": "Cannot mark complete: items remaining", "missing": missing}, 409 + + name = _completer_name(propel_user_id) + now_iso = datetime.now().isoformat() + team_doc.set({ + "completion_status": "complete", + "completion_completed_at": now_iso, + "completion_completed_by_propel_id": propel_user_id, + "completion_completed_by_name": name, + }, merge=True) + + slack_channel = team_data.get("slack_channel") + team_name = team_data.get("name", "Your team") + project_link = _project_link(team_data, team_id) + admin_mentions = " ".join(f"<@{uid}>" for uid in TEAM_COMPLETION_SLACK_ADMINS) + msg = ( + f":tada: :rocket: :tada: *{team_name} marked their project COMPLETE!*\n" + f"Congratulations team — you shipped it. 🏆\n\n" + f"cc {admin_mentions}\n\n" + f"Project link: {project_link}\n" + f"See examples of completed projects: https://ohack.dev/about/success-stories" + ) + if slack_channel: + try: + send_slack(message=msg, channel=slack_channel) + except Exception as e: + logger.error("mark_team_complete: send_slack failed for team %s: %s", team_id, e) + + send_slack_audit( + action="completion_complete", + message=f"Team {team_id} ({team_name}) marked PROJECT COMPLETE by {name}", + payload={"team_id": team_id, "by": propel_user_id}, + ) + clear_cache() + + fresh = (get_team(team_id) or {}).get("team") or {} + return {"success": True, "team": fresh}, 200 \ No newline at end of file diff --git a/api/teams/teams_views.py b/api/teams/teams_views.py index 3b5d558..b9595e8 100644 --- a/api/teams/teams_views.py +++ b/api/teams/teams_views.py @@ -14,7 +14,9 @@ remove_team, get_teams_by_hackathon_id, get_my_teams_by_event_id, - send_team_message + send_team_message, + toggle_completion_item, + mark_team_complete, ) logger = logging.getLogger(__name__) @@ -111,6 +113,39 @@ def add_demo_video_to_team_api(teamid): logger.error("Could not obtain user details for POST /team//demo-video") return {"error": "Unauthorized"}, 401 +@bp.route("//completion/toggle", methods=["POST"]) +@auth.require_user +def toggle_completion_item_api(teamid): + """ + Mark a single Definition-of-Done item complete on a team. Self-serve for + team members. Posts a Slack message into the team's channel. Re-toggling an + already-done item returns 409 (no unchecking, no double-Slack). + Body: { "item": "" } + """ + logger.info(f"POST /team/{teamid}/completion/toggle called") + if not (auth_user and auth_user.user_id): + return {"error": "Unauthorized"}, 401 + body = request.get_json() or {} + item_slug = body.get("item") + if not item_slug: + return {"error": "Missing 'item' in request body"}, 400 + return toggle_completion_item(auth_user.user_id, teamid, item_slug) + + +@bp.route("//completion/complete", methods=["POST"]) +@auth.require_user +def mark_team_complete_api(teamid): + """ + Mark a team's project COMPLETE. Self-serve for team members. Requires all + 8 checklist items to be done first. Posts a celebration message to the + team's Slack channel CCing the OHack admins. + """ + logger.info(f"POST /team/{teamid}/completion/complete called") + if not (auth_user and auth_user.user_id): + return {"error": "Unauthorized"}, 401 + return mark_team_complete(auth_user.user_id, teamid) + + @bp.route("//member", methods=["POST"]) @auth.require_user @auth.require_org_member_with_permission("volunteer.admin", req_to_org_id=getOrgId) diff --git a/common/utils/firestore_helpers.py b/common/utils/firestore_helpers.py index 352f565..2a1b334 100644 --- a/common/utils/firestore_helpers.py +++ b/common/utils/firestore_helpers.py @@ -20,11 +20,23 @@ def register_cache(cache_obj): def clear_all_caches(): - """Clear all registered caches and the doc_to_json cache.""" + """Clear all registered caches and the doc_to_json cache. + + A registered entry may be either an @cached-decorated function (which + cachetools gives a `cache_clear()` method) or a raw cache object such + as a TTLCache (which uses `.clear()`). Try both. + """ doc_to_json.cache_clear() for cache_obj in _cache_registry: try: - cache_obj.cache_clear() + if hasattr(cache_obj, "cache_clear"): + cache_obj.cache_clear() + elif hasattr(cache_obj, "clear"): + cache_obj.clear() + else: + logger.warning( + f"Registered cache has neither cache_clear nor clear: {type(cache_obj).__name__}" + ) except Exception as e: logger.warning(f"Failed to clear a registered cache: {e}") @@ -67,7 +79,7 @@ def doc_to_json(docid=None, doc=None, depth=0): return doc if d_json is None: - logger.warn(f"doc.to_dict() is NoneType | docid={docid} doc={doc}") + logger.warning(f"doc.to_dict() is NoneType | docid={docid} doc={doc}") return # If any values in d_json is a list, add only the document id to the list for DocumentReference or DocumentSnapshot diff --git a/common/utils/openai_api.py b/common/utils/openai_api.py index 03b1f0c..89e011a 100644 --- a/common/utils/openai_api.py +++ b/common/utils/openai_api.py @@ -143,7 +143,8 @@ def generate_and_save_image_to_cdn(directory, text): model="gpt-image-1", prompt=prompt, n=1, - size="1024x1024" + size="1024x1024", + timeout=90.0 ) # Create a short filename from input text diff --git a/common/utils/validators.py b/common/utils/validators.py index aed8191..826628a 100644 --- a/common/utils/validators.py +++ b/common/utils/validators.py @@ -1,4 +1,5 @@ import re +from copy import deepcopy from urllib.parse import urlparse import logging from datetime import datetime @@ -165,6 +166,140 @@ def validate_hackathon_data(data): validate_planning_subobject(planning) +def validate_hackathon_data_partial(data): + """Validate hackathon data with partial-save semantics. + + Required fields and date integrity always raise ValueError (hard fail). + Each optional field is validated individually; failures strip the field + from the returned data and record it in skipped_fields. + + Returns: + (cleaned_data, skipped_fields) where cleaned_data is a deep copy of + ``data`` with invalid optional fields removed, and skipped_fields is a + list of ``{"field": str, "reason": str}`` dicts. + + Raises: + ValueError: if a required field is missing/empty or dates are invalid. + """ + # Hard fail: required fields and date ordering + required_fields = ["title", "description", "location", "start_date", "end_date", "type", "image_url", "event_id"] + for field in required_fields: + if field not in data or not data[field]: + raise ValueError(f"Missing required field: {field}") + try: + start_date = datetime.fromisoformat(data["start_date"]) + end_date = datetime.fromisoformat(data["end_date"]) + if end_date <= start_date: + raise ValueError(f"End date must be after start date: start_date={start_date}, end_date={end_date}") + except ValueError: + raise + + cleaned = deepcopy(data) + skipped = [] + + def _skip(field, reason): + skipped.append({"field": field, "reason": reason}) + logger.warning("Field '%s' failed validation and will not be saved: %s", field, reason) + + # Timezone + timezone = cleaned.get("timezone") + if timezone: + try: + ZoneInfo(timezone) + except (ZoneInfoNotFoundError, KeyError): + _skip("timezone", f"Invalid timezone: {timezone}") + cleaned.pop("timezone", None) + + # Constraints — validate sub-fields individually + constraints = cleaned.get("constraints") + if constraints is not None: + if not isinstance(constraints, dict): + _skip("constraints", "constraints must be an object") + cleaned.pop("constraints", None) + else: + c = dict(constraints) + + for k in ["max_people_per_team", "max_teams_per_problem", "min_people_per_team"]: + if k in c and not isinstance(c[k], int): + _skip(f"constraints.{k}", "must be an integer") + c.pop(k) + + if "hacker_required_questions" in c: + try: + hrq = c["hacker_required_questions"] + questions = hrq.get("questions", []) + if not isinstance(questions, list): + raise ValueError("hacker_required_questions.questions must be a list") + for i, q in enumerate(questions): + if not isinstance(q, dict): + raise ValueError(f"Question {i} must be an object") + if not isinstance(q.get("question"), str) or not q.get("question"): + raise ValueError(f"Question {i} must have a non-empty 'question' string") + if not isinstance(q.get("required_answer"), bool): + raise ValueError(f"Question {i} must have a boolean 'required_answer'") + if not isinstance(q.get("error"), str) or not q.get("error"): + raise ValueError(f"Question {i} must have a non-empty 'error' string") + except ValueError as e: + _skip("constraints.hacker_required_questions", str(e)) + c.pop("hacker_required_questions") + + arrival = c.get("judge_venue_arrival_time") + if arrival not in (None, ""): + if not isinstance(arrival, str) or not re.match(r"^([01]\d|2[0-3]):[0-5]\d$", arrival): + _skip("constraints.judge_venue_arrival_time", "must be HH:MM (24-hour)") + c.pop("judge_venue_arrival_time") + + if "hacker_deposit" in c and c["hacker_deposit"] is not None: + try: + hd = c["hacker_deposit"] + if not isinstance(hd, dict): + raise ValueError("hacker_deposit must be an object") + if "enabled" in hd and not isinstance(hd["enabled"], bool): + raise ValueError("hacker_deposit.enabled must be boolean") + amount = hd.get("default_amount_cents") + if amount is not None: + if not isinstance(amount, int) or amount < 0 or amount > 50000: + raise ValueError("hacker_deposit.default_amount_cents must be a non-negative integer (cents) up to 50000") + except ValueError as e: + _skip("constraints.hacker_deposit", str(e)) + c.pop("hacker_deposit") + + if "meals" in c and c["meals"] is not None: + try: + validate_meals(c["meals"]) + except ValueError as e: + _skip("constraints.meals", str(e)) + c.pop("meals") + + cleaned["constraints"] = c + + # event_photos + if "event_photos" in cleaned and cleaned["event_photos"] is not None: + try: + validate_event_photos(cleaned["event_photos"]) + except ValueError as e: + _skip("event_photos", str(e)) + cleaned.pop("event_photos") + + # social_posts + if "social_posts" in cleaned and cleaned["social_posts"] is not None: + try: + validate_social_posts(cleaned["social_posts"]) + except ValueError as e: + _skip("social_posts", str(e)) + cleaned.pop("social_posts") + + # planning + if "planning" in cleaned and cleaned["planning"] is not None: + try: + validate_planning_subobject(cleaned["planning"]) + except ValueError as e: + _skip("planning", str(e)) + cleaned.pop("planning") + + return cleaned, skipped + + ALLOWED_DIETARY_TAGS = { "vegetarian", "vegan", diff --git a/db/firestore.py b/db/firestore.py index d64ce18..bf90cd3 100644 --- a/db/firestore.py +++ b/db/firestore.py @@ -325,8 +325,40 @@ def fetch_problem_statements(self): debug(logger, "Fetching all problem statements") db = self.get_db() try: - docs = db.collection('problem_statements').stream() - results = [convert_to_entity(doc, ProblemStatement) for doc in docs or []] + docs = list(db.collection('problem_statements').stream()) + + # Collect every unique event DocumentReference across all docs so we + # can batch-fetch them in a single RPC instead of one per ref (N+1). + raw_data = [] + all_event_refs: dict = {} + for doc in docs: + d = doc.to_dict() or {} + d['id'] = doc.id + raw_data.append(d) + for ref in d.get('events', []): + if isinstance(ref, firestore.DocumentReference): + all_event_refs[ref.id] = ref + + # Single batch read for all referenced hackathon docs. + hackathon_map: dict = {} + if all_event_refs: + for snap in db.get_all(list(all_event_refs.values())): + if snap.exists: + h_data = snap.to_dict() or {} + h_data['id'] = snap.id + hackathon_map[snap.id] = Hackathon.deserialize(h_data) + + # Build ProblemStatement objects using the prefetched hackathons. + results = [] + for d in raw_data: + if 'events' in d: + d['events'] = [ + hackathon_map[ref.id] + for ref in d['events'] + if isinstance(ref, firestore.DocumentReference) and ref.id in hackathon_map + ] + results.append(ProblemStatement.deserialize(d)) + info(logger, "Successfully fetched problem statements", count=len(results)) return results except Exception as e: @@ -419,6 +451,8 @@ def update_problem_statement(self, problem_statement: ProblemStatement): update_data['skills'] = problem_statement.skills if hasattr(problem_statement, 'rank'): update_data['rank'] = problem_statement.rank + if hasattr(problem_statement, 'slack_channel'): + update_data['slack_channel'] = problem_statement.slack_channel # Use update() instead of set() to only modify specified fields update_res = collection.document(problem_statement.id).update(update_data) diff --git a/model/problem_statement.py b/model/problem_statement.py index 08e2f58..c539ff7 100644 --- a/model/problem_statement.py +++ b/model/problem_statement.py @@ -27,6 +27,7 @@ def __init__(self): self.events = [] # TODO: Breaking change. This used to be called "events" self.status = None self.skills = [] # This is a list of skills, not a string + self.slack_channel = None @classmethod @@ -40,6 +41,7 @@ def deserialize(cls, d): p.github = d['github'] if 'github' in d else None p.status = d['status'] if 'status' in d else None p.skills = d['skills'] if 'skills' in d else [] + p.slack_channel = d['slack_channel'] if 'slack_channel' in d else None if 'events' in d: p.events = d['events'] @@ -75,7 +77,6 @@ def serialize(self): for event in self.events: if event is not None: if isinstance(event, dict): - print(f"Event is already a dict: {event}") d['events'].append(event) else: d['events'].append(event.serialize()) @@ -101,7 +102,7 @@ def serialize(self): d['references'] = [] # Add remaining fields that aren't special cases - for field in ['github', 'status', 'first_thought_of', 'skills', 'rank']: + for field in ['github', 'status', 'first_thought_of', 'skills', 'rank', 'slack_channel']: if hasattr(self, field): d[field] = getattr(self, field) diff --git a/requirements.txt b/requirements.txt index 96de056..89b9d1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ python-dotenv>=1.0.1 six==1.16.0 Werkzeug>=3.1.3 requests>=2.32.3 +beautifulsoup4>=4.12.2 firebase_admin==6.5.0 ratelimit==2.2.1 cachetools==5.2.0 diff --git a/scripts/backfill_devpost_winners.py b/scripts/backfill_devpost_winners.py new file mode 100644 index 0000000..ba8d1a8 --- /dev/null +++ b/scripts/backfill_devpost_winners.py @@ -0,0 +1,610 @@ +#!/usr/bin/env python3 +""" +Backfill team.devpost_link and winning-team status on a hackathon from Devpost. + +DRY-RUN BY DEFAULT. Pass --apply to write to Firestore. + +What it does +------------ +Given a Devpost event URL (e.g. https://opportunity-hack-2025-arizona.devpost.com/) +and a matching hackathon event_id: + + 1. Scrape /project-gallery, collect EVERY tile and flag the ones + carrying the orange "WINNER" ribbon (img.winner inside aside.entry-badge). + 2. Match each project to a Firestore team in this event (see strategies + below). For matched teams whose devpost_link is empty, plan to set it + (this is the bulk backfill for old hackathons that never linked + submissions). + 3. For matched WINNERS, additionally fetch /software/ to extract: + - the prize strings under "Submitted to" (span.winner + free text) + - the team member names under
+ Map prize text -> team status enum: + "... 1st place" -> FOUNDING_ENGINEERS + "... Completion / 2nd ..." -> COMPLETION_SUPPORT + anything else marked Winner -> CATEGORY_WINNER + Multiple prizes are kept verbatim in `awards: []`. `status` is set to the + best (lowest-rank) status across all prizes the team won. + +Match strategies (layered) +-------------------------- + (a) teams.devpost_link == project_url (strongest signal) + (b) teams.name ~= project title (case-insensitive, normalized) + (c) email overlap via Devpost projects CSV: + project title -> CSV row -> member emails -> user docs -> + team in this event whose users[] contains those user docs. + Tie-breaker for (b): if multiple teams share a name, fall back to (c). + +Storage +------- +On a matched team doc: + devpost_link = project URL (only set when previously empty; + conflicts logged, never overwritten) + status = best status across all prizes won (winners only) + awards = list[str] of prize text from Devpost (winners only) + winners_backfilled_at = ISO timestamp (winners only) + winners_backfilled_source = "scripts/backfill_devpost_winners.py" (winners only) + +No other fields are touched. team.users[] is never modified by this script. + +Unmatched winners +----------------- +Logged and skipped. The script exits with code 2 when there are unmatched +winning projects so a human (or CI) notices. + +Unmatched non-winners are listed for visibility (these typically represent +teams that registered only on Devpost and never on ohack.dev) but do NOT +trigger a non-zero exit. + +Usage +----- + cd backend-ohack.dev + + # Dry run (read-only); prints the plan + python scripts/backfill_devpost_winners.py \ + --event-id 2025_fall_az \ + --devpost-url https://opportunity-hack-2025-arizona.devpost.com/ + + # Optional: point at a specific projects CSV (otherwise auto-detected + # from /tmp/devpost_files//projects-*.csv) + python scripts/backfill_devpost_winners.py \ + --event-id 2024_fall \ + --devpost-url https://opportunity-hack-2024-arizona.devpost.com/ \ + --projects-csv /tmp/devpost_files/2024_fall/projects-....csv + + # Write to Firestore + python scripts/backfill_devpost_winners.py ... --apply + +Re-running is safe: writes are idempotent (set with merge=True) and only touch +status/awards/devpost_link/two metadata fields. +""" + +import argparse +import csv +import os +import re +import sys +import time +from collections import defaultdict +from datetime import datetime, timezone + +import requests +from bs4 import BeautifulSoup + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from dotenv import load_dotenv +load_dotenv() + +from common.utils.firebase import get_db + + +HTTP_HEADERS = {"User-Agent": "Mozilla/5.0 ohack-backfill-script"} +HTTP_DELAY_SEC = 0.5 # polite delay between fetches + +# Frontend source of truth: frontend-ohack.dev/src/constants/teamStatus.js +STATUS_FOUNDING_ENGINEERS = "FOUNDING_ENGINEERS" +STATUS_COMPLETION_SUPPORT = "COMPLETION_SUPPORT" +STATUS_CATEGORY_WINNER = "CATEGORY_WINNER" + +# Lower rank wins when a team has multiple prizes. +STATUS_RANK = { + STATUS_FOUNDING_ENGINEERS: 1, + STATUS_COMPLETION_SUPPORT: 2, + STATUS_CATEGORY_WINNER: 3, +} + + +def norm(s): + return re.sub(r"\s+", " ", (s or "").strip()).lower() + + +def norm_email(e): + return (e or "").strip().lower() + + +def prize_text_to_status(prize_text): + s = (prize_text or "").lower() + if re.search(r"\b1st\s+place\b|\bfirst\s+place\b", s): + return STATUS_FOUNDING_ENGINEERS + if "completion" in s or re.search(r"\b2nd\s+place\b|\bsecond\s+place\b", s): + return STATUS_COMPLETION_SUPPORT + return STATUS_CATEGORY_WINNER + + +def best_status(prizes): + statuses = [prize_text_to_status(p) for p in prizes if p] + if not statuses: + return None + return min(statuses, key=lambda s: STATUS_RANK[s]) + + +# --------------------------- Scraping --------------------------- + +def fetch_html(url): + time.sleep(HTTP_DELAY_SEC) + r = requests.get(url, headers=HTTP_HEADERS, timeout=30) + r.raise_for_status() + return r.text + + +def _gallery_url(devpost_url): + base = devpost_url.rstrip("/") + if base.endswith("/project-gallery"): + return base + return base + "/project-gallery" + + +def scrape_gallery_projects(devpost_url): + """Walk the project gallery and return EVERY project tile. + + Returns: list of {"title": str, "project_url": str, "is_winner": bool} + """ + gallery_url = _gallery_url(devpost_url) + projects = [] + seen = set() + page = 1 + while True: + url = gallery_url if page == 1 else f"{gallery_url}?page={page}" + print(f" GET {url}") + soup = BeautifulSoup(fetch_html(url), "html.parser") + tiles = soup.select("div.gallery-item") + if not tiles: + break + for tile in tiles: + link = tile.select_one("a.link-to-software") + if not link or not link.get("href"): + continue + project_url = link["href"].strip() + if project_url in seen: + continue + seen.add(project_url) + h5 = tile.select_one("figcaption h5") + title = h5.get_text(strip=True) if h5 else "" + is_winner = tile.select_one("aside.entry-badge img.winner") is not None + projects.append({ + "title": title, + "project_url": project_url, + "is_winner": is_winner, + }) + # Pagination: only continue if a rel=next or .next_page link exists. + next_link = soup.select_one("a[rel='next'], a.next_page") + if not next_link: + break + page += 1 + return projects + + +def scrape_project(project_url): + """Return {prizes: [str], members: [{name, profile_url}]} for a Devpost project page.""" + soup = BeautifulSoup(fetch_html(project_url), "html.parser") + + prizes = [] + submissions_div = soup.find("div", id="submissions") + if submissions_div: + # Each prize is an
  • containing Winner + # followed by free text like "Founding Engineer Prize (Website Redesign 1st place)". + for li in submissions_div.select("ul.no-bullet > li"): + winner_span = li.find("span", class_="winner") + if not winner_span: + continue + full = li.get_text(" ", strip=True) + # Strip the leading "Winner" label that came from the span. + prize_text = re.sub(r"^\s*Winner\s+", "", full, flags=re.I).strip() + if prize_text: + prizes.append(prize_text) + + members = [] + team_section = soup.find("section", id="app-team") + if team_section: + for li in team_section.select("li.software-team-member"): + # Two anchors per member (avatar + name); pick the one with text. + name_link = None + for a in li.select("a.user-profile-link"): + text = a.get_text(strip=True) + if text: + name_link = a + break + if not name_link: + continue + members.append({ + "name": name_link.get_text(strip=True), + "profile_url": (name_link.get("href") or "").strip(), + }) + + return {"prizes": prizes, "members": members} + + +# --------------------------- CSV ingestion --------------------------- + +def parse_projects_csv(csv_path): + """Parse a Devpost projects CSV using header-based column resolution. + + Returns list of {title, submission_url, emails: [str], members: [{first, last, email}]} + """ + out = [] + with open(csv_path, newline="", encoding="utf-8") as f: + r = csv.reader(f) + header = next(r) + + def col(name): + try: + return header.index(name) + except ValueError: + return -1 + + i_title = col("Project Title") + i_url = col("Submission Url") + i_sf = col("Submitter First Name") + i_sl = col("Submitter Last Name") + i_se = col("Submitter Email") + i_tm1f = col("Team Member 1 First Name") + if i_title < 0 or i_se < 0 or i_tm1f < 0: + raise SystemExit( + f"projects CSV is missing required columns " + f"(Project Title / Submitter Email / Team Member 1 First Name). " + f"Header: {header}" + ) + + for row in r: + if not row: + continue + title = (row[i_title] if i_title < len(row) else "").strip() + if not title: + continue + url = (row[i_url] if 0 <= i_url < len(row) else "").strip() + sf = (row[i_sf] if 0 <= i_sf < len(row) else "").strip() + sl = (row[i_sl] if 0 <= i_sl < len(row) else "").strip() + se = norm_email(row[i_se] if i_se < len(row) else "") + members = [] + if se: + members.append({"first": sf, "last": sl, "email": se}) + tail = row[i_tm1f:] + for i in range(0, len(tail), 3): + if i + 2 >= len(tail): + break + f_ = (tail[i] or "").strip() + l_ = (tail[i + 1] or "").strip() + e_ = norm_email(tail[i + 2]) + if e_: + members.append({"first": f_, "last": l_, "email": e_}) + emails = sorted({m["email"] for m in members if m["email"]}) + out.append({ + "title": title, + "submission_url": url, + "members": members, + "emails": emails, + }) + return out + + +def auto_find_projects_csv(event_id): + folder = os.path.join("/tmp/devpost_files", event_id) + if not os.path.isdir(folder): + return None + candidates = [n for n in os.listdir(folder) + if n.startswith("projects-") and n.endswith(".csv")] + if not candidates: + return None + return os.path.join(folder, sorted(candidates)[0]) + + +# --------------------------- Firestore --------------------------- + +def load_hackathon_and_teams(db, event_id): + docs = list(db.collection("hackathons").where("event_id", "==", event_id).stream()) + if not docs: + raise SystemExit(f"hackathon with event_id={event_id!r} not found") + snap = docs[0] + team_refs = (snap.to_dict() or {}).get("teams") or [] + teams = [] + if team_refs: + team_docs = db.get_all(team_refs) + for ref, doc in zip(team_refs, team_docs): + if not doc.exists: + continue + teams.append({"id": doc.id, "ref": ref, "data": doc.to_dict() or {}}) + return snap, teams + + +def load_users_by_emails(db, emails): + """Return {email_lower: user_doc_id} for any users found in the users/ collection.""" + from google.cloud.firestore import FieldFilter + out = {} + unique = sorted({e for e in emails if e}) + if not unique: + return out + CHUNK = 30 + for i in range(0, len(unique), CHUNK): + chunk = unique[i:i + CHUNK] + try: + docs = db.collection("users").where( + filter=FieldFilter("email_address", "in", chunk) + ).stream() + for d in docs: + ea = norm_email((d.to_dict() or {}).get("email_address")) + if ea: + out[ea] = d.id + except Exception: + for e in chunk: + docs = list(db.collection("users").where( + filter=FieldFilter("email_address", "==", e) + ).stream()) + if docs: + out[e] = docs[0].id + return out + + +# --------------------------- Matching --------------------------- + +def match_team(winner, teams, teams_by_link, teams_by_name, csv_index, email_to_user): + url = winner["project_url"] + title_norm = norm(winner["title"]) + + if url in teams_by_link: + return teams_by_link[url], f"devpost_link exact match ({url})" + + if title_norm and title_norm in teams_by_name: + candidates = teams_by_name[title_norm] + if len(candidates) == 1: + return candidates[0], f"team name match {winner['title']!r}" + # multiple teams with the same name in the same event: fall through to email overlap + + if csv_index: + csv_row = csv_index.get(title_norm) + if csv_row: + user_doc_ids = { + email_to_user[e] for e in csv_row["emails"] if e in email_to_user + } + if user_doc_ids: + best_team = None + best_overlap = 0 + for t in teams: + team_user_ids = { + u.id for u in (t["data"].get("users") or []) + if hasattr(u, "id") + } + overlap = len(team_user_ids & user_doc_ids) + if overlap > best_overlap: + best_overlap = overlap + best_team = t + if best_team and best_overlap > 0: + return best_team, ( + f"email overlap via CSV " + f"({best_overlap}/{len(user_doc_ids)} member doc IDs matched " + f"team {best_team['data'].get('name')!r})" + ) + + return None, "no devpost_link / team name / email-overlap match" + + +# --------------------------- Main --------------------------- + +def main(): + ap = argparse.ArgumentParser( + description="Backfill winning-team status on a hackathon from Devpost.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + ap.add_argument("--event-id", required=True, + help="hackathons.event_id (e.g. 2025_fall_az)") + ap.add_argument("--devpost-url", required=True, + help="Devpost event base URL (e.g. https://opportunity-hack-2025-arizona.devpost.com/)") + ap.add_argument("--projects-csv", + help="Optional explicit path. Auto-detected from /tmp/devpost_files//projects-*.csv if omitted.") + ap.add_argument("--apply", action="store_true", + help="Write to Firestore. Default is dry-run.") + args = ap.parse_args() + + csv_path = args.projects_csv or auto_find_projects_csv(args.event_id) + if csv_path and not os.path.isfile(csv_path): + raise SystemExit(f"--projects-csv not found: {csv_path}") + if csv_path: + print(f"Using projects CSV: {csv_path}") + else: + print(f"No projects CSV under /tmp/devpost_files/{args.event_id}/; " + f"email-overlap fallback disabled.") + + db = get_db() + hackathon_snap, teams = load_hackathon_and_teams(db, args.event_id) + print(f"Hackathon: doc_id={hackathon_snap.id} teams={len(teams)}") + + teams_by_link = {} + teams_by_name = defaultdict(list) + for t in teams: + link = (t["data"].get("devpost_link") or "").strip() + if link: + teams_by_link[link] = t + nm = norm(t["data"].get("name") or "") + if nm: + teams_by_name[nm].append(t) + + csv_index = None + email_to_user = {} + if csv_path: + csv_rows = parse_projects_csv(csv_path) + csv_index = {norm(r["title"]): r for r in csv_rows} + all_emails = sorted({e for r in csv_rows for e in r["emails"]}) + print(f"CSV: {len(csv_rows)} project row(s), {len(all_emails)} unique member email(s)") + email_to_user = load_users_by_emails(db, all_emails) + print(f" -> {len(email_to_user)} email(s) already linked to a user doc") + + print(f"\nScraping gallery {args.devpost_url}") + projects = scrape_gallery_projects(args.devpost_url) + winner_count = sum(1 for p in projects if p["is_winner"]) + print(f"Found {len(projects)} project(s) in gallery ({winner_count} winner(s))") + + # Three planning buckets: + # winner_plans - matched winners: set status + awards + devpost_link + # link_only_plans - matched non-winners with empty devpost_link: set link only + # unmatched_winners - winners we can't match (exit non-zero) + # unmatched_others - non-winners we can't match (informational only) + # link_conflicts - team already has a devpost_link that points elsewhere + winner_plans = [] + link_only_plans = [] + link_already_set = [] + link_conflicts = [] + unmatched_winners = [] + unmatched_others = [] + + for proj in projects: + team, reason = match_team( + proj, teams, teams_by_link, teams_by_name, csv_index, email_to_user + ) + if not team: + (unmatched_winners if proj["is_winner"] else unmatched_others).append( + {"project": proj, "reason": reason} + ) + continue + + cur_link = (team["data"].get("devpost_link") or "").strip() + link_status = None + if not cur_link: + link_status = "set" + elif cur_link == proj["project_url"]: + link_status = "already_matches" + else: + link_status = "conflict" + + if proj["is_winner"]: + print(f"\n Fetching {proj['project_url']} ({proj['title']!r})") + details = scrape_project(proj["project_url"]) + if not details["prizes"]: + print(f" WARN: gallery marks WINNER but no prize text on project page; " + f"defaulting status to CATEGORY_WINNER") + details["prizes"] = ["(unknown — gallery WINNER, no prize text found)"] + status = best_status(details["prizes"]) or STATUS_CATEGORY_WINNER + winner_plans.append({ + "project": proj, + "team": team, + "new_status": status, + "awards": details["prizes"], + "members": details["members"], + "reason": reason, + "link_status": link_status, + }) + else: + if link_status == "set": + link_only_plans.append({"project": proj, "team": team, "reason": reason}) + elif link_status == "already_matches": + link_already_set.append({"project": proj, "team": team}) + else: + link_conflicts.append({ + "project": proj, "team": team, + "existing_link": cur_link, "reason": reason, + }) + + sep = "=" * 78 + print("\n" + sep) + print(f"PLAN ({'APPLY' if args.apply else 'DRY-RUN'}) event_id={args.event_id}") + print(sep) + + print(f"\n[WINNERS] {len(winner_plans)} status update(s) planned") + for p in winner_plans: + t = p["team"] + cur_status = t["data"].get("status") or "(none)" + cur_link = (t["data"].get("devpost_link") or "").strip() + print(f"\n {p['project']['title']!r}") + print(f" team: {t['id']} name={t['data'].get('name')!r}") + print(f" match: {p['reason']}") + print(f" devpost members: {[m['name'] for m in p['members']]}") + print(f" prizes:") + for pr in p["awards"]: + print(f" - {pr}") + print(f" status: {cur_status!r} -> {p['new_status']!r}") + if not cur_link: + print(f" devpost_link: (empty) -> {p['project']['project_url']!r}") + elif cur_link != p["project"]["project_url"]: + print(f" devpost_link: {cur_link!r} (keeping; differs from {p['project']['project_url']!r})") + + print(f"\n[DEVPOST_LINK BACKFILL] {len(link_only_plans)} team(s) will get devpost_link set") + for p in link_only_plans: + t = p["team"] + print(f" - teams/{t['id']} {t['data'].get('name')!r}") + print(f" -> {p['project']['project_url']} ({p['reason']})") + + if link_already_set: + print(f"\n[LINK ALREADY CORRECT] {len(link_already_set)} team(s) - no change needed") + + if link_conflicts: + print(f"\n[LINK CONFLICTS] {len(link_conflicts)} team(s) have a different devpost_link already set (NOT overwriting)") + for c in link_conflicts: + t = c["team"] + print(f" - teams/{t['id']} {t['data'].get('name')!r}") + print(f" existing: {c['existing_link']}") + print(f" gallery: {c['project']['project_url']} ({c['reason']})") + + print(f"\n[UNMATCHED WINNERS] {len(unmatched_winners)}") + for u in unmatched_winners: + p = u["project"] + print(f" - {p['title']!r} {p['project_url']}") + print(f" reason: {u['reason']}") + + print(f"\n[UNMATCHED NON-WINNERS] {len(unmatched_others)} (informational — these teams likely never registered on ohack.dev)") + for u in unmatched_others: + p = u["project"] + print(f" - {p['title']!r} {p['project_url']}") + + print( + f"\nSummary: gallery={len(projects)} winners={winner_count} " + f"winner_status_updates={len(winner_plans)} " + f"link_backfills={len(link_only_plans)} " + f"link_conflicts={len(link_conflicts)} " + f"unmatched_winners={len(unmatched_winners)} " + f"unmatched_non_winners={len(unmatched_others)}" + ) + + if not args.apply: + print("\nDRY-RUN. Re-run with --apply to write to Firestore.") + sys.exit(2 if unmatched_winners else 0) + + print("\nApplying writes ...") + now_iso = datetime.now(timezone.utc).isoformat() + for p in winner_plans: + t = p["team"] + update = { + "status": p["new_status"], + "awards": p["awards"], + "winners_backfilled_at": now_iso, + "winners_backfilled_source": "scripts/backfill_devpost_winners.py", + } + if not (t["data"].get("devpost_link") or "").strip(): + update["devpost_link"] = p["project"]["project_url"] + db.collection("teams").document(t["id"]).set(update, merge=True) + print(f" wrote teams/{t['id']} status={p['new_status']!r} awards={len(p['awards'])}") + + for p in link_only_plans: + t = p["team"] + db.collection("teams").document(t["id"]).set( + {"devpost_link": p["project"]["project_url"]}, + merge=True, + ) + print(f" wrote teams/{t['id']} devpost_link={p['project']['project_url']}") + + if unmatched_winners: + print(f"\nWARNING: {len(unmatched_winners)} unmatched winner(s) - see list above.") + sys.exit(2) + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/scripts/cleanup_bogus_imported_users.py b/scripts/cleanup_bogus_imported_users.py new file mode 100644 index 0000000..a540934 --- /dev/null +++ b/scripts/cleanup_bogus_imported_users.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +""" +Delete user docs that were created by the off-by-one bug in +import_hackathon_users_from_csv.py's `parse_projects`. + +DRY-RUN BY DEFAULT. Pass --apply to mutate Firestore. + +Background +---------- +Devpost projects CSVs come in two layouts: one with a "Team Number" column +(24 cols, team members start at col 20) and one without (23 cols, team +members start at col 19). The old parser hard-coded the team-member tail to +start at col 21, so each team-member triplet read shifted by one or two +columns. That produced user docs like: + email_address = "kankipati" + name = "sathvikmalla17@gmail.com Abhishek" + imported = True + propel_id = "" + +These docs are linked into the team's `users[]` array as DocumentReferences. +This script: + 1. Finds every user doc matching the bogus fingerprint + imported == True AND + propel_id == "" AND + email_address present but not a valid email AND + import_source starts with "projects-" + 2. For each bogus user, walks their import_event_id's hackathon -> + teams[] -> users[] and finds every team that still references the + bogus user. + 3. Plans removal of the bogus user_ref from each team's users[] and + deletion of the user doc. + +Re-running the (now-fixed) `import_hackathon_users_from_csv.py --csv-type +projects` against the same event will import the real members that were +never created the first time. + +Usage +----- + cd backend-ohack.dev + python scripts/cleanup_bogus_imported_users.py # dry-run + python scripts/cleanup_bogus_imported_users.py --apply # write + python scripts/cleanup_bogus_imported_users.py --event-id cal_poly_humboldt_2025 +""" + +import argparse +import os +import re +import sys +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from dotenv import load_dotenv +load_dotenv() + +from common.utils.firebase import get_db + +EMAIL_RE = re.compile(r"^[^\s@]+@[^\s@]+\.[^\s@]+$") + + +def looks_bogus(data): + """A user doc matches the bug fingerprint iff ALL of these hold.""" + if not data.get("imported"): + return False + if (data.get("propel_id") or "").strip(): + return False + ea = (data.get("email_address") or "").strip() + if not ea or EMAIL_RE.match(ea): + return False + src = (data.get("import_source") or "").strip() + if not src.startswith("projects-"): + return False + return True + + +def find_bogus_users(db, event_filter=None): + """Return list of dicts {id, data} for every bogus user. + + event_filter: optional event_id; restricts to users with that import_event_id. + """ + out = [] + for d in db.collection("users").stream(): + data = d.to_dict() or {} + if event_filter and (data.get("import_event_id") or "") != event_filter: + continue + if looks_bogus(data): + out.append({"id": d.id, "data": data}) + return out + + +def build_team_index_for_events(db, event_ids): + """Return {team_doc_id: {ref, data, hackathon_event_id}} for every team + linked to any of the given hackathon event_ids.""" + out = {} + for eid in event_ids: + snaps = list(db.collection("hackathons").where("event_id", "==", eid).stream()) + if not snaps: + print(f"WARN: hackathon with event_id={eid!r} not found; skipping") + continue + snap = snaps[0] + team_refs = (snap.to_dict() or {}).get("teams") or [] + if not team_refs: + continue + team_docs = db.get_all(team_refs) + for ref, doc in zip(team_refs, team_docs): + if not doc.exists: + continue + out[doc.id] = { + "ref": ref, + "data": doc.to_dict() or {}, + "hackathon_event_id": eid, + } + return out + + +def plan_team_user_removals(bogus, teams_by_id): + """Build per-team plan: {team_id: {team_data, user_ids_to_remove: [str], user_names: [(id, name)]}}.""" + bogus_id_set = {u["id"] for u in bogus} + name_by_id = {u["id"]: (u["data"].get("name") or "") for u in bogus} + per_team = {} + for tid, t in teams_by_id.items(): + users_list = t["data"].get("users") or [] + hits = [] + for u_ref in users_list: + if hasattr(u_ref, "id") and u_ref.id in bogus_id_set: + hits.append(u_ref.id) + if hits: + per_team[tid] = { + "team_data": t["data"], + "hackathon_event_id": t["hackathon_event_id"], + "user_ids_to_remove": hits, + "user_names": [(uid, name_by_id.get(uid, "")) for uid in hits], + } + return per_team + + +def main(): + ap = argparse.ArgumentParser( + description="Delete bogus user docs left behind by the projects-CSV import off-by-one bug.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + ap.add_argument("--event-id", + help="Only clean up users whose import_event_id matches. " + "Default: clean every event.") + ap.add_argument("--apply", action="store_true", + help="Write to Firestore. Default is dry-run.") + args = ap.parse_args() + + db = get_db() + + print("Scanning users/ for bogus imported docs ...") + bogus = find_bogus_users(db, event_filter=args.event_id) + print(f"Found {len(bogus)} bogus user doc(s)") + + if not bogus: + print("Nothing to do.") + return + + by_event = defaultdict(list) + for u in bogus: + by_event[u["data"].get("import_event_id") or "(missing)"].append(u) + + print("\nBogus users grouped by import_event_id:") + for eid, users in sorted(by_event.items()): + print(f" {eid}: {len(users)} user(s)") + for u in users: + d = u["data"] + print(f" - users/{u['id']} email={d.get('email_address')!r} name={d.get('name')!r}") + + event_ids = [eid for eid in by_event.keys() if eid != "(missing)"] + teams_by_id = build_team_index_for_events(db, event_ids) + print(f"\nLoaded {len(teams_by_id)} team(s) across {len(event_ids)} event(s) to scan for links") + + per_team = plan_team_user_removals(bogus, teams_by_id) + print(f"{len(per_team)} team(s) reference a bogus user doc") + + sep = "=" * 78 + print("\n" + sep) + print(f"PLAN ({'APPLY' if args.apply else 'DRY-RUN'})") + print(sep) + + for tid, plan in per_team.items(): + td = plan["team_data"] + print(f"\n teams/{tid} name={td.get('name')!r} event={plan['hackathon_event_id']}") + for uid, name in plan["user_names"]: + print(f" - remove user_ref users/{uid} (name={name!r})") + + print(f"\nDelete {len(bogus)} bogus user doc(s):") + for u in bogus: + print(f" - users/{u['id']}") + + if not args.apply: + print("\nDRY-RUN. Re-run with --apply to execute.") + return + + print("\nApplying writes ...") + # Step 1: remove bogus refs from each team's users[] + for tid, plan in per_team.items(): + to_remove = set(plan["user_ids_to_remove"]) + td = plan["team_data"] + existing = td.get("users") or [] + new_users = [u for u in existing if not (hasattr(u, "id") and u.id in to_remove)] + if len(new_users) == len(existing): + continue + db.collection("teams").document(tid).set({"users": new_users}, merge=True) + print(f" teams/{tid}: pruned {len(existing) - len(new_users)} user ref(s)") + + # Step 2: delete the bogus user docs + for u in bogus: + db.collection("users").document(u["id"]).delete() + print(f" deleted users/{u['id']}") + + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/scripts/import_hackathon_users_from_csv.py b/scripts/import_hackathon_users_from_csv.py index 4c255d5..dfc0e6b 100644 --- a/scripts/import_hackathon_users_from_csv.py +++ b/scripts/import_hackathon_users_from_csv.py @@ -152,21 +152,33 @@ def parse_registrants(csv_path): def parse_projects(csv_path): """Devpost projects CSV -> list of teams with members. - The CSV header is: - Project Title, ..., Submitter First Name, Submitter Last Name, - Submitter Email, ..., How Many People Are On Your Team?, - Team Colleges/Universities, Additional Team Member Count, - Team Member 1 First Name, Team Member 1 Last Name, - Team Member 1 Email, ... - - Devpost emits 21 fixed columns and then repeating triplets of - (First Name, Last Name, Email) for each additional team member. - We don't trust the header beyond column 21; we parse the tail as triplets. + Devpost's projects CSV starts with a fixed set of project columns, then + "Submitter First/Last/Email" (cols 13/14/15 in every format we've seen), + then a variable run of summary columns ("Notes", optional "Team Number", + "Team Colleges/Universities", "Additional Team Member Count"), and finally + repeating triplets of (Team Member N First, Last, Email). + + The number of summary columns DIFFERS between exports (older 23-col CSVs + have no "Team Number"; newer 24-col CSVs do), so the team-member tail + starts at col 19 or 20 depending on the file. Anchor on the header column + "Team Member 1 First Name" to find the right offset for this file. + + Each parsed "email" is validated with looks_like_email; rows where the + triplet shifted off-axis produce a name-as-email and are skipped with a + warning rather than written as bogus user docs. """ teams = [] + skipped_bogus = [] with open(csv_path, newline="", encoding="utf-8") as f: reader = csv.reader(f) - next(reader, None) # skip header + header = next(reader, None) or [] + try: + i_tm1f = header.index("Team Member 1 First Name") + except ValueError: + raise SystemExit( + "projects CSV is missing 'Team Member 1 First Name' header; " + f"cannot locate team-member triplets. Header: {header}" + ) for row in reader: if not row: continue @@ -184,8 +196,7 @@ def parse_projects(csv_path): "last_name": submitter_last, "is_submitter": True, }) - # repeating triplets starting at column 21 (0-indexed) - tail = row[21:] + tail = row[i_tm1f:] for i in range(0, len(tail), 3): if i + 2 >= len(tail): break @@ -194,6 +205,9 @@ def parse_projects(csv_path): email = norm_email(tail[i + 2]) if not email: continue + if not looks_like_email(email): + skipped_bogus.append((title, first, last, email)) + continue members.append({ "email": email, "first_name": first, @@ -201,6 +215,12 @@ def parse_projects(csv_path): "is_submitter": False, }) teams.append({"team_name": title, "members": members}) + if skipped_bogus: + print(f"WARNING: skipped {len(skipped_bogus)} team-member triplet(s) where " + f"the 'email' value didn't look like an email " + f"(usually means the CSV triplet shifted off-axis):") + for team, first, last, email in skipped_bogus: + print(f" - team={team!r} first={first!r} last={last!r} email={email!r}") return teams diff --git a/services/hackathons_service.py b/services/hackathons_service.py index 0f2fe24..8a4e304 100644 --- a/services/hackathons_service.py +++ b/services/hackathons_service.py @@ -16,7 +16,7 @@ get_volunteer_from_db_by_event, get_volunteer_checked_in_from_db_by_event, ) -from common.utils.validators import validate_hackathon_data +from common.utils.validators import validate_hackathon_data_partial from common.utils.firestore_helpers import ( doc_to_json, doc_to_json_recursive, @@ -918,54 +918,57 @@ def save_hackathon(json_data, propel_id): send_slack_audit(action="save_hackathon", message="Saving/Updating", payload=json_data) try: - validate_hackathon_data(json_data) + data, skipped_fields = validate_hackathon_data_partial(json_data) - doc_id = json_data.get("id") or uuid.uuid1().hex + if skipped_fields: + logger.warning("save_hackathon: %d field(s) skipped due to validation errors: %s", len(skipped_fields), skipped_fields) + + doc_id = data.get("id") or uuid.uuid1().hex is_update = "id" in json_data hackathon_data = { - "title": json_data["title"], - "description": json_data["description"], - "location": json_data["location"], - "start_date": json_data["start_date"], - "end_date": json_data["end_date"], - "type": json_data["type"], - "image_url": json_data["image_url"], - "event_id": json_data["event_id"], - "links": json_data.get("links", []), - "countdowns": json_data.get("countdowns", []), - "constraints": json_data.get("constraints", { + "title": data["title"], + "description": data["description"], + "location": data["location"], + "start_date": data["start_date"], + "end_date": data["end_date"], + "type": data["type"], + "image_url": data["image_url"], + "event_id": data["event_id"], + "links": data.get("links", []), + "countdowns": data.get("countdowns", []), + "constraints": data.get("constraints", { "max_people_per_team": 5, "max_teams_per_problem": 10, "min_people_per_team": 2, }), - "donation_current": json_data.get("donation_current", { + "donation_current": data.get("donation_current", { "food": "0", "prize": "0", "swag": "0", "thank_you": "", }), - "donation_goals": json_data.get("donation_goals", { + "donation_goals": data.get("donation_goals", { "food": "0", "prize": "0", "swag": "0", }), - "timezone": json_data.get("timezone", "America/Phoenix"), - "event_photos": json_data.get("event_photos", []), - "social_posts": json_data.get("social_posts", []), + "timezone": data.get("timezone", "America/Phoenix"), + "event_photos": data.get("event_photos", []), + "social_posts": data.get("social_posts", []), "last_updated": firestore.SERVER_TIMESTAMP, "last_updated_by": propel_id, } - if "planning" in json_data: - hackathon_data["planning"] = json_data["planning"] + if "planning" in data: + hackathon_data["planning"] = data["planning"] - if "nonprofits" in json_data: - hackathon_data["nonprofits"] = [db.collection("nonprofits").document(npo) for npo in json_data["nonprofits"]] - if "teams" in json_data: - hackathon_data["teams"] = [db.collection("teams").document(team) for team in json_data["teams"]] - if "visible_problem_statements" in json_data: - hackathon_data["visible_problem_statements"] = json_data["visible_problem_statements"] + if "nonprofits" in data: + hackathon_data["nonprofits"] = [db.collection("nonprofits").document(npo) for npo in data["nonprofits"]] + if "teams" in data: + hackathon_data["teams"] = [db.collection("teams").document(team) for team in data["teams"]] + if "visible_problem_statements" in data: + hackathon_data["visible_problem_statements"] = data["visible_problem_statements"] @firestore.transactional def update_hackathon(transaction): @@ -983,9 +986,10 @@ def update_hackathon(transaction): clear_cache() logger.info(f"Hackathon {'updated' if is_update else 'created'} successfully. ID: {doc_id}") - return Message( - "Saved Hackathon" - ) + msg = Message("Saved Hackathon") + if skipped_fields: + msg.skipped_fields = skipped_fields + return msg except ValueError as ve: logger.error(f"Validation error: {str(ve)}") diff --git a/services/news_service.py b/services/news_service.py index 4960d38..2715a73 100644 --- a/services/news_service.py +++ b/services/news_service.py @@ -40,10 +40,10 @@ def save_news(json): cdn_dir = "ohack.dev/news" try: news_image = generate_and_save_image_to_cdn(cdn_dir, json["title"]) + json["image"] = f"{CDN_SERVER}/{cdn_dir}/{news_image}" except Exception as e: logger.exception(f"Image generation failed for title '{json['title']}': {e}") - raise - json["image"] = f"{CDN_SERVER}/{cdn_dir}/{news_image}" + json["image"] = None json["last_updated"] = datetime.now().isoformat() news_id = upsert_news(json) diff --git a/services/nonprofits_service.py b/services/nonprofits_service.py index eaaef1f..905c1fb 100644 --- a/services/nonprofits_service.py +++ b/services/nonprofits_service.py @@ -13,13 +13,15 @@ from common.utils.slack import send_slack_audit, send_slack from common.utils.validators import validate_email, validate_url from common.exceptions import InvalidInputError -from common.utils.firestore_helpers import doc_to_json, doc_to_json_recursive +from common.utils.firestore_helpers import doc_to_json, register_cache from api.messages.message import Message logger = get_logger("nonprofits_service") ONE_MINUTE = 1*60 +_npo_list_cache: TTLCache = TTLCache(maxsize=1, ttl=300) # 5-min TTL; cleared on save/update/delete + def _get_db(): from db.db import get_db @@ -29,6 +31,7 @@ def _get_db(): def _clear_cache(): from services.hackathons_service import clear_cache clear_cache() + _npo_list_cache.clear() # ==================== Model-based functions (existing) ==================== @@ -186,20 +189,30 @@ def get_npo_by_hackathon_id(id): @limits(calls=20, period=ONE_MINUTE) -def get_npo_list(word_length=30): +@cached(cache=_npo_list_cache) +def get_npo_list(): logger.debug("NPO List Start") db = _get_db() - docs = db.collection('nonprofits').order_by( "rank" ).stream() - if docs is None: - return {[]} - else: - results = [] - for doc in docs: - logger.debug(f"Processing doc {doc.id} {doc}") - results.append(doc_to_json_recursive(doc=doc)) - - logger.debug(f"Found {len(results)} results {results}") - return { "nonprofits": results } + # stream() without order_by so docs missing 'rank' are included; sort in Python. + docs = db.collection('nonprofits').stream() + + results = [] + for doc in docs: + results.append(doc_to_json(docid=doc.id, doc=doc)) + + def rank_key(x): + rank = x.get('rank') + if rank is None: + return (True, 0) + try: + return (False, int(rank)) + except (ValueError, TypeError): + return (True, 0) + + results.sort(key=rank_key) + + logger.debug("NPO List end", extra={"count": len(results)}) + return {"nonprofits": results} @limits(calls=100, period=ONE_MINUTE) diff --git a/services/problem_statements_service.py b/services/problem_statements_service.py index ee5c2f2..62f80f3 100644 --- a/services/problem_statements_service.py +++ b/services/problem_statements_service.py @@ -21,6 +21,8 @@ ONE_MINUTE = 60 CACHE_TTL = 600 # 10 minutes +_ps_list_cache: TTLCache = TTLCache(maxsize=1, ttl=CACHE_TTL) + @limits(calls=50, period=ONE_MINUTE) def save_problem_statement(d): """ @@ -40,7 +42,7 @@ def save_problem_statement(d): # Clear relevant caches get_problem_statement.cache_clear() - #get_problem_statements.cache_clear() + _ps_list_cache.clear() send_slack_audit(action="save_problem_statement", message="Saving", payload=d) @@ -83,14 +85,14 @@ def remove_problem_statement(id): # Clear caches get_problem_statement.cache_clear() - #get_problem_statements.cache_clear() + _ps_list_cache.clear() return result except Exception as e: exception(logger, "Error deleting problem statement", exc_info=e, id=id) raise -# @cached(cache=TTLCache(maxsize=1, ttl=CACHE_TTL), key=lambda: hashkey('all_problem_statements')) +@cached(cache=_ps_list_cache) def get_problem_statements(): """Get all problem statements""" return fetch_problem_statements() diff --git a/services/teams_service.py b/services/teams_service.py index 3c85bb0..36a47e1 100644 --- a/services/teams_service.py +++ b/services/teams_service.py @@ -6,7 +6,7 @@ from firebase_admin import firestore from common.log import get_logger -from common.utils.firestore_helpers import doc_to_json, log_execution_time +from common.utils.firestore_helpers import doc_to_json, log_execution_time, register_cache from common.utils.slack import send_slack_audit, send_slack, create_slack_channel, invite_user_to_channel from common.utils.github import create_github_repo, validate_github_username, get_all_repos from common.utils.firebase import get_hackathon_by_event_id @@ -59,8 +59,57 @@ def get_teams_list(id=None): return { "teams": results } +def _enrich_team_users(team_data, db): + """ + Replace team_data["users"] (list of user doc-id strings, as flattened by + doc_to_json) with a list of slim profile dicts so the public team page can + render real names + avatars without N round-trips from the frontend. + + Safe to call repeatedly; a missing user doc becomes a stub instead of 500ing + the whole team request. + """ + user_ids = team_data.get("users") + if not user_ids or not isinstance(user_ids, list): + return team_data + + user_refs = [db.collection("users").document(uid) for uid in user_ids if isinstance(uid, str)] + if not user_refs: + return team_data + + try: + snapshots = db.get_all(user_refs) + except Exception as e: + logger.warning(f"_enrich_team_users get_all failed, leaving ids in place: {e}") + return team_data + + enriched = [] + for snap in snapshots: + try: + if not snap.exists: + enriched.append({"id": snap.id, "name": None, "nickname": None, "profile_image": None, "user_id": None}) + continue + d = snap.to_dict() or {} + enriched.append({ + "id": snap.id, + "user_id": d.get("user_id"), + "name": d.get("name"), + "nickname": d.get("nickname"), + "profile_image": d.get("profile_image"), + }) + except Exception as e: + logger.warning(f"_enrich_team_users per-user failure ({snap.id}): {e}") + enriched.append({"id": snap.id, "name": None, "nickname": None, "profile_image": None, "user_id": None}) + + team_data["users"] = enriched + return team_data + + +_GET_TEAM_CACHE = TTLCache(maxsize=100, ttl=600) +register_cache(_GET_TEAM_CACHE) + + @limits(calls=2000, period=THIRTY_SECONDS) -@cached(cache=TTLCache(maxsize=100, ttl=600), key=lambda id: id) +@cached(cache=_GET_TEAM_CACHE, key=lambda id: id) @log_execution_time def get_team(id): if id is None: @@ -79,6 +128,7 @@ def get_team(id): return {} team_data = doc_to_json(docid=doc.id, doc=doc) + team_data = _enrich_team_users(team_data, db) logger.info(f"Successfully retrieved team with id={id}") return { "team" : team_data