From 8ef4d225007840b52aff2b324147c42f8ed91d60 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 6 May 2026 13:42:13 -0400 Subject: [PATCH 001/165] Include additional Augur copyright holder names in the CREDITS.md file Signed-off-by: Adrian Edwards --- CREDITS.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CREDITS.md b/CREDITS.md index 238478d63..3305fd3ca 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -13,6 +13,8 @@ The list of current CollectOSS maintainers can be found in the [MAINTAINERS](./M Augur has been supported by the University of Missouri through funding provided by the Alfred P. Sloan Foundation, Mozilla, The Reynolds Journalism Institute with contributions from VMWare, Red Hat LLC, Grace Hopper's Open Source Day, GitHub, Microsoft, Twitter, Adobe, the Gluster Project, Open Source Summit (NA/Europe), and the Linux Foundation Compliance Summit. +Augur has also been supported by the University of Nebraska at Omaha. + Significant design contributors include Kate Stewart, Dawn Foster, Duane O'Brien, Remy Decausemaker, Google Summer of Code Students, and others including: ### Maintainers @@ -43,6 +45,18 @@ Significant design contributors include Kate Stewart, Dawn Foster, Duane O'Brien - [Gary P. White](https://github.com/garypwhite) - [Shlok Gilda](https://github.com/shlokgilda) +### Credited Copyright Holders +These names came from the LICENSE.md file in the Augur project: +- Matt Germonprez +- Sean Goggins +- Gabe Heim +- Derek Howard +- Carter Landis +- Matt Snell +- Brian Warner +- University of Nebraska at Omaha +- University of Missouri + ### GSoC 2025 Participants - [Akshat Baranwal](https://github.com/akshatb2006) - [Asish Kumar](https://github.com/officialasishkumar) From 81e824c16ba5a98efa1e6d3c0e41393c8071bbcb Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 19 Mar 2026 14:28:05 -0400 Subject: [PATCH 002/165] add a function in GitHubDataAccess to query pull request availability Signed-off-by: Adrian Edwards --- collectoss/tasks/github/pull_requests/tasks.py | 4 ++++ .../tasks/github/util/github_data_access.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/collectoss/tasks/github/pull_requests/tasks.py b/collectoss/tasks/github/pull_requests/tasks.py index f8966ee6e..58362c9ba 100644 --- a/collectoss/tasks/github/pull_requests/tasks.py +++ b/collectoss/tasks/github/pull_requests/tasks.py @@ -79,6 +79,10 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[ github_data_access = GithubDataAccess(key_auth, logger) + if not github_data_access.check_prs_enabled(owner, repo): + logger.info(f"{owner}/{repo}: Pull requests appear to be disabled for this repo. Skipping.") + return + num_pages = github_data_access.get_resource_page_count(url) logger.debug(f"{owner}/{repo}: Retrieving {num_pages} pages of pull requests") diff --git a/collectoss/tasks/github/util/github_data_access.py b/collectoss/tasks/github/util/github_data_access.py index 18256fe68..64d0f5d4e 100644 --- a/collectoss/tasks/github/util/github_data_access.py +++ b/collectoss/tasks/github/util/github_data_access.py @@ -60,6 +60,22 @@ def get_resource_count(self, url): return (100 * (num_pages -1)) + len(data) + def check_prs_enabled(self, owner: str, repo: str,) -> bool: + """ + Checks whether pull requests are enabled for a repository. + Returns False if PRs are disabled (404 on /pulls) and true if there are PRs. + """ + + url = f"https://api.github.com/repos/{owner}/{repo}/pulls?per_page=1" + + try: + self.get_resource_page_count(url) + return True + except UrlNotFoundException: + self.logger.info(f"{owner}/{repo}: Pull requests are disabled. Skipping PR collection.") + return False + + def paginate_resource(self, url): response = self.make_request_with_retries(url) From 247777e1d330462b494d52140de6b095cbbf0a9b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 09:33:38 -0400 Subject: [PATCH 003/165] factor URL creation into a function in GithubDataAccess for better handling of future query param encoding needs and sharing responsibility for url creation (GhDA owns the domain/base url, the caller owns the path and query params they want to use) Signed-off-by: Adrian Edwards --- .../tasks/github/util/github_data_access.py | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/collectoss/tasks/github/util/github_data_access.py b/collectoss/tasks/github/util/github_data_access.py index 64d0f5d4e..38d8e2deb 100644 --- a/collectoss/tasks/github/util/github_data_access.py +++ b/collectoss/tasks/github/util/github_data_access.py @@ -5,6 +5,7 @@ from urllib.parse import urlparse, parse_qs, urlencode from keyman.KeyClient import KeyClient from collectoss.util.keys import mask_key +import urllib.parse GITHUB_RATELIMIT_REMAINING_CAP = 50 @@ -44,6 +45,35 @@ def __init__(self, key_manager, logger: logging.Logger, feature="rest"): self.key = None self.expired_keys_for_request = [] + def endpoint_url(self, path: str, params: dict = None) -> str: + """Build a URL for a github endpoint using the specified path and query parameters + + Args: + path (str): the path to use (i.e. "/users/MoralCode") + params (dict): optional query parameters to add to the url, as a dict + + Returns: + str: the full URL to the specified resource. + """ + # using pythons url processing library this way helps handle accidental + # inclusion of query parameters in the path string, ensuring all query + # parameters are properly encoded and escaped + + input_url_parts = urllib.parse.urlsplit(path) + final_query_parameters = dict() + + if input_url_parts.query != '': + final_query_parameters.update( + parse_qs(input_url_parts.query) + ) + + if params != None: + final_query_parameters.update(params) + + return urllib.parse.urlunsplit( + ('https', 'api.github.com', input_url_parts.path, urllib.parse.urlencode(final_query_parameters), '') + ) + def get_resource_count(self, url): # set per_page to 100 explicitly so we know each page is 100 long @@ -65,10 +95,8 @@ def check_prs_enabled(self, owner: str, repo: str,) -> bool: Checks whether pull requests are enabled for a repository. Returns False if PRs are disabled (404 on /pulls) and true if there are PRs. """ - - url = f"https://api.github.com/repos/{owner}/{repo}/pulls?per_page=1" - try: + url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls", {"per_page": "1"}) self.get_resource_page_count(url) return True except UrlNotFoundException: From 9878d80bf6bdf0af3198b3226de2f16db0a3dbe4 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 09:34:08 -0400 Subject: [PATCH 004/165] perform pull requests lookup with the new endpoint url builder Signed-off-by: Adrian Edwards --- collectoss/tasks/github/pull_requests/tasks.py | 5 +++-- collectoss/tasks/github/util/github_data_access.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/collectoss/tasks/github/pull_requests/tasks.py b/collectoss/tasks/github/pull_requests/tasks.py index 58362c9ba..3efaddf3b 100644 --- a/collectoss/tasks/github/pull_requests/tasks.py +++ b/collectoss/tasks/github/pull_requests/tasks.py @@ -75,10 +75,11 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[ logger.debug(f"Collecting pull requests for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all&direction=desc&sort=updated" - github_data_access = GithubDataAccess(key_auth, logger) + search_args = {"state": "all", "direction": "desc", "sort": "updated"} + url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls", search_args) + if not github_data_access.check_prs_enabled(owner, repo): logger.info(f"{owner}/{repo}: Pull requests appear to be disabled for this repo. Skipping.") return diff --git a/collectoss/tasks/github/util/github_data_access.py b/collectoss/tasks/github/util/github_data_access.py index 38d8e2deb..618322390 100644 --- a/collectoss/tasks/github/util/github_data_access.py +++ b/collectoss/tasks/github/util/github_data_access.py @@ -96,7 +96,7 @@ def check_prs_enabled(self, owner: str, repo: str,) -> bool: Returns False if PRs are disabled (404 on /pulls) and true if there are PRs. """ try: - url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls", {"per_page": "1"}) + url = self.endpoint_url(f"repos/{owner}/{repo}/pulls", {"per_page": "1"}) self.get_resource_page_count(url) return True except UrlNotFoundException: From babd132b043daaf4ecf4d92731c74747b7655461 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 19:29:31 -0400 Subject: [PATCH 005/165] add some deprecation warnings to functions that should be replaced by this in the future Signed-off-by: Adrian Edwards --- .../contributor_interfaceable/contributor_interface.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 1e064f033..78ab1a47c 100644 --- a/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -8,7 +8,7 @@ # Debugger from collectoss.tasks.github.util.github_paginator import GithubApiResult from collectoss.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql, get_contributors_by_github_user_id - +from typing_extensions import deprecated ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -107,7 +107,7 @@ def request_dict_from_endpoint(logger, session, url, timeout_wait=10): return response_data - +@deprecated("Please use GithubDataAcess.endpoint_url() instead") def create_endpoint_from_email(email): # Note: I added "+type:user" to avoid having user owned organizations be returned # Also stopped splitting per note above. @@ -117,7 +117,7 @@ def create_endpoint_from_email(email): return url - +@deprecated("Please use GithubDataAcess.endpoint_url() instead") def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): logger.debug( f"Trying to create endpoint from commit hash: {commit_sha}") From 0496029ae59c34262bdbc916da00a0837e8bf610 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 27 Mar 2026 08:21:26 -0400 Subject: [PATCH 006/165] refactor to reuse an existing internal function, rather than rewriting what it already does Signed-off-by: Adrian Edwards --- .../tasks/github/util/github_data_access.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/collectoss/tasks/github/util/github_data_access.py b/collectoss/tasks/github/util/github_data_access.py index 618322390..0fc017124 100644 --- a/collectoss/tasks/github/util/github_data_access.py +++ b/collectoss/tasks/github/util/github_data_access.py @@ -5,7 +5,6 @@ from urllib.parse import urlparse, parse_qs, urlencode from keyman.KeyClient import KeyClient from collectoss.util.keys import mask_key -import urllib.parse GITHUB_RATELIMIT_REMAINING_CAP = 50 @@ -55,24 +54,16 @@ def endpoint_url(self, path: str, params: dict = None) -> str: Returns: str: the full URL to the specified resource. """ - # using pythons url processing library this way helps handle accidental + # using pythons url processing library helps handle accidental # inclusion of query parameters in the path string, ensuring all query # parameters are properly encoded and escaped - input_url_parts = urllib.parse.urlsplit(path) - final_query_parameters = dict() + if not path.startswith("/"): + path = "/" + path - if input_url_parts.query != '': - final_query_parameters.update( - parse_qs(input_url_parts.query) - ) - - if params != None: - final_query_parameters.update(params) + url = "https://api.github.com" + path - return urllib.parse.urlunsplit( - ('https', 'api.github.com', input_url_parts.path, urllib.parse.urlencode(final_query_parameters), '') - ) + return self.__add_query_params(url, params or {}) def get_resource_count(self, url): From 73f3b6cee5dd76166476714fb703b980a3eead79 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 19:07:28 -0400 Subject: [PATCH 007/165] mark a lib config fetcher as deprecated for future removal Signed-off-by: Adrian Edwards --- collectoss/application/db/lib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/collectoss/application/db/lib.py b/collectoss/application/db/lib.py index 4d719d1ac..ed7613d11 100644 --- a/collectoss/application/db/lib.py +++ b/collectoss/application/db/lib.py @@ -9,6 +9,7 @@ from sqlalchemy.exc import OperationalError from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union +from typing_extensions import deprecated from collectoss.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus, UserGroup, RepoGroup from collectoss.tasks.util.collection_state import CollectionState @@ -18,7 +19,7 @@ logger = logging.getLogger("db_lib") - +@deprecated("This is a legacy method. Use AugurConfig.get_value instead") def get_value(section_name: str, setting_name: str) -> Optional[Any]: """Get the value of a setting from the config. From 9a68982f351efdf0c46570dc62062d952f7579b4 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 19:08:13 -0400 Subject: [PATCH 008/165] mark GitHubRandomKeyAuth as deprecated Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/github_random_key_auth.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/collectoss/tasks/github/util/github_random_key_auth.py b/collectoss/tasks/github/util/github_random_key_auth.py index 6797ba785..1dbbd2d65 100644 --- a/collectoss/tasks/github/util/github_random_key_auth.py +++ b/collectoss/tasks/github/util/github_random_key_auth.py @@ -3,7 +3,9 @@ from collectoss.tasks.util.random_key_auth import RandomKeyAuth from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from sqlalchemy.orm import Session +from typing_extensions import deprecated +@deprecated("This class is deprecated. Use the KeyClient interface to the Keymanager process instead.") class GithubRandomKeyAuth(RandomKeyAuth): """Defines a github specific RandomKeyAuth class so github collections can have a class randomly selects an api key for each request From 7327356663fc43c934a05dee4f12251ec83af251 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 20 Mar 2026 19:21:14 -0400 Subject: [PATCH 009/165] find some deprecated things based on comments Signed-off-by: Adrian Edwards --- .../git/util/facade_worker/facade_worker/repofetch.py | 8 ++++---- .../contributor_interfaceable/contributor_interface.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py index 658ddc1d0..3f7ab07e9 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -36,6 +36,7 @@ from collectoss.application.db.models.augur_operations import CollectionStatus from collectoss.application.db.util import execute_session_query, convert_orm_list_to_dict_list from collectoss.application.db.lib import execute_sql, get_repo_by_repo_git +from typing_extensions import deprecated class GitCloneError(Exception): pass @@ -174,8 +175,7 @@ def git_repo_initialize(facade_helper, session, repo_git): facade_helper.log_activity('Info', f"Fetching new repos (complete)") -# Deprecated functionality. No longer used -# Should be re-purposed in start_tasks when tasks are being scheduled +@deprecated("Deprecated functionality. No longer used. Should be re-purposed in start_tasks when tasks are being scheduled") def check_for_repo_updates(session, repo_git): # Check the last time a repo was updated and if it has been longer than the @@ -244,7 +244,7 @@ def check_for_repo_updates(session, repo_git): # Deprecated. No longer used. - +@deprecated("This functionality is deprecated and won't work with present facade versions") def force_repo_updates(session, repo_git): raise NotImplementedError( "This functionality is deprecated and won't work with present facade versions") @@ -263,7 +263,7 @@ def force_repo_updates(session, repo_git): # Deprecated. No longer used. - +@deprecated("This functionality is deprecated and won't work with present facade versions") def force_repo_analysis(session, repo_git): raise NotImplementedError( "This functionality is deprecated and won't work with present facade versions") diff --git a/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 1e064f033..36fb67488 100644 --- a/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -8,7 +8,7 @@ # Debugger from collectoss.tasks.github.util.github_paginator import GithubApiResult from collectoss.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql, get_contributors_by_github_user_id - +from typing_extensions import deprecated ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -27,7 +27,7 @@ def clean_dict(d): return {k: ("" if v is None else v) for k, v in d.items()} -# deprecated in favor of GithubDataAcess.get_resource() +@deprecated("Please use GithubDataAcess.get_resource() instead") def request_dict_from_endpoint(logger, session, url, timeout_wait=10): """Hit the endpoint specified by the url and return the json that it returns if it returns a dict. From d1971e5fa2a895c3eef18e5299f11a98ffe04ddc Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 7 May 2026 09:24:53 -0400 Subject: [PATCH 010/165] deprecate hit_api Signed-off-by: Adrian Edwards --- collectoss/tasks/github/util/github_paginator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/collectoss/tasks/github/util/github_paginator.py b/collectoss/tasks/github/util/github_paginator.py index 990bc4f73..4a5469552 100644 --- a/collectoss/tasks/github/util/github_paginator.py +++ b/collectoss/tasks/github/util/github_paginator.py @@ -7,8 +7,9 @@ from typing import Optional from enum import Enum +from typing_extensions import deprecated - +@deprecated("Deprecated. Use GithubDataAccess class instead") def hit_api(key_manager, url: str, logger: logging.Logger, timeout: float = 10, method: str = 'GET', follow_redirects=True) -> Optional[httpx.Response]: """Ping the api and get the data back for the page. From 2a812259b3f52fcc3b4e484819c8bbcf585fc2d4 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 7 May 2026 15:28:27 -0400 Subject: [PATCH 011/165] Introduce new table definition for forge_instance operational table. Signed-off-by: Adrian Edwards --- .../application/db/models/augur_operations.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/collectoss/application/db/models/augur_operations.py b/collectoss/application/db/models/augur_operations.py index 41a4cef6b..cd68a1fa4 100644 --- a/collectoss/application/db/models/augur_operations.py +++ b/collectoss/application/db/models/augur_operations.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from sqlalchemy import BigInteger, SmallInteger, Column, Index, Integer, String, Table, text, UniqueConstraint, Boolean, ForeignKey, update, CheckConstraint, Sequence +from sqlalchemy import BigInteger, SmallInteger, Column, Index, Integer, String, Table, text, UniqueConstraint, Boolean, ForeignKey, update, CheckConstraint, Sequence, DateTime, func from sqlalchemy.dialects.postgresql import TIMESTAMP from sqlalchemy.orm.exc import NoResultFound, MultipleResultsFound from sqlalchemy.exc import IntegrityError @@ -1072,6 +1072,20 @@ def get_by_id(session, client_id): session.rollback() raise e +class ForgeInstance(Base): + __tablename__ = "forge_instance" + __table_args__ = { "schema": "augur_operations" } + + id = Column(Integer, primary_key=True, nullable=False, comment="Internal unique identifier for this forge instance") + # platform_type stores an integer that CollectOSS maps/will map to it's internal platform + # identifer Enum (as used in ContributorUUID) for identifying the API endpoints and tasks to use for collection + platform_type = Column(Integer, nullable=False, comment="Type specifier identifying the relevant platform API interface to CollectOSS") + name = Column(String, nullable=False, comment="User-specified name for this forge instance") + # https://stackoverflow.com/a/54800233 + date_added = Column(DateTime(timezone=True), nullable=False, default=func.now()) + domain_name = Column(String, nullable=False, comment="The base domain name (without the scheme) where this instance is hosted") + enabled = Column(Boolean, default=True, nullable=False, comment="denotes whether collection should run for this instance") + class Subscription(Base): __tablename__ = "subscriptions" From 52eacc0c9ffb8ea3ff3754ab620245b7670f3c7a Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 7 May 2026 15:52:39 -0400 Subject: [PATCH 012/165] create new alembic revision for forge_instance table Signed-off-by: Adrian Edwards --- .../42_introduce_empty_instances_table.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 collectoss/application/schema/alembic/versions/42_introduce_empty_instances_table.py diff --git a/collectoss/application/schema/alembic/versions/42_introduce_empty_instances_table.py b/collectoss/application/schema/alembic/versions/42_introduce_empty_instances_table.py new file mode 100644 index 000000000..fd80723d6 --- /dev/null +++ b/collectoss/application/schema/alembic/versions/42_introduce_empty_instances_table.py @@ -0,0 +1,37 @@ +"""introduce empty instances table + +Revision ID: 42 +Revises: 41 +Create Date: 2026-05-07 15:51:17.510641 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '42' +down_revision = '41' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('forge_instance', + sa.Column('id', sa.Integer(), nullable=False, comment='Internal unique identifier for this forge instance'), + sa.Column('platform_type', sa.Integer(), nullable=False, comment='Type specifier identifying the relevant platform API interface to CollectOSS'), + sa.Column('name', sa.String(), nullable=False, comment='User-specified name for this forge instance'), + sa.Column('date_added', sa.DateTime(timezone=True), nullable=False), + sa.Column('domain_name', sa.String(), nullable=False, comment='The base domain name (without the scheme) where this instance is hosted'), + sa.Column('enabled', sa.Boolean(), nullable=False, comment='denotes whether collection should run for this instance'), + sa.PrimaryKeyConstraint('id'), + schema='augur_operations' + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('forge_instance', schema='augur_operations') + # ### end Alembic commands ### From 69a123b4296392b193e8b46624db455c2058be8d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 11:04:04 -0400 Subject: [PATCH 013/165] correct comment typo Signed-off-by: Adrian Edwards --- collectoss/application/db/models/augur_operations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/collectoss/application/db/models/augur_operations.py b/collectoss/application/db/models/augur_operations.py index cd68a1fa4..760ea6c1a 100644 --- a/collectoss/application/db/models/augur_operations.py +++ b/collectoss/application/db/models/augur_operations.py @@ -1077,8 +1077,8 @@ class ForgeInstance(Base): __table_args__ = { "schema": "augur_operations" } id = Column(Integer, primary_key=True, nullable=False, comment="Internal unique identifier for this forge instance") - # platform_type stores an integer that CollectOSS maps/will map to it's internal platform - # identifer Enum (as used in ContributorUUID) for identifying the API endpoints and tasks to use for collection + # platform_type stores an integer that CollectOSS maps/will map to it's internal platform identifier Enum + # (as used in ContributorUUID) for identifying the API endpoints and tasks to use for collection platform_type = Column(Integer, nullable=False, comment="Type specifier identifying the relevant platform API interface to CollectOSS") name = Column(String, nullable=False, comment="User-specified name for this forge instance") # https://stackoverflow.com/a/54800233 From 74dadd16e16f1132aaf9e61b1fc9750b75e578c6 Mon Sep 17 00:00:00 2001 From: Phanindra899 Date: Thu, 14 May 2026 13:37:35 +0530 Subject: [PATCH 014/165] Improve UrlNotFoundException logging context Signed-off-by: Phanindra899 --- .../contributor_breadth_worker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 108326b50..4c8aec067 100644 --- a/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -113,11 +113,11 @@ def contributor_breadth_model(self) -> None: if len(cntrb_events) == 0: logger.info("There are no cntrb events, or new events for this user.\n") continue - except UrlNotFoundException as e: - logger.warning(e) + logger.warning( + f"UrlNotFoundException while processing contributor {cntrb['gh_login']}: {e}" + ) continue - events = process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source) logger.info(f"Inserting {len(events)} events") From 6463450d5a62a841fb5c88f42e9fbb3e9ed80280 Mon Sep 17 00:00:00 2001 From: Lovelace <61972457+Lovlace777@users.noreply.github.com> Date: Sat, 16 May 2026 23:09:40 +0800 Subject: [PATCH 015/165] docs: update README metrics link Signed-off-by: Lovelace <61972457+Lovlace777@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ee4782dce..be144d755 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ CollectOSS's main focus is to measure the overall health and sustainability of o The data CollectOSS collects covers more than just code contributions and extends to anything that can be derived from forge data, including comments, change reviews, releases, and other project activity or interactions. This data is stored in a relational database (PostgreSQL), enabling large-scale data aggregation across any number of repositories to provide context about the way these communities evolve. -CollectOSS is part of [CHAOSS](https://chaoss.community), which is a Linux Foundation® project. Many of our metrics are implementations of the [metrics](https://chaoss.community/metrics/) defined by the CHAOSS community. +CollectOSS is part of [CHAOSS](https://chaoss.community), which is a Linux Foundation® project. Many of our metrics are implementations of the [metrics](https://chaoss.community/kb-metrics-and-metrics-models/) defined by the CHAOSS community. ## Versions and support CollectOSS is a Python project distributed via container images and aims to support all currently-supported versions of Python on macOS and Linux platforms. Docker is the primary supported container runtime, but Podman is also supported and used by some maintainers, although it requires configuring some extra permissions to run correctly. From ab2d2acad7d1f14463e2754154ebdce8b506fc04 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 18 May 2026 13:58:22 -0400 Subject: [PATCH 016/165] return early with a log message if badges model data is not correctly found This needs a larger refactor at a later time, this is just a small improvement to prevent a crash when data cannot be found. Signed-off-by: Adrian Edwards --- collectoss/tasks/github/repo_info/core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/collectoss/tasks/github/repo_info/core.py b/collectoss/tasks/github/repo_info/core.py index 55b1def2a..25b1b25d1 100644 --- a/collectoss/tasks/github/repo_info/core.py +++ b/collectoss/tasks/github/repo_info/core.py @@ -282,6 +282,10 @@ def badges_model(logger,repo_git,repo_id,db): #Hit cii api with no api key. response = hit_api(None, url, logger) + if not response: + logger.error(f"An error occurred fetching data from {url} in badges_model") + return + try: response_data = response.json() except: From 0de09121b912385ce77f51042637ede2f51ce7a5 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 16 Mar 2026 10:49:04 -0400 Subject: [PATCH 017/165] inject application name into engine connection args based on https://stackoverflow.com/questions/15685861/setting-application-name-on-postgres-sqlalchemy Signed-off-by: Adrian Edwards --- collectoss/api/server.py | 2 +- collectoss/application/cli/db.py | 2 +- collectoss/application/db/__init__.py | 4 ++-- conftest.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/collectoss/api/server.py b/collectoss/api/server.py index 2c71dfcd1..a4d212f58 100644 --- a/collectoss/api/server.py +++ b/collectoss/api/server.py @@ -329,7 +329,7 @@ def get_server_cache(cache_manager) -> Cache: logger = SystemLogger("server").get_logger() url = get_database_string() -engine = create_database_engine(url, poolclass=StaticPool) +engine = create_database_engine(url, poolclass=StaticPool, connect_args={"application_name": f"collectoss v{code_version} api"}) db_session = DatabaseSession(logger, engine) system_config = SystemConfig(logger, db_session) diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index fd5db52cf..1a790e3c0 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -511,7 +511,7 @@ def run_psql_command_in_database(target_type, target): database_name = db_config["database_name"] db_conn_string = f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database_name']}" - engine = s.create_engine(db_conn_string) + engine = s.create_engine(db_conn_string, connect_args={"application_name": f"collectoss cli"}) check_call( [ diff --git a/collectoss/application/db/__init__.py b/collectoss/application/db/__init__.py index b10b00b44..84e2d977f 100644 --- a/collectoss/application/db/__init__.py +++ b/collectoss/application/db/__init__.py @@ -12,7 +12,7 @@ def get_engine(): if engine is None: url = get_database_string() - engine = create_database_engine(url=url, poolclass=StaticPool) + engine = create_database_engine(url=url, poolclass=StaticPool, connect_args={"application_name": f"collectoss"}) Session = sessionmaker(bind=engine) return engine @@ -42,7 +42,7 @@ def get_session(): def temporary_database_engine(): url = get_database_string() - temporary_database_engine = create_database_engine(url=url, poolclass=StaticPool) + temporary_database_engine = create_database_engine(url=url, poolclass=StaticPool, connect_args={"application_name": f"collectoss temporary/testing"}) try: yield temporary_database_engine diff --git a/conftest.py b/conftest.py index 55eae98b5..db2e95b78 100644 --- a/conftest.py +++ b/conftest.py @@ -14,7 +14,6 @@ from collectoss.application.config import SystemConfig from collectoss.application.db.engine import get_database_string, create_database_engine, parse_database_string, execute_sql_file - logger = logging.getLogger(__name__) default_repo_id = "25430" @@ -104,7 +103,8 @@ def generate_db_from_template(template_name): create_database(conn, cursor, test_db_name, template_name) # create engine to connect to db - engine = create_database_engine(test_db_string, poolclass=StaticPool) + engine = create_database_engine(test_db_string, poolclass=StaticPool, connect_args={"application_name": f"collectoss tests"}) + yield engine From 6b23c4052b2962c893afe6efac610e016ff443e3 Mon Sep 17 00:00:00 2001 From: Inengs Date: Tue, 3 Mar 2026 10:13:46 +0100 Subject: [PATCH 018/165] tests: add unit tests for AugurUUID and subclasses Signed-off-by: Inengs --- .../test_util/test_augur_uuid.py | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py diff --git a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py new file mode 100644 index 000000000..0202980dc --- /dev/null +++ b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py @@ -0,0 +1,86 @@ +import pytest +import uuid +from augur.tasks.util.AugurUUID import AugurUUID, GithubUUID, GitlabUUID, UnresolvableUUID + +# AugurUUID tests + +# this checks whether a brand new AugurUUID object starts as 16 zero bytes +def test_augur_uuid_initializes_with_16_zero_bytes(): + uid = AugurUUID() + assert len(uid.bytes) == 16 + assert all(b == 0 for b in uid.bytes) + +# checks that githubUUID sets its platform number to 1 +def test_github_uuid_platform_is_1(): + uid = GithubUUID() + assert uid["platform"] == 1 + +# checks that gitlabUUID sets its platform number to 2 +def test_gitlab_uuid_platform_is_2(): + uid = GitlabUUID() + assert uid["platform"] == 2 + +def test_unresolvable_uuid_platform_is_0(): + uid = UnresolvableUUID() + assert uid["platform"] == 0 + +# checks the that you can store a value in the user field +def test_github_uuid_set_user(): + uid = GithubUUID() + uid["user"] = 12345 + assert uid["user"] == 12345 + +# checks the that you can store a value in the user field +def test_gitlab_uuid_set_user(): + uid = GitlabUUID() + uid["user"] = 99999 + assert uid["user"] == 99999 + +# checks that to_UUID returs the uuid.UUID object +def test_to_uuid_returns_valid_uuid(): + uid = GithubUUID() + uid["user"] = 15 + result = uid.to_UUID() + assert isinstance(result, uuid.UUID) + +# checks that set_byte correctly rejects a value that is too large +def test_set_byte_raises_on_invalid_value(): + uid = AugurUUID() + with pytest.raises(ValueError): + uid.set_byte(0, 256) # too big for one byte + +# checks that set_byte rejects an index that doesnt exist +def test_set_byte_raises_on_out_of_range_index(): + uid = AugurUUID() + with pytest.raises(IndexError): + uid.set_byte(16, 1) # index 16 is out of bounds + +# checks that 2 UUIDs with the same values are considered equal. +def test_equality(): + uid1 = GithubUUID() + uid2 = GithubUUID() + uid1["user"] = 100 + uid2["user"] = 100 + assert uid1 == uid2 + +# checks that 2 UUIDs with different values are not equal +def test_inequality(): + uid1 = GithubUUID() + uid2 = GithubUUID() + uid1["user"] = 100 + uid2["user"] = 200 + assert uid1 != uid2 + +# checks that writeint correctly rejects a number +def test_write_int_raises_on_overflow(): + uid = GithubUUID() + with pytest.raises(ValueError): + uid["user"] = 99999999999 # too big for 4 bytes + +# checks that the same user produces different user IDs across platforms +def test_github_and_gitlab_different_for_same_user(): + github_uid = GithubUUID() + gitlab_uid = GitlabUUID() + github_uid["user"] = 100 + gitlab_uid["user"] = 100 + assert github_uid != gitlab_uid \ No newline at end of file From f87fda64ed2ec6fa48f6495fa84bcdd3e0c0865d Mon Sep 17 00:00:00 2001 From: Inengs Date: Wed, 4 Mar 2026 01:04:34 +0100 Subject: [PATCH 019/165] tests: add test_augur_uuid.py to pytest testpaths in pyproject.toml Signed-off-by: Inengs --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 85f11e3e6..25edd9a69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -147,7 +147,8 @@ testpaths = [ "tests/test_classes", "tests/test_application/test_cli/test_csv_utils.py", "tests/test_tasks/test_task_utilities/test_util/test_worker_util.py" - # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of CollectOSS + "tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py", + # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of Augur # "tests/test_metrics", # "tests/test_tasks", # "tests/test_application", From a1ae7a57cdb5b9741e96f743f59d0320d8a84c22 Mon Sep 17 00:00:00 2001 From: Inengs Date: Fri, 6 Mar 2026 07:29:05 +0100 Subject: [PATCH 020/165] tests: add edge case and boundary tests for AugurUUID Signed-off-by: Inengs --- .../test_util/test_augur_uuid.py | 128 +++++++++++++++--- 1 file changed, 112 insertions(+), 16 deletions(-) diff --git a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py index 0202980dc..2f81a8f84 100644 --- a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py +++ b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py @@ -20,21 +20,32 @@ def test_gitlab_uuid_platform_is_2(): uid = GitlabUUID() assert uid["platform"] == 2 -def test_unresolvable_uuid_platform_is_0(): - uid = UnresolvableUUID() - assert uid["platform"] == 0 - # checks the that you can store a value in the user field def test_github_uuid_set_user(): uid = GithubUUID() uid["user"] = 12345 assert uid["user"] == 12345 -# checks the that you can store a value in the user field -def test_gitlab_uuid_set_user(): - uid = GitlabUUID() - uid["user"] = 99999 - assert uid["user"] == 99999 +# tests platform_id edge cases +def test_set_platform_id_raises_on_non_integer(): + uid = AugurUUID() + with pytest.raises(ValueError): + uid.set_platform_id("github") + +def test_set_platform_id_raises_on_overflow(): + uid = AugurUUID() + with pytest.raises(ValueError): + uid.set_platform_id(256) # too big for 1 byte + +# checks that writing to one field doesnt accidentally overwrite bytes belonging to another field +def test_fields_dont_overlap(): + uid = GithubUUID() + + uid["user"] = 12345 + uid["repo"] = 99999 + + assert uid["user"] == 12345 + assert uid["repo"] == 99999 # checks that to_UUID returs the uuid.UUID object def test_to_uuid_returns_valid_uuid(): @@ -43,6 +54,46 @@ def test_to_uuid_returns_valid_uuid(): result = uid.to_UUID() assert isinstance(result, uuid.UUID) +# checks the start_byte is within range(0, 16) for set_bytes +def test_set_bytes_raises_on_invalid_start_byte(): + uid = AugurUUID() + with pytest.raises(ValueError): + uid.set_bytes([1, 2, 3], 16) + +# checks that set_bytes correctly raises an error when you write more bytes that will fit in the UUID starting at a given position +def test_set_bytes_raises_on_too_many_bytes(): + uid = AugurUUID() + with pytest.raises(ValueError): + uid.set_bytes([1] * 10, 10) + +# checks that writeint correctly rejects a number +def test_write_int_raises_on_overflow(): + uid = GithubUUID() + with pytest.raises(ValueError): + uid["user"] = 99999999999 # too big for 4 bytes + +def test_write_int_with_non_integer(): + uid = GithubUUID() + + with pytest.raises(ValueError): + uid.write_int("abc", 1, 4) + +def test_write_int_and_get_int_roundtrip(): + uid = AugurUUID() + uid.write_int(65535, 1, 2) + assert uid.get_int(1, 2) == 65535 + +# checks __int__ method +def test_int_conversion(): + uid = AugurUUID() + uid.set_byte(15, 1) + assert int(uid) == 1 + +def test_get_byte_invalid_index(): + uid = AugurUUID() + with pytest.raises(IndexError): + uid.get_byte(20) + # checks that set_byte correctly rejects a value that is too large def test_set_byte_raises_on_invalid_value(): uid = AugurUUID() @@ -55,6 +106,11 @@ def test_set_byte_raises_on_out_of_range_index(): with pytest.raises(IndexError): uid.set_byte(16, 1) # index 16 is out of bounds +def test_set_byte_raises_on_non_integer(): + uid = AugurUUID() + with pytest.raises(ValueError): + uid.set_byte(0, "hello") + # checks that 2 UUIDs with the same values are considered equal. def test_equality(): uid1 = GithubUUID() @@ -71,16 +127,56 @@ def test_inequality(): uid2["user"] = 200 assert uid1 != uid2 -# checks that writeint correctly rejects a number -def test_write_int_raises_on_overflow(): - uid = GithubUUID() - with pytest.raises(ValueError): - uid["user"] = 99999999999 # too big for 4 bytes - # checks that the same user produces different user IDs across platforms def test_github_and_gitlab_different_for_same_user(): github_uid = GithubUUID() gitlab_uid = GitlabUUID() github_uid["user"] = 100 gitlab_uid["user"] = 100 - assert github_uid != gitlab_uid \ No newline at end of file + assert github_uid != gitlab_uid + +# checks the maximum value that fits into user field +def test_user_field_max_value(): + uid = GithubUUID() + uid["user"] = 4294967295 + assert uid["user"] == 4294967295 + +# checks the minimum boundary +def test_user_field_zero_value(): + uid = GithubUUID() + uid["user"] = 0 + assert uid["user"] == 0 + +def test_len_returns_16(): + uid = AugurUUID() + assert len(uid) == 16 + +def test_dict_representation(): + uid = GithubUUID() + uid["user"] = 10 + + result = uid.__dict__() + + assert result["platform"] == 1 + assert result["user"] == 10 + +def test_string_representation(): + uid = GithubUUID() + uid["user"] = 10 + + result = str(uid) + + assert "user" in result + assert "platform" in result + +def test_iteration_over_bytes(): + uid = AugurUUID() + bytes_list = list(uid) + + assert len(bytes_list) == 16 + +def test_setting_same_field_twice(): + uid = GithubUUID() + uid["user"] = 42 + uid["user"] = 100 # overwrite with different value + assert uid["user"] == 100 \ No newline at end of file From 29ce9588e76e79c67ed82b20b869af4243ed2e73 Mon Sep 17 00:00:00 2001 From: Inengiye Emmanuel Date: Tue, 17 Mar 2026 18:09:09 +0100 Subject: [PATCH 021/165] Remove redundant UUID tests Signed-off-by: Inengiye Emmanuel --- .../test_util/test_augur_uuid.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py index 2f81a8f84..a377468e4 100644 --- a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py +++ b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py @@ -135,22 +135,6 @@ def test_github_and_gitlab_different_for_same_user(): gitlab_uid["user"] = 100 assert github_uid != gitlab_uid -# checks the maximum value that fits into user field -def test_user_field_max_value(): - uid = GithubUUID() - uid["user"] = 4294967295 - assert uid["user"] == 4294967295 - -# checks the minimum boundary -def test_user_field_zero_value(): - uid = GithubUUID() - uid["user"] = 0 - assert uid["user"] == 0 - -def test_len_returns_16(): - uid = AugurUUID() - assert len(uid) == 16 - def test_dict_representation(): uid = GithubUUID() uid["user"] = 10 @@ -169,12 +153,6 @@ def test_string_representation(): assert "user" in result assert "platform" in result -def test_iteration_over_bytes(): - uid = AugurUUID() - bytes_list = list(uid) - - assert len(bytes_list) == 16 - def test_setting_same_field_twice(): uid = GithubUUID() uid["user"] = 42 From e87983d4d57bec5df43ee643ce9e74dfa405ce1c Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 7 May 2026 09:42:32 -0400 Subject: [PATCH 022/165] add some basic unit tests to assert that the URL generation works as intended These pass but require that the collectoss/tasks/github/__init__.py file be commented out first before they will run Co-Authored-By: Cursor Signed-off-by: Adrian Edwards --- tests/test_classes/test_github_data_access.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/test_classes/test_github_data_access.py diff --git a/tests/test_classes/test_github_data_access.py b/tests/test_classes/test_github_data_access.py new file mode 100644 index 000000000..7a3064d8d --- /dev/null +++ b/tests/test_classes/test_github_data_access.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: MIT +import pytest +from unittest.mock import Mock, patch + +from collectoss.tasks.github.util.github_data_access import GithubDataAccess + + +@pytest.fixture +def mock_logger(): + return Mock() + + +@pytest.fixture +def mock_key_manager(): + return Mock() + + +@pytest.fixture +def gda(mock_key_manager, mock_logger): + with patch("collectoss.tasks.github.util.github_data_access.KeyClient"): + return GithubDataAccess(mock_key_manager, mock_logger) + + +class TestEndpointUrl: + + def test_basic_path(self, gda): + result = gda.endpoint_url("/users/MoralCode") + assert result == "https://api.github.com/users/MoralCode" + + def test_path_without_leading_slash(self, gda): + result = gda.endpoint_url("repos/owner/repo") + assert result == "https://api.github.com/repos/owner/repo" + + def test_with_single_param(self, gda): + result = gda.endpoint_url("/users/MoralCode", {"per_page": "100"}) + assert "per_page=100" in result + assert result.startswith("https://api.github.com/users/MoralCode") + + def test_with_multiple_params(self, gda): + result = gda.endpoint_url("/repos/owner/repo/pulls", {"per_page": "50", "state": "open"}) + assert "per_page=50" in result + assert "state=open" in result + assert result.startswith("https://api.github.com/repos/owner/repo/pulls") + + def test_none_params_produces_no_query_string(self, gda): + result = gda.endpoint_url("/users/MoralCode", None) + assert result == "https://api.github.com/users/MoralCode" + + def test_empty_params_produces_no_query_string(self, gda): + result = gda.endpoint_url("/users/MoralCode", {}) + assert result == "https://api.github.com/users/MoralCode" + + def test_path_with_existing_query_params(self, gda): + result = gda.endpoint_url("/search/repositories?q=python", {"per_page": "10"}) + assert "q=python" in result + assert "per_page=10" in result + assert result.startswith("https://api.github.com/search/repositories") From e6976be4ca786cbe60dc787a80c14a5d5cfbab82 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 11:41:14 -0400 Subject: [PATCH 023/165] fix merge conflict error Signed-off-by: Adrian Edwards --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 25edd9a69..5c155b7d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -146,7 +146,7 @@ addopts = "-ra -s" testpaths = [ "tests/test_classes", "tests/test_application/test_cli/test_csv_utils.py", - "tests/test_tasks/test_task_utilities/test_util/test_worker_util.py" + "tests/test_tasks/test_task_utilities/test_util/test_worker_util.py", "tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py", # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of Augur # "tests/test_metrics", From c47b20b21592b5f79fcbee56e895aa470f0ebd1f Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 7 May 2026 09:45:00 -0400 Subject: [PATCH 024/165] move task imports to the one place they're actually needed this prevents them from messing with unit testing stuff Signed-off-by: Adrian Edwards --- collectoss/tasks/github/__init__.py | 7 ------- collectoss/tasks/start_tasks.py | 8 +++++++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/collectoss/tasks/github/__init__.py b/collectoss/tasks/github/__init__.py index de3f37bd8..e69de29bb 100644 --- a/collectoss/tasks/github/__init__.py +++ b/collectoss/tasks/github/__init__.py @@ -1,7 +0,0 @@ -from collectoss.tasks.github.contributors import * -from collectoss.tasks.github.events import * -from collectoss.tasks.github.issues import * -from collectoss.tasks.github.messages import * -from collectoss.tasks.github.pull_requests.tasks import * -from collectoss.tasks.github.repo_info.tasks import * -from collectoss.tasks.github.releases.tasks import * diff --git a/collectoss/tasks/start_tasks.py b/collectoss/tasks/start_tasks.py index 644b6cbc4..8e130f926 100644 --- a/collectoss/tasks/start_tasks.py +++ b/collectoss/tasks/start_tasks.py @@ -7,7 +7,13 @@ import sqlalchemy as s -from collectoss.tasks.github import * +from collectoss.tasks.github.contributors import * +from collectoss.tasks.github.events import * +from collectoss.tasks.github.issues import * +from collectoss.tasks.github.messages import * +from collectoss.tasks.github.pull_requests.tasks import * +from collectoss.tasks.github.repo_info.tasks import * +from collectoss.tasks.github.releases.tasks import * if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": from collectoss.tasks.data_analysis import * from collectoss.tasks.github.detect_move.tasks import detect_github_repo_move_core, detect_github_repo_move_secondary From 309f8521411cd01094c778a5789999d923c32e96 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 11:41:23 -0400 Subject: [PATCH 025/165] rename the class Signed-off-by: Adrian Edwards --- .../test_util/test_augur_uuid.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py index a377468e4..22d7703bb 100644 --- a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py +++ b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py @@ -1,12 +1,12 @@ import pytest import uuid -from augur.tasks.util.AugurUUID import AugurUUID, GithubUUID, GitlabUUID, UnresolvableUUID +from collectoss.tasks.util.ContributorUUID import ContributorUUID, GithubUUID, GitlabUUID, UnresolvableUUID -# AugurUUID tests +# ContributorUUID tests -# this checks whether a brand new AugurUUID object starts as 16 zero bytes +# this checks whether a brand new ContributorUUID object starts as 16 zero bytes def test_augur_uuid_initializes_with_16_zero_bytes(): - uid = AugurUUID() + uid = ContributorUUID() assert len(uid.bytes) == 16 assert all(b == 0 for b in uid.bytes) @@ -28,12 +28,12 @@ def test_github_uuid_set_user(): # tests platform_id edge cases def test_set_platform_id_raises_on_non_integer(): - uid = AugurUUID() + uid = ContributorUUID() with pytest.raises(ValueError): uid.set_platform_id("github") def test_set_platform_id_raises_on_overflow(): - uid = AugurUUID() + uid = ContributorUUID() with pytest.raises(ValueError): uid.set_platform_id(256) # too big for 1 byte @@ -56,13 +56,13 @@ def test_to_uuid_returns_valid_uuid(): # checks the start_byte is within range(0, 16) for set_bytes def test_set_bytes_raises_on_invalid_start_byte(): - uid = AugurUUID() + uid = ContributorUUID() with pytest.raises(ValueError): uid.set_bytes([1, 2, 3], 16) # checks that set_bytes correctly raises an error when you write more bytes that will fit in the UUID starting at a given position def test_set_bytes_raises_on_too_many_bytes(): - uid = AugurUUID() + uid = ContributorUUID() with pytest.raises(ValueError): uid.set_bytes([1] * 10, 10) @@ -79,35 +79,35 @@ def test_write_int_with_non_integer(): uid.write_int("abc", 1, 4) def test_write_int_and_get_int_roundtrip(): - uid = AugurUUID() + uid = ContributorUUID() uid.write_int(65535, 1, 2) assert uid.get_int(1, 2) == 65535 # checks __int__ method def test_int_conversion(): - uid = AugurUUID() + uid = ContributorUUID() uid.set_byte(15, 1) assert int(uid) == 1 def test_get_byte_invalid_index(): - uid = AugurUUID() + uid = ContributorUUID() with pytest.raises(IndexError): uid.get_byte(20) # checks that set_byte correctly rejects a value that is too large def test_set_byte_raises_on_invalid_value(): - uid = AugurUUID() + uid = ContributorUUID() with pytest.raises(ValueError): uid.set_byte(0, 256) # too big for one byte # checks that set_byte rejects an index that doesnt exist def test_set_byte_raises_on_out_of_range_index(): - uid = AugurUUID() + uid = ContributorUUID() with pytest.raises(IndexError): uid.set_byte(16, 1) # index 16 is out of bounds def test_set_byte_raises_on_non_integer(): - uid = AugurUUID() + uid = ContributorUUID() with pytest.raises(ValueError): uid.set_byte(0, "hello") From a63d96a0c8d72f286b1530ccae7120b6e7505e1b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 11:25:33 -0400 Subject: [PATCH 026/165] fix remove database dependency from the worker util test Signed-off-by: Adrian Edwards --- pyproject.toml | 1 + .../test_task_utilities/test_util/test_worker_util.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6445d832e..85f11e3e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -146,6 +146,7 @@ addopts = "-ra -s" testpaths = [ "tests/test_classes", "tests/test_application/test_cli/test_csv_utils.py", + "tests/test_tasks/test_task_utilities/test_util/test_worker_util.py" # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of CollectOSS # "tests/test_metrics", # "tests/test_tasks", diff --git a/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py b/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py index affd40248..da420a2ab 100644 --- a/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py +++ b/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py @@ -2,11 +2,11 @@ import pytest import sqlalchemy as s -from collectoss.tasks.util.worker_util import * +from collectoss.tasks.util.worker_util import remove_duplicates_by_uniques logger = logging.getLogger(__name__) -def test_remove_duplicates_by_uniques(test_db_engine): +def test_remove_duplicates_by_uniques(): data_1 = {"cntrb_login": "Bob", "gh_user_id": 4, "gh_login": "bob", "cntrb_id": "01003f7a-8500-0000-0000-000000000000"} data_2 = {"cntrb_login": "amazing", "gh_user_id": 1700, "gh_login": "hello", "cntrb_id": "01003f7a-8500-0000-0000-000123002000"} From ecfa0d754ce0a01d105e3a0c0ff3df31edb8eff8 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 19 May 2026 09:06:09 -0400 Subject: [PATCH 027/165] actually run all the configured tests in CI Signed-off-by: Adrian Edwards --- .github/workflows/functional_test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/functional_test.yml b/.github/workflows/functional_test.yml index eaa50adf3..5ec4dc2b9 100644 --- a/.github/workflows/functional_test.yml +++ b/.github/workflows/functional_test.yml @@ -27,5 +27,4 @@ jobs: - name: Run Tests run: | uv run --python ${{ matrix.env }} pytest \ - tests/test_classes \ --color=yes From bc66238059322a18b08d19a4df6dc74272fbac1f Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 19 May 2026 09:14:09 -0400 Subject: [PATCH 028/165] rename duplicate test function Signed-off-by: Adrian Edwards --- .../test_paginators/test_github_paginator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tasks/test_task_utilities/test_paginators/test_github_paginator.py b/tests/test_tasks/test_task_utilities/test_paginators/test_github_paginator.py index a8ea375f3..ec0c27745 100644 --- a/tests/test_tasks/test_task_utilities/test_paginators/test_github_paginator.py +++ b/tests/test_tasks/test_task_utilities/test_paginators/test_github_paginator.py @@ -92,7 +92,7 @@ def test_github_paginator_len(key_auth): assert len_contributors_list == 0 -def test_github_paginator_get_item(key_auth): +def test_github_paginator_get_item_2(key_auth): owner = "chaoss" name = "whitepaper" From 5840b23a12ae25407b625549eb6e23159a63452f Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 19 May 2026 09:59:50 -0400 Subject: [PATCH 029/165] refactor most of the JSONConfig store tests into a class Signed-off-by: Adrian Edwards --- tests/test_classes/test_config_stores.py | 236 ++++++++++++----------- 1 file changed, 119 insertions(+), 117 deletions(-) diff --git a/tests/test_classes/test_config_stores.py b/tests/test_classes/test_config_stores.py index 8c15fd020..13fb6119d 100644 --- a/tests/test_classes/test_config_stores.py +++ b/tests/test_classes/test_config_stores.py @@ -14,73 +14,125 @@ def mock_session(): return Mock() -def test_jsonconfig_readonly_flags(mock_logger): - cfg = JsonConfig({"A": {"x": 1}}, mock_logger) - assert cfg.writable is False - assert cfg.empty is False - - -def test_jsonconfig_empty_true_false(mock_logger): - assert JsonConfig({}, mock_logger).empty is True - assert JsonConfig({"A": {}}, mock_logger).empty is False - - -def test_jsonconfig_write_protection(mock_logger): - # JsonConfig should be not writeable by default, so we should be unable to change - # its values, even by abusing references - - data = {"Alpha": {"a": 1, "b": "str"}, "Beta": {}} - cfg = JsonConfig(data, mock_logger) - - # mutation via input - data["Alpha"]["a"] = 2 - - config_test = cfg.retrieve_dict() - assert config_test != data # the data in the config should not change - - # mutation via output - config_test["Alpha"]["a"] = 3 - - config_test = cfg.retrieve_dict() - assert config_test != data # the data in the config should not change - -def test_jsonconfig_retrieve_has_get(mock_logger): - data = {"Alpha": {"a": 1, "b": "str"}, "Beta": {}} - cfg = JsonConfig(data, mock_logger) - - # retrieve full dict - assert cfg.retrieve_dict() == data - - # has/get section - assert cfg.has_section("Alpha") is True - assert cfg.has_section("Missing") is False - assert cfg.get_section("Alpha") == {"a": 1, "b": "str"} - assert cfg.get_section("Missing") is None - - # has/get value - assert cfg.has_value("Alpha", "a") is True - assert cfg.has_value("Alpha", "missing") is False - assert cfg.has_value("Missing", "a") is False - assert cfg.get_value("Alpha", "a") == 1 - assert cfg.get_value("Alpha", "missing") is None - assert cfg.get_value("Missing", "a") is None - - -@pytest.mark.parametrize( - "callable_name, args, kwargs", - [ - ("load_dict", ({"X": {"y": 2}},), {"ignore_existing": False}), - ("clear", tuple(), {}), - ("remove_section", ("X",), {}), - ("create_section", ("X", {"y": 2}), {"ignore_existing": False}), - ("remove_value", ("X", "y"), {}), - ("add_value", ("X", "y", 2), {"ignore_existing": False}), - ], -) -def test_jsonconfig_mutations_raise_not_writable(mock_logger, callable_name, args, kwargs): - cfg = JsonConfig({"A": {"x": 1}}, mock_logger) - with pytest.raises(NotWriteableException): - getattr(cfg, callable_name)(*args, **kwargs) +class TestJSONConfig: + + def test_jsonconfig_readonly_flags(self, mock_logger): + cfg = JsonConfig({"A": {"x": 1}}, mock_logger) + assert cfg.writable is False + assert cfg.empty is False + + + def test_jsonconfig_empty_true_false(self, mock_logger): + assert JsonConfig({}, mock_logger).empty is True + assert JsonConfig({"A": {}}, mock_logger).empty is False + + + def test_jsonconfig_write_protection(self, mock_logger): + # JsonConfig should be not writeable by default, so we should be unable to change + # its values, even by abusing references + + data = {"Alpha": {"a": 1, "b": "str"}, "Beta": {}} + cfg = JsonConfig(data, mock_logger) + + # mutation via input + data["Alpha"]["a"] = 2 + + config_test = cfg.retrieve_dict() + assert config_test != data # the data in the config should not change + + # mutation via output + config_test["Alpha"]["a"] = 3 + + config_test = cfg.retrieve_dict() + assert config_test != data # the data in the config should not change + + def test_jsonconfig_retrieve_has_get(self, mock_logger): + data = {"Alpha": {"a": 1, "b": "str"}, "Beta": {}} + cfg = JsonConfig(data, mock_logger) + + # retrieve full dict + assert cfg.retrieve_dict() == data + + # has/get section + assert cfg.has_section("Alpha") is True + assert cfg.has_section("Missing") is False + assert cfg.get_section("Alpha") == {"a": 1, "b": "str"} + assert cfg.get_section("Missing") is None + + # has/get value + assert cfg.has_value("Alpha", "a") is True + assert cfg.has_value("Alpha", "missing") is False + assert cfg.has_value("Missing", "a") is False + assert cfg.get_value("Alpha", "a") == 1 + assert cfg.get_value("Alpha", "missing") is None + assert cfg.get_value("Missing", "a") is None + + + @pytest.mark.parametrize( + "callable_name, args, kwargs", + [ + ("load_dict", ({"X": {"y": 2}},), {"ignore_existing": False}), + ("clear", tuple(), {}), + ("remove_section", ("X",), {}), + ("create_section", ("X", {"y": 2}), {"ignore_existing": False}), + ("remove_value", ("X", "y"), {}), + ("add_value", ("X", "y", 2), {"ignore_existing": False}), + ], + ) + def test_jsonconfig_mutations_raise_not_writable(self, mock_logger, callable_name, args, kwargs): + cfg = JsonConfig({"A": {"x": 1}}, mock_logger) + with pytest.raises(NotWriteableException): + getattr(cfg, callable_name)(*args, **kwargs) + + + def test_fetching_real_defaults(self, mock_logger, mock_session): + cfg = SystemConfig(mock_logger, mock_session) + cfg.config_sources = [JsonConfig(default_config, mock_logger)] + + assert cfg.get_value("Redis", "cache_group") == 0 + + + def test_load_config_utilizes_hierarchy(self): + + default_dict = { + "Section1": {"alpha": 1, "beta": "x"}, + "Section2": {"gamma": False, "delta": 3.14}, + } + + override_dict = { + "Section1": {"beta": "y"}, + "Section2": {"Epsilon": True, "delta": 6.28}, + "Section3": {"hi": "there"} + } + + cfg = SystemConfig(None, None, [JsonConfig(default_dict, mock_logger), JsonConfig(override_dict, mock_logger)]) + + expected_dict = { + "Section1": {"alpha": 1, "beta": "y"}, + "Section2": {"gamma": False, "Epsilon": True, "delta": 6.28}, + "Section3": {"hi": "there"} # test that new sections are accounted for too + } + + assert cfg.load_config() == expected_dict + + + def test_get_section_incorporates_hierarchy(self): + + default_dict = { + "Section1": {"alpha": 1, "beta": "x"}, + "Section2": {"gamma": False, "delta": 3.14}, + } + + override_dict = { + "Section1": {"beta": "y"}, + "Section2": {"gamma": False, "delta": 3.14}, + } + + cfg = SystemConfig(None, None, [JsonConfig(default_dict, mock_logger), JsonConfig(override_dict, mock_logger)]) + + expected_dict = {"alpha": 1, "beta": "y"} + + assert cfg.get_section("Section1") == expected_dict def test_dict_to_config_table_happy_path(): @@ -122,53 +174,3 @@ def test_dict_to_config_table_happy_path(): assert rows == expected - -def test_fetching_real_defaults(mock_logger, mock_session): - cfg = SystemConfig(mock_logger, mock_session) - cfg.config_sources = [JsonConfig(default_config, mock_logger)] - - assert cfg.get_value("Redis", "cache_group") == 0 - - -def test_load_config_utilizes_hierarchy(): - - default_dict = { - "Section1": {"alpha": 1, "beta": "x"}, - "Section2": {"gamma": False, "delta": 3.14}, - } - - override_dict = { - "Section1": {"beta": "y"}, - "Section2": {"Epsilon": True, "delta": 6.28}, - "Section3": {"hi": "there"} - } - - cfg = SystemConfig(None, None, [JsonConfig(default_dict, mock_logger), JsonConfig(override_dict, mock_logger)]) - - expected_dict = { - "Section1": {"alpha": 1, "beta": "y"}, - "Section2": {"gamma": False, "Epsilon": True, "delta": 6.28}, - "Section3": {"hi": "there"} # test that new sections are accounted for too - } - - assert cfg.load_config() == expected_dict - - -def test_get_section_incorporates_hierarchy(): - - default_dict = { - "Section1": {"alpha": 1, "beta": "x"}, - "Section2": {"gamma": False, "delta": 3.14}, - } - - override_dict = { - "Section1": {"beta": "y"}, - "Section2": {"gamma": False, "delta": 3.14}, - } - - cfg = SystemConfig(None, None, [JsonConfig(default_dict, mock_logger), JsonConfig(override_dict, mock_logger)]) - - expected_dict = {"alpha": 1, "beta": "y"} - - assert cfg.get_section("Section1") == expected_dict - From 606024dee3f0804e49ae082d69456371ce6942ca Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 19 May 2026 10:11:43 -0400 Subject: [PATCH 030/165] encapsulate contributor UUID into a class Signed-off-by: Adrian Edwards --- .../test_util/test_augur_uuid.py | 310 +++++++++--------- 1 file changed, 155 insertions(+), 155 deletions(-) diff --git a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py index 22d7703bb..b1ea766e7 100644 --- a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py +++ b/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py @@ -3,158 +3,158 @@ from collectoss.tasks.util.ContributorUUID import ContributorUUID, GithubUUID, GitlabUUID, UnresolvableUUID # ContributorUUID tests - -# this checks whether a brand new ContributorUUID object starts as 16 zero bytes -def test_augur_uuid_initializes_with_16_zero_bytes(): - uid = ContributorUUID() - assert len(uid.bytes) == 16 - assert all(b == 0 for b in uid.bytes) - -# checks that githubUUID sets its platform number to 1 -def test_github_uuid_platform_is_1(): - uid = GithubUUID() - assert uid["platform"] == 1 - -# checks that gitlabUUID sets its platform number to 2 -def test_gitlab_uuid_platform_is_2(): - uid = GitlabUUID() - assert uid["platform"] == 2 - -# checks the that you can store a value in the user field -def test_github_uuid_set_user(): - uid = GithubUUID() - uid["user"] = 12345 - assert uid["user"] == 12345 - -# tests platform_id edge cases -def test_set_platform_id_raises_on_non_integer(): - uid = ContributorUUID() - with pytest.raises(ValueError): - uid.set_platform_id("github") - -def test_set_platform_id_raises_on_overflow(): - uid = ContributorUUID() - with pytest.raises(ValueError): - uid.set_platform_id(256) # too big for 1 byte - -# checks that writing to one field doesnt accidentally overwrite bytes belonging to another field -def test_fields_dont_overlap(): - uid = GithubUUID() - - uid["user"] = 12345 - uid["repo"] = 99999 - - assert uid["user"] == 12345 - assert uid["repo"] == 99999 - -# checks that to_UUID returs the uuid.UUID object -def test_to_uuid_returns_valid_uuid(): - uid = GithubUUID() - uid["user"] = 15 - result = uid.to_UUID() - assert isinstance(result, uuid.UUID) - -# checks the start_byte is within range(0, 16) for set_bytes -def test_set_bytes_raises_on_invalid_start_byte(): - uid = ContributorUUID() - with pytest.raises(ValueError): - uid.set_bytes([1, 2, 3], 16) - -# checks that set_bytes correctly raises an error when you write more bytes that will fit in the UUID starting at a given position -def test_set_bytes_raises_on_too_many_bytes(): - uid = ContributorUUID() - with pytest.raises(ValueError): - uid.set_bytes([1] * 10, 10) - -# checks that writeint correctly rejects a number -def test_write_int_raises_on_overflow(): - uid = GithubUUID() - with pytest.raises(ValueError): - uid["user"] = 99999999999 # too big for 4 bytes - -def test_write_int_with_non_integer(): - uid = GithubUUID() - - with pytest.raises(ValueError): - uid.write_int("abc", 1, 4) - -def test_write_int_and_get_int_roundtrip(): - uid = ContributorUUID() - uid.write_int(65535, 1, 2) - assert uid.get_int(1, 2) == 65535 - -# checks __int__ method -def test_int_conversion(): - uid = ContributorUUID() - uid.set_byte(15, 1) - assert int(uid) == 1 - -def test_get_byte_invalid_index(): - uid = ContributorUUID() - with pytest.raises(IndexError): - uid.get_byte(20) - -# checks that set_byte correctly rejects a value that is too large -def test_set_byte_raises_on_invalid_value(): - uid = ContributorUUID() - with pytest.raises(ValueError): - uid.set_byte(0, 256) # too big for one byte - -# checks that set_byte rejects an index that doesnt exist -def test_set_byte_raises_on_out_of_range_index(): - uid = ContributorUUID() - with pytest.raises(IndexError): - uid.set_byte(16, 1) # index 16 is out of bounds - -def test_set_byte_raises_on_non_integer(): - uid = ContributorUUID() - with pytest.raises(ValueError): - uid.set_byte(0, "hello") - -# checks that 2 UUIDs with the same values are considered equal. -def test_equality(): - uid1 = GithubUUID() - uid2 = GithubUUID() - uid1["user"] = 100 - uid2["user"] = 100 - assert uid1 == uid2 - -# checks that 2 UUIDs with different values are not equal -def test_inequality(): - uid1 = GithubUUID() - uid2 = GithubUUID() - uid1["user"] = 100 - uid2["user"] = 200 - assert uid1 != uid2 - -# checks that the same user produces different user IDs across platforms -def test_github_and_gitlab_different_for_same_user(): - github_uid = GithubUUID() - gitlab_uid = GitlabUUID() - github_uid["user"] = 100 - gitlab_uid["user"] = 100 - assert github_uid != gitlab_uid - -def test_dict_representation(): - uid = GithubUUID() - uid["user"] = 10 - - result = uid.__dict__() - - assert result["platform"] == 1 - assert result["user"] == 10 - -def test_string_representation(): - uid = GithubUUID() - uid["user"] = 10 - - result = str(uid) - - assert "user" in result - assert "platform" in result - -def test_setting_same_field_twice(): - uid = GithubUUID() - uid["user"] = 42 - uid["user"] = 100 # overwrite with different value - assert uid["user"] == 100 \ No newline at end of file +class TestContributorUUID: + # this checks whether a brand new ContributorUUID object starts as 16 zero bytes + def test_augur_uuid_initializes_with_16_zero_bytes(self): + uid = ContributorUUID() + assert len(uid.bytes) == 16 + assert all(b == 0 for b in uid.bytes) + + # checks that githubUUID sets its platform number to 1 + def test_github_uuid_platform_is_1(self): + uid = GithubUUID() + assert uid["platform"] == 1 + + # checks that gitlabUUID sets its platform number to 2 + def test_gitlab_uuid_platform_is_2(self): + uid = GitlabUUID() + assert uid["platform"] == 2 + + # checks the that you can store a value in the user field + def test_github_uuid_set_user(self): + uid = GithubUUID() + uid["user"] = 12345 + assert uid["user"] == 12345 + + # tests platform_id edge cases + def test_set_platform_id_raises_on_non_integer(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_platform_id("github") + + def test_set_platform_id_raises_on_overflow(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_platform_id(256) # too big for 1 byte + + # checks that writing to one field doesnt accidentally overwrite bytes belonging to another field + def test_fields_dont_overlap(self): + uid = GithubUUID() + + uid["user"] = 12345 + uid["repo"] = 99999 + + assert uid["user"] == 12345 + assert uid["repo"] == 99999 + + # checks that to_UUID returs the uuid.UUID object + def test_to_uuid_returns_valid_uuid(self): + uid = GithubUUID() + uid["user"] = 15 + result = uid.to_UUID() + assert isinstance(result, uuid.UUID) + + # checks the start_byte is within range(0, 16) for set_bytes + def test_set_bytes_raises_on_invalid_start_byte(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_bytes([1, 2, 3], 16) + + # checks that set_bytes correctly raises an error when you write more bytes that will fit in the UUID starting at a given position + def test_set_bytes_raises_on_too_many_bytes(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_bytes([1] * 10, 10) + + # checks that writeint correctly rejects a number + def test_write_int_raises_on_overflow(self): + uid = GithubUUID() + with pytest.raises(ValueError): + uid["user"] = 99999999999 # too big for 4 bytes + + def test_write_int_with_non_integer(self): + uid = GithubUUID() + + with pytest.raises(ValueError): + uid.write_int("abc", 1, 4) + + def test_write_int_and_get_int_roundtrip(self): + uid = ContributorUUID() + uid.write_int(65535, 1, 2) + assert uid.get_int(1, 2) == 65535 + + # checks __int__ method + def test_int_conversion(self): + uid = ContributorUUID() + uid.set_byte(15, 1) + assert int(uid) == 1 + + def test_get_byte_invalid_index(self): + uid = ContributorUUID() + with pytest.raises(IndexError): + uid.get_byte(20) + + # checks that set_byte correctly rejects a value that is too large + def test_set_byte_raises_on_invalid_value(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_byte(0, 256) # too big for one byte + + # checks that set_byte rejects an index that doesnt exist + def test_set_byte_raises_on_out_of_range_index(self): + uid = ContributorUUID() + with pytest.raises(IndexError): + uid.set_byte(16, 1) # index 16 is out of bounds + + def test_set_byte_raises_on_non_integer(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_byte(0, "hello") + + # checks that 2 UUIDs with the same values are considered equal. + def test_equality(self): + uid1 = GithubUUID() + uid2 = GithubUUID() + uid1["user"] = 100 + uid2["user"] = 100 + assert uid1 == uid2 + + # checks that 2 UUIDs with different values are not equal + def test_inequality(self): + uid1 = GithubUUID() + uid2 = GithubUUID() + uid1["user"] = 100 + uid2["user"] = 200 + assert uid1 != uid2 + + # checks that the same user produces different user IDs across platforms + def test_github_and_gitlab_different_for_same_user(self): + github_uid = GithubUUID() + gitlab_uid = GitlabUUID() + github_uid["user"] = 100 + gitlab_uid["user"] = 100 + assert github_uid != gitlab_uid + + def test_dict_representation(self): + uid = GithubUUID() + uid["user"] = 10 + + result = uid.__dict__() + + assert result["platform"] == 1 + assert result["user"] == 10 + + def test_string_representation(self): + uid = GithubUUID() + uid["user"] = 10 + + result = str(uid) + + assert "user" in result + assert "platform" in result + + def test_setting_same_field_twice(self): + uid = GithubUUID() + uid["user"] = 42 + uid["user"] = 100 # overwrite with different value + assert uid["user"] == 100 \ No newline at end of file From 9199b85da7c6c39ed06a10ad38342c572bdd429b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 19 May 2026 10:09:02 -0400 Subject: [PATCH 031/165] rename unit test file Signed-off-by: Adrian Edwards --- pyproject.toml | 2 +- .../test_util/{test_augur_uuid.py => test_contributor_uuid.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/test_tasks/test_task_utilities/test_util/{test_augur_uuid.py => test_contributor_uuid.py} (100%) diff --git a/pyproject.toml b/pyproject.toml index 5c155b7d7..ebe2b0043 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -147,7 +147,7 @@ testpaths = [ "tests/test_classes", "tests/test_application/test_cli/test_csv_utils.py", "tests/test_tasks/test_task_utilities/test_util/test_worker_util.py", - "tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py", + "tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py", # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of Augur # "tests/test_metrics", # "tests/test_tasks", diff --git a/tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py b/tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py similarity index 100% rename from tests/test_tasks/test_task_utilities/test_util/test_augur_uuid.py rename to tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py From 1c23358d37ac25f3ce1186cb9d24ac3ab16dfe7a Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 19 May 2026 10:04:37 -0400 Subject: [PATCH 032/165] configure and apply test markers to separate unit and integration tests Signed-off-by: Adrian Edwards --- pyproject.toml | 5 +++++ tests/test_application/test_cli/test_csv_utils.py | 12 +++++++----- tests/test_classes/test_config_stores.py | 1 + tests/test_classes/test_github_data_access.py | 2 +- .../test_util/test_contributor_uuid.py | 1 + .../test_util/test_worker_util.py | 1 + 6 files changed, 16 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ebe2b0043..d14c871f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,6 +156,11 @@ testpaths = [ # "tests/test_workers/worker_persistence/", # "tests/test_routes/runner.py" ] +markers = [ + "unit: pure logic tests with no external dependencies", + "integration: tests requiring a database, Redis, or network access", +] + [tool.mypy] files = ['collectoss/application/db/*.py'] diff --git a/tests/test_application/test_cli/test_csv_utils.py b/tests/test_application/test_cli/test_csv_utils.py index 395ed0936..d15a7f04b 100644 --- a/tests/test_application/test_cli/test_csv_utils.py +++ b/tests/test_application/test_cli/test_csv_utils.py @@ -14,7 +14,7 @@ MAX_FILE_SIZE_BYTES, ) - +@pytest.mark.unit class TestValidateGitUrl: """Tests for validate_git_url function""" @@ -40,7 +40,7 @@ def test_whitespace_handling(self): """Test that whitespace is properly stripped""" assert validate_git_url(" https://github.com/chaoss/collectoss ") - +@pytest.mark.unit class TestValidatePositiveInt: """Tests for validate_positive_int function""" @@ -71,7 +71,7 @@ def test_whitespace_handling(self): """Test that whitespace is properly stripped""" assert validate_positive_int(" 42 ") - +@pytest.mark.unit class TestDetectColumnOrder: """Tests for detect_column_order function""" @@ -153,7 +153,7 @@ def test_no_match_found_raises_error(self): with pytest.raises(ValueError, match="Could not detect column"): detect_column_order(sample_rows, validators) - +@pytest.mark.unit class TestProcessCsv: """Tests for process_csv function""" @@ -252,7 +252,7 @@ def test_whitespace_in_values(self, tmp_path): result = process_csv(str(csv_file), validators) assert result[0] == {"repo_url": "https://github.com/chaoss/collectoss", "repo_group_id": "10"} - +@pytest.mark.unit class TestProcessRepoCsv: """Tests for process_repo_csv function""" @@ -275,6 +275,7 @@ def test_process_repo_csv_without_headers(self, tmp_path): assert len(result) == 2 +@pytest.mark.unit class TestProcessRepoGroupCsv: """Tests for process_repo_group_csv function""" @@ -310,6 +311,7 @@ def test_empty_group_name_invalid(self, tmp_path): assert len(result) >= 1 +@pytest.mark.unit class TestEdgeCases: """Tests for edge cases and error conditions""" diff --git a/tests/test_classes/test_config_stores.py b/tests/test_classes/test_config_stores.py index 13fb6119d..cf23f646f 100644 --- a/tests/test_classes/test_config_stores.py +++ b/tests/test_classes/test_config_stores.py @@ -135,6 +135,7 @@ def test_get_section_incorporates_hierarchy(self): assert cfg.get_section("Section1") == expected_dict +@pytest.mark.unit def test_dict_to_config_table_happy_path(): input_dict = { "Section1": {"alpha": 1, "beta": "x"}, diff --git a/tests/test_classes/test_github_data_access.py b/tests/test_classes/test_github_data_access.py index 7a3064d8d..3ebd4db79 100644 --- a/tests/test_classes/test_github_data_access.py +++ b/tests/test_classes/test_github_data_access.py @@ -20,7 +20,7 @@ def gda(mock_key_manager, mock_logger): with patch("collectoss.tasks.github.util.github_data_access.KeyClient"): return GithubDataAccess(mock_key_manager, mock_logger) - +@pytest.mark.unit class TestEndpointUrl: def test_basic_path(self, gda): diff --git a/tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py b/tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py index b1ea766e7..40f5cdc27 100644 --- a/tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py +++ b/tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py @@ -3,6 +3,7 @@ from collectoss.tasks.util.ContributorUUID import ContributorUUID, GithubUUID, GitlabUUID, UnresolvableUUID # ContributorUUID tests +@pytest.mark.unit class TestContributorUUID: # this checks whether a brand new ContributorUUID object starts as 16 zero bytes def test_augur_uuid_initializes_with_16_zero_bytes(self): diff --git a/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py b/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py index da420a2ab..410c1ef70 100644 --- a/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py +++ b/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py @@ -6,6 +6,7 @@ logger = logging.getLogger(__name__) +@pytest.mark.unit def test_remove_duplicates_by_uniques(): data_1 = {"cntrb_login": "Bob", "gh_user_id": 4, "gh_login": "bob", "cntrb_id": "01003f7a-8500-0000-0000-000000000000"} From 19b20eba5c17c12cff8e406cd1b4459ade109a59 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 19 May 2026 15:28:19 -0400 Subject: [PATCH 033/165] provide new repo name when updating a repo URL in move detection Signed-off-by: Adrian Edwards --- collectoss/tasks/github/detect_move/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/tasks/github/detect_move/core.py b/collectoss/tasks/github/detect_move/core.py index 1c0d7dba8..5adcc83fa 100644 --- a/collectoss/tasks/github/detect_move/core.py +++ b/collectoss/tasks/github/detect_move/core.py @@ -110,7 +110,7 @@ def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='c repo_update_dict = { 'repo_git': f"https://github.com/{owner}/{name}", 'repo_path': None, - 'repo_name': None, + 'repo_name': name, 'description': f"(Originally hosted at {url}) {old_description}" } From b5ab88fab6dc50500195a7ca0efe97ba5166ca16 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 20 May 2026 15:13:09 -0400 Subject: [PATCH 034/165] Document the types of testing being done and how to run them Signed-off-by: Adrian Edwards --- docs/source/development-guide/testing/toc.rst | 40 ++++++++++++++++++- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/docs/source/development-guide/testing/toc.rst b/docs/source/development-guide/testing/toc.rst index cbe469805..dd28c64a7 100644 --- a/docs/source/development-guide/testing/toc.rst +++ b/docs/source/development-guide/testing/toc.rst @@ -1,8 +1,44 @@ Testing =============== -**THIS SECTION IS UNDER CONSTRUCTION.** +CollectOSS aims to have a comprehensive set of tests to enable more rapid iteration and greater confidence that changes have not caused new breakage. + +These tests fall into one of several general types. +* unit tests - standalone tests that are simple to run and test single units of functionality (often individual functions or classes) +* integration tests - small subsystem tests that require bringing up additional pieces, such as redis or a database, to perform the test +* end-to-end tests - complete system tests that require running everything + +Unit Tests +----------- + +Unit tests are implemented via pytest and tagged as ``unit`` to make them easy to run. + +To run the unit tests, clone the CollectOSS repository and run ``uv run pytest -m unit`` + + +Integration Tests +------------------ +Unit tests are also implemented via pytest and tagged as ``integration``. +Because they require additional components, they are not quite as easy to run. + + +To run the integration tests you will need to start up the associated services. This can be done as follows: + +1. Enter the tests directory with ``cd tests/``, this ensures you use the correct dockerfile. +2. Bring up the associated services using the ``docker-compose.yml`` file by running ``docker compose up`` or the podman equivalent. +3. The tests can now be run in a new terminal using ``uv run pytest -m integration`` + +End to End Tests +------------------ + +The end to end tests are currently run as part of a CI job in github actions that is run on pull request. + +The main form of end to end test is the smoke test. This test brings up and runs the full container stack for three minutes. +A script monitors the output logs and looks for specific log statements that indicate that CollectOSS is coming up and behaving as expected. + +Future end to end tests may also run CollectOSS to the point of fully collecting on some smaller repositories and validating that the database is as expected. + If you have questions or would like to help please open an issue on GitHub_. -.. _GitHub: https://github.com/chaoss/collectoss/toss/issues +.. _GitHub: https://github.com/chaoss/collectoss/issues From da2856bc96e743866a39a503ed1b53cd0836600b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 20 May 2026 17:10:31 -0400 Subject: [PATCH 035/165] Adjust headings so another top level heading can be added Signed-off-by: Adrian Edwards --- docs/source/development-guide/testing/toc.rst | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/source/development-guide/testing/toc.rst b/docs/source/development-guide/testing/toc.rst index dd28c64a7..8d3515b60 100644 --- a/docs/source/development-guide/testing/toc.rst +++ b/docs/source/development-guide/testing/toc.rst @@ -1,15 +1,20 @@ Testing =============== + CollectOSS aims to have a comprehensive set of tests to enable more rapid iteration and greater confidence that changes have not caused new breakage. -These tests fall into one of several general types. + +Types of Testing +----------------- + +The tests of the CollectOSS app fall into one of several general types. * unit tests - standalone tests that are simple to run and test single units of functionality (often individual functions or classes) * integration tests - small subsystem tests that require bringing up additional pieces, such as redis or a database, to perform the test * end-to-end tests - complete system tests that require running everything Unit Tests ------------ +~~~~~~~~~~~ Unit tests are implemented via pytest and tagged as ``unit`` to make them easy to run. @@ -17,7 +22,7 @@ To run the unit tests, clone the CollectOSS repository and run ``uv run pytest - Integration Tests ------------------- +~~~~~~~~~~~~~~~~~~ Unit tests are also implemented via pytest and tagged as ``integration``. Because they require additional components, they are not quite as easy to run. @@ -28,8 +33,8 @@ To run the integration tests you will need to start up the associated services. 2. Bring up the associated services using the ``docker-compose.yml`` file by running ``docker compose up`` or the podman equivalent. 3. The tests can now be run in a new terminal using ``uv run pytest -m integration`` -End to End Tests ------------------- +End to End (E2E) Tests +~~~~~~~~~~~~~~~~~~~~~~~ The end to end tests are currently run as part of a CI job in github actions that is run on pull request. From b4360e3d2463773a5e624f992ab6ff05c1e86fb8 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 20 May 2026 17:10:51 -0400 Subject: [PATCH 036/165] include slack in testing signoff Signed-off-by: Adrian Edwards --- docs/source/development-guide/testing/toc.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/development-guide/testing/toc.rst b/docs/source/development-guide/testing/toc.rst index 8d3515b60..b9c2c2170 100644 --- a/docs/source/development-guide/testing/toc.rst +++ b/docs/source/development-guide/testing/toc.rst @@ -44,6 +44,6 @@ A script monitors the output logs and looks for specific log statements that ind Future end to end tests may also run CollectOSS to the point of fully collecting on some smaller repositories and validating that the database is as expected. -If you have questions or would like to help please open an issue on GitHub_. +If you have questions about testing in CollectOSS or would like to help please reach out via the `CHAOSS Slack `_ (in the #wg-collectoss-8knot channel) or open an issue on GitHub_. .. _GitHub: https://github.com/chaoss/collectoss/issues From a79b67b432afc81047dce342632e5cb349ffa762 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 20 May 2026 17:10:58 -0400 Subject: [PATCH 037/165] add a testing standards section Signed-off-by: Adrian Edwards --- docs/source/development-guide/testing/toc.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/development-guide/testing/toc.rst b/docs/source/development-guide/testing/toc.rst index b9c2c2170..5a973f209 100644 --- a/docs/source/development-guide/testing/toc.rst +++ b/docs/source/development-guide/testing/toc.rst @@ -44,6 +44,25 @@ A script monitors the output logs and looks for specific log statements that ind Future end to end tests may also run CollectOSS to the point of fully collecting on some smaller repositories and validating that the database is as expected. +Testing Standards +----------------- + +Different parts of the CollectOSS codebase are held to different standards when it comes to how thoroughly changes are expected to be tested/validated before being allowed to merge. + +An approximate, non-exhaustive list of the various levels of testing include: + +* **Code Review** - only a code review is needed to make sure things look okay (spelling/grammar, formatting etc). Typically used for README changes or changes to other simple, non-functional text files in the repo +* **Sanity Check** - a simple, automated check, such as a build job, should be run to ensure that syntax is correct and that the changes aren't causing a build failure. Typically used for documentation (what you are reading now) +* **Automated Functional Test** - A more complex automated check, such as unit tests, integration tests, E2E smoke tests, etc should be run to ensure that CollectOSS can at least start up successfully with the new code. Typically used for trivial changes to subcomponents that already have automated tests +* **Manual Functional Test Procedure** - A set of pre-defined testing steps designed to exercise the specific code/problem being changed. This will usually be derived from the reproduction steps for the bug being solved or documented in the related issue/PR before testing so others can reproduce it. Typically used to test fixes for specific bugs +* **Full Collection Test** - The change should be built and run on a small instance (with relevant repos being added to the collection set if necessary) and the instance should be allowed to run to full collection (all collection stages for all repos marked as "success" in the ``collection_status`` operations table). Typically used for basic/generalized behavior changes +* **Difficult Repo Test** - Either the manual functional test or the full collection test can be made more "difficult" by including one or more known-difficult repositories, such as `chaoss/jank `_ (an artificial repo intended to contain a bunch of examples of problematic git data), or any other repo demonstrating a relevant and extreme/difficult scenario (huge overall size, huge commit count, 50-100k+ commits, etc). Typically used for parsing/performance tests +* **Stress/Scale Test** - the change should be run on an instance (likely pre-existing) with at least 10k diverse repositories for at least one or more full cycles of the collection interval (about 1-2 weeks) to ensure that nothing breaks under load or other scaling-related conditions. Typically used for performance issues, bugs unique to large scale repos, and code thats important enough to require testing on a wide range of different repositories. + +Both the final merge decisions as well as decisions about which level of testing is appropriate for a given PR rests with the project maintainers. + + + If you have questions about testing in CollectOSS or would like to help please reach out via the `CHAOSS Slack `_ (in the #wg-collectoss-8knot channel) or open an issue on GitHub_. .. _GitHub: https://github.com/chaoss/collectoss/issues From e46a6a94ee3a0c3f8e43d98e5ea6ff226950d9c6 Mon Sep 17 00:00:00 2001 From: Suraj Date: Tue, 26 May 2026 12:57:27 +0000 Subject: [PATCH 038/165] docs(docker): remove reference to nonexistent database-compose.yml The file database-compose.yml no longer exists in the repository. The Docker Compose setup now includes the database service directly in docker-compose.yml. Closes #333 Signed-off-by: Suraj --- docs/source/docker/docker-compose.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/docker/docker-compose.rst b/docs/source/docker/docker-compose.rst index ae38fab6c..5c5d16a47 100644 --- a/docs/source/docker/docker-compose.rst +++ b/docs/source/docker/docker-compose.rst @@ -50,7 +50,7 @@ To run CollectOSS **with** the database container: .. code-block:: bash - docker compose -f docker-compose.yml -f database-compose.yml up + docker compose up Stopping the containers From eb1b831f48e67ff1ca0c74ba3190bb3c5fd69030 Mon Sep 17 00:00:00 2001 From: chrisx9z <287960381+chrisx9z@users.noreply.github.com> Date: Tue, 26 May 2026 22:04:46 +0700 Subject: [PATCH 039/165] Update CollectOSS documentation URLs This commit replaces the old Read the Docs host with docs.collectoss.org and points CLI help text to live documentation pages. Signed-off-by: chrisx9z <287960381+chrisx9z@users.noreply.github.com> --- CONTRIBUTING.md | 4 ++-- README.md | 2 +- docker/backend/Dockerfile | 2 +- docker/database/Dockerfile | 2 +- docker/keyman/Dockerfile | 2 +- docker/rabbitmq/Dockerfile | 2 +- pyproject.toml | 2 +- scripts/docker/config.sh | 4 ++-- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3d9182b26..42011a252 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,7 +9,7 @@ These resources are a great way to meet the people behind the project, ask quest ## Learn about the project -If you aren't already familiar with what CollectOSS is, please make sure you've read the [README](README.md) to get a primer on our project, and maybe take a look around the [documentation](https://collectoss.readthedocs.io/en/release/) so you know what we are about. You can also hang out in Slack or join our community meetings to learn more about what we do. +If you aren't already familiar with what CollectOSS is, please make sure you've read the [README](README.md) to get a primer on our project, and maybe take a look around the [documentation](https://docs.collectoss.org/en/latest/) so you know what we are about. You can also hang out in Slack or join our community meetings to learn more about what we do. ## Opening an issue If you're experiencing an issue with CollectOSS you can search for your problem or question on our [issues](https://github.com/chaoss/collectoss/issues) page to see if someone else has already reported it. If you cannot find your issue, please feel free to [open a new one](https://github.com/chaoss/collectoss/issues/new/choose). @@ -53,7 +53,7 @@ Github has an article called [Syncing a fork](https://docs.github.com/en/pull-re ## Helpful Links -- [CollectOSS stable documentation](https://collectoss.readthedocs.io/en/release/) +- [CollectOSS stable documentation](https://docs.collectoss.org/en/latest/) - [CHAOSS Getting Started page](https://chaoss.community/kb-getting-started/) **Git & GitHub** diff --git a/README.md b/README.md index be144d755..983fdfe66 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Basic initial setup can be completed in a few minutes as follows: 3. Copy the `environment.txt` file to a new file called `.env` and fill in values for the required variables 4. Run `docker compose up` to start the containers -Check out the [CollectOSS Documentation](https://collectoss.readthedocs.io) for more detailed setup instructions and troubleshooting steps. +Check out the [CollectOSS Documentation](https://docs.collectoss.org) for more detailed setup instructions and troubleshooting steps. ## Contributing We strongly believe that communities are what makes open source so impactful. We invite you to join our community, regardless of your experience level or coding abilities! diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 98e9f06ef..d3ada5bf0 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -22,7 +22,7 @@ FROM python:3.11-slim-bullseye LABEL org.opencontainers.image.authors="CHAOSS https://chaoss.community" LABEL org.opencontainers.image.licenses="MIT" LABEL org.opencontainers.image.source="https://github.com/chaoss/collectoss" -LABEL org.opencontainers.image.documentation="https://collectoss.readthedocs.io" +LABEL org.opencontainers.image.documentation="https://docs.collectoss.org" ARG VERSION LABEL org.opencontainers.image.version=${VERSION} diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index aa769649c..c239db9b6 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -4,7 +4,7 @@ FROM postgres:16 LABEL org.opencontainers.image.authors="CHAOSS https://chaoss.community" LABEL org.opencontainers.image.licenses="MIT" LABEL org.opencontainers.image.source="https://github.com/chaoss/collectoss" -LABEL org.opencontainers.image.documentation="https://collectoss.readthedocs.io" +LABEL org.opencontainers.image.documentation="https://docs.collectoss.org" ARG VERSION LABEL org.opencontainers.image.version=${VERSION} diff --git a/docker/keyman/Dockerfile b/docker/keyman/Dockerfile index 93de9fc28..33413a680 100644 --- a/docker/keyman/Dockerfile +++ b/docker/keyman/Dockerfile @@ -3,7 +3,7 @@ FROM python:3.11.12-alpine LABEL org.opencontainers.image.authors="CHAOSS https://chaoss.community" LABEL org.opencontainers.image.licenses="MIT" LABEL org.opencontainers.image.source="https://github.com/chaoss/collectoss" -LABEL org.opencontainers.image.documentation="https://collectoss.readthedocs.io" +LABEL org.opencontainers.image.documentation="https://docs.collectoss.org" ARG VERSION LABEL org.opencontainers.image.version=${VERSION} diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile index 6e8916f32..aea2806ac 100644 --- a/docker/rabbitmq/Dockerfile +++ b/docker/rabbitmq/Dockerfile @@ -3,7 +3,7 @@ FROM rabbitmq:4.1-management-alpine LABEL org.opencontainers.image.authors="CHAOSS https://chaoss.community" LABEL org.opencontainers.image.licenses="MIT" LABEL org.opencontainers.image.source="https://github.com/chaoss/collectoss" -LABEL org.opencontainers.image.documentation="https://collectoss.readthedocs.io" +LABEL org.opencontainers.image.documentation="https://docs.collectoss.org" ARG VERSION LABEL org.opencontainers.image.version=${VERSION} diff --git a/pyproject.toml b/pyproject.toml index d14c871f7..490e14dc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,7 +110,7 @@ collectoss = "collectoss.application.cli._multicommand:run" [project.urls] Homepage = "https://github.com/chaoss/collectoss" -Documentation = "https://collectoss.readthedocs.io/en/latest/" +Documentation = "https://docs.collectoss.org/en/latest/" ############################################################ diff --git a/scripts/docker/config.sh b/scripts/docker/config.sh index 6170ea57e..6f92c9a36 100755 --- a/scripts/docker/config.sh +++ b/scripts/docker/config.sh @@ -43,7 +43,7 @@ function get_github_api_key() { echo echo "Please provide a valid GitHub API key." echo "For more information on how to create the key, visit:" - echo "https://collectoss.readthedocs.io/en/latest/getting-started/installation.html#backend" + echo "https://docs.collectoss.org/en/latest/getting-started/collecting-data.html" echo "** This is required for CollectOSS to gather data ***" read -p "GitHub API Key: " github_api_key blank_confirm github_api_key @@ -63,7 +63,7 @@ function get_gitlab_api_key() { echo echo "Please provide a valid GitLab API key." echo "For more information on how to create the key, visit:" - echo "https://collectoss.readthedocs.io/en/latest/getting-started/installation.html#backend" + echo "https://docs.collectoss.org/en/latest/getting-started/collecting-data.html" echo "** This is required for CollectOSS to gather data ***" read -p "GitLab API Key: " gitlab_api_key blank_confirm gitlab_api_key From 983aa82f42ea8d26e46862ab1469b64551d914e8 Mon Sep 17 00:00:00 2001 From: Inengiye Emmanuel Date: Tue, 17 Mar 2026 20:08:26 +0100 Subject: [PATCH 040/165] Add unit tests for check_swapped_emails Signed-off-by: Inengiye Emmanuel --- .../test_util/test_check_swapped_emails.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py diff --git a/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py b/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py new file mode 100644 index 000000000..ed8245a82 --- /dev/null +++ b/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py @@ -0,0 +1,34 @@ +import pytest +from collectoss.tasks.git.util.facade_worker.facade_worker.analyzecommit import check_swapped_emails + +def test_correct_input_unchanged(): + name, email = check_swapped_emails("John Smith", "john@gmail.com") + assert name == "John Smith" + assert email == "john@gmail.com" + +def test_swapped_input_is_corrected(): + name, email = check_swapped_emails("john@gmail.com", "John Smith") + assert name == "John Smith" + assert email == "john@gmail.com" + +def test_name_field_contains_mixed_name_and_email(): + # name field has both a name and email mixed together + name, email = check_swapped_emails("John Smith john@gmail.com", "") + assert name == "John Smith john@gmail.com" + assert email == "" + +def test_email_field_contains_mixed_name_and_email(): + # email field has both a name and email mixed together + name, email = check_swapped_emails("John Smith", "John Smith john@gmail.com") + assert name == "John Smith" + assert email == "John Smith john@gmail.com" + +def test_both_fields_contain_mixed_name_and_email(): + name, email = check_swapped_emails("John Smith john@gmail.com", "Jane Doe jane@gmail.com") + assert name == "John Smith john@gmail.com" + assert email == "Jane Doe jane@gmail.com" + +def test_when_both_empty_strings(): + name, email = check_swapped_emails("", "") + assert name == "" + assert email == "" From c6e81ab6b412aadfe10f56525bb709c5299b6076 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 19 May 2026 16:41:24 -0400 Subject: [PATCH 041/165] enable all tests within test_util Signed-off-by: Adrian Edwards --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d14c871f7..c83a0dd03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -146,8 +146,7 @@ addopts = "-ra -s" testpaths = [ "tests/test_classes", "tests/test_application/test_cli/test_csv_utils.py", - "tests/test_tasks/test_task_utilities/test_util/test_worker_util.py", - "tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py", + "tests/test_tasks/test_task_utilities/test_util/", # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of Augur # "tests/test_metrics", # "tests/test_tasks", From 2423f1836496409c571bf083a0dc593809faa68e Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 13:25:14 -0400 Subject: [PATCH 042/165] Correct the failing test Signed-off-by: Adrian Edwards --- .../test_util/test_check_swapped_emails.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py b/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py index ed8245a82..2ae9e8807 100644 --- a/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py +++ b/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py @@ -14,8 +14,8 @@ def test_swapped_input_is_corrected(): def test_name_field_contains_mixed_name_and_email(): # name field has both a name and email mixed together name, email = check_swapped_emails("John Smith john@gmail.com", "") - assert name == "John Smith john@gmail.com" - assert email == "" + assert name == "" + assert email == "John Smith john@gmail.com" def test_email_field_contains_mixed_name_and_email(): # email field has both a name and email mixed together From e5ece605e18efb2dc44a5b37fc02b1efab10029f Mon Sep 17 00:00:00 2001 From: Suraj Date: Wed, 27 May 2026 03:47:12 +0000 Subject: [PATCH 043/165] fix(tasks): remove debug print statements causing log spam MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These print(libdays) statements were outputting bare numbers to stdout during dependency libyear analysis, creating log spam in task logs. Removed from both util.py and pypi_libyear_util.py — addressed the root cause of issue #289. Signed-off-by: Suraj --- .../dependency_libyear_tasks/libyear_util/pypi_libyear_util.py | 1 - .../tasks/git/dependency_libyear_tasks/libyear_util/util.py | 1 - 2 files changed, 2 deletions(-) diff --git a/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py index 46304490f..752582d64 100644 --- a/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py +++ b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py @@ -117,6 +117,5 @@ def get_libyear(current_version, current_release_date, latest_version, latest_re latest_release_date = dateutil.parser.parse(latest_release_date) libdays = (latest_release_date - current_release_date).days - print(libdays) libyear = libdays/365 return libyear \ No newline at end of file diff --git a/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py index 372e64c82..0a74492f2 100644 --- a/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py +++ b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py @@ -95,7 +95,6 @@ def get_libyear(current_version, current_release_date, latest_version, latest_re latest_release_date = dateutil.parser.parse(latest_release_date) libdays = (latest_release_date - current_release_date).days - print(libdays) libyear = libdays/365 return libyear From 6dfb9f9a5ce067275773c2befb5364e33593dce5 Mon Sep 17 00:00:00 2001 From: Shlok Gilda Date: Wed, 7 Jan 2026 02:38:49 -0500 Subject: [PATCH 044/165] implement timestamp correction utilities and tests for git commit data Signed-off-by: Shlok Gilda --- collectoss/application/db/lib.py | 55 +++---- collectoss/tasks/git/correction.py | 129 +++++++++++++++ tests/test_tasks/test_git/__init__.py | 0 tests/test_tasks/test_git/test_correction.py | 158 +++++++++++++++++++ 4 files changed, 307 insertions(+), 35 deletions(-) create mode 100644 collectoss/tasks/git/correction.py create mode 100644 tests/test_tasks/test_git/__init__.py create mode 100644 tests/test_tasks/test_git/test_correction.py diff --git a/collectoss/application/db/lib.py b/collectoss/application/db/lib.py index ed7613d11..4752f094e 100644 --- a/collectoss/application/db/lib.py +++ b/collectoss/application/db/lib.py @@ -13,6 +13,7 @@ from collectoss.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus, UserGroup, RepoGroup from collectoss.tasks.util.collection_state import CollectionState +from collectoss.tasks.git.correction import correct_timestamp from collectoss.application.db import get_session, get_engine from collectoss.application.db.util import execute_session_query, convert_type_of_value from collectoss.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts @@ -218,46 +219,30 @@ def facade_bulk_insert_commits(logger, records): facade_bulk_insert_commits(logger, firsthalfRecords) facade_bulk_insert_commits(logger, secondhalfRecords) elif len(records) == 1: + # Binary search isolated the problematic record + # Try to fix invalid timestamps (rare but possible from git corruption) commit_record = records[0] - #replace incomprehensible dates with epoch. - #2021-10-11 11:57:46 -0500 - - # placeholder_date = "1970-01-01 00:00:15 -0500" - placeholder_date = commit_record['cmt_author_timestamp'] - - postgres_valid_timezones = { - -1200, -1100, -1000, -930, -900, -800, -700, - -600, -500, -400, -300, -230, -200, -100, 000, - 100, 200, 300, 330, 400, 430, 500, 530, 545, 600, - 630, 700, 800, 845, 900, 930, 1000, 1030, 1100, 1200, - 1245, 1300, 1400 - } - - # Reconstruct timezone portion of the date string to UTC - placeholder_date_segments = re.split(" ", placeholder_date) - tzdata = placeholder_date_segments.pop() - - if ":" in tzdata: - tzdata = tzdata.replace(":", "") - - if int(tzdata) not in postgres_valid_timezones: - tzdata = "+0000" - else: - raise e - placeholder_date_segments.append(tzdata) - - placeholder_date = " ".join(placeholder_date_segments) + # Correct both author and committer timestamps + author_corrected = correct_timestamp( + commit_record.get('cmt_author_timestamp', ''), + fallback=None, + logger=logger + ) + committer_corrected = correct_timestamp( + commit_record.get('cmt_committer_timestamp', ''), + fallback=author_corrected, + logger=logger + ) - #Check for improper utc timezone offset - #UTC timezone offset should be between -14:00 and +14:00 + commit_record['cmt_author_timestamp'] = author_corrected + commit_record['cmt_committer_timestamp'] = committer_corrected - # analyzecommit.generate_commit_record() defines the keys on the commit_record dictionary - commit_record['cmt_author_timestamp'] = placeholder_date - commit_record['cmt_committer_timestamp'] = placeholder_date - - logger.warning(f"commit with invalid timezone set to UTC: {commit_record['cmt_commit_hash']}") + logger.warning( + f"Corrected invalid timestamp(s) for commit {commit_record.get('cmt_commit_hash')}" + ) + # Retry insert with corrected timestamps session.execute( s.insert(Commit), [commit_record], diff --git a/collectoss/tasks/git/correction.py b/collectoss/tasks/git/correction.py new file mode 100644 index 000000000..df2d97d04 --- /dev/null +++ b/collectoss/tasks/git/correction.py @@ -0,0 +1,129 @@ +""" +Timestamp correction utilities for git commit data. + +This module provides functions to validate and correct timestamp strings +before database insertion, specifically handling invalid timezone offsets +that PostgreSQL cannot process. +""" + +import logging +from typing import List, Optional + + +# Valid PostgreSQL timezone offsets in format ±HHMM (e.g., -0500, +0530) +# Range: -12:00 to +14:00 including all real-world fractional hour offsets +POSTGRES_VALID_TIMEZONES = { + -1200, -1100, -1000, -930, -900, -800, -700, + -600, -500, -430, -400, -330, -300, -230, -200, -100, 0, + 100, 200, 300, 330, 400, 430, 500, 530, 545, 600, + 630, 700, 800, 845, 900, 930, 1000, 1030, 1100, 1130, 1200, + 1245, 1300, 1345, 1400 +} + + +def correct_timestamp( + timestamp_str: str, + fallback: Optional[str] = None, + logger: Optional[logging.Logger] = None +) -> str: + """Fix invalid timezone in timestamp string. + + Validates the timezone portion of a timestamp and corrects it if invalid. + Handles three cases: + 1. Valid timezone → return as-is + 2. Invalid timezone → replace with fallback or UTC + 3. Unparseable format → return fallback or default + + Args: + timestamp_str: Timestamp string in format 'YYYY-MM-DD HH:MM:SS ±HHMM' + fallback: Optional fallback timestamp to use if correction needed + logger: Optional logger for recording corrections + + Returns: + Corrected timestamp string safe for PostgreSQL insertion + """ + if not timestamp_str: + return fallback or "1970-01-01 00:00:15 +0000" + + # Split on last space to separate date/time from timezone + # Example: '2025-11-03 16:28:43 -0500' → ['2025-11-03 16:28:43', '-0500'] + parts = timestamp_str.strip().rsplit(' ', 1) + + if len(parts) != 2: + # No space found, can't parse + if logger: + logger.warning(f"Unparseable timestamp format (no space): {timestamp_str}") + return fallback or "1970-01-01 00:00:15 +0000" + + date_time, tz_string = parts + + # Validate timezone starts with + or - + if not tz_string or tz_string[0] not in ('+', '-'): + if logger: + logger.warning(f"Unparseable timezone (no sign): {tz_string}") + return fallback or "1970-01-01 00:00:15 +0000" + + # Normalize timezone: remove colons (handles both -0500 and -05:00) + tz_normalized = tz_string.replace(':', '') + + # Try to parse as integer + try: + tz_offset = int(tz_normalized) + except ValueError: + if logger: + logger.warning(f"Could not parse timezone as integer: {tz_string}") + return fallback or "1970-01-01 00:00:15 +0000" + + # Check if timezone is valid + if tz_offset in POSTGRES_VALID_TIMEZONES: + # Valid timezone, return original + return timestamp_str + + # Invalid timezone detected + if fallback: + if logger: + logger.info(f"Invalid timezone {tz_offset} in '{timestamp_str}', using fallback") + return fallback + + # No fallback, default to UTC + if logger: + logger.warning(f"Invalid timezone {tz_offset} in '{timestamp_str}', defaulting to UTC") + return f"{date_time} +0000" + + +def clean_commit_timestamps(records: List[dict], logger: logging.Logger) -> None: + """Validate and correct timestamps in commit records in-place. + + Processes a batch of commit records, validating both author and committer + timestamps. For invalid committer timestamps, uses the corrected author + timestamp as a fallback before defaulting to UTC. + + This prevents PostgreSQL insertion failures due to invalid timezone offsets + (e.g., -13068837 which is outside the valid ±14:00 range). + + Args: + records: List of commit record dicts with keys: + - 'cmt_author_timestamp' + - 'cmt_committer_timestamp' + logger: Logger for recording corrections + + Returns: + None (modifies records in-place) + """ + for record in records: + author_ts = record.get('cmt_author_timestamp', '') + committer_ts = record.get('cmt_committer_timestamp', '') + + # Correct author timestamp first (no fallback, will use UTC if invalid) + author_corrected = correct_timestamp(author_ts, fallback=None, logger=logger) + + # Correct committer timestamp, using corrected author as fallback + # This minimizes data loss per issue discussion (prefer author time over UTC) + committer_corrected = correct_timestamp( + committer_ts, + fallback=author_corrected, + logger=logger + ) + + record['cmt_author_timestamp'] = author_corrected + record['cmt_committer_timestamp'] = committer_corrected diff --git a/tests/test_tasks/test_git/__init__.py b/tests/test_tasks/test_git/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_tasks/test_git/test_correction.py b/tests/test_tasks/test_git/test_correction.py new file mode 100644 index 000000000..32dc87c47 --- /dev/null +++ b/tests/test_tasks/test_git/test_correction.py @@ -0,0 +1,158 @@ +""" +Unit tests for git commit timestamp correction functions. + +Tests the correction.py module which validates and corrects invalid +timezone offsets in git commit timestamps before PostgreSQL insertion. +""" + +import pytest +import logging +from augur.tasks.git.correction import ( + correct_timestamp, + clean_commit_timestamps, + POSTGRES_VALID_TIMEZONES +) + + +@pytest.fixture +def logger(): + """Provide a basic logger for tests.""" + return logging.getLogger("test_correction") + + +class TestCorrectTimestamp: + """Tests for the correct_timestamp function.""" + + def test_valid_timestamp_unchanged(self, logger): + """Valid timestamp should pass through unchanged.""" + valid_ts = "2025-11-03 16:28:43 -0500" + result = correct_timestamp(valid_ts, logger=logger) + assert result == valid_ts + + def test_valid_utc_timestamp(self, logger): + """UTC timestamp (offset 0) should pass through unchanged.""" + utc_ts = "2025-11-03 16:28:43 +0000" + result = correct_timestamp(utc_ts, logger=logger) + assert result == utc_ts + + def test_invalid_timezone_uses_fallback(self, logger): + """Invalid timezone should use fallback timestamp.""" + invalid_ts = "2106-02-07 06:28:23 -13068837" + fallback_ts = "2025-11-03 16:28:43 -0500" + result = correct_timestamp(invalid_ts, fallback=fallback_ts, logger=logger) + assert result == fallback_ts + + def test_invalid_timezone_uses_utc_if_no_fallback(self, logger): + """Invalid timezone without fallback should default to UTC.""" + invalid_ts = "2106-02-07 06:28:23 -13068837" + result = correct_timestamp(invalid_ts, fallback=None, logger=logger) + # Should replace timezone with +0000, keep date/time + assert result == "2106-02-07 06:28:23 +0000" + + def test_empty_string_returns_default(self, logger): + """Empty timestamp string should return default epoch.""" + result = correct_timestamp("", logger=logger) + assert result == "1970-01-01 00:00:15 +0000" + + def test_unparseable_format_returns_default(self, logger): + """Unparseable timestamp format should return default.""" + unparseable = "not a timestamp" + result = correct_timestamp(unparseable, logger=logger) + assert result == "1970-01-01 00:00:15 +0000" + + def test_unparseable_with_fallback_returns_fallback(self, logger): + """Unparseable timestamp with fallback should return fallback.""" + unparseable = "not a timestamp" + fallback = "2025-11-03 16:28:43 -0500" + result = correct_timestamp(unparseable, fallback=fallback, logger=logger) + assert result == fallback + + +class TestCleanCommitTimestamps: + """Tests for the clean_commit_timestamps function.""" + + def test_issue_3472_exact_case(self, logger): + """Reproduce the exact bug from issue #3472. + + Author timestamp has valid timezone (-0500). + Committer timestamp has invalid timezone (-13068837). + Should use author timestamp as fallback for committer. + """ + records = [ + { + 'cmt_commit_hash': '5de262a839', + 'cmt_author_timestamp': '2025-11-03 16:28:43 -0500', + 'cmt_committer_timestamp': '2106-02-07 06:28:23 -13068837' + } + ] + + clean_commit_timestamps(records, logger) + + # Author should be unchanged (valid) + assert records[0]['cmt_author_timestamp'] == '2025-11-03 16:28:43 -0500' + + # Committer should use author as fallback (invalid → fallback) + assert records[0]['cmt_committer_timestamp'] == '2025-11-03 16:28:43 -0500' + + def test_clean_commit_timestamps_batch(self, logger): + """Test batch processing of multiple commits.""" + records = [ + { + 'cmt_author_timestamp': '2025-11-03 16:28:43 -0500', # Valid + 'cmt_committer_timestamp': '2025-11-03 16:28:43 -0500' # Valid + }, + { + 'cmt_author_timestamp': '2025-11-04 10:00:00 +0000', # Valid + 'cmt_committer_timestamp': '2106-02-07 06:28:23 -99999' # Invalid + }, + { + 'cmt_author_timestamp': '2025-11-05 12:00:00 -12345', # Invalid + 'cmt_committer_timestamp': '2025-11-05 13:00:00 +0530' # Valid + } + ] + + clean_commit_timestamps(records, logger) + + # Record 1: Both valid, unchanged + assert records[0]['cmt_author_timestamp'] == '2025-11-03 16:28:43 -0500' + assert records[0]['cmt_committer_timestamp'] == '2025-11-03 16:28:43 -0500' + + # Record 2: Author valid, committer invalid → use author as fallback + assert records[1]['cmt_author_timestamp'] == '2025-11-04 10:00:00 +0000' + assert records[1]['cmt_committer_timestamp'] == '2025-11-04 10:00:00 +0000' + + # Record 3: Author invalid → UTC, committer valid → unchanged + assert records[2]['cmt_author_timestamp'] == '2025-11-05 12:00:00 +0000' + assert records[2]['cmt_committer_timestamp'] == '2025-11-05 13:00:00 +0530' + + def test_both_timestamps_invalid(self, logger): + """When both timestamps invalid, both should default to UTC.""" + records = [ + { + 'cmt_author_timestamp': '2025-11-03 16:28:43 -99999', + 'cmt_committer_timestamp': '2106-02-07 06:28:23 -88888' + } + ] + + clean_commit_timestamps(records, logger) + + # Author invalid → UTC (no fallback) + assert records[0]['cmt_author_timestamp'] == '2025-11-03 16:28:43 +0000' + + # Committer invalid → fallback to corrected author (which is UTC) + assert records[0]['cmt_committer_timestamp'] == '2025-11-03 16:28:43 +0000' + + +class TestPostgresValidTimezones: + """Verify the POSTGRES_VALID_TIMEZONES set is correct.""" + + def test_valid_timezones_range(self): + """Valid timezones should be in range -12:00 to +14:00.""" + for tz in POSTGRES_VALID_TIMEZONES: + assert -1200 <= tz <= 1400 + + def test_common_timezones_present(self): + """Common timezone offsets should be in the set.""" + common = [0, -500, -400, -800, 100, 530, 800] # UTC, EST, EDT, PST, CET, IST, CST + for tz in common: + assert tz in POSTGRES_VALID_TIMEZONES From a32b2d071513bac9619f82b76b9dfc7984a3215e Mon Sep 17 00:00:00 2001 From: Shlok Gilda Date: Wed, 7 Jan 2026 11:47:48 -0500 Subject: [PATCH 045/165] improve exception handling and update logger usage in timestamp correction tests Signed-off-by: Shlok Gilda --- collectoss/application/db/lib.py | 8 ++-- tests/test_tasks/test_git/test_correction.py | 42 ++++++++++---------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/collectoss/application/db/lib.py b/collectoss/application/db/lib.py index 4752f094e..9250896b2 100644 --- a/collectoss/application/db/lib.py +++ b/collectoss/application/db/lib.py @@ -1,10 +1,8 @@ -import re import time import random import logging import sqlalchemy as s -from sqlalchemy import func -from sqlalchemy.exc import DataError +from sqlalchemy import func from sqlalchemy.dialects import postgresql from sqlalchemy.exc import OperationalError from psycopg2.errors import DeadlockDetected @@ -162,7 +160,7 @@ def get_working_commits_by_repo_id(repo_id): try: working_commits = fetchall_data_from_sql_text(query) - except: + except Exception: working_commits = [] return working_commits @@ -178,7 +176,7 @@ def get_missing_commit_message_hashes(repo_id): try: missing_commit_hashes = fetchall_data_from_sql_text(fetch_missing_hashes_sql) - except: + except Exception: missing_commit_hashes = [] return missing_commit_hashes diff --git a/tests/test_tasks/test_git/test_correction.py b/tests/test_tasks/test_git/test_correction.py index 32dc87c47..bd9913d48 100644 --- a/tests/test_tasks/test_git/test_correction.py +++ b/tests/test_tasks/test_git/test_correction.py @@ -15,7 +15,7 @@ @pytest.fixture -def logger(): +def test_logger(): """Provide a basic logger for tests.""" return logging.getLogger("test_correction") @@ -23,55 +23,55 @@ def logger(): class TestCorrectTimestamp: """Tests for the correct_timestamp function.""" - def test_valid_timestamp_unchanged(self, logger): + def test_valid_timestamp_unchanged(self, test_logger): """Valid timestamp should pass through unchanged.""" valid_ts = "2025-11-03 16:28:43 -0500" - result = correct_timestamp(valid_ts, logger=logger) + result = correct_timestamp(valid_ts, logger=test_logger) assert result == valid_ts - def test_valid_utc_timestamp(self, logger): + def test_valid_utc_timestamp(self, test_logger): """UTC timestamp (offset 0) should pass through unchanged.""" utc_ts = "2025-11-03 16:28:43 +0000" - result = correct_timestamp(utc_ts, logger=logger) + result = correct_timestamp(utc_ts, logger=test_logger) assert result == utc_ts - def test_invalid_timezone_uses_fallback(self, logger): + def test_invalid_timezone_uses_fallback(self, test_logger): """Invalid timezone should use fallback timestamp.""" invalid_ts = "2106-02-07 06:28:23 -13068837" fallback_ts = "2025-11-03 16:28:43 -0500" - result = correct_timestamp(invalid_ts, fallback=fallback_ts, logger=logger) + result = correct_timestamp(invalid_ts, fallback=fallback_ts, logger=test_logger) assert result == fallback_ts - def test_invalid_timezone_uses_utc_if_no_fallback(self, logger): + def test_invalid_timezone_uses_utc_if_no_fallback(self, test_logger): """Invalid timezone without fallback should default to UTC.""" invalid_ts = "2106-02-07 06:28:23 -13068837" - result = correct_timestamp(invalid_ts, fallback=None, logger=logger) + result = correct_timestamp(invalid_ts, fallback=None, logger=test_logger) # Should replace timezone with +0000, keep date/time assert result == "2106-02-07 06:28:23 +0000" - def test_empty_string_returns_default(self, logger): + def test_empty_string_returns_default(self, test_logger): """Empty timestamp string should return default epoch.""" - result = correct_timestamp("", logger=logger) + result = correct_timestamp("", logger=test_logger) assert result == "1970-01-01 00:00:15 +0000" - def test_unparseable_format_returns_default(self, logger): + def test_unparseable_format_returns_default(self, test_logger): """Unparseable timestamp format should return default.""" unparseable = "not a timestamp" - result = correct_timestamp(unparseable, logger=logger) + result = correct_timestamp(unparseable, logger=test_logger) assert result == "1970-01-01 00:00:15 +0000" - def test_unparseable_with_fallback_returns_fallback(self, logger): + def test_unparseable_with_fallback_returns_fallback(self, test_logger): """Unparseable timestamp with fallback should return fallback.""" unparseable = "not a timestamp" fallback = "2025-11-03 16:28:43 -0500" - result = correct_timestamp(unparseable, fallback=fallback, logger=logger) + result = correct_timestamp(unparseable, fallback=fallback, logger=test_logger) assert result == fallback class TestCleanCommitTimestamps: """Tests for the clean_commit_timestamps function.""" - def test_issue_3472_exact_case(self, logger): + def test_issue_3472_exact_case(self, test_logger): """Reproduce the exact bug from issue #3472. Author timestamp has valid timezone (-0500). @@ -86,7 +86,7 @@ def test_issue_3472_exact_case(self, logger): } ] - clean_commit_timestamps(records, logger) + clean_commit_timestamps(records, test_logger) # Author should be unchanged (valid) assert records[0]['cmt_author_timestamp'] == '2025-11-03 16:28:43 -0500' @@ -94,7 +94,7 @@ def test_issue_3472_exact_case(self, logger): # Committer should use author as fallback (invalid → fallback) assert records[0]['cmt_committer_timestamp'] == '2025-11-03 16:28:43 -0500' - def test_clean_commit_timestamps_batch(self, logger): + def test_clean_commit_timestamps_batch(self, test_logger): """Test batch processing of multiple commits.""" records = [ { @@ -111,7 +111,7 @@ def test_clean_commit_timestamps_batch(self, logger): } ] - clean_commit_timestamps(records, logger) + clean_commit_timestamps(records, test_logger) # Record 1: Both valid, unchanged assert records[0]['cmt_author_timestamp'] == '2025-11-03 16:28:43 -0500' @@ -125,7 +125,7 @@ def test_clean_commit_timestamps_batch(self, logger): assert records[2]['cmt_author_timestamp'] == '2025-11-05 12:00:00 +0000' assert records[2]['cmt_committer_timestamp'] == '2025-11-05 13:00:00 +0530' - def test_both_timestamps_invalid(self, logger): + def test_both_timestamps_invalid(self, test_logger): """When both timestamps invalid, both should default to UTC.""" records = [ { @@ -134,7 +134,7 @@ def test_both_timestamps_invalid(self, logger): } ] - clean_commit_timestamps(records, logger) + clean_commit_timestamps(records, test_logger) # Author invalid → UTC (no fallback) assert records[0]['cmt_author_timestamp'] == '2025-11-03 16:28:43 +0000' From 2a5f4f83fecf5e0311e064083ce290da92dc1115 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 16 Feb 2026 13:19:46 -0500 Subject: [PATCH 046/165] add new unit tests to list Signed-off-by: Adrian Edwards --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index c83a0dd03..c6bd0fe77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -147,6 +147,7 @@ testpaths = [ "tests/test_classes", "tests/test_application/test_cli/test_csv_utils.py", "tests/test_tasks/test_task_utilities/test_util/", + "tests/test_tasks/test_git", # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of Augur # "tests/test_metrics", # "tests/test_tasks", From 48b4e9882f17c89ea381cf7f05b13c8732509b92 Mon Sep 17 00:00:00 2001 From: Shlok Gilda Date: Wed, 25 Feb 2026 00:38:54 -0500 Subject: [PATCH 047/165] Move timestamp correction utilities from tasks layer to db layer Addresses PR #3518 review feedback from Ulincsys: the db module must not import from the tasks layer, as it's intended to be pip-installable as a standalone package. Moved correction.py to augur/application/db/timestamp_utils.py and relocated its tests accordingly. Signed-off-by: Shlok Gilda --- collectoss/application/db/lib.py | 4 +++- .../db/timestamp_utils.py} | 3 ++- pyproject.toml | 2 +- .../test_db/test_timestamp_utils.py} | 15 +++++++++++++-- 4 files changed, 19 insertions(+), 5 deletions(-) rename collectoss/{tasks/git/correction.py => application/db/timestamp_utils.py} (97%) rename tests/{test_tasks/test_git/test_correction.py => test_application/test_db/test_timestamp_utils.py} (90%) diff --git a/collectoss/application/db/lib.py b/collectoss/application/db/lib.py index 9250896b2..c6c52406d 100644 --- a/collectoss/application/db/lib.py +++ b/collectoss/application/db/lib.py @@ -10,8 +10,10 @@ from typing_extensions import deprecated from collectoss.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus, UserGroup, RepoGroup +# TODO: CollectionState should be moved to augur/application/db/ to eliminate +# this cross-layer dependency — same issue as the correction.py import above. from collectoss.tasks.util.collection_state import CollectionState -from collectoss.tasks.git.correction import correct_timestamp +from collectoss.db.timestamp_utils import correct_timestamp from collectoss.application.db import get_session, get_engine from collectoss.application.db.util import execute_session_query, convert_type_of_value from collectoss.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts diff --git a/collectoss/tasks/git/correction.py b/collectoss/application/db/timestamp_utils.py similarity index 97% rename from collectoss/tasks/git/correction.py rename to collectoss/application/db/timestamp_utils.py index df2d97d04..3e969f957 100644 --- a/collectoss/tasks/git/correction.py +++ b/collectoss/application/db/timestamp_utils.py @@ -3,7 +3,8 @@ This module provides functions to validate and correct timestamp strings before database insertion, specifically handling invalid timezone offsets -that PostgreSQL cannot process. +that PostgreSQL cannot process. Resides in the db layer so it can be used +by db-layer bulk insert logic without crossing into the tasks layer. """ import logging diff --git a/pyproject.toml b/pyproject.toml index c6bd0fe77..fc09336cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -147,7 +147,7 @@ testpaths = [ "tests/test_classes", "tests/test_application/test_cli/test_csv_utils.py", "tests/test_tasks/test_task_utilities/test_util/", - "tests/test_tasks/test_git", + "tests/test_application/test_db/test_timestamp_utils.py", # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of Augur # "tests/test_metrics", # "tests/test_tasks", diff --git a/tests/test_tasks/test_git/test_correction.py b/tests/test_application/test_db/test_timestamp_utils.py similarity index 90% rename from tests/test_tasks/test_git/test_correction.py rename to tests/test_application/test_db/test_timestamp_utils.py index bd9913d48..f822ce96b 100644 --- a/tests/test_tasks/test_git/test_correction.py +++ b/tests/test_application/test_db/test_timestamp_utils.py @@ -1,13 +1,13 @@ """ Unit tests for git commit timestamp correction functions. -Tests the correction.py module which validates and corrects invalid +Tests the timestamp_utils module which validates and corrects invalid timezone offsets in git commit timestamps before PostgreSQL insertion. """ import pytest import logging -from augur.tasks.git.correction import ( +from augur.application.db.timestamp_utils import ( correct_timestamp, clean_commit_timestamps, POSTGRES_VALID_TIMEZONES @@ -67,6 +67,17 @@ def test_unparseable_with_fallback_returns_fallback(self, test_logger): result = correct_timestamp(unparseable, fallback=fallback, logger=test_logger) assert result == fallback + def test_none_returns_default(self, test_logger): + """None timestamp (e.g. from record.get() with no default) should return default epoch.""" + result = correct_timestamp(None, logger=test_logger) + assert result == "1970-01-01 00:00:15 +0000" + + def test_none_with_fallback_returns_fallback(self, test_logger): + """None timestamp with fallback should return fallback.""" + fallback = "2025-11-03 16:28:43 -0500" + result = correct_timestamp(None, fallback=fallback, logger=test_logger) + assert result == fallback + class TestCleanCommitTimestamps: """Tests for the clean_commit_timestamps function.""" From 05f5eb6b9235e4199c78ec67b17257d9088fbfab Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 09:04:42 -0400 Subject: [PATCH 048/165] update package name and correct an error with a previous import Signed-off-by: Adrian Edwards --- collectoss/application/db/lib.py | 2 +- tests/test_application/test_db/test_timestamp_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/collectoss/application/db/lib.py b/collectoss/application/db/lib.py index c6c52406d..c5394365d 100644 --- a/collectoss/application/db/lib.py +++ b/collectoss/application/db/lib.py @@ -13,7 +13,7 @@ # TODO: CollectionState should be moved to augur/application/db/ to eliminate # this cross-layer dependency — same issue as the correction.py import above. from collectoss.tasks.util.collection_state import CollectionState -from collectoss.db.timestamp_utils import correct_timestamp +from collectoss.application.db.timestamp_utils import correct_timestamp from collectoss.application.db import get_session, get_engine from collectoss.application.db.util import execute_session_query, convert_type_of_value from collectoss.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts diff --git a/tests/test_application/test_db/test_timestamp_utils.py b/tests/test_application/test_db/test_timestamp_utils.py index f822ce96b..76b1706c6 100644 --- a/tests/test_application/test_db/test_timestamp_utils.py +++ b/tests/test_application/test_db/test_timestamp_utils.py @@ -7,7 +7,7 @@ import pytest import logging -from augur.application.db.timestamp_utils import ( +from collectoss.application.db.timestamp_utils import ( correct_timestamp, clean_commit_timestamps, POSTGRES_VALID_TIMEZONES From 08c638850ac9c116143996a910726a0d6e267e68 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 11:15:45 -0400 Subject: [PATCH 049/165] remove rabbitmq default user tags Signed-off-by: Adrian Edwards --- docker/rabbitmq/collectoss.conf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docker/rabbitmq/collectoss.conf b/docker/rabbitmq/collectoss.conf index d31435d02..ee8ed92c2 100644 --- a/docker/rabbitmq/collectoss.conf +++ b/docker/rabbitmq/collectoss.conf @@ -5,7 +5,3 @@ default_permissions.read = .* default_permissions.write = .* default_user_tags.administrator = true -default_user_tags.augur = true -default_user_tags.augurTag = true -default_user_tags.collectoss = true -default_user_tags.collectossTag = true From 2adc4812ed80e0efd4de45d4430a4867beee1650 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 26 May 2026 13:43:43 -0400 Subject: [PATCH 050/165] mark functions related to weight-based scheduling as deprecated Signed-off-by: Adrian Edwards --- .../git/util/facade_worker/facade_worker/utilitymethods.py | 7 +++++-- collectoss/tasks/github/util/util.py | 3 +++ collectoss/tasks/util/collection_util.py | 3 ++- collectoss/tasks/util/worker_util.py | 2 ++ 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 513390d07..afba70fa2 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -35,6 +35,7 @@ from collectoss.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes, get_repo_by_repo_git, get_session from collectoss.application.db.util import execute_session_query #from collectoss.tasks.git.util.facade_worker.facade +from typing_extensions import deprecated def update_repo_log(logger, facade_helper, repos_id,status): @@ -176,6 +177,7 @@ def get_repo_commit_count(logger, facade_helper, repo_git): return commit_count +@deprecated("This method of scheduling is legacy and should be removed") def get_facade_weight_time_factor(repo_git): with get_session() as session: @@ -194,15 +196,16 @@ def get_facade_weight_time_factor(repo_git): return time_factor +@deprecated("This method of scheduling is legacy and should be removed") def get_facade_weight_with_commit_count(repo_git, commit_count): return commit_count - get_facade_weight_time_factor(repo_git) - +@deprecated("This method of scheduling is legacy and should be removed") def get_repo_weight_by_commit(logger, repo_git): facade_helper = FacadeHelper(logger) return get_repo_commit_count(logger, facade_helper, repo_git) - get_facade_weight_time_factor(repo_git) - +@deprecated("This method of scheduling is legacy and should be removed") def update_facade_scheduling_fields(repo_git, weight, commit_count): repo = get_repo_by_repo_git(repo_git) diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index a0f009855..c25c738d9 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -8,6 +8,7 @@ from collectoss.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from collectoss.application.db.lib import get_repo_by_repo_git from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps +from typing_extensions import deprecated def get_repo_src_id(owner, repo, logger): @@ -87,6 +88,7 @@ def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dic logger.warning(f"invalid return. Response was: {response.text}. Exception: {e}") return json.loads(json.dumps(response.text)) +@deprecated("This method of scheduling is legacy and should be removed") def get_repo_weight_by_issue(logger,repo_git): """ Retrieve the sum of the number of issues and prs in a repository from a graphql query. @@ -111,6 +113,7 @@ def get_repo_weight_by_issue(logger,repo_git): return number_of_issues_and_prs #Get the weight for each repo for the core collection hook +@deprecated("This method of scheduling is legacy and should be removed") def get_repo_weight_core(logger,repo_git): repo = get_repo_by_repo_git(repo_git) diff --git a/collectoss/tasks/util/collection_util.py b/collectoss/tasks/util/collection_util.py index 18009d207..9ca6bb059 100644 --- a/collectoss/tasks/util/collection_util.py +++ b/collectoss/tasks/util/collection_util.py @@ -17,7 +17,7 @@ from collectoss.tasks.util.collection_state import CollectionState from collectoss.application.db.session import DatabaseSession from collectoss.application.config import SystemConfig - +from typing_extensions import deprecated class CollectionRequest: def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab_phases=None): @@ -252,6 +252,7 @@ def core_task_success_util(self, repo_git): issue_pr_task_update_weight_util([int(raw_count)],repo_git=repo_git,session=session) #Update the existing core and secondary weights as well as the raw sum of issues and prs +@deprecated("This method of scheduling is legacy and should be removed") def update_issue_pr_weights(logger,session,repo_git,raw_sum): repo = Repo.get_by_repo_git(session, repo_git) status = repo.collection_status[0] diff --git a/collectoss/tasks/util/worker_util.py b/collectoss/tasks/util/worker_util.py index 2c5943560..7f315d5b0 100644 --- a/collectoss/tasks/util/worker_util.py +++ b/collectoss/tasks/util/worker_util.py @@ -11,6 +11,7 @@ import json import subprocess +from typing_extensions import deprecated from collectoss.tasks.util.metadata_exception import MetadataException @@ -109,6 +110,7 @@ def remove_duplicate_naturals(data, natural_keys): def date_weight_factor(days_since_last_collection,domain_shift=0): return (days_since_last_collection - domain_shift) ** 4 +@deprecated("This method of scheduling is legacy and should be removed") def calculate_date_weight_from_timestamps(added,last_collection,domain_start_days=30): #Get the time since last collection as well as when the repo was added. if last_collection is None: From 313ca99f3a20f724d5601861a7f645958010683f Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 28 May 2026 13:00:31 -0400 Subject: [PATCH 051/165] update bandit actions config to a more current version Signed-off-by: Adrian Edwards --- .github/workflows/bandit.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml index 138b175f5..ce8b50bb7 100644 --- a/.github/workflows/bandit.yml +++ b/.github/workflows/bandit.yml @@ -13,12 +13,12 @@ name: Bandit on: push: - branches: [ "main", "release" ] + branches: ["main", "release"] pull_request: # The branches below must be a subset of the branches above - branches: [ "main" ] + branches: ["main"] schedule: - - cron: '24 2 * * 2' + - cron: "24 2 * * 2" jobs: bandit: @@ -30,7 +30,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Bandit Scan - uses: shundor/python-bandit-scan@ab1d87dfccc5a0ffab88be3aaac6ffe35c10d6cd + uses: reactive-firewall/python-bandit-scan@11a72c7c18aab77758bf6f5d9456f1018ec107b0 with: # optional arguments # exit with 0, even with results found exit_zero: true # optional, default is DEFAULT @@ -43,9 +43,9 @@ jobs: # Report only issues of a given confidence level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything) # confidence: # optional, default is UNDEFINED # comma-separated list of paths (glob patterns supported) to exclude from scan (note that these are in addition to the excluded paths provided in the config file) (default: .svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg) - excluded_paths: tests + # excluded_paths: # comma-separated list of test IDs to skip # skips: # optional, default is DEFAULT # path to a .bandit file that supplies command line arguments # ini_path: # optional, default is DEFAULT - + config_path: pyproject.toml From f47a72641c7cb986149d952f0e1291b8f02d1d8c Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 20 May 2026 14:51:47 -0400 Subject: [PATCH 052/165] remove make section of docs Signed-off-by: Adrian Edwards --- .../development-guide/make/development.rst | 10 --- .../development-guide/make/documentation.rst | 71 ------------------ .../development-guide/make/installation.rst | 74 ------------------- .../source/development-guide/make/testing.rst | 58 --------------- docs/source/development-guide/make/toc.rst | 16 ---- docs/source/development-guide/toc.rst | 1 - 6 files changed, 230 deletions(-) delete mode 100644 docs/source/development-guide/make/development.rst delete mode 100644 docs/source/development-guide/make/documentation.rst delete mode 100644 docs/source/development-guide/make/installation.rst delete mode 100644 docs/source/development-guide/make/testing.rst delete mode 100644 docs/source/development-guide/make/toc.rst diff --git a/docs/source/development-guide/make/development.rst b/docs/source/development-guide/make/development.rst deleted file mode 100644 index e2be118e1..000000000 --- a/docs/source/development-guide/make/development.rst +++ /dev/null @@ -1,10 +0,0 @@ -Development -============ - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/issues - -These commands are used to control CollectOSS's backend and frontend servers simultaneously. diff --git a/docs/source/development-guide/make/documentation.rst b/docs/source/development-guide/make/documentation.rst deleted file mode 100644 index dc8ff0c14..000000000 --- a/docs/source/development-guide/make/documentation.rst +++ /dev/null @@ -1,71 +0,0 @@ -Documentation -============= - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/issues - -These commands are used to build and view CollectOSS's documentation. -Before making any documentation changes, please read the `documentation guide <../documentation.html>`_. - --------------------------- - -``make docs`` --------------- -Generate both library and API documentation. - -Example\: - -.. code-block:: bash - - $ make docs - --------------------------- - -``make library-docs`` ----------------------- -Generate the library documentation (the documentation you're reading). - -Example\: - -.. code-block:: bash - - $ make library-docs - --------------------------- - -``make library-docs-view`` --------------------------- -Generate the library documentation, and automatically open a new browser tab to view it. - -Example\: - -.. code-block:: bash - - $ make library-docs-view - --------------------------- - -``make api-docs`` ------------------- -Generate the API documentation. - -Example\: - -.. code-block:: bash - - $ make api-docs - --------------------------- - -``make api-docs-view`` ------------------------ -Generate the API documentation, and automatically open a new browser tab to view it. - -Example\: - -.. code-block:: bash - - $ make api-docs-view diff --git a/docs/source/development-guide/make/installation.rst b/docs/source/development-guide/make/installation.rst deleted file mode 100644 index 647ea00f5..000000000 --- a/docs/source/development-guide/make/installation.rst +++ /dev/null @@ -1,74 +0,0 @@ -Installation -============= - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/toss/issues - -This section explicitly explains the commands that are used to manage the installation of CollectOSS locally. - ---------------- - -``make install`` ------------------ -This command installs the project dependencies, sets up the default configuration file, and gathers database credentials. - -Example\: - -.. code-block:: bash - - $ make install - ---------------- - -``make install-dev`` ---------------------- -The same as ``make install``, except it installs the additional developer dependencies and installs the packages in editable mode. - -Example\: - -.. code-block:: bash - - $ make install-dev - ---------------- - -``make clean`` ----------------- -Removes logs, caches, and some other cruft that can get annoying. This command is used when things aren't building properly or you think an old version of collectoss is getting in the way. - -Example\: - -.. code-block:: bash - - $ make clean - ---------------- - -``make rebuild`` ----------------- -Used in conjunction with ``make clean`` to remove all build/compiled files and binaries and reinstall the project. Useful for upgrading in place. - -Example\: - -.. code-block:: bash - - $ make rebuild - ---------------- - -``make rebuild-dev`` ---------------------- -The same as ``make rebuild``, except it installs the additional developer dependencies and installs the packages in editable mode. - -.. note:: - - You can still use ``make clean`` as normal if something went wrong. - -Example\: - -.. code-block:: bash - - $ make rebuild-dev diff --git a/docs/source/development-guide/make/testing.rst b/docs/source/development-guide/make/testing.rst deleted file mode 100644 index 05508154e..000000000 --- a/docs/source/development-guide/make/testing.rst +++ /dev/null @@ -1,58 +0,0 @@ -Testing -======= - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/issues - -These commands are used to run specific subsets of unit tests. We use ``pytest`` as our test runner. - --------------- - -``make test`` -------------- -This command runs ALL available tests for both the metric functions and their API endpoints. - -Example\: - -.. code-block:: bash - - $ make test - --------------- - -``make test-metrics`` ------------------------- -This command will run ALL unit tests for the metric functions. - -Example\: - -.. code-block:: bash - - $ make test-metrics - --------------- - -``make test-metrics-api`` --------------------------- -The above command runs ALL tests for the metrics API. - -Example\: - -.. code-block:: bash - - $ make test-metrics-api - --------------- - -``pytest`` ----------- -You can also run the tests directly using the ``pytest`` command. - -Example\: - -.. code-block:: bash - - $ uv run pytest diff --git a/docs/source/development-guide/make/toc.rst b/docs/source/development-guide/make/toc.rst deleted file mode 100644 index 75da18238..000000000 --- a/docs/source/development-guide/make/toc.rst +++ /dev/null @@ -1,16 +0,0 @@ -Make commands -=============== - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/issues - -.. toctree:: - :maxdepth: 1 - - installation - development - testing - documentation diff --git a/docs/source/development-guide/toc.rst b/docs/source/development-guide/toc.rst index fc447be06..c6b10af1c 100644 --- a/docs/source/development-guide/toc.rst +++ b/docs/source/development-guide/toc.rst @@ -7,7 +7,6 @@ This is the development guide for CollectOSS. See our `Contributing to CollectOS :maxdepth: 1 installation - make/toc logging documentation workers/toc From c9f66cd3f9029c98f66e5b3038a33cf1f65c7ce0 Mon Sep 17 00:00:00 2001 From: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> Date: Tue, 2 Jun 2026 16:28:03 -0400 Subject: [PATCH 053/165] set stable docs urls back to "release" Co-authored-by: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> Signed-off-by: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 42011a252..ad19e84e3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,7 +9,7 @@ These resources are a great way to meet the people behind the project, ask quest ## Learn about the project -If you aren't already familiar with what CollectOSS is, please make sure you've read the [README](README.md) to get a primer on our project, and maybe take a look around the [documentation](https://docs.collectoss.org/en/latest/) so you know what we are about. You can also hang out in Slack or join our community meetings to learn more about what we do. +If you aren't already familiar with what CollectOSS is, please make sure you've read the [README](README.md) to get a primer on our project, and maybe take a look around the [documentation](https://docs.collectoss.org/en/release/) so you know what we are about. You can also hang out in Slack or join our community meetings to learn more about what we do. ## Opening an issue If you're experiencing an issue with CollectOSS you can search for your problem or question on our [issues](https://github.com/chaoss/collectoss/issues) page to see if someone else has already reported it. If you cannot find your issue, please feel free to [open a new one](https://github.com/chaoss/collectoss/issues/new/choose). @@ -53,7 +53,7 @@ Github has an article called [Syncing a fork](https://docs.github.com/en/pull-re ## Helpful Links -- [CollectOSS stable documentation](https://docs.collectoss.org/en/latest/) +- [CollectOSS stable documentation](https://docs.collectoss.org/en/release/) - [CHAOSS Getting Started page](https://chaoss.community/kb-getting-started/) **Git & GitHub** From 1c653c32dbb08526c4d79009c7187bb7111958e2 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 5 Jun 2026 13:26:16 -0400 Subject: [PATCH 054/165] nullify migration that should never have been a migration in the first place Signed-off-by: Adrian Edwards --- .../versions/34_add_contrib_to_config.py | 34 ++----------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py b/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py index f4e17a08b..ba0c4568a 100644 --- a/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py +++ b/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py @@ -20,38 +20,8 @@ logger = logging.getLogger(__name__) def upgrade(): - - with DatabaseSession(logger) as session: - config = SystemConfig(logger,session) - config_dict = config.load_config() - - #Update the missing fields of the facade section in the config - section = config_dict.get("Facade") - - #Just copy the default if section doesn't exist. - if section: - if 'facade_contributor_full_recollect' not in section.keys(): - section['facade_contributor_full_recollect'] = 0 - - else: - section = config.default_config["Facade"] - - config.add_section_from_json("Facade", section) + pass def downgrade(): - - conn = op.get_bind() - - conn.execute(text(f""" - DELETE FROM augur_operations.config - WHERE section_name='Facade' AND (setting_name='facade_contributor_full_recollect'); - """)) - - try: - conn.execute(text(f""" - DELETE FROM augur_operations.config - WHERE section_name='Facade' AND (setting_name='facade_contributor_full_recollect'); - """)) - except: - pass \ No newline at end of file + pass \ No newline at end of file From 2eaa54f921c7f99c993136c3feb1e3bfdbf0e262 Mon Sep 17 00:00:00 2001 From: Diptesh Roy Date: Sun, 7 Jun 2026 02:11:27 +0530 Subject: [PATCH 055/165] fix: correct biweely typo to biweekly in GOVERNANCE.md Signed-off-by: Diptesh Roy --- GOVERNANCE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GOVERNANCE.md b/GOVERNANCE.md index bdd6f2560..4b01e769a 100644 --- a/GOVERNANCE.md +++ b/GOVERNANCE.md @@ -121,7 +121,7 @@ While most business in CollectOSS is conducted by "[lazy consensus](https://comm periodically the Maintainers may need to vote on specific actions or changes. A vote can be taken on the project's public Slack channel (#wg-collectoss-8knot in the [CHAOSS Slack](https://chaoss.community/kb-getting-started/)) or the private Maintainer Slack channel for security or conduct matters. -Votes may also be taken at the biweely developer meeting. Any Maintainer may +Votes may also be taken at the biweekly developer meeting. Any Maintainer may demand a vote be taken. Most votes require a simple majority of all Maintainers to succeed, except where From 9c0b67c51605c440966c5f7e1e7615f6cad1e433 Mon Sep 17 00:00:00 2001 From: Diptesh Roy Date: Sun, 7 Jun 2026 02:15:40 +0530 Subject: [PATCH 056/165] docs: re-introduce data sources section to scope.rst Re-adds the data sources list that was removed from the README in PR #2 and adds it to the readthedocs documentation as intended. Also adds a 5th entry for OpenSSF Scorecard analysis as suggested. Fixes #335 Signed-off-by: Diptesh Roy --- docs/source/about/scope.rst | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/source/about/scope.rst b/docs/source/about/scope.rst index e89b319dc..70ccd4749 100644 --- a/docs/source/about/scope.rst +++ b/docs/source/about/scope.rst @@ -7,4 +7,15 @@ The data CollectOSS collects covers more than just code contributions and extend This scope is intentionally narrower than that of the CHAOSS project as a whole to help keep the CollectOSS project sustainable with the resources available. Usecases and discussion of perspectives outside this defined scope are still welcome in the CHAOSS community, but may not be a good fit for direct contributions to CollectOSS. These usecases may work best as a complementary add-on project, new working group, or third-party addon to collectoss that depends on or extends CollectOSS functionality. -Future expansions of CollectOSS's scope may also bring in these community addons into the main codebase if new resources become available to sustain such expansion. \ No newline at end of file +Future expansions of CollectOSS's scope may also bring in these community addons into the main codebase if new resources become available to sustain such expansion. + +Data Sources +------------ + +CollectOSS collects data from a variety of sources: + +1. Raw Git commit logs (commits, contributors) +2. GitHub's API (issues, pull requests, contributors, releases, repository metadata) +3. The Linux Foundation's `Core Infrastructure Initiative `_ API (repository metadata) +4. `Succinct Code Counter `_, a blazingly fast Sloc, Cloc, and Code tool that also performs COCOMO calculations +5. `OpenSSF Scorecard `_ analysis (security health metrics for open source projects) \ No newline at end of file From a5d6143e8afe67a5f12152f168df9d19d13d0724 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:08:02 -0400 Subject: [PATCH 057/165] rename augur_data model file to data Signed-off-by: Adrian Edwards --- collectoss/application/db/models/__init__.py | 2 +- collectoss/application/db/models/{augur_data.py => data.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename collectoss/application/db/models/{augur_data.py => data.py} (100%) diff --git a/collectoss/application/db/models/__init__.py b/collectoss/application/db/models/__init__.py index bed0e4c8e..11c6b38fe 100644 --- a/collectoss/application/db/models/__init__.py +++ b/collectoss/application/db/models/__init__.py @@ -1,4 +1,4 @@ -from collectoss.application.db.models.augur_data import ( +from collectoss.application.db.models.data import ( ChaossMetricStatus, ChaossUser, ContributorAffiliation, diff --git a/collectoss/application/db/models/augur_data.py b/collectoss/application/db/models/data.py similarity index 100% rename from collectoss/application/db/models/augur_data.py rename to collectoss/application/db/models/data.py From f5a8d143a41c57a4310a1aafd23c115250a02c9d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:09:34 -0400 Subject: [PATCH 058/165] rename augur_data to collection_data in schema arguments Signed-off-by: Adrian Edwards --- collectoss/application/db/models/data.py | 42 ++++++++++++------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/collectoss/application/db/models/data.py b/collectoss/application/db/models/data.py index 7ea85eefc..a4e6b7e53 100644 --- a/collectoss/application/db/models/data.py +++ b/collectoss/application/db/models/data.py @@ -54,7 +54,7 @@ nullable=False, server_default=text("CURRENT_TIMESTAMP"), ), - schema="augur_data", + schema="collection_data", ) Index('repos_id', t_analysis_log.c.repos_id) @@ -337,7 +337,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): ), Index("repo_id,email_copy_1", "repo_id", "email"), Index("repo_id,affiliation_copy_1", "repo_id", "affiliation"), - schema="augur_data", + schema="collection_data", ) @@ -363,7 +363,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): ), Index("projects_id,email_copy_1", "repo_group_id", "email"), Index("projects_id,affiliation_copy_1", "repo_group_id", "affiliation"), - schema="augur_data", + schema="collection_data", ) @@ -394,7 +394,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index( "projects_id,year,affiliation_copy_1", "repo_group_id", "year", "affiliation" ), - schema="augur_data", + schema="collection_data", ) @@ -423,7 +423,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index("projects_id,email", "repo_group_id", "email"), Index("projects_id,year,email", "repo_group_id", "year", "email"), Index("projects_id,year,affiliation", "repo_group_id", "year", "affiliation"), - schema="augur_data", + schema="collection_data", ) @@ -452,7 +452,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index("repo_id,year,affiliation_copy_1", "repo_id", "year", "affiliation"), Index("repo_id,affiliation_copy_2", "repo_id", "affiliation"), Index("repo_id,email_copy_2", "repo_id", "email"), - schema="augur_data", + schema="collection_data", ) @@ -481,7 +481,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index("repo_id,email", "repo_id", "email"), Index("repo_id,year,email", "repo_id", "year", "email"), Index("repo_id,year,affiliation", "repo_id", "year", "affiliation"), - schema="augur_data", + schema="collection_data", ) @@ -530,7 +530,7 @@ class Platform(Base): pltfrm_id = Column( BigInteger, - Sequence('platform_pltfrm_id_seq', start=25430, schema="augur_data"), + Sequence('platform_pltfrm_id_seq', start=25430, schema="collection_data"), primary_key=True, server_default=text("nextval('augur_data.platform_pltfrm_id_seq'::regclass)"), ) @@ -622,7 +622,7 @@ def get_by_name(session, rg_name): ), Index("repos_id,status", "repos_id", "status"), Index("repos_id,statusops", "repos_id", "status"), - schema="augur_data", + schema="collection_data", ) @@ -678,7 +678,7 @@ class TopicWord(Base): server_default=text("CURRENT_TIMESTAMP"), ), Index("type,projects_id", "type", "repo_group_id"), - schema="augur_data", + schema="collection_data", ) @@ -710,7 +710,7 @@ class UtilityLog(Base): id = Column( BigInteger, - Sequence('utility_log_id_seq1', start=1, schema="augur_data"), + Sequence('utility_log_id_seq1', start=1, schema="collection_data"), primary_key=True, server_default=text("nextval('augur_data.utility_log_id_seq1'::regclass)"), ) @@ -728,7 +728,7 @@ class UtilityLog(Base): Column( "working_commit", String(40), server_default=text("'NULL'::character varying") ), - schema="augur_data", + schema="collection_data", ) @@ -1326,7 +1326,7 @@ class Commit(Base): cmt_id = Column( BigInteger, - Sequence('commits_cmt_id_seq', start=25430, schema="augur_data"), + Sequence('commits_cmt_id_seq', start=25430, schema="collection_data"), primary_key=True, server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), ) @@ -1411,7 +1411,7 @@ class CommitMessage(Base): cmt_msg_id = Column( BigInteger, - Sequence('commits_cmt_id_seq', start=25430, schema="augur_data"), + Sequence('commits_cmt_id_seq', start=25430, schema="collection_data"), primary_key=True, server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), ) @@ -1447,7 +1447,7 @@ class Issue(Base): issue_id = Column( BigInteger, - Sequence('issue_seq', start=31000, schema="augur_data"), + Sequence('issue_seq', start=31000, schema="collection_data"), primary_key=True, server_default=text("nextval('augur_data.issue_seq'::regclass)"), ) @@ -1513,7 +1513,7 @@ class Library(Base): library_id = Column( BigInteger, - Sequence('libraries_library_id_seq', start=25430, schema="augur_data"), + Sequence('libraries_library_id_seq', start=25430, schema="collection_data"), primary_key=True, server_default=text("nextval('augur_data.libraries_library_id_seq'::regclass)"), ) @@ -1597,7 +1597,7 @@ class Message(Base): msg_id = Column( BigInteger, - Sequence('message_msg_id_seq', start=25430, schema="augur_data"), + Sequence('message_msg_id_seq', start=25430, schema="collection_data"), primary_key=True, server_default=text("nextval('augur_data.message_msg_id_seq'::regclass)"), ) @@ -1887,7 +1887,7 @@ class Release(Base): release_id = Column( CHAR(256), - Sequence('releases_release_id_seq', start=1, schema="augur_data"), + Sequence('releases_release_id_seq', start=1, schema="collection_data"), primary_key=True, server_default=text("nextval('augur_data.releases_release_id_seq'::regclass)"), ) @@ -2149,7 +2149,7 @@ class RepoInsight(Base): ri_id = Column( BigInteger, - Sequence('repo_insights_ri_id_seq', start=25430, schema="augur_data"), + Sequence('repo_insights_ri_id_seq', start=25430, schema="collection_data"), primary_key=True, server_default=text("nextval('augur_data.repo_insights_ri_id_seq'::regclass)"), ) @@ -2268,7 +2268,7 @@ class RepoMeta(Base): ) rmeta_id = Column( BigInteger, - Sequence('repo_meta_rmeta_id_seq', start=25430, schema="augur_data"), + Sequence('repo_meta_rmeta_id_seq', start=25430, schema="collection_data"), primary_key=True, nullable=False, server_default=text("nextval('augur_data.repo_meta_rmeta_id_seq'::regclass)"), @@ -2312,7 +2312,7 @@ class RepoStat(Base): ) rstat_id = Column( BigInteger, - Sequence('repo_stats_rstat_id_seq', start=25430, schema="augur_data"), + Sequence('repo_stats_rstat_id_seq', start=25430, schema="collection_data"), primary_key=True, nullable=False, server_default=text("nextval('augur_data.repo_stats_rstat_id_seq'::regclass)"), From 87db0a19c5821a5624c71fc37719e04c0a506718 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:10:56 -0400 Subject: [PATCH 059/165] rename augur_data to collection_data in table arguments Signed-off-by: Adrian Edwards --- .../application/db/models/augur_operations.py | 2 +- collectoss/application/db/models/data.py | 138 +++++++++--------- 2 files changed, 70 insertions(+), 70 deletions(-) diff --git a/collectoss/application/db/models/augur_operations.py b/collectoss/application/db/models/augur_operations.py index 760ea6c1a..5f25a92f4 100644 --- a/collectoss/application/db/models/augur_operations.py +++ b/collectoss/application/db/models/augur_operations.py @@ -221,7 +221,7 @@ class WorkerSettingsFacade(Base): class BadgingDEI(Base): __tablename__ = 'dei_badging' - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} id = Column(Integer, primary_key=True, nullable=False) badging_id = Column(Integer, nullable=False) level = Column(String, nullable=False) diff --git a/collectoss/application/db/models/data.py b/collectoss/application/db/models/data.py index a4e6b7e53..8d6dd17a5 100644 --- a/collectoss/application/db/models/data.py +++ b/collectoss/application/db/models/data.py @@ -63,7 +63,7 @@ class ChaossMetricStatus(Base): __tablename__ = "chaoss_metric_status" __table_args__ = { - "schema": "augur_data", + "schema": "collection_data", "comment": "This table used to track CHAOSS Metric implementations, but due to the constantly changing location of that information, it is for the moment not actively populated. ", } @@ -97,7 +97,7 @@ class ChaossMetricStatus(Base): class ChaossUser(Base): __tablename__ = "chaoss_user" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} chaoss_id = Column( BigInteger, @@ -122,7 +122,7 @@ class ChaossUser(Base): class ContributorAffiliation(Base): __tablename__ = "contributor_affiliations" __table_args__ = { - "schema": "augur_data", + "schema": "collection_data", "comment": "This table exists outside of relations with other tables. The purpose is to provide a dynamic, owner maintained (and collectoss augmented) list of affiliations. This table is processed in affiliation information in the DM_ tables generated when CollectOSS is finished counting commits using the Facade Worker. ", } @@ -178,7 +178,7 @@ class Contributor(Base): Index("login-contributor-idx", "cntrb_login"), { - "schema": "augur_data", + "schema": "collection_data", "comment": "For GitHub, this should be repeated from gh_login. for other systems, it should be that systems login. \nGithub now allows a user to change their login name, but their user id remains the same in this case. So, the natural key is the combination of id and login, but there should never be repeated logins. ", }, ) @@ -487,7 +487,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): class Exclude(Base): __tablename__ = "exclude" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} id = Column(Integer, primary_key=True) projects_id = Column(Integer, nullable=False) @@ -497,7 +497,7 @@ class Exclude(Base): class LstmAnomalyModel(Base): __tablename__ = "lstm_anomaly_models" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} model_id = Column( BigInteger, @@ -525,7 +525,7 @@ class Platform(Base): __tablename__ = "platform" __table_args__ = ( Index("plat", "pltfrm_id", unique=True), - {"schema": "augur_data"} + {"schema": "collection_data"} ) pltfrm_id = Column( @@ -548,7 +548,7 @@ class RepoGroup(Base): __table_args__ = ( Index("rgidm", "repo_group_id", unique=True), Index("rgnameindex", "rg_name"), - {"schema": "augur_data", + {"schema": "collection_data", "comment": "rg_type is intended to be either a GitHub Organization or a User Created Repo Group. "}, ) @@ -628,7 +628,7 @@ def get_by_name(session, rg_name): class Settings(Base): __tablename__ = "settings" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} id = Column(Integer, primary_key=True) setting = Column(String(32), nullable=False) @@ -640,7 +640,7 @@ class Settings(Base): class TopicWord(Base): __tablename__ = "topic_words" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} topic_words_id = Column( BigInteger, @@ -684,7 +684,7 @@ class TopicWord(Base): class UnresolvedCommitEmail(Base): __tablename__ = "unresolved_commit_emails" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} email_unresolved_id = Column( BigInteger, @@ -706,7 +706,7 @@ class UnresolvedCommitEmail(Base): class UtilityLog(Base): __tablename__ = "utility_log" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} id = Column( BigInteger, @@ -737,7 +737,7 @@ class ContributorRepo(Base): __table_args__ = ( UniqueConstraint("event_id", "tool_version"), { - "schema": "augur_data", + "schema": "collection_data", "comment": 'Developed in Partnership with Andrew Brain.', }, ) @@ -782,7 +782,7 @@ class ContributorsAlias(Base): __table_args__ = ( UniqueConstraint("cntrb_id","alias_email", name="cntrb-email-insert-unique"), { - "schema": "augur_data", + "schema": "collection_data", "comment": "Every open source user may have more than one email used to make contributions over time. CollectOSS selects the first email it encounters for a user as its “canonical_email”. \n\nThe canonical_email is also added to the contributors_aliases table, with the canonical_email and alias_email being identical. Using this strategy, an email search will only need to join the alias table for basic email information, and can then more easily map the canonical email from each alias row to the same, more detailed information in the contributors table for a user. ", }, ) @@ -838,7 +838,7 @@ class Repo(Base): Index("therepo", "repo_id", unique=True), { - "schema": "augur_data", + "schema": "collection_data", "comment": "This table is a combination of the columns in Facade’s repo table and GHTorrent’s projects table. ", }, ) @@ -1192,7 +1192,7 @@ class HistoricalRepoURLs(Base): """ __tablename__ = "historical_repo_urls" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} repo_id = Column(ForeignKey("augur_data.repo.repo_id"), primary_key=True) git_url = Column(String, primary_key=True) @@ -1200,7 +1200,7 @@ class HistoricalRepoURLs(Base): class RepoTestCoverage(Base): __tablename__ = "repo_test_coverage" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} repo_id = Column( ForeignKey("augur_data.repo.repo_id"), @@ -1231,7 +1231,7 @@ class RepoTestCoverage(Base): class RepoGroupInsight(Base): __tablename__ = "repo_group_insights" __table_args__ = { - "schema": "augur_data", + "schema": "collection_data", "comment": 'This table is output from an analytical worker. It runs through the different metrics on a REPOSITORY_GROUP and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', } @@ -1266,7 +1266,7 @@ class RepoGroupsListServe(Base): __table_args__ = ( UniqueConstraint("rgls_id", "repo_group_id"), Index("lister", "rgls_id", "repo_group_id", unique=True), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) rgls_id = Column( @@ -1319,7 +1319,7 @@ class Commit(Base): Index("repo_id,commit", "repo_id", "cmt_commit_hash"), { - "schema": "augur_data", + "schema": "collection_data", "comment": "Commits.\nEach row represents changes to one FILE within a single commit. So you will encounter multiple rows per commit hash in many cases. ", }, ) @@ -1404,7 +1404,7 @@ class CommitMessage(Base): __table_args__ = ( UniqueConstraint("repo_id","cmt_hash", name="commit-message-insert-unique"), { - "schema": "augur_data", + "schema": "collection_data", "comment": "This table holds commit messages", } ) @@ -1442,7 +1442,7 @@ class Issue(Base): UniqueConstraint("repo_id", "gh_issue_id"), UniqueConstraint("issue_url", name="issue-insert-unique"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) issue_id = Column( @@ -1509,7 +1509,7 @@ class Issue(Base): class Library(Base): __tablename__ = "libraries" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} library_id = Column( BigInteger, @@ -1551,7 +1551,7 @@ class Library(Base): class LstmAnomalyResult(Base): __tablename__ = "lstm_anomaly_results" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} result_id = Column( BigInteger, @@ -1592,7 +1592,7 @@ class Message(Base): Index("msg-cntrb-id-idx", "cntrb_id"), Index("platformgrouper", "msg_id", "pltfrm_id"), Index("messagegrouper", "msg_id", "rgls_id", unique=True), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) msg_id = Column( @@ -1661,7 +1661,7 @@ class Message(Base): class MessageAnalysisSummary(Base): __tablename__ = "message_analysis_summary" __table_args__ = { - "schema": "augur_data", + "schema": "collection_data", "comment": "In a relationally perfect world, we would have a table called “message_analysis_run” the incremented the “worker_run_id” for both message_analysis and message_analysis_summary. For now, we decided this was overkill. ", } @@ -1701,7 +1701,7 @@ class MessageAnalysisSummary(Base): class MessageSentimentSummary(Base): __tablename__ = "message_sentiment_summary" __table_args__ = { - "schema": "augur_data", + "schema": "collection_data", "comment": "In a relationally perfect world, we would have a table called “message_sentiment_run” the incremented the “worker_run_id” for both message_sentiment and message_sentiment_summary. For now, we decided this was overkill. ", } @@ -1749,7 +1749,7 @@ class PullRequest(Base): "pull_requests_idx_repo_id_data_datex", "repo_id", "data_collection_date" ), Index("pr_ID_prs_table", "pull_request_id"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) pull_request_id = Column( @@ -1883,7 +1883,7 @@ def from_github(cls, pr, repo_id, tool_source, tool_version): class Release(Base): __tablename__ = "releases" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} release_id = Column( CHAR(256), @@ -1916,7 +1916,7 @@ class Release(Base): class RepoBadging(Base): __tablename__ = "repo_badging" __table_args__ = { - "schema": "augur_data", + "schema": "collection_data", "comment": "This will be collected from the LF’s Badging API\nhttps://bestpractices.coreinfrastructure.org/projects.json?pq=https%3A%2F%2Fgithub.com%2Fchaoss%2Faugur\n", } @@ -1960,7 +1960,7 @@ def insert(session, repo_id: int, data: dict) -> dict: class RepoClusterMessage(Base): __tablename__ = "repo_cluster_messages" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} msg_cluster_id = Column( BigInteger, @@ -1988,7 +1988,7 @@ class RepoDependency(Base): __table_args__ = ( UniqueConstraint("repo_id","dep_name","data_collection_date", name="deps-insert-unique"), { - "schema": "augur_data", + "schema": "collection_data", "comment": "Contains the dependencies for a repo." }, ) @@ -2021,7 +2021,7 @@ class RepoDepsLibyear(Base): __tablename__ = "repo_deps_libyear" __table_args__ = ( UniqueConstraint("repo_id","name", "data_collection_date", name="deps-libyear-insert-unique"), - {"schema": "augur_data"} + {"schema": "collection_data"} ) repo_deps_libyear_id = Column( @@ -2056,7 +2056,7 @@ class RepoDepsScorecard(Base): __tablename__ = "repo_deps_scorecard" __table_args__ = ( UniqueConstraint("repo_id","name", "data_collection_date", name="deps_scorecard_new_unique"), - {"schema": "augur_data"} + {"schema": "collection_data"} ) repo_deps_scorecard_id = Column( @@ -2087,7 +2087,7 @@ class RepoInfo(Base): __table_args__ = ( Index("repo_info_idx_repo_id_data_date_1x", "repo_id", "data_collection_date"), Index("repo_info_idx_repo_id_data_datex", "repo_id", "data_collection_date"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) repo_info_id = Column( @@ -2143,7 +2143,7 @@ class RepoInfo(Base): class RepoInsight(Base): __tablename__ = "repo_insights" __table_args__ = { - "schema": "augur_data", + "schema": "collection_data", "comment": 'This table is output from an analytical worker. It runs through the different metrics on a repository and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', } @@ -2178,7 +2178,7 @@ class RepoInsightsRecord(Base): __tablename__ = "repo_insights_records" __table_args__ = ( Index("dater", "ri_date"), - {"schema": "augur_data"} + {"schema": "collection_data"} ) ri_id = Column( @@ -2223,7 +2223,7 @@ class RepoLabor(Base): __table_args__ = ( UniqueConstraint("repo_id", "rl_analysis_date", "file_path", "file_name"), { - "schema": "augur_data", + "schema": "collection_data", "comment": "repo_labor is a derivative of tables used to store scc code and complexity counting statistics that are inputs to labor analysis, which are components of CHAOSS value metric calculations. ", }, ) @@ -2261,7 +2261,7 @@ class RepoLabor(Base): class RepoMeta(Base): __tablename__ = "repo_meta" - __table_args__ = {"schema": "augur_data", "comment": "Project Languages"} + __table_args__ = {"schema": "collection_data", "comment": "Project Languages"} repo_id = Column( ForeignKey("augur_data.repo.repo_id"), primary_key=True, nullable=False @@ -2285,7 +2285,7 @@ class RepoMeta(Base): class RepoSbomScan(Base): __tablename__ = "repo_sbom_scans" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} rsb_id = Column( BigInteger, @@ -2305,7 +2305,7 @@ class RepoSbomScan(Base): class RepoStat(Base): __tablename__ = "repo_stats" - __table_args__ = {"schema": "augur_data", "comment": "Project Watchers"} + __table_args__ = {"schema": "collection_data", "comment": "Project Watchers"} repo_id = Column( ForeignKey("augur_data.repo.repo_id"), primary_key=True, nullable=False @@ -2329,7 +2329,7 @@ class RepoStat(Base): class RepoTopic(Base): __tablename__ = "repo_topic" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} repo_topic_id = Column( BigInteger, @@ -2356,7 +2356,7 @@ class CommitCommentRef(Base): __tablename__ = "commit_comment_ref" __table_args__ = ( Index("comment_id", "cmt_comment_src_id", "cmt_comment_id", "msg_id"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) cmt_comment_id = Column( @@ -2413,7 +2413,7 @@ class CommitParent(Base): __table_args__ = ( Index("commit_parents_ibfk_1", "cmt_id"), Index("commit_parents_ibfk_2", "parent_id"), - {"schema": "augur_data"} + {"schema": "collection_data"} ) cmt_id = Column( @@ -2446,7 +2446,7 @@ class CommitParent(Base): class DiscourseInsight(Base): __tablename__ = "discourse_insights" __table_args__ = { - "schema": "augur_data", + "schema": "collection_data", "comment": "This table is populated by the “Discourse_Analysis_Worker”. It examines sequential discourse, using computational linguistic methods, to draw statistical inferences regarding the discourse in a particular comment thread. ", } @@ -2475,7 +2475,7 @@ class IssueAssignee(Base): __table_args__ = ( Index("issue-cntrb-assign-idx-1", "cntrb_id"), UniqueConstraint("issue_assignee_src_id", "issue_id", name="issue-assignee-insert-unique"), - {"schema": "augur_data"} + {"schema": "collection_data"} ) issue_assignee_id = Column( @@ -2535,7 +2535,7 @@ class IssueEvent(Base): Index("issue_events_ibfk_1", "issue_id"), Index("issue_events_ibfk_2", "cntrb_id"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) event_id = Column( @@ -2620,7 +2620,7 @@ class IssueLabel(Base): __tablename__ = "issue_labels" __table_args__ = ( UniqueConstraint("label_src_id", "issue_id"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) issue_label_id = Column( @@ -2677,7 +2677,7 @@ class IssueMessageRef(Base): __tablename__ = "issue_message_ref" __table_args__ = ( UniqueConstraint("issue_msg_ref_src_comment_id", "issue_id", name="issue-message-ref-insert-unique"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) issue_msg_ref_id = Column( @@ -2739,7 +2739,7 @@ class LibraryDependency(Base): __tablename__ = "library_dependencies" __table_args__ = ( Index("REPO_DEP", "library_id"), - {"schema": "augur_data"} + {"schema": "collection_data"} ) lib_dependency_id = Column( @@ -2767,7 +2767,7 @@ class LibraryDependency(Base): class LibraryVersion(Base): __tablename__ = "library_version" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} library_version_id = Column( BigInteger, @@ -2793,7 +2793,7 @@ class LibraryVersion(Base): class MessageAnalysis(Base): __tablename__ = "message_analysis" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} msg_analysis_id = Column( BigInteger, @@ -2836,7 +2836,7 @@ class MessageAnalysis(Base): class MessageSentiment(Base): __tablename__ = "message_sentiment" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} msg_analysis_id = Column( BigInteger, @@ -2915,7 +2915,7 @@ class PullRequestAnalysis(Base): __table_args__ = ( Index("pr_anal_idx", pull_request_id), Index("probability_idx", merge_probability.desc().nullslast()), - {"schema": "augur_data"} + {"schema": "collection_data"} ) pull_request = relationship("PullRequest") @@ -2926,7 +2926,7 @@ class PullRequestAssignee(Base): __table_args__ = ( Index("pr_meta_cntrb-idx", "contrib_id"), UniqueConstraint("pull_request_id", "pr_assignee_src_id", name="assigniees-unique"), - {"schema": "augur_data"} + {"schema": "collection_data"} ) pr_assignee_map_id = Column( @@ -2987,7 +2987,7 @@ class PullRequestCommit(Base): __table_args__ = ( UniqueConstraint("pull_request_id", "repo_id", "pr_cmt_sha"), { - "schema": "augur_data", + "schema": "collection_data", "comment": "Pull request commits are an enumeration of each commit associated with a pull request. \nNot all pull requests are from a branch or fork into master. \nThe commits table intends to count only commits that end up in the master branch (i.e., part of the deployed code base for a project).\nTherefore, there will be commit “SHA”’s in this table that are no associated with a commit SHA in the commits table. \nIn cases where the PR is to the master branch of a project, you will find a match. In cases where the PR does not involve the master branch, you will not find a corresponding commit SHA in the commits table. This is expected. ", }, ) @@ -3044,7 +3044,7 @@ class PullRequestEvent(Base): UniqueConstraint("repo_id", "issue_event_src_id", name="pr_events_repo_id_event_src_id_unique"), UniqueConstraint("platform_id", "node_id", name="unique-pr-event-id"), UniqueConstraint("node_id", name="pr-unqiue-event"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) pr_event_id = Column( @@ -3142,7 +3142,7 @@ class PullRequestFile(Base): Index("pr_id_pr_files","pull_request_id"), UniqueConstraint("pull_request_id", "repo_id", "pr_file_path", name="prfiles_unique"), { - "schema": "augur_data", + "schema": "collection_data", "comment": "Pull request commits are an enumeration of each commit associated with a pull request. \nNot all pull requests are from a branch or fork into master. \nThe commits table intends to count only commits that end up in the master branch (i.e., part of the deployed code base for a project).\nTherefore, there will be commit “SHA”’s in this table that are no associated with a commit SHA in the commits table. \nIn cases where the PR is to the master branch of a project, you will find a match. In cases where the PR does not involve the master branch, you will not find a corresponding commit SHA in the commits table. This is expected. ", }, ) @@ -3196,7 +3196,7 @@ class PullRequestLabel(Base): __tablename__ = "pull_request_labels" __table_args__ = ( UniqueConstraint("pr_src_id", "pull_request_id"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) pr_label_id = Column( @@ -3258,7 +3258,7 @@ class PullRequestMessageRef(Base): __tablename__ = "pull_request_message_ref" __table_args__ = ( UniqueConstraint("pr_message_ref_src_comment_id", "pull_request_id", name="pull-request-message-ref-insert-unique"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) pr_msg_ref_id = Column( @@ -3310,7 +3310,7 @@ class PullRequestMeta(Base): __table_args__ = ( Index("pr_meta-cntrbid-idx", "cntrb_id"), UniqueConstraint("pull_request_id", "pr_head_or_base", 'pr_sha', name="pull-request-meta-insert-unique"), - {"schema": "augur_data", + {"schema": "collection_data", "comment": 'Pull requests contain referencing metadata. There are a few columns that are discrete. There are also head and base designations for the repo on each side of the pull request. Similar functions exist in GitLab, though the language here is based on GitHub.'}, ) @@ -3384,7 +3384,7 @@ class PullRequestReviewer(Base): __table_args__ = ( Index("pr-reviewers-cntrb-idx1", "cntrb_id"), UniqueConstraint("pull_request_id", "pr_reviewer_src_id"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) pr_reviewer_map_id = Column( @@ -3446,7 +3446,7 @@ class PullRequestReview(Base): __table_args__ = ( UniqueConstraint("pr_review_src_id", name="pr_review_unique"), Index("pr_id_pr_reviews", "pull_request_id"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) pr_review_id = Column( @@ -3515,7 +3515,7 @@ class PullRequestReview(Base): class PullRequestTeam(Base): __tablename__ = "pull_request_teams" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} pr_team_id = Column( BigInteger, @@ -3557,7 +3557,7 @@ class PullRequestRepo(Base): __tablename__ = "pull_request_repo" __table_args__ = ( Index("pr-cntrb-idx-repo", "pr_cntrb_id"), - {"schema": "augur_data", + {"schema": "collection_data", "comment": "This table is for storing information about forks that exist as part of a pull request. Generally we do not want to track these like ordinary repositories. "}, ) @@ -3601,7 +3601,7 @@ class PullRequestReviewMessageRef(Base): __tablename__ = "pull_request_review_message_ref" __table_args__ = ( UniqueConstraint("pr_review_msg_src_id", name="pull-request-review-message-ref-insert-unique"), - {"schema": "augur_data"}, + {"schema": "collection_data"}, ) pr_review_msg_ref_id = Column( @@ -3675,7 +3675,7 @@ class PullRequestReviewMessageRef(Base): class RepoClone(Base): __tablename__ = "repo_clones_data" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} repo_clone_data_id = Column( BigInteger, @@ -3704,7 +3704,7 @@ class RepoClone(Base): class TopicModelMeta(Base): __tablename__ = "topic_model_meta" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "collection_data"} model_id = Column( UUID(as_uuid=True), @@ -3811,7 +3811,7 @@ class TopicModelEvent(Base): __table_args__ = ( Index("ix_tme_repo_ts", "repo_id", "ts"), Index("ix_tme_event", "event"), - {"schema": "augur_data"} + {"schema": "collection_data"} ) event_id = Column( From 8a1e9a07bbcce4c98e65659fc2ff53736dfeff0f Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:12:27 -0400 Subject: [PATCH 060/165] schema arguments (but single quotes this time) Signed-off-by: Adrian Edwards --- collectoss/application/db/models/data.py | 104 +++++++++++------------ 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/collectoss/application/db/models/data.py b/collectoss/application/db/models/data.py index 8d6dd17a5..650c793cb 100644 --- a/collectoss/application/db/models/data.py +++ b/collectoss/application/db/models/data.py @@ -69,7 +69,7 @@ class ChaossMetricStatus(Base): cms_id = Column( BigInteger, - Sequence('chaoss_metric_status_cms_id_seq', start=1, schema='augur_data'), + Sequence('chaoss_metric_status_cms_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.chaoss_metric_status_cms_id_seq'::regclass)" @@ -101,7 +101,7 @@ class ChaossUser(Base): chaoss_id = Column( BigInteger, - Sequence('chaoss_user_chaoss_id_seq', start=1, schema='augur_data'), + Sequence('chaoss_user_chaoss_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.chaoss_user_chaoss_id_seq'::regclass)" @@ -128,7 +128,7 @@ class ContributorAffiliation(Base): ca_id = Column( BigInteger, - Sequence('contributor_affiliations_ca_id_seq', start=25430, schema='augur_data'), + Sequence('contributor_affiliations_ca_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.contributor_affiliations_ca_id_seq'::regclass)" @@ -501,7 +501,7 @@ class LstmAnomalyModel(Base): model_id = Column( BigInteger, - Sequence('lstm_anomaly_models_model_id_seq', start=1, schema='augur_data'), + Sequence('lstm_anomaly_models_model_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.lstm_anomaly_models_model_id_seq'::regclass)" @@ -554,7 +554,7 @@ class RepoGroup(Base): repo_group_id = Column( BigInteger, - Sequence('repo_groups_repo_group_id_seq', start=25430, schema='augur_data'), + Sequence('repo_groups_repo_group_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_groups_repo_group_id_seq'::regclass)" @@ -644,7 +644,7 @@ class TopicWord(Base): topic_words_id = Column( BigInteger, - Sequence('topic_words_topic_words_id_seq', start=1, schema='augur_data'), + Sequence('topic_words_topic_words_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.topic_words_topic_words_id_seq'::regclass)" @@ -688,7 +688,7 @@ class UnresolvedCommitEmail(Base): email_unresolved_id = Column( BigInteger, - Sequence('unresolved_commit_emails_email_unresolved_id_seq', start=1, schema='augur_data'), + Sequence('unresolved_commit_emails_email_unresolved_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.unresolved_commit_emails_email_unresolved_id_seq'::regclass)" @@ -744,7 +744,7 @@ class ContributorRepo(Base): cntrb_repo_id = Column( BigInteger, - Sequence('contributor_repo_cntrb_repo_id_seq', start=1, schema='augur_data'), + Sequence('contributor_repo_cntrb_repo_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.contributor_repo_cntrb_repo_id_seq'::regclass)" @@ -789,7 +789,7 @@ class ContributorsAlias(Base): cntrb_alias_id = Column( BigInteger, - Sequence('contributors_aliases_cntrb_alias_id_seq', start=1, schema='augur_data'), + Sequence('contributors_aliases_cntrb_alias_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.contributors_aliases_cntrb_alias_id_seq'::regclass)" @@ -845,7 +845,7 @@ class Repo(Base): repo_id = Column( BigInteger, - Sequence('repo_repo_id_seq', start=25480, schema='augur_data'), + Sequence('repo_repo_id_seq', start=25480, schema='collection_data'), primary_key=True, server_default=text("nextval('augur_data.repo_repo_id_seq'::regclass)"), ) @@ -1204,7 +1204,7 @@ class RepoTestCoverage(Base): repo_id = Column( ForeignKey("augur_data.repo.repo_id"), - Sequence('repo_test_coverage_repo_id_seq', start=1, schema='augur_data'), + Sequence('repo_test_coverage_repo_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_test_coverage_repo_id_seq'::regclass)" @@ -1237,7 +1237,7 @@ class RepoGroupInsight(Base): rgi_id = Column( BigInteger, - Sequence('repo_group_insights_rgi_id_seq', start=25430, schema='augur_data'), + Sequence('repo_group_insights_rgi_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_group_insights_rgi_id_seq'::regclass)" @@ -1271,7 +1271,7 @@ class RepoGroupsListServe(Base): rgls_id = Column( BigInteger, - Sequence('repo_groups_list_serve_rgls_id_seq', start=25430, schema='augur_data'), + Sequence('repo_groups_list_serve_rgls_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_groups_list_serve_rgls_id_seq'::regclass)" @@ -1555,7 +1555,7 @@ class LstmAnomalyResult(Base): result_id = Column( BigInteger, - Sequence('lstm_anomaly_results_result_id_seq', start=1, schema='augur_data'), + Sequence('lstm_anomaly_results_result_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.lstm_anomaly_results_result_id_seq'::regclass)" @@ -1667,7 +1667,7 @@ class MessageAnalysisSummary(Base): msg_summary_id = Column( BigInteger, - Sequence('message_analysis_summary_msg_summary_id_seq', start=1, schema='augur_data'), + Sequence('message_analysis_summary_msg_summary_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.message_analysis_summary_msg_summary_id_seq'::regclass)" @@ -1707,7 +1707,7 @@ class MessageSentimentSummary(Base): msg_summary_id = Column( BigInteger, - Sequence('message_sentiment_summary_msg_summary_id_seq', start=1, schema='augur_data'), + Sequence('message_sentiment_summary_msg_summary_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.message_sentiment_summary_msg_summary_id_seq'::regclass)" @@ -1754,7 +1754,7 @@ class PullRequest(Base): pull_request_id = Column( BigInteger, - Sequence('pull_requests_pull_request_id_seq', start=25430, schema='augur_data'), + Sequence('pull_requests_pull_request_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_requests_pull_request_id_seq'::regclass)" @@ -1922,7 +1922,7 @@ class RepoBadging(Base): badge_collection_id = Column( BigInteger, - Sequence('repo_badging_badge_collection_id_seq', start=25012, schema='augur_data'), + Sequence('repo_badging_badge_collection_id_seq', start=25012, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_badging_badge_collection_id_seq'::regclass)" @@ -1964,7 +1964,7 @@ class RepoClusterMessage(Base): msg_cluster_id = Column( BigInteger, - Sequence('repo_cluster_messages_msg_cluster_id_seq', start=1, schema='augur_data'), + Sequence('repo_cluster_messages_msg_cluster_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_cluster_messages_msg_cluster_id_seq'::regclass)" @@ -1995,7 +1995,7 @@ class RepoDependency(Base): repo_dependencies_id = Column( BigInteger, - Sequence('repo_dependencies_repo_dependencies_id_seq', start=1, schema='augur_data'), + Sequence('repo_dependencies_repo_dependencies_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_dependencies_repo_dependencies_id_seq'::regclass)" @@ -2026,7 +2026,7 @@ class RepoDepsLibyear(Base): repo_deps_libyear_id = Column( BigInteger, - Sequence('repo_deps_libyear_repo_deps_libyear_id_seq', start=1, schema='augur_data'), + Sequence('repo_deps_libyear_repo_deps_libyear_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_deps_libyear_repo_deps_libyear_id_seq'::regclass)" @@ -2061,7 +2061,7 @@ class RepoDepsScorecard(Base): repo_deps_scorecard_id = Column( BigInteger, - Sequence('repo_deps_scorecard_repo_deps_scorecard_id_seq1', start=1, schema='augur_data'), + Sequence('repo_deps_scorecard_repo_deps_scorecard_id_seq1', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_deps_scorecard_repo_deps_scorecard_id_seq1'::regclass)" @@ -2092,7 +2092,7 @@ class RepoInfo(Base): repo_info_id = Column( BigInteger, - Sequence('repo_info_repo_info_id_seq', start=25430, schema='augur_data'), + Sequence('repo_info_repo_info_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_info_repo_info_id_seq'::regclass)" @@ -2183,7 +2183,7 @@ class RepoInsightsRecord(Base): ri_id = Column( BigInteger, - Sequence('repo_insights_records_ri_id_seq', start=1, schema='augur_data'), + Sequence('repo_insights_records_ri_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_insights_records_ri_id_seq'::regclass)" @@ -2230,7 +2230,7 @@ class RepoLabor(Base): repo_labor_id = Column( BigInteger, - Sequence('repo_labor_repo_labor_id_seq', start=25430, schema='augur_data'), + Sequence('repo_labor_repo_labor_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_labor_repo_labor_id_seq'::regclass)" @@ -2289,7 +2289,7 @@ class RepoSbomScan(Base): rsb_id = Column( BigInteger, - Sequence('repo_sbom_scans_rsb_id_seq', start=25430, schema='augur_data'), + Sequence('repo_sbom_scans_rsb_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_sbom_scans_rsb_id_seq'::regclass)" @@ -2333,7 +2333,7 @@ class RepoTopic(Base): repo_topic_id = Column( BigInteger, - Sequence('repo_topic_repo_topic_id_seq', start=1, schema='augur_data'), + Sequence('repo_topic_repo_topic_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_topic_repo_topic_id_seq'::regclass)" @@ -2361,7 +2361,7 @@ class CommitCommentRef(Base): cmt_comment_id = Column( BigInteger, - Sequence('commit_comment_ref_cmt_comment_id_seq', start=25430, schema='augur_data'), + Sequence('commit_comment_ref_cmt_comment_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.commit_comment_ref_cmt_comment_id_seq'::regclass)" @@ -2423,7 +2423,7 @@ class CommitParent(Base): ) parent_id = Column( ForeignKey("augur_data.commits.cmt_id"), - Sequence('commit_parents_parent_id_seq', start=25430, schema='augur_data'), + Sequence('commit_parents_parent_id_seq', start=25430, schema='collection_data'), primary_key=True, nullable=False, server_default=text( @@ -2452,7 +2452,7 @@ class DiscourseInsight(Base): msg_discourse_id = Column( BigInteger, - Sequence('discourse_insights_msg_discourse_id_seq1', start=1, schema='augur_data'), + Sequence('discourse_insights_msg_discourse_id_seq1', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.discourse_insights_msg_discourse_id_seq1'::regclass)" @@ -2480,7 +2480,7 @@ class IssueAssignee(Base): issue_assignee_id = Column( BigInteger, - Sequence('issue_assignees_issue_assignee_id_seq', start=1, schema='augur_data'), + Sequence('issue_assignees_issue_assignee_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.issue_assignees_issue_assignee_id_seq'::regclass)" @@ -2540,7 +2540,7 @@ class IssueEvent(Base): event_id = Column( BigInteger, - Sequence('issue_events_event_id_seq', start=25430, schema='augur_data'), + Sequence('issue_events_event_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.issue_events_event_id_seq'::regclass)" @@ -2625,7 +2625,7 @@ class IssueLabel(Base): issue_label_id = Column( BigInteger, - Sequence('issue_labels_issue_label_id_seq', start=25430, schema='augur_data'), + Sequence('issue_labels_issue_label_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.issue_labels_issue_label_id_seq'::regclass)" @@ -2682,7 +2682,7 @@ class IssueMessageRef(Base): issue_msg_ref_id = Column( BigInteger, - Sequence('issue_message_ref_issue_msg_ref_id_seq', start=25430, schema='augur_data'), + Sequence('issue_message_ref_issue_msg_ref_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.issue_message_ref_issue_msg_ref_id_seq'::regclass)" @@ -2744,7 +2744,7 @@ class LibraryDependency(Base): lib_dependency_id = Column( BigInteger, - Sequence('library_dependencies_lib_dependency_id_seq', start=25430, schema='augur_data'), + Sequence('library_dependencies_lib_dependency_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.library_dependencies_lib_dependency_id_seq'::regclass)" @@ -2771,7 +2771,7 @@ class LibraryVersion(Base): library_version_id = Column( BigInteger, - Sequence('library_version_library_version_id_seq', start=25430, schema='augur_data'), + Sequence('library_version_library_version_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.library_version_library_version_id_seq'::regclass)" @@ -2797,7 +2797,7 @@ class MessageAnalysis(Base): msg_analysis_id = Column( BigInteger, - Sequence('message_analysis_msg_analysis_id_seq', start=1, schema='augur_data'), + Sequence('message_analysis_msg_analysis_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.message_analysis_msg_analysis_id_seq'::regclass)" @@ -2840,7 +2840,7 @@ class MessageSentiment(Base): msg_analysis_id = Column( BigInteger, - Sequence('message_sentiment_msg_analysis_id_seq', start=1, schema='augur_data'), + Sequence('message_sentiment_msg_analysis_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.message_sentiment_msg_analysis_id_seq'::regclass)" @@ -2881,7 +2881,7 @@ class PullRequestAnalysis(Base): pull_request_analysis_id = Column( BigInteger, - Sequence('pull_request_analysis_pull_request_analysis_id_seq', start=1, schema='augur_data'), + Sequence('pull_request_analysis_pull_request_analysis_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_analysis_pull_request_analysis_id_seq'::regclass)" @@ -2931,7 +2931,7 @@ class PullRequestAssignee(Base): pr_assignee_map_id = Column( BigInteger, - Sequence('pull_request_assignees_pr_assignee_map_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_assignees_pr_assignee_map_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_assignees_pr_assignee_map_id_seq'::regclass)" @@ -2994,7 +2994,7 @@ class PullRequestCommit(Base): pr_cmt_id = Column( BigInteger, - Sequence('pull_request_commits_pr_cmt_id_seq', start=1, schema='augur_data'), + Sequence('pull_request_commits_pr_cmt_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_commits_pr_cmt_id_seq'::regclass)" @@ -3049,7 +3049,7 @@ class PullRequestEvent(Base): pr_event_id = Column( BigInteger, - Sequence('pull_request_events_pr_event_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_events_pr_event_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_events_pr_event_id_seq'::regclass)" @@ -3149,7 +3149,7 @@ class PullRequestFile(Base): pr_file_id = Column( BigInteger, - Sequence('pull_request_files_pr_file_id_seq', start=25150, schema='augur_data'), + Sequence('pull_request_files_pr_file_id_seq', start=25150, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_files_pr_file_id_seq'::regclass)" @@ -3201,7 +3201,7 @@ class PullRequestLabel(Base): pr_label_id = Column( BigInteger, - Sequence('pull_request_labels_pr_label_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_labels_pr_label_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_labels_pr_label_id_seq'::regclass)" @@ -3263,7 +3263,7 @@ class PullRequestMessageRef(Base): pr_msg_ref_id = Column( BigInteger, - Sequence('pull_request_message_ref_pr_msg_ref_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_message_ref_pr_msg_ref_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_message_ref_pr_msg_ref_id_seq'::regclass)" @@ -3316,7 +3316,7 @@ class PullRequestMeta(Base): pr_repo_meta_id = Column( BigInteger, - Sequence('pull_request_meta_pr_repo_meta_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_meta_pr_repo_meta_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_meta_pr_repo_meta_id_seq'::regclass)" @@ -3389,7 +3389,7 @@ class PullRequestReviewer(Base): pr_reviewer_map_id = Column( BigInteger, - Sequence('pull_request_reviewers_pr_reviewer_map_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_reviewers_pr_reviewer_map_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_reviewers_pr_reviewer_map_id_seq'::regclass)" @@ -3451,7 +3451,7 @@ class PullRequestReview(Base): pr_review_id = Column( BigInteger, - Sequence('pull_request_reviews_pr_review_id_seq', start=1, schema='augur_data'), + Sequence('pull_request_reviews_pr_review_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_reviews_pr_review_id_seq'::regclass)" @@ -3519,7 +3519,7 @@ class PullRequestTeam(Base): pr_team_id = Column( BigInteger, - Sequence('pull_request_teams_pr_team_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_teams_pr_team_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_teams_pr_team_id_seq'::regclass)" @@ -3563,7 +3563,7 @@ class PullRequestRepo(Base): pr_repo_id = Column( BigInteger, - Sequence('pull_request_repo_pr_repo_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_repo_pr_repo_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_repo_pr_repo_id_seq'::regclass)" @@ -3606,7 +3606,7 @@ class PullRequestReviewMessageRef(Base): pr_review_msg_ref_id = Column( BigInteger, - Sequence('pull_request_review_message_ref_pr_review_msg_ref_id_seq', start=1, schema='augur_data'), + Sequence('pull_request_review_message_ref_pr_review_msg_ref_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.pull_request_review_message_ref_pr_review_msg_ref_id_seq'::regclass)" @@ -3679,7 +3679,7 @@ class RepoClone(Base): repo_clone_data_id = Column( BigInteger, - Sequence('repo_clones_data_id_seq', start=1, schema='augur_data'), + Sequence('repo_clones_data_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( "nextval('augur_data.repo_clones_data_id_seq'::regclass)" From f36619ebcfc2df15bfba4349581f70c7ff58ee1b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:14:19 -0400 Subject: [PATCH 061/165] rename all references (foreign keys, sequences etc) within SQL Signed-off-by: Adrian Edwards --- .../application/db/models/augur_operations.py | 6 +- collectoss/application/db/models/data.py | 322 +++++++++--------- 2 files changed, 164 insertions(+), 164 deletions(-) diff --git a/collectoss/application/db/models/augur_operations.py b/collectoss/application/db/models/augur_operations.py index 5f25a92f4..68aaac06d 100644 --- a/collectoss/application/db/models/augur_operations.py +++ b/collectoss/application/db/models/augur_operations.py @@ -227,7 +227,7 @@ class BadgingDEI(Base): level = Column(String, nullable=False) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False + ForeignKey("collection_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False ) repo = relationship("Repo") @@ -749,7 +749,7 @@ class UserRepo(Base): ForeignKey("augur_operations.user_groups.group_id", name="user_repo_group_id_fkey"), primary_key=True, nullable=False ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False + ForeignKey("collection_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False ) repo = relationship("Repo", back_populates="user_repo") @@ -1204,7 +1204,7 @@ class CollectionStatus(Base): {"schema": "augur_operations"} ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id", name="collection_status_repo_id_fk"), primary_key=True) + repo_id = Column(ForeignKey("collection_data.repo.repo_id", name="collection_status_repo_id_fk"), primary_key=True) core_data_last_collected = Column(TIMESTAMP) core_status = Column(String, nullable=False, server_default=text("'Pending'")) core_task_id = Column(String) diff --git a/collectoss/application/db/models/data.py b/collectoss/application/db/models/data.py index 650c793cb..4b8d7e5a9 100644 --- a/collectoss/application/db/models/data.py +++ b/collectoss/application/db/models/data.py @@ -72,7 +72,7 @@ class ChaossMetricStatus(Base): Sequence('chaoss_metric_status_cms_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.chaoss_metric_status_cms_id_seq'::regclass)" + "nextval('collection_data.chaoss_metric_status_cms_id_seq'::regclass)" ), ) cm_group = Column(String) @@ -104,7 +104,7 @@ class ChaossUser(Base): Sequence('chaoss_user_chaoss_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.chaoss_user_chaoss_id_seq'::regclass)" + "nextval('collection_data.chaoss_user_chaoss_id_seq'::regclass)" ), ) chaoss_login_name = Column(String) @@ -131,7 +131,7 @@ class ContributorAffiliation(Base): Sequence('contributor_affiliations_ca_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.contributor_affiliations_ca_id_seq'::regclass)" + "nextval('collection_data.contributor_affiliations_ca_id_seq'::regclass)" ), ) ca_domain = Column(String(64), nullable=False, unique=True) @@ -504,7 +504,7 @@ class LstmAnomalyModel(Base): Sequence('lstm_anomaly_models_model_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.lstm_anomaly_models_model_id_seq'::regclass)" + "nextval('collection_data.lstm_anomaly_models_model_id_seq'::regclass)" ), ) model_name = Column(String) @@ -532,7 +532,7 @@ class Platform(Base): BigInteger, Sequence('platform_pltfrm_id_seq', start=25430, schema="collection_data"), primary_key=True, - server_default=text("nextval('augur_data.platform_pltfrm_id_seq'::regclass)"), + server_default=text("nextval('collection_data.platform_pltfrm_id_seq'::regclass)"), ) pltfrm_name = Column(String) pltfrm_version = Column(String) @@ -557,7 +557,7 @@ class RepoGroup(Base): Sequence('repo_groups_repo_group_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_groups_repo_group_id_seq'::regclass)" + "nextval('collection_data.repo_groups_repo_group_id_seq'::regclass)" ), ) rg_name = Column(String, nullable=False) @@ -647,7 +647,7 @@ class TopicWord(Base): Sequence('topic_words_topic_words_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.topic_words_topic_words_id_seq'::regclass)" + "nextval('collection_data.topic_words_topic_words_id_seq'::regclass)" ), ) topic_id = Column(BigInteger) @@ -691,7 +691,7 @@ class UnresolvedCommitEmail(Base): Sequence('unresolved_commit_emails_email_unresolved_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.unresolved_commit_emails_email_unresolved_id_seq'::regclass)" + "nextval('collection_data.unresolved_commit_emails_email_unresolved_id_seq'::regclass)" ), ) email = Column(String, nullable=False, unique=True) @@ -712,7 +712,7 @@ class UtilityLog(Base): BigInteger, Sequence('utility_log_id_seq1', start=1, schema="collection_data"), primary_key=True, - server_default=text("nextval('augur_data.utility_log_id_seq1'::regclass)"), + server_default=text("nextval('collection_data.utility_log_id_seq1'::regclass)"), ) level = Column(String(8), nullable=False) status = Column(String, nullable=False) @@ -747,12 +747,12 @@ class ContributorRepo(Base): Sequence('contributor_repo_cntrb_repo_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.contributor_repo_cntrb_repo_id_seq'::regclass)" + "nextval('collection_data.contributor_repo_cntrb_repo_id_seq'::regclass)" ), ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "collection_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, comment="This is not null because what is the point without the contributor in this table? ", @@ -792,12 +792,12 @@ class ContributorsAlias(Base): Sequence('contributors_aliases_cntrb_alias_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.contributors_aliases_cntrb_alias_id_seq'::regclass)" + "nextval('collection_data.contributors_aliases_cntrb_alias_id_seq'::regclass)" ), ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", + "collection_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -847,10 +847,10 @@ class Repo(Base): BigInteger, Sequence('repo_repo_id_seq', start=25480, schema='collection_data'), primary_key=True, - server_default=text("nextval('augur_data.repo_repo_id_seq'::regclass)"), + server_default=text("nextval('collection_data.repo_repo_id_seq'::regclass)"), ) repo_group_id = Column( - ForeignKey("augur_data.repo_groups.repo_group_id"), nullable=False + ForeignKey("collection_data.repo_groups.repo_group_id"), nullable=False ) repo_git = Column(String, nullable=False) @@ -1194,7 +1194,7 @@ class HistoricalRepoURLs(Base): __tablename__ = "historical_repo_urls" __table_args__ = {"schema": "collection_data"} - repo_id = Column(ForeignKey("augur_data.repo.repo_id"), primary_key=True) + repo_id = Column(ForeignKey("collection_data.repo.repo_id"), primary_key=True) git_url = Column(String, primary_key=True) date_collected = Column(DateTime(timezone=True), server_default=func.now(), nullable=True) @@ -1203,11 +1203,11 @@ class RepoTestCoverage(Base): __table_args__ = {"schema": "collection_data"} repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), + ForeignKey("collection_data.repo.repo_id"), Sequence('repo_test_coverage_repo_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_test_coverage_repo_id_seq'::regclass)" + "nextval('collection_data.repo_test_coverage_repo_id_seq'::regclass)" ), ) repo_clone_date = Column(TIMESTAMP(precision=0)) @@ -1240,10 +1240,10 @@ class RepoGroupInsight(Base): Sequence('repo_group_insights_rgi_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_group_insights_rgi_id_seq'::regclass)" + "nextval('collection_data.repo_group_insights_rgi_id_seq'::regclass)" ), ) - repo_group_id = Column(ForeignKey("augur_data.repo_groups.repo_group_id")) + repo_group_id = Column(ForeignKey("collection_data.repo_groups.repo_group_id")) rgi_metric = Column(String) rgi_value = Column(String) cms_id = Column(BigInteger) @@ -1274,11 +1274,11 @@ class RepoGroupsListServe(Base): Sequence('repo_groups_list_serve_rgls_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_groups_list_serve_rgls_id_seq'::regclass)" + "nextval('collection_data.repo_groups_list_serve_rgls_id_seq'::regclass)" ), ) repo_group_id = Column( - ForeignKey("augur_data.repo_groups.repo_group_id"), nullable=False + ForeignKey("collection_data.repo_groups.repo_group_id"), nullable=False ) rgls_name = Column(String) rgls_description = Column(String(3000)) @@ -1328,10 +1328,10 @@ class Commit(Base): BigInteger, Sequence('commits_cmt_id_seq', start=25430, schema="collection_data"), primary_key=True, - server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), + server_default=text("nextval('collection_data.commits_cmt_id_seq'::regclass)"), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), + ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=False, ) cmt_commit_hash = Column(String(80), nullable=False) @@ -1355,7 +1355,7 @@ class Commit(Base): cmt_filename = Column(String, nullable=False) cmt_date_attempted = Column(TIMESTAMP(precision=0), nullable=False) cmt_ght_author_id = Column(ForeignKey( - "augur_data.contributors.cntrb_id", + "collection_data.contributors.cntrb_id", name="cmt_ght_author_cntrb_id_fk", onupdate="CASCADE", ondelete="RESTRICT", @@ -1368,7 +1368,7 @@ class Commit(Base): cmt_author_timestamp = Column(TIMESTAMP(True, 0)) cmt_author_platform_username = Column( ForeignKey( - "augur_data.contributors.cntrb_login", + "collection_data.contributors.cntrb_login", name="fk_commits_contributors_3", ondelete="CASCADE", onupdate="CASCADE", @@ -1376,7 +1376,7 @@ class Commit(Base): deferrable=True, ), ForeignKey( - "augur_data.contributors.cntrb_login", + "collection_data.contributors.cntrb_login", name="fk_commits_contributors_4", ondelete="CASCADE", onupdate="CASCADE", @@ -1413,11 +1413,11 @@ class CommitMessage(Base): BigInteger, Sequence('commits_cmt_id_seq', start=25430, schema="collection_data"), primary_key=True, - server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), + server_default=text("nextval('collection_data.commits_cmt_id_seq'::regclass)"), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), + ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=False, ) @@ -1449,13 +1449,13 @@ class Issue(Base): BigInteger, Sequence('issue_seq', start=31000, schema="collection_data"), primary_key=True, - server_default=text("nextval('augur_data.issue_seq'::regclass)"), + server_default=text("nextval('collection_data.issue_seq'::regclass)"), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), + ForeignKey("collection_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), ) reporter_id = Column( - ForeignKey("augur_data.contributors.cntrb_id"), + ForeignKey("collection_data.contributors.cntrb_id"), comment="The ID of the person who opened the issue. ", ) pull_request = Column(BigInteger) @@ -1464,7 +1464,7 @@ class Issue(Base): issue_title = Column(String) issue_body = Column(String) cntrb_id = Column( - ForeignKey("augur_data.contributors.cntrb_id"), + ForeignKey("collection_data.contributors.cntrb_id"), comment="The ID of the person who closed the issue. ", ) comment_count = Column(BigInteger) @@ -1515,9 +1515,9 @@ class Library(Base): BigInteger, Sequence('libraries_library_id_seq', start=25430, schema="collection_data"), primary_key=True, - server_default=text("nextval('augur_data.libraries_library_id_seq'::regclass)"), + server_default=text("nextval('collection_data.libraries_library_id_seq'::regclass)"), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) platform = Column(String) name = Column(String) created_timestamp = Column( @@ -1558,12 +1558,12 @@ class LstmAnomalyResult(Base): Sequence('lstm_anomaly_results_result_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.lstm_anomaly_results_result_id_seq'::regclass)" + "nextval('collection_data.lstm_anomaly_results_result_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) repo_category = Column(String) - model_id = Column(ForeignKey("augur_data.lstm_anomaly_models.model_id")) + model_id = Column(ForeignKey("collection_data.lstm_anomaly_models.model_id")) metric = Column(String) contamination_factor = Column(Float(53)) mean_absolute_error = Column(Float(53)) @@ -1599,11 +1599,11 @@ class Message(Base): BigInteger, Sequence('message_msg_id_seq', start=25430, schema="collection_data"), primary_key=True, - server_default=text("nextval('augur_data.message_msg_id_seq'::regclass)"), + server_default=text("nextval('collection_data.message_msg_id_seq'::regclass)"), ) rgls_id = Column( ForeignKey( - "augur_data.repo_groups_list_serve.rgls_id", + "collection_data.repo_groups_list_serve.rgls_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -1612,7 +1612,7 @@ class Message(Base): platform_node_id = Column(String) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "collection_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -1621,7 +1621,7 @@ class Message(Base): ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" + "collection_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" ), comment="Not populated for mailing lists. Populated for GitHub issues. ", ) @@ -1631,7 +1631,7 @@ class Message(Base): msg_header = Column(String) pltfrm_id = Column( ForeignKey( - "augur_data.platform.pltfrm_id", ondelete="CASCADE", onupdate="CASCADE" + "collection_data.platform.pltfrm_id", ondelete="CASCADE", onupdate="CASCADE" ), nullable=False, ) @@ -1670,10 +1670,10 @@ class MessageAnalysisSummary(Base): Sequence('message_analysis_summary_msg_summary_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.message_analysis_summary_msg_summary_id_seq'::regclass)" + "nextval('collection_data.message_analysis_summary_msg_summary_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) worker_run_id = Column( BigInteger, comment='This value should reflect the worker_run_id for the messages summarized in the table. There is not a relation between these two tables for that purpose because its not *really*, relationaly a concept unless we create a third table for "worker_run_id", which we determined was unnecessarily complex. ', @@ -1710,10 +1710,10 @@ class MessageSentimentSummary(Base): Sequence('message_sentiment_summary_msg_summary_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.message_sentiment_summary_msg_summary_id_seq'::regclass)" + "nextval('collection_data.message_sentiment_summary_msg_summary_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) worker_run_id = Column( BigInteger, comment='This value should reflect the worker_run_id for the messages summarized in the table. There is not a relation between these two tables for that purpose because its not *really*, relationaly a concept unless we create a third table for "worker_run_id", which we determined was unnecessarily complex. ', @@ -1757,11 +1757,11 @@ class PullRequest(Base): Sequence('pull_requests_pull_request_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_requests_pull_request_id_seq'::regclass)" + "nextval('collection_data.pull_requests_pull_request_id_seq'::regclass)" ), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), + ForeignKey("collection_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), server_default=text("0"), ) pr_url = Column(String) @@ -1784,7 +1784,7 @@ class PullRequest(Base): pr_src_title = Column(String) pr_augur_contributor_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "collection_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), comment="This is to link to the contributor record. ", ) @@ -1889,9 +1889,9 @@ class Release(Base): CHAR(256), Sequence('releases_release_id_seq', start=1, schema="collection_data"), primary_key=True, - server_default=text("nextval('augur_data.releases_release_id_seq'::regclass)"), + server_default=text("nextval('collection_data.releases_release_id_seq'::regclass)"), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id"), nullable=False) + repo_id = Column(ForeignKey("collection_data.repo.repo_id"), nullable=False) release_name = Column(String) release_description = Column(String) release_author = Column(String) @@ -1925,10 +1925,10 @@ class RepoBadging(Base): Sequence('repo_badging_badge_collection_id_seq', start=25012, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_badging_badge_collection_id_seq'::regclass)" + "nextval('collection_data.repo_badging_badge_collection_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) created_at = Column( TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) @@ -1967,10 +1967,10 @@ class RepoClusterMessage(Base): Sequence('repo_cluster_messages_msg_cluster_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_cluster_messages_msg_cluster_id_seq'::regclass)" + "nextval('collection_data.repo_cluster_messages_msg_cluster_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) cluster_content = Column(Integer) cluster_mechanism = Column(Integer) tool_source = Column(String) @@ -1998,11 +1998,11 @@ class RepoDependency(Base): Sequence('repo_dependencies_repo_dependencies_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_dependencies_repo_dependencies_id_seq'::regclass)" + "nextval('collection_data.repo_dependencies_repo_dependencies_id_seq'::regclass)" ), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), comment="Forign key for repo id. " + ForeignKey("collection_data.repo.repo_id"), comment="Forign key for repo id. " ) dep_name = Column(String, comment="Name of the dependancy found in project. ") dep_count = Column(Integer, comment="Number of times the dependancy was found. ") @@ -2029,10 +2029,10 @@ class RepoDepsLibyear(Base): Sequence('repo_deps_libyear_repo_deps_libyear_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_deps_libyear_repo_deps_libyear_id_seq'::regclass)" + "nextval('collection_data.repo_deps_libyear_repo_deps_libyear_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) name = Column(String) requirement = Column(String) type = Column(String) @@ -2064,10 +2064,10 @@ class RepoDepsScorecard(Base): Sequence('repo_deps_scorecard_repo_deps_scorecard_id_seq1', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_deps_scorecard_repo_deps_scorecard_id_seq1'::regclass)" + "nextval('collection_data.repo_deps_scorecard_repo_deps_scorecard_id_seq1'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) name = Column(String) #status = Column(String) scorecard_check_details = Column(JSONB) @@ -2095,10 +2095,10 @@ class RepoInfo(Base): Sequence('repo_info_repo_info_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_info_repo_info_id_seq'::regclass)" + "nextval('collection_data.repo_info_repo_info_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id"), nullable=False) + repo_id = Column(ForeignKey("collection_data.repo.repo_id"), nullable=False) last_updated = Column( TIMESTAMP(precision=0), server_default=text("NULL::timestamp without time zone") ) @@ -2151,9 +2151,9 @@ class RepoInsight(Base): BigInteger, Sequence('repo_insights_ri_id_seq', start=25430, schema="collection_data"), primary_key=True, - server_default=text("nextval('augur_data.repo_insights_ri_id_seq'::regclass)"), + server_default=text("nextval('collection_data.repo_insights_ri_id_seq'::regclass)"), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) ri_metric = Column(String) ri_value = Column(String) ri_date = Column(TIMESTAMP(precision=0)) @@ -2186,12 +2186,12 @@ class RepoInsightsRecord(Base): Sequence('repo_insights_records_ri_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_insights_records_ri_id_seq'::regclass)" + "nextval('collection_data.repo_insights_records_ri_id_seq'::regclass)" ), comment="Primary key. ", ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="SET NULL", onupdate="CASCADE"), + ForeignKey("collection_data.repo.repo_id", ondelete="SET NULL", onupdate="CASCADE"), comment="Refers to repo table primary key. Will have a foreign key", ) ri_metric = Column(String, comment="The metric endpoint") @@ -2233,10 +2233,10 @@ class RepoLabor(Base): Sequence('repo_labor_repo_labor_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_labor_repo_labor_id_seq'::regclass)" + "nextval('collection_data.repo_labor_repo_labor_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) repo_clone_date = Column(TIMESTAMP(precision=0)) rl_analysis_date = Column(TIMESTAMP(precision=0)) programming_language = Column(String) @@ -2264,14 +2264,14 @@ class RepoMeta(Base): __table_args__ = {"schema": "collection_data", "comment": "Project Languages"} repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), primary_key=True, nullable=False + ForeignKey("collection_data.repo.repo_id"), primary_key=True, nullable=False ) rmeta_id = Column( BigInteger, Sequence('repo_meta_rmeta_id_seq', start=25430, schema="collection_data"), primary_key=True, nullable=False, - server_default=text("nextval('augur_data.repo_meta_rmeta_id_seq'::regclass)"), + server_default=text("nextval('collection_data.repo_meta_rmeta_id_seq'::regclass)"), ) rmeta_name = Column(String) rmeta_value = Column(String, server_default=text("0")) @@ -2292,11 +2292,11 @@ class RepoSbomScan(Base): Sequence('repo_sbom_scans_rsb_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_sbom_scans_rsb_id_seq'::regclass)" + "nextval('collection_data.repo_sbom_scans_rsb_id_seq'::regclass)" ), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE") + ForeignKey("collection_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE") ) sbom_scan = Column(JSON) @@ -2308,14 +2308,14 @@ class RepoStat(Base): __table_args__ = {"schema": "collection_data", "comment": "Project Watchers"} repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), primary_key=True, nullable=False + ForeignKey("collection_data.repo.repo_id"), primary_key=True, nullable=False ) rstat_id = Column( BigInteger, Sequence('repo_stats_rstat_id_seq', start=25430, schema="collection_data"), primary_key=True, nullable=False, - server_default=text("nextval('augur_data.repo_stats_rstat_id_seq'::regclass)"), + server_default=text("nextval('collection_data.repo_stats_rstat_id_seq'::regclass)"), ) rstat_name = Column(String(400)) rstat_value = Column(BigInteger) @@ -2336,10 +2336,10 @@ class RepoTopic(Base): Sequence('repo_topic_repo_topic_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_topic_repo_topic_id_seq'::regclass)" + "nextval('collection_data.repo_topic_repo_topic_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("collection_data.repo.repo_id")) topic_id = Column(Integer) topic_prob = Column(Float(53)) tool_source = Column(String) @@ -2364,19 +2364,19 @@ class CommitCommentRef(Base): Sequence('commit_comment_ref_cmt_comment_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.commit_comment_ref_cmt_comment_id_seq'::regclass)" + "nextval('collection_data.commit_comment_ref_cmt_comment_id_seq'::regclass)" ), ) cmt_id = Column( ForeignKey( - "augur_data.commits.cmt_id", ondelete="RESTRICT", onupdate="CASCADE" + "collection_data.commits.cmt_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) repo_id = Column(BigInteger) msg_id = Column( ForeignKey( - "augur_data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE" + "collection_data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) @@ -2417,17 +2417,17 @@ class CommitParent(Base): ) cmt_id = Column( - ForeignKey("augur_data.commits.cmt_id"), + ForeignKey("collection_data.commits.cmt_id"), primary_key=True, nullable=False, ) parent_id = Column( - ForeignKey("augur_data.commits.cmt_id"), + ForeignKey("collection_data.commits.cmt_id"), Sequence('commit_parents_parent_id_seq', start=25430, schema='collection_data'), primary_key=True, nullable=False, server_default=text( - "nextval('augur_data.commit_parents_parent_id_seq'::regclass)" + "nextval('collection_data.commit_parents_parent_id_seq'::regclass)" ), ) tool_source = Column(String) @@ -2455,10 +2455,10 @@ class DiscourseInsight(Base): Sequence('discourse_insights_msg_discourse_id_seq1', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.discourse_insights_msg_discourse_id_seq1'::regclass)" + "nextval('collection_data.discourse_insights_msg_discourse_id_seq1'::regclass)" ), ) - msg_id = Column(ForeignKey("augur_data.message.msg_id")) + msg_id = Column(ForeignKey("collection_data.message.msg_id")) discourse_act = Column(String) tool_source = Column(String) tool_version = Column(String) @@ -2483,14 +2483,14 @@ class IssueAssignee(Base): Sequence('issue_assignees_issue_assignee_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.issue_assignees_issue_assignee_id_seq'::regclass)" + "nextval('collection_data.issue_assignees_issue_assignee_id_seq'::regclass)" ), ) - issue_id = Column(ForeignKey("augur_data.issues.issue_id")) + issue_id = Column(ForeignKey("collection_data.issues.issue_id")) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) - cntrb_id = Column(ForeignKey("augur_data.contributors.cntrb_id")) + cntrb_id = Column(ForeignKey("collection_data.contributors.cntrb_id")) issue_assignee_src_id = Column( BigInteger, comment="This ID comes from the source. In the case of GitHub, it is the id that is the first field returned from the issue events API in the issue_assignees embedded JSON object. We may discover it is an ID for the person themselves; but my hypothesis is that its not.", @@ -2543,21 +2543,21 @@ class IssueEvent(Base): Sequence('issue_events_event_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.issue_events_event_id_seq'::regclass)" + "nextval('collection_data.issue_events_event_id_seq'::regclass)" ), ) issue_id = Column( ForeignKey( - "augur_data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE" + "collection_data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE" ), nullable=False, ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "collection_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ) ) action = Column(String, nullable=False) @@ -2572,7 +2572,7 @@ class IssueEvent(Base): node_url = Column(String) platform_id = Column( ForeignKey( - "augur_data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="CASCADE" + "collection_data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) @@ -2628,14 +2628,14 @@ class IssueLabel(Base): Sequence('issue_labels_issue_label_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.issue_labels_issue_label_id_seq'::regclass)" + "nextval('collection_data.issue_labels_issue_label_id_seq'::regclass)" ), ) issue_id = Column( - ForeignKey("augur_data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE") + ForeignKey("collection_data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE") ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) label_text = Column(String) label_description = Column(String) @@ -2685,12 +2685,12 @@ class IssueMessageRef(Base): Sequence('issue_message_ref_issue_msg_ref_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.issue_message_ref_issue_msg_ref_id_seq'::regclass)" + "nextval('collection_data.issue_message_ref_issue_msg_ref_id_seq'::regclass)" ), ) issue_id = Column( ForeignKey( - "augur_data.issues.issue_id", + "collection_data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -2699,7 +2699,7 @@ class IssueMessageRef(Base): ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -2708,7 +2708,7 @@ class IssueMessageRef(Base): ) msg_id = Column( ForeignKey( - "augur_data.message.msg_id", + "collection_data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -2747,10 +2747,10 @@ class LibraryDependency(Base): Sequence('library_dependencies_lib_dependency_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.library_dependencies_lib_dependency_id_seq'::regclass)" + "nextval('collection_data.library_dependencies_lib_dependency_id_seq'::regclass)" ), ) - library_id = Column(ForeignKey("augur_data.libraries.library_id")) + library_id = Column(ForeignKey("collection_data.libraries.library_id")) manifest_platform = Column(String) manifest_filepath = Column( String(1000), server_default=text("NULL::character varying") @@ -2774,10 +2774,10 @@ class LibraryVersion(Base): Sequence('library_version_library_version_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.library_version_library_version_id_seq'::regclass)" + "nextval('collection_data.library_version_library_version_id_seq'::regclass)" ), ) - library_id = Column(ForeignKey("augur_data.libraries.library_id")) + library_id = Column(ForeignKey("collection_data.libraries.library_id")) library_platform = Column(String) version_number = Column(String) version_release_date = Column( @@ -2800,10 +2800,10 @@ class MessageAnalysis(Base): Sequence('message_analysis_msg_analysis_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.message_analysis_msg_analysis_id_seq'::regclass)" + "nextval('collection_data.message_analysis_msg_analysis_id_seq'::regclass)" ), ) - msg_id = Column(ForeignKey("augur_data.message.msg_id")) + msg_id = Column(ForeignKey("collection_data.message.msg_id")) worker_run_id = Column( BigInteger, comment="This column is used to indicate analyses run by a worker during the same execution period, and is useful for grouping, and time series analysis. ", @@ -2843,10 +2843,10 @@ class MessageSentiment(Base): Sequence('message_sentiment_msg_analysis_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.message_sentiment_msg_analysis_id_seq'::regclass)" + "nextval('collection_data.message_sentiment_msg_analysis_id_seq'::regclass)" ), ) - msg_id = Column(ForeignKey("augur_data.message.msg_id")) + msg_id = Column(ForeignKey("collection_data.message.msg_id")) worker_run_id = Column( BigInteger, comment="This column is used to indicate analyses run by a worker during the same execution period, and is useful for grouping, and time series analysis. ", @@ -2884,12 +2884,12 @@ class PullRequestAnalysis(Base): Sequence('pull_request_analysis_pull_request_analysis_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_analysis_pull_request_analysis_id_seq'::regclass)" + "nextval('collection_data.pull_request_analysis_pull_request_analysis_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ), @@ -2934,26 +2934,26 @@ class PullRequestAssignee(Base): Sequence('pull_request_assignees_pr_assignee_map_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_assignees_pr_assignee_map_id_seq'::regclass)" + "nextval('collection_data.pull_request_assignees_pr_assignee_map_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, initially="DEFERRED", ) ) - contrib_id = Column(ForeignKey("augur_data.contributors.cntrb_id")) + contrib_id = Column(ForeignKey("collection_data.contributors.cntrb_id")) pr_assignee_src_id = Column(BigInteger) tool_source = Column(String) tool_version = Column(String) @@ -2997,18 +2997,18 @@ class PullRequestCommit(Base): Sequence('pull_request_commits_pr_cmt_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_commits_pr_cmt_id_seq'::regclass)" + "nextval('collection_data.pull_request_commits_pr_cmt_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) pr_cmt_sha = Column( String, @@ -3019,7 +3019,7 @@ class PullRequestCommit(Base): pr_cmt_comments_url = Column(String) pr_cmt_author_cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" + "collection_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" ) ) pr_cmt_timestamp = Column(TIMESTAMP(precision=0)) @@ -3052,12 +3052,12 @@ class PullRequestEvent(Base): Sequence('pull_request_events_pr_event_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_events_pr_event_id_seq'::regclass)" + "nextval('collection_data.pull_request_events_pr_event_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ), @@ -3065,7 +3065,7 @@ class PullRequestEvent(Base): ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="RESTRICT", deferrable=True, @@ -3073,7 +3073,7 @@ class PullRequestEvent(Base): ) ) cntrb_id = Column( - ForeignKey("augur_data.contributors.cntrb_id") + ForeignKey("collection_data.contributors.cntrb_id") ) action = Column(String, nullable=False) action_commit_hash = Column(String) @@ -3091,7 +3091,7 @@ class PullRequestEvent(Base): node_url = Column(String) platform_id = Column( ForeignKey( - "augur_data.platform.pltfrm_id", + "collection_data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="RESTRICT", deferrable=True, @@ -3152,19 +3152,19 @@ class PullRequestFile(Base): Sequence('pull_request_files_pr_file_id_seq', start=25150, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_files_pr_file_id_seq'::regclass)" + "nextval('collection_data.pull_request_files_pr_file_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3204,18 +3204,18 @@ class PullRequestLabel(Base): Sequence('pull_request_labels_pr_label_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_labels_pr_label_id_seq'::regclass)" + "nextval('collection_data.pull_request_labels_pr_label_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) pr_src_id = Column(BigInteger) pr_src_node_id = Column(String) @@ -3266,12 +3266,12 @@ class PullRequestMessageRef(Base): Sequence('pull_request_message_ref_pr_msg_ref_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_message_ref_pr_msg_ref_id_seq'::regclass)" + "nextval('collection_data.pull_request_message_ref_pr_msg_ref_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -3279,11 +3279,11 @@ class PullRequestMessageRef(Base): ) ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) msg_id = Column( ForeignKey( - "augur_data.message.msg_id", + "collection_data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3319,19 +3319,19 @@ class PullRequestMeta(Base): Sequence('pull_request_meta_pr_repo_meta_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_meta_pr_repo_meta_id_seq'::regclass)" + "nextval('collection_data.pull_request_meta_pr_repo_meta_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3348,7 +3348,7 @@ class PullRequestMeta(Base): ) pr_src_meta_ref = Column(String) pr_sha = Column(String) - cntrb_id = Column(ForeignKey("augur_data.contributors.cntrb_id")) + cntrb_id = Column(ForeignKey("collection_data.contributors.cntrb_id")) tool_source = Column(String) tool_version = Column(String) data_source = Column(String) @@ -3392,12 +3392,12 @@ class PullRequestReviewer(Base): Sequence('pull_request_reviewers_pr_reviewer_map_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_reviewers_pr_reviewer_map_id_seq'::regclass)" + "nextval('collection_data.pull_request_reviewers_pr_reviewer_map_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -3409,7 +3409,7 @@ class PullRequestReviewer(Base): repo_id = Column(BigInteger) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" + "collection_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" ), ) pr_reviewer_src_id = Column( @@ -3454,23 +3454,23 @@ class PullRequestReview(Base): Sequence('pull_request_reviews_pr_review_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_reviews_pr_review_id_seq'::regclass)" + "nextval('collection_data.pull_request_reviews_pr_review_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ), nullable=False, ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "collection_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) @@ -3485,7 +3485,7 @@ class PullRequestReview(Base): pr_review_commit_id = Column(String) platform_id = Column( ForeignKey( - "augur_data.platform.pltfrm_id", + "collection_data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3522,12 +3522,12 @@ class PullRequestTeam(Base): Sequence('pull_request_teams_pr_team_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_teams_pr_team_id_seq'::regclass)" + "nextval('collection_data.pull_request_teams_pr_team_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "collection_data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -3566,12 +3566,12 @@ class PullRequestRepo(Base): Sequence('pull_request_repo_pr_repo_id_seq', start=25430, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_repo_pr_repo_id_seq'::regclass)" + "nextval('collection_data.pull_request_repo_pr_repo_id_seq'::regclass)" ), ) pr_repo_meta_id = Column( ForeignKey( - "augur_data.pull_request_meta.pr_repo_meta_id", + "collection_data.pull_request_meta.pr_repo_meta_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -3585,7 +3585,7 @@ class PullRequestRepo(Base): pr_repo_name = Column(String) pr_repo_full_name = Column(String) pr_repo_private_bool = Column(Boolean) - pr_cntrb_id = Column(ForeignKey("augur_data.contributors.cntrb_id")) + pr_cntrb_id = Column(ForeignKey("collection_data.contributors.cntrb_id")) tool_source = Column(String) tool_version = Column(String) data_source = Column(String) @@ -3609,12 +3609,12 @@ class PullRequestReviewMessageRef(Base): Sequence('pull_request_review_message_ref_pr_review_msg_ref_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_review_message_ref_pr_review_msg_ref_id_seq'::regclass)" + "nextval('collection_data.pull_request_review_message_ref_pr_review_msg_ref_id_seq'::regclass)" ), ) pr_review_id = Column( ForeignKey( - "augur_data.pull_request_reviews.pr_review_id", + "collection_data.pull_request_reviews.pr_review_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3624,7 +3624,7 @@ class PullRequestReviewMessageRef(Base): ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3633,7 +3633,7 @@ class PullRequestReviewMessageRef(Base): ) msg_id = Column( ForeignKey( - "augur_data.message.msg_id", + "collection_data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3682,12 +3682,12 @@ class RepoClone(Base): Sequence('repo_clones_data_id_seq', start=1, schema='collection_data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_clones_data_id_seq'::regclass)" + "nextval('collection_data.repo_clones_data_id_seq'::regclass)" ), ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3713,7 +3713,7 @@ class TopicModelMeta(Base): comment="Unique identifier for the topic model" ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), + ForeignKey("collection_data.repo.repo_id"), comment="Repository this model was trained on" ) model_method = Column( @@ -3827,14 +3827,14 @@ class TopicModelEvent(Base): ) repo_id = Column( Integer, - ForeignKey("augur_data.repo.repo_id", name="fk_tme_repo_id"), + ForeignKey("collection_data.repo.repo_id", name="fk_tme_repo_id"), nullable=True, comment="Repository associated with this event" ) model_id = Column( UUID(as_uuid=True), ForeignKey( - "augur_data.topic_model_meta.model_id", + "collection_data.topic_model_meta.model_id", name="fk_tme_model_id", ondelete="SET NULL" ), From 7f280a00fcc1897e6f89135385358a720c5697f7 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:17:28 -0400 Subject: [PATCH 062/165] rename all references to augur_data in the rest of the codebase Signed-off-by: Adrian Edwards --- collectoss/api/metrics/commit.py | 2 +- collectoss/api/metrics/deps.py | 32 ++-- collectoss/api/metrics/message.py | 20 +-- collectoss/api/metrics/pull_request.py | 6 +- collectoss/api/metrics/repo_meta.py | 12 +- collectoss/api/metrics/toss.py | 2 +- collectoss/api/routes/collection_status.py | 18 +-- collectoss/api/routes/complexity.py | 144 +++++++++--------- collectoss/api/routes/metadata.py | 2 +- collectoss/application/cli/backend.py | 2 +- collectoss/application/cli/collection.py | 2 +- collectoss/application/cli/db.py | 4 +- .../data_analysis/clustering_worker/tasks.py | 26 ++-- .../data_analysis/discourse_analysis/tasks.py | 8 +- .../data_analysis/message_insights/tasks.py | 54 +++---- .../pull_request_analysis_worker/tasks.py | 22 +-- .../tasks/db/refresh_materialized_views.py | 28 ++-- .../tasks/github/facade_github/tasks.py | 12 +- collectoss/tasks/util/collection_util.py | 2 +- collectoss/util/repo_load_controller.py | 4 +- conftest.py | 2 +- .../test_application/test_db/test_session.py | 24 +-- .../test_github_tasks/test_pull_requests.py | 10 +- .../test_endpoints.py | 6 +- 24 files changed, 222 insertions(+), 222 deletions(-) diff --git a/collectoss/api/metrics/commit.py b/collectoss/api/metrics/commit.py index de2c84809..3b55a1520 100644 --- a/collectoss/api/metrics/commit.py +++ b/collectoss/api/metrics/commit.py @@ -231,7 +231,7 @@ def annual_commit_count_ranked_by_repo_in_repo_group(repo_group_id, repo_id=None if timeframe == 'all': cdRgTpRankedCommitsSQL = s.sql.text(""" SELECT repo.repo_id, repo_name as name, SUM(added - removed - whitespace) as net, patches - FROM augur_data.dm_repo_annual, repo, repo_groups + FROM collection_data.dm_repo_annual, repo, repo_groups WHERE repo.repo_group_id = :repo_group_id AND repo.repo_group_id = repo_groups.repo_group_id AND dm_repo_annual.repo_id = repo.repo_id diff --git a/collectoss/api/metrics/deps.py b/collectoss/api/metrics/deps.py index ef13aee7d..61f34092f 100644 --- a/collectoss/api/metrics/deps.py +++ b/collectoss/api/metrics/deps.py @@ -33,13 +33,13 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No depsSQL = s.sql.text(""" SELECT - augur_data.repo_dependencies.*, - augur_data.repo_groups.repo_group_id + collection_data.repo_dependencies.*, + collection_data.repo_groups.repo_group_id FROM - augur_data.repo_dependencies, - augur_data.repo_groups, - augur_data.repo, - ( SELECT MAX ( date_trunc( 'day', augur_data.repo_dependencies.data_collection_date ) ) AS data_collection_date FROM repo_dependencies WHERE repo_id = repo_id ) C + collection_data.repo_dependencies, + collection_data.repo_groups, + collection_data.repo, + ( SELECT MAX ( date_trunc( 'day', collection_data.repo_dependencies.data_collection_date ) ) AS data_collection_date FROM repo_dependencies WHERE repo_id = repo_id ) C WHERE repo_dependencies.repo_id = repo.repo_id AND repo.repo_group_id = repo_groups.repo_group_id @@ -54,13 +54,13 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No depsSQL = s.sql.text(""" SELECT - augur_data.repo_dependencies.*, - augur_data.repo_groups.repo_group_id + collection_data.repo_dependencies.*, + collection_data.repo_groups.repo_group_id FROM - augur_data.repo_dependencies, - augur_data.repo_groups, - augur_data.repo, - ( SELECT MAX ( date_trunc( 'day', augur_data.repo_dependencies.data_collection_date ) ) AS data_collection_date + collection_data.repo_dependencies, + collection_data.repo_groups, + collection_data.repo, + ( SELECT MAX ( date_trunc( 'day', collection_data.repo_dependencies.data_collection_date ) ) AS data_collection_date FROM repo_dependencies, repo, repo_groups WHERE repo.repo_group_id = repo_groups.repo_group_id and repo_dependencies.repo_id = repo.repo_id and @@ -134,8 +134,8 @@ def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date f.libyear, f.data_collection_date FROM - ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e, - augur_data.repo_deps_libyear f + ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM collection_data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e, + collection_data.repo_deps_libyear f WHERE e.data_collection_date = f.data_collection_date and e.repo_id = f.repo_id @@ -203,8 +203,8 @@ def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date f.libyear, f.data_collection_date FROM - ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e, - augur_data.repo_deps_libyear f + ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM collection_data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e, + collection_data.repo_deps_libyear f WHERE e.data_collection_date = f.data_collection_date and e.repo_id = f.repo_id diff --git a/collectoss/api/metrics/message.py b/collectoss/api/metrics/message.py index f76aabd28..26ce99cf3 100644 --- a/collectoss/api/metrics/message.py +++ b/collectoss/api/metrics/message.py @@ -40,12 +40,12 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en COUNT ( * ), repo_name FROM - augur_data.repo, - augur_data.message + collection_data.repo, + collection_data.message WHERE - augur_data.repo.repo_id = augur_data.message.repo_id + collection_data.repo.repo_id = collection_data.message.repo_id AND - augur_data.repo.repo_id = :repo_id + collection_data.repo.repo_id = :repo_id AND message.msg_timestamp BETWEEN :begin_date AND :end_date GROUP BY @@ -69,14 +69,14 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en COUNT ( * ), rg_name FROM - augur_data.repo, - augur_data.repo_groups, - augur_data.message + collection_data.repo, + collection_data.repo_groups, + collection_data.message WHERE - augur_data.repo.repo_id = augur_data.message.repo_id - AND augur_data.repo_groups.repo_group_id = repo.repo_group_id + collection_data.repo.repo_id = collection_data.message.repo_id + AND collection_data.repo_groups.repo_group_id = repo.repo_group_id AND - augur_data.repo_groups.repo_group_id = :repo_group_id + collection_data.repo_groups.repo_group_id = :repo_group_id AND message.msg_timestamp BETWEEN :begin_date AND :end_date GROUP BY diff --git a/collectoss/api/metrics/pull_request.py b/collectoss/api/metrics/pull_request.py index 20d6be893..8516ec999 100644 --- a/collectoss/api/metrics/pull_request.py +++ b/collectoss/api/metrics/pull_request.py @@ -787,8 +787,8 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo pr_merged_at, pr_closed_at, pr_created_at - FROM augur_data.pull_request_commits, augur_data.pull_request_meta,augur_data.repo_groups, - augur_data.pull_requests JOIN repo ON pull_requests.repo_id = repo.repo_id + FROM collection_data.pull_request_commits, collection_data.pull_request_meta,collection_data.repo_groups, + collection_data.pull_requests JOIN repo ON pull_requests.repo_id = repo.repo_id WHERE pull_requests.repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) AND pull_requests.pull_request_id = pull_request_commits.pull_request_id @@ -821,7 +821,7 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo pr_merged_at, pr_closed_at, pr_created_at - FROM augur_data.pull_request_commits, augur_data.pull_requests, augur_data.pull_request_meta + FROM collection_data.pull_request_commits, collection_data.pull_requests, collection_data.pull_request_meta WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id AND pull_requests.repo_id = :repo_id diff --git a/collectoss/api/metrics/repo_meta.py b/collectoss/api/metrics/repo_meta.py index 7c4129081..a609066ab 100644 --- a/collectoss/api/metrics/repo_meta.py +++ b/collectoss/api/metrics/repo_meta.py @@ -190,7 +190,7 @@ def sbom_download(repo_group_id, repo_id=None): :return: dosocs sbom """ dosocs_SQL = s.sql.text(""" - select * from augur_data.repo_sbom_scans + select * from collection_data.repo_sbom_scans where repo_id = :repo_id; """) @@ -313,7 +313,7 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): if not repo_id: cii_best_practices_badge_SQL = s.sql.text(""" SELECT data - FROM augur_data.repo_badging + FROM collection_data.repo_badging WHERE repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) ORDER BY created_at DESC LIMIT 1 @@ -321,7 +321,7 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): else: cii_best_practices_badge_SQL = s.sql.text(""" SELECT data - FROM augur_data.repo_badging + FROM collection_data.repo_badging WHERE repo_id = :repo_id ORDER BY created_at DESC LIMIT 1 @@ -1270,7 +1270,7 @@ def clones(repo_group_id, repo_id=None, begin_date=None, end_date=None): clone_data_timestamp AS date, count_clones AS total_clones, unique_clones - FROM augur_data.repo_clones_data + FROM collection_data.repo_clones_data WHERE repo_id = :repo_id AND clone_data_timestamp BETWEEN :begin_date AND :end_date ORDER BY clone_data_timestamp @@ -1289,9 +1289,9 @@ def clones(repo_group_id, repo_id=None, begin_date=None, end_date=None): clone_data_timestamp AS date, count_clones AS total_clones, unique_clones - FROM augur_data.repo_clones_data + FROM collection_data.repo_clones_data WHERE repo_id IN ( - SELECT repo_id FROM augur_data.repo WHERE repo_group_id = :repo_group_id + SELECT repo_id FROM collection_data.repo WHERE repo_group_id = :repo_group_id ) AND clone_data_timestamp BETWEEN :begin_date AND :end_date ORDER BY repo_id, clone_data_timestamp diff --git a/collectoss/api/metrics/toss.py b/collectoss/api/metrics/toss.py index 698b4cf31..69597da66 100644 --- a/collectoss/api/metrics/toss.py +++ b/collectoss/api/metrics/toss.py @@ -114,7 +114,7 @@ def toss_repo_info(repo_id): repo_info.default_branch, repo.repo_git FROM - augur_data.repo_info + collection_data.repo_info JOIN repo ON repo.repo_id = repo_info.repo_id WHERE repo_info.repo_id = :repo_id diff --git a/collectoss/api/routes/collection_status.py b/collectoss/api/routes/collection_status.py index eaa374f4c..ba8373440 100644 --- a/collectoss/api/routes/collection_status.py +++ b/collectoss/api/routes/collection_status.py @@ -61,10 +61,10 @@ def issue_collection_status(): # TODO: make this name automatic - wrapper? ( CAST (( COUNT ( * )) +1 AS DOUBLE PRECISION ) / CAST ( b.issues_count + 1 AS DOUBLE PRECISION )) AS ratio_issues FROM - augur_data.repo A, - augur_data.issues d, - augur_data.repo_info b, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, + collection_data.repo A, + collection_data.issues d, + collection_data.repo_info b, + ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM collection_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, ( SELECT repo_id, MAX ( data_collection_date ) AS most_recently_collected_issue FROM issues GROUP BY repo_id ORDER BY repo_id ) f WHERE A.repo_id = b.repo_id @@ -135,11 +135,11 @@ def pull_request_collection_status(): # TODO: make this name automatic - wrappe ABS ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_abs, ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_issues FROM - augur_data.repo A, - augur_data.pull_requests d, - augur_data.repo_info b, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_pr_collected FROM augur_data.pull_requests GROUP BY repo_id ORDER BY repo_id ) f + collection_data.repo A, + collection_data.pull_requests d, + collection_data.repo_info b, + ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM collection_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, + ( SELECT repo_id, MAX ( data_collection_date ) AS last_pr_collected FROM collection_data.pull_requests GROUP BY repo_id ORDER BY repo_id ) f WHERE A.repo_id = b.repo_id AND LOWER ( A.repo_git ) LIKE'%github.com%' diff --git a/collectoss/api/routes/complexity.py b/collectoss/api/routes/complexity.py index 11fbf5ebe..1b1a2c6a5 100644 --- a/collectoss/api/routes/complexity.py +++ b/collectoss/api/routes/complexity.py @@ -17,13 +17,13 @@ def get_project_languages(): project_languages_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + collection_data.repo.repo_git, + collection_data.repo.repo_name, e.programming_language, e.code_lines, e.files FROM - augur_data.repo, + collection_data.repo, (SELECT d.repo_id, d.programming_language, @@ -31,22 +31,22 @@ def get_project_languages(): COUNT(*)::int AS files FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.programming_language, - augur_data.repo_labor.code_lines + collection_data.repo_labor.repo_id, + collection_data.repo_labor.programming_language, + collection_data.repo_labor.code_lines FROM - augur_data.repo_labor, + collection_data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + collection_data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + collection_data.repo_labor + GROUP BY collection_data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + collection_data.repo_labor.repo_id = recent.repo_id + AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id, d.programming_language) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE collection_data.repo.repo_id = e.repo_id ORDER BY e.repo_id """) @@ -62,30 +62,30 @@ def get_project_files(): project_files_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + collection_data.repo.repo_git, + collection_data.repo.repo_name, e.files FROM - augur_data.repo, + collection_data.repo, (SELECT d.repo_id, count(*) AS files FROM (SELECT - augur_data.repo_labor.repo_id + collection_data.repo_labor.repo_id FROM - augur_data.repo_labor, + collection_data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + collection_data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + collection_data.repo_labor + GROUP BY collection_data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + collection_data.repo_labor.repo_id = recent.repo_id + AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE collection_data.repo.repo_id = e.repo_id ORDER BY e.repo_id """) @@ -103,33 +103,33 @@ def get_project_lines(): project_lines_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + collection_data.repo.repo_git, + collection_data.repo.repo_name, e.total_lines, e.average_lines FROM - augur_data.repo, + collection_data.repo, (SELECT d.repo_id, SUM(d.total_lines) AS total_lines, AVG(d.total_lines)::INT AS average_lines FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.total_lines + collection_data.repo_labor.repo_id, + collection_data.repo_labor.total_lines FROM - augur_data.repo_labor, + collection_data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + collection_data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + collection_data.repo_labor + GROUP BY collection_data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + collection_data.repo_labor.repo_id = recent.repo_id + AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id and augur_data.repo.repo_id = :repo_id_param + WHERE collection_data.repo.repo_id = e.repo_id and collection_data.repo.repo_id = :repo_id_param ORDER BY e.repo_id """).bindparams(repo_id_param=repo_id) @@ -147,33 +147,33 @@ def get_project_comment_lines(): comment_lines_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + collection_data.repo.repo_git, + collection_data.repo.repo_name, e.comment_lines, e.avg_comment_lines FROM - augur_data.repo, + collection_data.repo, (SELECT d.repo_id, SUM(d.comment_lines) AS comment_lines, AVG(d.comment_lines)::INT AS avg_comment_lines FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.comment_lines + collection_data.repo_labor.repo_id, + collection_data.repo_labor.comment_lines FROM - augur_data.repo_labor, + collection_data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + collection_data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + collection_data.repo_labor + GROUP BY collection_data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + collection_data.repo_labor.repo_id = recent.repo_id + AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE collection_data.repo.repo_id = e.repo_id AND e.repo_id = :repo_id_param ORDER BY e.repo_id """).bindparams(repo_id_param=repo_id) @@ -192,33 +192,33 @@ def get_project_blank_lines(): blank_lines_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + collection_data.repo.repo_git, + collection_data.repo.repo_name, e.blank_lines, e.avg_blank_lines FROM - augur_data.repo, + collection_data.repo, (SELECT d.repo_id, SUM(d.blank_lines) AS blank_lines, AVG(d.blank_lines)::int AS avg_blank_lines FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.blank_lines + collection_data.repo_labor.repo_id, + collection_data.repo_labor.blank_lines FROM - augur_data.repo_labor, + collection_data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + collection_data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + collection_data.repo_labor + GROUP BY collection_data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + collection_data.repo_labor.repo_id = recent.repo_id + AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE collection_data.repo.repo_id = e.repo_id AND e.repo_id = :repo_id_param ORDER BY e.repo_id """).bindparams(repo_id_param=repo_id) @@ -236,33 +236,33 @@ def get_project_file_complexity(): project_file_complexity_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + collection_data.repo.repo_git, + collection_data.repo.repo_name, e.sum_code_complexity, e.average_code_complexity FROM - augur_data.repo, + collection_data.repo, (SELECT d.repo_id, SUM(d.code_complexity) AS sum_code_complexity, AVG(d.code_complexity)::int AS average_code_complexity FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.code_complexity + collection_data.repo_labor.repo_id, + collection_data.repo_labor.code_complexity FROM - augur_data.repo_labor, + collection_data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + collection_data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + collection_data.repo_labor + GROUP BY collection_data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + collection_data.repo_labor.repo_id = recent.repo_id + AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE collection_data.repo.repo_id = e.repo_id ORDER BY e.repo_id """) diff --git a/collectoss/api/routes/metadata.py b/collectoss/api/routes/metadata.py index edd65f595..7b09cfabe 100644 --- a/collectoss/api/routes/metadata.py +++ b/collectoss/api/routes/metadata.py @@ -31,7 +31,7 @@ def get_repo_info(): FROM repo_info, repo, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e + ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM collection_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e WHERE repo_info.repo_id = repo.repo_id AND e.repo_id = repo_info.repo_id diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index a07ddf198..f85a4e105 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -386,7 +386,7 @@ def repo_reset(backend_app): UPDATE augur_operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; - TRUNCATE augur_data.commits CASCADE; + TRUNCATE collection_data.commits CASCADE; """)) logger.info("Repos successfully reset") diff --git a/collectoss/application/cli/collection.py b/collectoss/application/cli/collection.py index b1a93ce80..3f46d10d1 100644 --- a/collectoss/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -211,7 +211,7 @@ def repo_reset(ctx): UPDATE augur_operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; - TRUNCATE augur_data.commits CASCADE; + TRUNCATE collection_data.commits CASCADE; """)) logger.info("Repos successfully reset") diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index 1a790e3c0..6bc475711 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -140,7 +140,7 @@ def get_repo_groups(ctx: click.Context) -> pd.DataFrame: with ctx.obj.engine.connect() as connection: df = pd.read_sql( s.sql.text( - "SELECT repo_group_id, rg_name, rg_description FROM augur_data.repo_groups" + "SELECT repo_group_id, rg_name, rg_description FROM collection_data.repo_groups" ), connection, ) @@ -179,7 +179,7 @@ def add_repo_groups(ctx: click.Context, filename: str) -> None: with ctx.obj.engine.begin() as connection: # Get existing repo group IDs df = pd.read_sql( - s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), + s.sql.text("SELECT repo_group_id FROM collection_data.repo_groups"), connection, ) repo_group_IDs = df["repo_group_id"].values.tolist() diff --git a/collectoss/tasks/data_analysis/clustering_worker/tasks.py b/collectoss/tasks/data_analysis/clustering_worker/tasks.py index c9e269e5f..7ec48414d 100644 --- a/collectoss/tasks/data_analysis/clustering_worker/tasks.py +++ b/collectoss/tasks/data_analysis/clustering_worker/tasks.py @@ -78,10 +78,10 @@ def clustering_model(repo_git: str,logger,engine) -> None: i.issue_title thread_title, M.msg_id FROM - augur_data.repo r, - augur_data.issues i, - augur_data.message M, - augur_data.issue_message_ref imr + collection_data.repo r, + collection_data.issues i, + collection_data.message M, + collection_data.issue_message_ref imr WHERE r.repo_id = i.repo_id AND imr.issue_id = i.issue_id @@ -98,10 +98,10 @@ def clustering_model(repo_git: str,logger,engine) -> None: pr.pr_src_title thread_title, M.msg_id FROM - augur_data.repo r, - augur_data.pull_requests pr, - augur_data.message M, - augur_data.pull_request_message_ref prmr + collection_data.repo r, + collection_data.pull_requests pr, + collection_data.message M, + collection_data.pull_request_message_ref prmr WHERE r.repo_id = pr.repo_id AND prmr.pull_request_id = pr.pull_request_id @@ -289,15 +289,15 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): get_messages_sql = s.sql.text( """ SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id - FROM augur_data.repo r, augur_data.issues i, - augur_data.message m, augur_data.issue_message_ref imr + FROM collection_data.repo r, collection_data.issues i, + collection_data.message m, collection_data.issue_message_ref imr WHERE r.repo_id=i.repo_id AND imr.issue_id=i.issue_id AND imr.msg_id=m.msg_id UNION SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, pr.pull_request_id thread_id,m.msg_text,pr.pr_src_title thread_title,m.msg_id - FROM augur_data.repo r, augur_data.pull_requests pr, - augur_data.message m, augur_data.pull_request_message_ref prmr + FROM collection_data.repo r, collection_data.pull_requests pr, + collection_data.message m, collection_data.pull_request_message_ref prmr WHERE r.repo_id=pr.repo_id AND prmr.pull_request_id=pr.pull_request_id AND prmr.msg_id=m.msg_id @@ -365,7 +365,7 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): # key_sequence_words_sql = s.sql.text( # """ - # SELECT nextval('augur_data.topic_words_topic_words_id_seq'::text) + # SELECT nextval('collection_data.topic_words_topic_words_id_seq'::text) # """ # ) diff --git a/collectoss/tasks/data_analysis/discourse_analysis/tasks.py b/collectoss/tasks/data_analysis/discourse_analysis/tasks.py index a95756b8c..cad3856ab 100644 --- a/collectoss/tasks/data_analysis/discourse_analysis/tasks.py +++ b/collectoss/tasks/data_analysis/discourse_analysis/tasks.py @@ -51,16 +51,16 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: get_messages_for_repo_sql = s.sql.text(""" (SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id - FROM augur_data.repo r, augur_data.issues i, - augur_data.message m, augur_data.issue_message_ref imr + FROM collection_data.repo r, collection_data.issues i, + collection_data.message m, collection_data.issue_message_ref imr WHERE r.repo_id=i.repo_id AND imr.issue_id=i.issue_id AND imr.msg_id=m.msg_id AND r.repo_id = :repo_id) UNION (SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, pr.pull_request_id thread_id,m.msg_text,pr.pr_src_title thread_title,m.msg_id - FROM augur_data.repo r, augur_data.pull_requests pr, - augur_data.message m, augur_data.pull_request_message_ref prmr + FROM collection_data.repo r, collection_data.pull_requests pr, + collection_data.message m, collection_data.pull_request_message_ref prmr WHERE r.repo_id=pr.repo_id AND prmr.pull_request_id=pr.pull_request_id AND prmr.msg_id=m.msg_id diff --git a/collectoss/tasks/data_analysis/message_insights/tasks.py b/collectoss/tasks/data_analysis/message_insights/tasks.py index 7913a5d13..f01de4305 100644 --- a/collectoss/tasks/data_analysis/message_insights/tasks.py +++ b/collectoss/tasks/data_analysis/message_insights/tasks.py @@ -52,7 +52,7 @@ def message_insight_model(repo_git: str,logger,engine) -> None: # Check to see if repo has been analyzed previously repo_exists_SQL = s.sql.text(""" - SELECT exists (SELECT 1 FROM augur_data.message_analysis_summary WHERE repo_id = :repo_id LIMIT 1)""") + SELECT exists (SELECT 1 FROM collection_data.message_analysis_summary WHERE repo_id = :repo_id LIMIT 1)""") with engine.connect() as conn: df_rep = pd.read_sql_query(repo_exists_SQL, conn, params={'repo_id': repo_id}) @@ -66,17 +66,17 @@ def message_insight_model(repo_git: str,logger,engine) -> None: # Fetch the timestamp of last analyzed message for the repo past_SQL = s.sql.text(""" select message_analysis.msg_id, message.msg_timestamp - from augur_data.message_analysis - inner join augur_data.message on message.msg_id = message_analysis.msg_id - inner join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - inner join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + from collection_data.message_analysis + inner join collection_data.message on message.msg_id = message_analysis.msg_id + inner join collection_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + inner join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where message.repo_id = :repo_id UNION select message_analysis.msg_id, message.msg_timestamp - from augur_data.message_analysis - inner join augur_data.message on message.msg_id = message_analysis.msg_id - inner join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - inner join augur_data.issues on issue_message_ref.issue_id = issues.issue_id + from collection_data.message_analysis + inner join collection_data.message on message.msg_id = message_analysis.msg_id + inner join collection_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + inner join collection_data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id """) @@ -97,28 +97,28 @@ def message_insight_model(repo_git: str,logger,engine) -> None: # Fetch only recent messages join_SQL = s.sql.text(""" - select message.msg_id, msg_timestamp, msg_text from augur_data.message - left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + select message.msg_id, msg_timestamp, msg_text from collection_data.message + left outer join collection_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + left outer join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where message.repo_id = :repo_id and msg_timestamp > :begin_date UNION - select message.msg_id, msg_timestamp, msg_text from augur_data.message - left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id + select message.msg_id, msg_timestamp, msg_text from collection_data.message + left outer join collection_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + left outer join collection_data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id and msg_timestamp > :begin_date""") else: logger.info(f'Fetching all past messages of repo {repo_id}...') # Fetch all messages join_SQL = s.sql.text(""" - select message.msg_id, msg_timestamp, msg_text from augur_data.message - left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + select message.msg_id, msg_timestamp, msg_text from collection_data.message + left outer join collection_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + left outer join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where message.repo_id = :repo_id UNION - select message.msg_id, msg_timestamp, msg_text from augur_data.message - left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id + select message.msg_id, msg_timestamp, msg_text from collection_data.message + left outer join collection_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + left outer join collection_data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id""") with engine.connect() as conn: @@ -147,14 +147,14 @@ def message_insight_model(repo_git: str,logger,engine) -> None: if not full_train: merge_SQL = s.sql.text(""" - select novelty_flag, reconstruction_error from augur_data.message_analysis - left outer join augur_data.pull_request_message_ref on message_analysis.msg_id = pull_request_message_ref.msg_id - left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + select novelty_flag, reconstruction_error from collection_data.message_analysis + left outer join collection_data.pull_request_message_ref on message_analysis.msg_id = pull_request_message_ref.msg_id + left outer join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where pull_request_message_ref.repo_id = :repo_id UNION - select novelty_flag, reconstruction_error from augur_data.message_analysis - left outer join augur_data.issue_message_ref on message_analysis.msg_id = issue_message_ref.msg_id - left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id + select novelty_flag, reconstruction_error from collection_data.message_analysis + left outer join collection_data.issue_message_ref on message_analysis.msg_id = issue_message_ref.msg_id + left outer join collection_data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") with engine.connect() as conn: diff --git a/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py index aa8d5a0a0..2c6e4365e 100644 --- a/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -59,8 +59,8 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: pull_request_commits.pr_cmt_id, pr_augur_contributor_id, pr_src_author_association - from augur_data.pull_requests - INNER JOIN augur_data.pull_request_commits on pull_requests.pull_request_id = pull_request_commits.pull_request_id + from collection_data.pull_requests + INNER JOIN collection_data.pull_request_commits on pull_requests.pull_request_id = pull_request_commits.pull_request_id where pr_created_at > :begin_date and pull_requests.repo_id = :repo_id and pr_src_state like 'open' @@ -90,13 +90,13 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: # Get sentiment score of all messages relating to the PR messages_SQL = s.sql.text(""" - select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from augur_data.message - left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where pull_request_message_ref.repo_id = :repo_id + select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from collection_data.message + left outer join collection_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + left outer join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where pull_request_message_ref.repo_id = :repo_id UNION - select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from augur_data.message - left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") + select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from collection_data.message + left outer join collection_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + left outer join collection_data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") with engine.connect() as conn: df_message = pd.read_sql_query(messages_SQL, conn, params={'repo_id': repo_id}) @@ -104,7 +104,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: # Map PR to its corresponding messages - pr_ref_sql = s.sql.text("select * from augur_data.pull_request_message_ref") + pr_ref_sql = s.sql.text("select * from collection_data.pull_request_message_ref") with engine.connect() as conn: df_pr_ref = pd.read_sql_query(pr_ref_sql, conn) df_merge = pd.merge(df_pr, df_pr_ref, on='pull_request_id', how='left') @@ -142,7 +142,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: ''' # Get cntrb info from API - cntrb_sql = 'SELECT cntrb_id, gh_login FROM augur_data.contributors' + cntrb_sql = 'SELECT cntrb_id, gh_login FROM collection_data.contributors' df_ctrb = pd.read_sql_query(cntrb_SQL, create_database_engine()) df_fin1 = pd.merge(df_fin,df_ctrb,left_on='pr_augur_contributor_id', right_on='cntrb_id', how='left') df_fin1 = df_fin1.drop(['cntrb_id'],axis=1) @@ -157,7 +157,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: # Get repo info repo_sql = s.sql.text(""" SELECT repo_id, pull_requests_merged, pull_request_count,watchers_count, last_updated FROM - augur_data.repo_info where repo_id = :repo_id + collection_data.repo_info where repo_id = :repo_id """) with engine.connect() as conn: diff --git a/collectoss/tasks/db/refresh_materialized_views.py b/collectoss/tasks/db/refresh_materialized_views.py index 95f169722..d8eeabf97 100644 --- a/collectoss/tasks/db/refresh_materialized_views.py +++ b/collectoss/tasks/db/refresh_materialized_views.py @@ -19,78 +19,78 @@ def refresh_materialized_views(self): #self.logger = logging.getLogger(refresh_materialized_views.__name__) mv1_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.api_get_all_repo_prs with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.api_get_all_repo_prs with data; COMMIT; """) mv2_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.api_get_all_repos_commits with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.api_get_all_repos_commits with data; COMMIT; """) mv3_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.api_get_all_repos_issues with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.api_get_all_repos_issues with data; COMMIT; """) mv4_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.augur_new_contributors with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.augur_new_contributors with data; COMMIT; """) mv5_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_commits_and_committers_daily_count with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_commits_and_committers_daily_count with data; COMMIT; """) mv6_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_new_contributors with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_new_contributors with data; COMMIT; """) mv7_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_entry_list with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_entry_list with data; COMMIT; """) mv8_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_contributor_actions with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_contributor_actions with data; COMMIT; """) mv9_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_user_repos with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_user_repos with data; COMMIT; """) mv10_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response_times with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_pr_response_times with data; COMMIT; """) mv11_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_assignments with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_pr_assignments with data; COMMIT; """) mv12_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_issue_assignments with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_issue_assignments with data; COMMIT; """) mv13_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_pr_response with data; COMMIT; """) mv14_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_repo_languages with data; + REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_repo_languages with data; COMMIT; """) diff --git a/collectoss/tasks/github/facade_github/tasks.py b/collectoss/tasks/github/facade_github/tasks.py index ab7a18eab..732b70dcc 100644 --- a/collectoss/tasks/github/facade_github/tasks.py +++ b/collectoss/tasks/github/facade_github/tasks.py @@ -207,12 +207,12 @@ def insert_facade_contributors(self, repo_git): commits.cmt_commit_hash AS hash, commits.cmt_author_raw_email AS email_raw FROM - augur_data.commits + collection_data.commits WHERE commits.repo_id = :repo_id AND commits.cmt_ght_author_id IS NULL AND commits.cmt_author_raw_email NOT IN ( - SELECT email FROM augur_data.unresolved_commit_emails + SELECT email FROM collection_data.unresolved_commit_emails ) """).bindparams(repo_id=repo_id) @@ -253,19 +253,19 @@ def insert_facade_contributors(self, repo_git): resolve_email_to_cntrb_id_sql = s.sql.text(""" WITH email_to_contributor AS ( SELECT cntrb_email AS email, cntrb_id - FROM augur_data.contributors + FROM collection_data.contributors WHERE cntrb_email IS NOT NULL UNION ALL SELECT cntrb_canonical AS email, cntrb_id - FROM augur_data.contributors + FROM collection_data.contributors WHERE cntrb_canonical IS NOT NULL UNION ALL SELECT alias_email AS email, cntrb_id - FROM augur_data.contributors_aliases + FROM collection_data.contributors_aliases WHERE alias_email IS NOT NULL ), deduplicated AS ( @@ -277,7 +277,7 @@ def insert_facade_contributors(self, repo_git): d.cntrb_id, c.cmt_author_email AS email FROM - augur_data.commits c + collection_data.commits c INNER JOIN deduplicated d ON c.cmt_author_email = d.email diff --git a/collectoss/tasks/util/collection_util.py b/collectoss/tasks/util/collection_util.py index 9ca6bb059..3781377b0 100644 --- a/collectoss/tasks/util/collection_util.py +++ b/collectoss/tasks/util/collection_util.py @@ -74,7 +74,7 @@ def get_newly_added_repos(session, limit, hook): repo_query = s.sql.text(f""" select repo_git - from augur_operations.collection_status x, augur_data.repo y + from augur_operations.collection_status x, collection_data.repo y where x.repo_id=y.repo_id and {condition_string} order by {order_by_field} diff --git a/collectoss/util/repo_load_controller.py b/collectoss/util/repo_load_controller.py index 5455411e4..7e3a0548a 100644 --- a/collectoss/util/repo_load_controller.py +++ b/collectoss/util/repo_load_controller.py @@ -22,8 +22,8 @@ augur_data_schema = MetaData(schema = "augur_data") augur_data_schema.reflect(bind = engine, views = True) - commits_materialized_view: Table = augur_data_schema.tables["augur_data.api_get_all_repos_commits"] - issues_materialized_view: Table = augur_data_schema.tables["augur_data.api_get_all_repos_issues"] + commits_materialized_view: Table = augur_data_schema.tables["collection_data.api_get_all_repos_commits"] + issues_materialized_view: Table = augur_data_schema.tables["collection_data.api_get_all_repos_issues"] class RepoLoadController: diff --git a/conftest.py b/conftest.py index db2e95b78..38f7e7055 100644 --- a/conftest.py +++ b/conftest.py @@ -195,7 +195,7 @@ def read_only_db(empty_db): database_name = empty_db.url.database test_username = "testuser" test_password = "testpass" - schemas = ["public", "augur_data", "augur_operations"] + schemas = ["public", "collection_data", "augur_operations"] # create read-only user empty_db.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) diff --git a/tests/test_application/test_db/test_session.py b/tests/test_application/test_db/test_session.py index 36698b217..856a3f194 100644 --- a/tests/test_application/test_db/test_session.py +++ b/tests/test_application/test_db/test_session.py @@ -35,7 +35,7 @@ def test_execute_sql(test_db_engine): with DatabaseSession(logger, engine=test_db_engine) as session: cntrb_id = data['cntrb_id'] - result = session.execute_sql(f"SELECT * FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() + result = session.execute_sql(f"SELECT * FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() assert result is not None assert isinstance(result[0], s.engine.result.RowProxy) @@ -57,7 +57,7 @@ def test_execute_sql(test_db_engine): for data in all_data: cntrb_id = data["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}';") def test_insert_data_with_duplicates(test_db_engine): @@ -79,7 +79,7 @@ def test_insert_data_with_duplicates(test_db_engine): cntrb_id = data_1['cntrb_id'] - result = session.execute_sql(f"SELECT * FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() + result = session.execute_sql(f"SELECT * FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() assert result is not None assert len(result) == 3 @@ -94,7 +94,7 @@ def test_insert_data_with_duplicates(test_db_engine): for data in duplicate_data_list: cntrb_id = data["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}';") def test_insert_data_with_updates(test_db_engine): @@ -117,7 +117,7 @@ def test_insert_data_with_updates(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1['cntrb_id'] - result = connection.execute(f"SELECT * FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() assert result is not None assert dict(result[0])["gh_user_id"] == 6 @@ -127,7 +127,7 @@ def test_insert_data_with_updates(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}';") def test_insert_data_with_bulk(test_db_engine): @@ -145,7 +145,7 @@ def test_insert_data_with_bulk(test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute(f"SELECT * FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() assert result is not None assert len(result) == 4 @@ -160,7 +160,7 @@ def test_insert_data_with_bulk(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") + connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") @@ -183,7 +183,7 @@ def test_insert_data_partial_update(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1['cntrb_id'] - result = connection.execute(f"SELECT * FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() assert result is not None assert dict(result[0])["gh_user_id"] == 6 @@ -193,7 +193,7 @@ def test_insert_data_partial_update(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}';") issue_data_with_null_strings = [] @@ -232,7 +232,7 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): return_columns=issue_return_columns, string_fields=issue_string_columns) data_inserted_count = len(issue_data_with_null_strings) - result = connection.execute(f"Select * FROM augur_data.issues;").fetchall() + result = connection.execute(f"Select * FROM collection_data.issues;").fetchall() assert issue_return_data is not None assert len(issue_return_data) == data_inserted_count @@ -242,7 +242,7 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): with test_db_engine.connect() as connection: connection.execute(""" - DELETE FROM augur_data.issues; + DELETE FROM collection_data.issues; DELETE FROM "augur_data"."repo"; DELETE FROM "augur_data"."repo_groups"; """) diff --git a/tests/test_tasks/test_github_tasks/test_pull_requests.py b/tests/test_tasks/test_github_tasks/test_pull_requests.py index 4dc2c9e73..83751ea22 100644 --- a/tests/test_tasks/test_github_tasks/test_pull_requests.py +++ b/tests/test_tasks/test_github_tasks/test_pull_requests.py @@ -263,7 +263,7 @@ def test_insert_pr_contributors(github_api_key_headers, test_db_session, pr_numb with test_db_session.engine.connect() as connection: - result = connection.execute(f"SELECT * FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() assert result is not None assert len(result) == len(unique_contributors) @@ -277,7 +277,7 @@ def test_insert_pr_contributors(github_api_key_headers, test_db_session, pr_numb with test_db_session.engine.connect() as connection: - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") + connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") repos = [] repos.append({"owner": "chaoss", "repo": "augur"}) @@ -336,7 +336,7 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): with test_db_session.engine.connect() as connection: - result = connection.execute(f"SELECT * FROM augur_data.pull_requests;").fetchall() + result = connection.execute(f"SELECT * FROM collection_data.pull_requests;").fetchall() assert result is not None assert len(result) == len(prs) == len(return_data) @@ -353,11 +353,11 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): with test_db_session.engine.connect() as connection: - connection.execute(f"DELETE FROM augur_data.pull_requests;") + connection.execute(f"DELETE FROM collection_data.pull_requests;") connection.execute("""DELETE FROM "augur_data"."repo"; DELETE FROM "augur_data"."repo_groups"; """) - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") + connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") diff --git a/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py b/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py index 980f09fb3..14bd8bfe5 100644 --- a/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py +++ b/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py @@ -9,7 +9,7 @@ @pytest.fixture def set_up_repo_groups(database_connection): - df = pd.read_sql(s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), database_connection) + df = pd.read_sql(s.sql.text("SELECT repo_group_id FROM collection_data.repo_groups"), database_connection) repo_group_IDs = df['repo_group_id'].values.tolist() insert_repo_group_sql = s.sql.text(""" @@ -31,12 +31,12 @@ def set_up_repo_groups(database_connection): - df = database_connection.execute(s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups")) + df = database_connection.execute(s.sql.text("SELECT repo_group_id FROM collection_data.repo_groups")) repo_group_IDs = [group[0] for group in df.fetchall()] insertSQL = s.sql.text(""" - INSERT INTO augur_data.repo(repo_group_id, repo_git, + INSERT INTO collection_data.repo(repo_group_id, repo_git, tool_source, tool_version, data_source, data_collection_date) VALUES (:repo_group_id, :repo_git, 'CLI', 1.0, 'Git', CURRENT_TIMESTAMP) """) From 754d1221fb47caf1e32869a1afdb5d42c4b3fccc Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:44:29 -0400 Subject: [PATCH 063/165] rename augur_operations file to operations Signed-off-by: Adrian Edwards --- collectoss/application/db/models/__init__.py | 2 +- .../db/models/{augur_operations.py => operations.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename collectoss/application/db/models/{augur_operations.py => operations.py} (100%) diff --git a/collectoss/application/db/models/__init__.py b/collectoss/application/db/models/__init__.py index 11c6b38fe..80d3cf9b4 100644 --- a/collectoss/application/db/models/__init__.py +++ b/collectoss/application/db/models/__init__.py @@ -95,7 +95,7 @@ SpdxIdentifier, ) -from collectoss.application.db.models.augur_operations import ( +from collectoss.application.db.models.operations import ( Settings, WorkerHistory, WorkerJob, diff --git a/collectoss/application/db/models/augur_operations.py b/collectoss/application/db/models/operations.py similarity index 100% rename from collectoss/application/db/models/augur_operations.py rename to collectoss/application/db/models/operations.py From 67a6552591824dae96f5bbc7ffc3b1616ccccaa6 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:48:14 -0400 Subject: [PATCH 064/165] rename operations schema specifiers in tables and columns Signed-off-by: Adrian Edwards --- .../application/db/models/operations.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/collectoss/application/db/models/operations.py b/collectoss/application/db/models/operations.py index 68aaac06d..34c99fe0e 100644 --- a/collectoss/application/db/models/operations.py +++ b/collectoss/application/db/models/operations.py @@ -83,20 +83,20 @@ def retrieve_owner_repos(session, owner: str) -> List[str]: Column("Count", String), Column("WeightedComplexity", String), Column("Files", String), - schema="augur_operations", + schema="collection_operations", ) class Settings(Base): __tablename__ = "augur_settings" __table_args__ = { - "schema": "augur_operations", + "schema": "collection_operations", "comment": "CollectOSS settings include the schema version, and the CollectOSS API Key as of 10/25/2020. Future augur settings may be stored in this table, which has the basic structure of a name-value pair. ", } id = Column( BigInteger, - Sequence("augur_settings_id_seq", start=1, schema="augur_operations"), + Sequence("augur_settings_id_seq", start=1, schema="collection_operations"), primary_key=True, server_default=text( "nextval('augur_operations.augur_settings_id_seq'::regclass)" @@ -119,20 +119,20 @@ class Settings(Base): server_default=text("CURRENT_TIMESTAMP"), ), Index("repos_id,statusops", "repos_id", "status"), - schema="augur_operations", + schema="collection_operations", comment="For future use when we move all working tables to the augur_operations schema. ", ) class WorkerHistory(Base): __tablename__ = "worker_history" __table_args__ = { - "schema": "augur_operations", + "schema": "collection_operations", "comment": "This table stores the complete history of job execution, including success and failure. It is useful for troubleshooting. ", } history_id = Column( BigInteger, - Sequence("gh_worker_history_history_id_seq", start=1, schema="augur_operations"), + Sequence("gh_worker_history_history_id_seq", start=1, schema="collection_operations"), primary_key=True, server_default=text( "nextval('augur_operations.gh_worker_history_history_id_seq'::regclass)" @@ -150,7 +150,7 @@ class WorkerHistory(Base): class WorkerJob(Base): __tablename__ = "worker_job" __table_args__ = { - "schema": "augur_operations", + "schema": "collection_operations", "comment": "This table stores the jobs workers collect data for. A job is found in the code, and in the augur.config.json under the construct of a “model”. ", } @@ -172,13 +172,13 @@ class WorkerJob(Base): class WorkerOauth(Base): __tablename__ = "worker_oauth" __table_args__ = { - "schema": "augur_operations", + "schema": "collection_operations", "comment": "This table stores credentials for retrieving data from platform API’s. Entries in this table must comply with the terms of service for each platform. ", } oauth_id = Column( BigInteger, - Sequence("worker_oauth_oauth_id_seq", start=1000, schema="augur_operations"), + Sequence("worker_oauth_oauth_id_seq", start=1000, schema="collection_operations"), primary_key=True, server_default=text( "nextval('augur_operations.worker_oauth_oauth_id_seq'::regclass)" @@ -196,7 +196,7 @@ class WorkerOauth(Base): class WorkerSettingsFacade(Base): __tablename__ = "worker_settings_facade" __table_args__ = { - "schema": "augur_operations", + "schema": "collection_operations", "comment": "For future use when we move all working tables to the augur_operations schema. ", } @@ -215,7 +215,7 @@ class WorkerSettingsFacade(Base): Column( "working_commit", String(40), server_default=text("'NULL'::character varying") ), - schema="augur_operations", + schema="collection_operations", comment="For future use when we move all working tables to the augur_operations schema. ", ) @@ -237,7 +237,7 @@ class Config(Base): __tablename__ = 'config' __table_args__ = ( UniqueConstraint('section_name', "setting_name", name='unique-config-setting'), - {"schema": "augur_operations"} + {"schema": "collection_operations"} ) id = Column(SmallInteger, primary_key=True, nullable=False) @@ -255,7 +255,7 @@ class User(Base): UniqueConstraint('email', name='user-unique-email'), UniqueConstraint('login_name', name='user-unique-name'), UniqueConstraint('text_phone', name='user-unique-phone'), - {"schema": "augur_operations"} + {"schema": "collection_operations"} ) user_id = Column(Integer, primary_key=True) @@ -634,7 +634,7 @@ class UserGroup(Base): __tablename__ = 'user_groups' __table_args__ = ( UniqueConstraint('user_id', 'name', name='user_groups_user_id_name_key'), - {"schema": "augur_operations"} + {"schema": "collection_operations"} ) group_id = Column(BigInteger, primary_key=True) @@ -743,7 +743,7 @@ def convert_group_name_to_id(session, user_id: int, group_name: str) -> int: class UserRepo(Base): __tablename__ = "user_repos" - __table_args__ = { "schema": "augur_operations" } + __table_args__ = { "schema": "collection_operations" } group_id = Column( ForeignKey("augur_operations.user_groups.group_id", name="user_repo_group_id_fkey"), primary_key=True, nullable=False @@ -1010,7 +1010,7 @@ def add_github_org_repos(session, url: List[str], user_id: int, group_name: int) class UserSessionToken(Base): __tablename__ = "user_session_tokens" - __table_args__ = { "schema": "augur_operations" } + __table_args__ = { "schema": "collection_operations" } token = Column(String, primary_key=True, nullable=False) user_id = Column(ForeignKey("augur_operations.users.user_id", name="user_session_token_user_id_fkey"), nullable=False) @@ -1048,7 +1048,7 @@ def delete_refresh_tokens(self, session): class ClientApplication(Base): __tablename__ = "client_applications" - __table_args__ = { "schema": "augur_operations" } + __table_args__ = { "schema": "collection_operations" } id = Column(String, primary_key=True, nullable=False) user_id = Column(ForeignKey("augur_operations.users.user_id", name="client_application_user_id_fkey"), nullable=False) @@ -1074,7 +1074,7 @@ def get_by_id(session, client_id): class ForgeInstance(Base): __tablename__ = "forge_instance" - __table_args__ = { "schema": "augur_operations" } + __table_args__ = { "schema": "collection_operations" } id = Column(Integer, primary_key=True, nullable=False, comment="Internal unique identifier for this forge instance") # platform_type stores an integer that CollectOSS maps/will map to it's internal platform identifier Enum @@ -1089,7 +1089,7 @@ class ForgeInstance(Base): class Subscription(Base): __tablename__ = "subscriptions" - __table_args__ = { "schema": "augur_operations" } + __table_args__ = { "schema": "collection_operations" } application_id = Column(ForeignKey("augur_operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) type_id = Column(ForeignKey("augur_operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) @@ -1101,7 +1101,7 @@ class SubscriptionType(Base): __tablename__ = "subscription_types" __table_args__ = ( UniqueConstraint('name', name='subscription_type_title_unique'), - {"schema": "augur_operations"} + {"schema": "collection_operations"} ) @@ -1115,7 +1115,7 @@ class RefreshToken(Base): __tablename__ = "refresh_tokens" __table_args__ = ( UniqueConstraint('user_session_token', name='refresh_token_user_session_token_id_unique'), - {"schema": "augur_operations"} + {"schema": "collection_operations"} ) id = Column(String, primary_key=True) @@ -1201,7 +1201,7 @@ class CollectionStatus(Base): "NOT (core_status = 'Pending' AND secondary_status = 'Collecting')", name='core_secondary_dependency_check' ), - {"schema": "augur_operations"} + {"schema": "collection_operations"} ) repo_id = Column(ForeignKey("collection_data.repo.repo_id", name="collection_status_repo_id_fk"), primary_key=True) From 8b1ff46cccf2cc74bb2dc7db9da9440103eebebd Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:48:29 -0400 Subject: [PATCH 065/165] rename in-SQL references to operations schema within the models Signed-off-by: Adrian Edwards --- .../application/db/models/operations.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/collectoss/application/db/models/operations.py b/collectoss/application/db/models/operations.py index 34c99fe0e..417230a50 100644 --- a/collectoss/application/db/models/operations.py +++ b/collectoss/application/db/models/operations.py @@ -99,7 +99,7 @@ class Settings(Base): Sequence("augur_settings_id_seq", start=1, schema="collection_operations"), primary_key=True, server_default=text( - "nextval('augur_operations.augur_settings_id_seq'::regclass)" + "nextval('collection_operations.augur_settings_id_seq'::regclass)" ), ) setting = Column(String) @@ -135,7 +135,7 @@ class WorkerHistory(Base): Sequence("gh_worker_history_history_id_seq", start=1, schema="collection_operations"), primary_key=True, server_default=text( - "nextval('augur_operations.gh_worker_history_history_id_seq'::regclass)" + "nextval('collection_operations.gh_worker_history_history_id_seq'::regclass)" ), ) repo_id = Column(BigInteger) @@ -181,7 +181,7 @@ class WorkerOauth(Base): Sequence("worker_oauth_oauth_id_seq", start=1000, schema="collection_operations"), primary_key=True, server_default=text( - "nextval('augur_operations.worker_oauth_oauth_id_seq'::regclass)" + "nextval('collection_operations.worker_oauth_oauth_id_seq'::regclass)" ), ) name = Column(String(255), nullable=False) @@ -639,7 +639,7 @@ class UserGroup(Base): group_id = Column(BigInteger, primary_key=True) user_id = Column(Integer, - ForeignKey("augur_operations.users.user_id", name="user_group_user_id_fkey"), nullable=False + ForeignKey("collection_operations.users.user_id", name="user_group_user_id_fkey"), nullable=False ) name = Column(String, nullable=False) favorited = Column(Boolean, nullable=False, server_default=text("FALSE")) @@ -746,7 +746,7 @@ class UserRepo(Base): __table_args__ = { "schema": "collection_operations" } group_id = Column( - ForeignKey("augur_operations.user_groups.group_id", name="user_repo_group_id_fkey"), primary_key=True, nullable=False + ForeignKey("collection_operations.user_groups.group_id", name="user_repo_group_id_fkey"), primary_key=True, nullable=False ) repo_id = Column( ForeignKey("collection_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False @@ -1013,9 +1013,9 @@ class UserSessionToken(Base): __table_args__ = { "schema": "collection_operations" } token = Column(String, primary_key=True, nullable=False) - user_id = Column(ForeignKey("augur_operations.users.user_id", name="user_session_token_user_id_fkey"), nullable=False) + user_id = Column(ForeignKey("collection_operations.users.user_id", name="user_session_token_user_id_fkey"), nullable=False) expiration = Column(BigInteger) - application_id = Column(ForeignKey("augur_operations.client_applications.id", name="user_session_token_application_id_fkey")) + application_id = Column(ForeignKey("collection_operations.client_applications.id", name="user_session_token_application_id_fkey")) created_at = Column(BigInteger) user = relationship("User", back_populates="tokens") @@ -1051,7 +1051,7 @@ class ClientApplication(Base): __table_args__ = { "schema": "collection_operations" } id = Column(String, primary_key=True, nullable=False) - user_id = Column(ForeignKey("augur_operations.users.user_id", name="client_application_user_id_fkey"), nullable=False) + user_id = Column(ForeignKey("collection_operations.users.user_id", name="client_application_user_id_fkey"), nullable=False) name = Column(String, nullable=False) redirect_url = Column(String, nullable=False) api_key = Column(String, nullable=False) @@ -1091,8 +1091,8 @@ class Subscription(Base): __tablename__ = "subscriptions" __table_args__ = { "schema": "collection_operations" } - application_id = Column(ForeignKey("augur_operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) - type_id = Column(ForeignKey("augur_operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) + application_id = Column(ForeignKey("collection_operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) + type_id = Column(ForeignKey("collection_operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) application = relationship("ClientApplication", back_populates="subscriptions") type = relationship("SubscriptionType", back_populates="subscriptions") @@ -1119,7 +1119,7 @@ class RefreshToken(Base): ) id = Column(String, primary_key=True) - user_session_token = Column(ForeignKey("augur_operations.user_session_tokens.token", name="refresh_token_session_token_id_fkey"), nullable=False) + user_session_token = Column(ForeignKey("collection_operations.user_session_tokens.token", name="refresh_token_session_token_id_fkey"), nullable=False) user_session = relationship("UserSessionToken", back_populates="refresh_tokens") From e6b185e3c25e0c53ff82d800c01f94d26fe4926d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:50:16 -0400 Subject: [PATCH 066/165] rename all references to augur_operations in the rest of the codebase Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 8 ++--- collectoss/application/cli/collection.py | 22 ++++++------ collectoss/application/cli/db.py | 8 ++--- collectoss/application/service_manager.py | 14 ++++---- collectoss/tasks/start_tasks.py | 2 +- collectoss/tasks/util/collection_util.py | 4 +-- .../configuration-file-reference.rst | 2 +- .../development-guide/tech-breakdown.rst | 2 +- keyman/README.md | 2 +- .../test_config/test_config.py | 34 +++++++++---------- .../test_github_api_key_handler.py | 6 ++-- 11 files changed, 52 insertions(+), 52 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index f85a4e105..378a8a0fa 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -340,7 +340,7 @@ def stop_processes(signal, logger, engine): def assign_orphan_repos_to_default_user(session): query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.user_repos) + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM collection_operations.user_repos) """) repos = session.execute_sql(query).fetchall() @@ -377,13 +377,13 @@ def repo_reset(backend_app): Refresh repo collection to force data collection """ backend_app.database.execute(s.sql.text(""" - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET core_status='Pending',core_task_id = NULL, core_data_last_collected = NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL, secondary_data_last_collected = NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; TRUNCATE collection_data.commits CASCADE; diff --git a/collectoss/application/cli/collection.py b/collectoss/application/cli/collection.py index 3f46d10d1..471606d62 100644 --- a/collectoss/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -202,13 +202,13 @@ def repo_reset(ctx): """ with ctx.obj.engine.connect() as connection: connection.execute(s.sql.text(""" - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET core_status='Pending',core_task_id = NULL, core_data_last_collected = NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL, secondary_data_last_collected = NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; TRUNCATE collection_data.commits CASCADE; @@ -279,31 +279,31 @@ def cleanup_after_collection_halt(logger_instance, engine): #Make sure that database reflects collection status when processes are killed/stopped. def clean_collection_status(session): session.execute_sql(s.sql.text(""" - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET core_status='Pending',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET core_status='Success',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET secondary_status='Success',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET facade_status='Update', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET facade_status='Success', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET facade_status='Pending', facade_task_id=NULL WHERE facade_status='Failed Clone' OR facade_status='Initializing'; """)) @@ -311,7 +311,7 @@ def clean_collection_status(session): def assign_orphan_repos_to_default_user(session): query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.user_repos) + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM collection_operations.user_repos) """) repos = session.execute_sql(query).fetchall() diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index 6bc475711..10e830735 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -262,7 +262,7 @@ def add_github_org(ctx, organization_name): def get_db_version(engine): db_version_sql = s.sql.text( """ - SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version' + SELECT * FROM collection_operations.augur_settings WHERE setting = 'augur_data_version' """ ) @@ -342,11 +342,11 @@ def update_api_key(ctx, api_key): """ update_api_key_sql = s.sql.text( """ - INSERT INTO augur_operations.augur_settings (setting,VALUE) VALUES ('augur_api_key','HudMhTyPW7wiaWopUKgRoGCxlIUulw4g') ON CONFLICT (setting) + INSERT INTO collection_operations.augur_settings (setting,VALUE) VALUES ('augur_api_key','HudMhTyPW7wiaWopUKgRoGCxlIUulw4g') ON CONFLICT (setting) DO UPDATE SET VALUE='HudMhTyPW7wiaWopUKgRoGCxlIUulw4g'; - --UPDATE augur_operations.augur_settings SET VALUE = :api_key WHERE setting='augur_api_key'; + --UPDATE collection_operations.augur_settings SET VALUE = :api_key WHERE setting='augur_api_key'; """ ) @@ -363,7 +363,7 @@ def update_api_key(ctx, api_key): def get_api_key(ctx): get_api_key_sql = s.sql.text( """ - SELECT value FROM augur_operations.augur_settings WHERE setting='augur_api_key'; + SELECT value FROM collection_operations.augur_settings WHERE setting='augur_api_key'; """ ) diff --git a/collectoss/application/service_manager.py b/collectoss/application/service_manager.py index 3cebb4d34..0ade300e8 100644 --- a/collectoss/application/service_manager.py +++ b/collectoss/application/service_manager.py @@ -110,31 +110,31 @@ def clear_redis_caches(): #Make sure that database reflects collection status when processes are killed/stopped. def clean_collection_status(session): session.execute_sql(s.sql.text(""" - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET core_status='Pending',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET core_status='Success',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET secondary_status='Success',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET facade_status='Update', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET facade_status='Success', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE collection_operations.collection_status SET facade_status='Pending', facade_task_id=NULL WHERE facade_status='Failed Clone' OR facade_status='Initializing'; """)) diff --git a/collectoss/tasks/start_tasks.py b/collectoss/tasks/start_tasks.py index 8e130f926..42ebed21e 100644 --- a/collectoss/tasks/start_tasks.py +++ b/collectoss/tasks/start_tasks.py @@ -383,7 +383,7 @@ def create_collection_status_records(self): logger = logging.getLogger(create_collection_status_records.__name__) query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.collection_status) + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM collection_operations.collection_status) """) repo = execute_sql(query).first() diff --git a/collectoss/tasks/util/collection_util.py b/collectoss/tasks/util/collection_util.py index 3781377b0..59dd22e22 100644 --- a/collectoss/tasks/util/collection_util.py +++ b/collectoss/tasks/util/collection_util.py @@ -74,7 +74,7 @@ def get_newly_added_repos(session, limit, hook): repo_query = s.sql.text(f""" select repo_git - from augur_operations.collection_status x, collection_data.repo y + from collection_operations.collection_status x, collection_data.repo y where x.repo_id=y.repo_id and {condition_string} order by {order_by_field} @@ -96,7 +96,7 @@ def get_repos_for_recollection(session, limit, hook, days_until_collect_again): repo_query = s.sql.text(f""" select repo_git - from augur_operations.collection_status x, repo y + from collection_operations.collection_status x, repo y where x.repo_id = y.repo_id and {condition_string} and {hook}_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' diff --git a/docs/source/development-guide/configuration-file-reference.rst b/docs/source/development-guide/configuration-file-reference.rst index ecca79590..a2da864cd 100644 --- a/docs/source/development-guide/configuration-file-reference.rst +++ b/docs/source/development-guide/configuration-file-reference.rst @@ -1,7 +1,7 @@ Configuration file reference =============================== -CollectOSS's configuration template file, which generates your locally deployed ``augur.config.json`` file, is found at ``collectoss/config.py``. You will notice a small collection of workers are turned on to start with, by examining the ``switch`` variable within the ``Workers`` block of the config file. You can also specify the number of processes to spawn for each worker using the ``workers`` command. The default is one, and we recommend you start here. If you are going to spawn multiple workers, be sure you have enough credentials cached in the ``augur_operations.worker_oath`` table for the platforms you use. +CollectOSS's configuration template file, which generates your locally deployed ``augur.config.json`` file, is found at ``collectoss/config.py``. You will notice a small collection of workers are turned on to start with, by examining the ``switch`` variable within the ``Workers`` block of the config file. You can also specify the number of processes to spawn for each worker using the ``workers`` command. The default is one, and we recommend you start here. If you are going to spawn multiple workers, be sure you have enough credentials cached in the ``collection_operations.worker_oath`` table for the platforms you use. If you have questions or would like to help please open an issue on GitHub_. diff --git a/docs/source/development-guide/tech-breakdown.rst b/docs/source/development-guide/tech-breakdown.rst index ce4425877..0e002bcfa 100644 --- a/docs/source/development-guide/tech-breakdown.rst +++ b/docs/source/development-guide/tech-breakdown.rst @@ -127,7 +127,7 @@ Your CollectOSS instance will now be available at http://servername-or-ip:port_number Note: CollectOSS will run on port 5000 by default (you probably need to -change that in augur_operations.config for OSX) +change that in collection_operations.config for OSX) Stopping your CollectOSS Instance --------------------------------- diff --git a/keyman/README.md b/keyman/README.md index 1deb1b8b9..18622c914 100644 --- a/keyman/README.md +++ b/keyman/README.md @@ -119,7 +119,7 @@ python keyman/Orchestrator.py ## Adding Keys ```sql -INSERT INTO augur_operations.worker_oauth +INSERT INTO collection_operations.worker_oauth (name, consumer_key, consumer_secret, access_token, access_token_secret, platform) VALUES ('My GitHub Key', 'not_used', 'not_used', 'ghp_YOURTOKEN', 'not_used', 'github_rest'); diff --git a/tests/test_application/test_config/test_config.py b/tests/test_application/test_config/test_config.py index b03db89c6..8341b12da 100644 --- a/tests/test_application/test_config/test_config.py +++ b/tests/test_application/test_config/test_config.py @@ -26,7 +26,7 @@ def test_config_get_value(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_config_get_section(test_db_config, test_db_engine): @@ -62,7 +62,7 @@ def test_config_get_section(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_config_load_config(test_db_config, test_db_engine): @@ -102,7 +102,7 @@ def test_config_load_config(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_config_empty(test_db_config, test_db_engine): @@ -132,7 +132,7 @@ def test_config_empty(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_config_is_section_in_config(test_db_config, test_db_engine): @@ -163,7 +163,7 @@ def test_config_is_section_in_config(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_config_add_settings(test_db_config, test_db_engine): @@ -174,7 +174,7 @@ def test_config_add_settings(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() assert result is not None assert len(result) == 2 @@ -189,7 +189,7 @@ def test_config_add_settings(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_config_update_settings(test_db_config, test_db_engine): @@ -222,7 +222,7 @@ def test_config_update_settings(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() assert len(result) == 3 @@ -235,7 +235,7 @@ def test_config_update_settings(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_config_add_section_from_json(test_db_config, test_db_engine): @@ -252,7 +252,7 @@ def test_config_add_section_from_json(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""") + result = connection.execute("""SELECT * FROM collection_operations.config""") for row in result: dict_data = dict(row) @@ -266,7 +266,7 @@ def test_config_add_section_from_json(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_load_config_file(test_db_config): @@ -312,7 +312,7 @@ def test_config_load_config_from_dict(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() for row in result: dict_data = dict(row) @@ -328,7 +328,7 @@ def test_config_load_config_from_dict(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_config_clear(test_db_config, test_db_engine): @@ -350,14 +350,14 @@ def test_config_clear(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() assert len(result) == 0 finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_remove_section(test_db_config, test_db_engine): @@ -385,7 +385,7 @@ def test_remove_section(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() for row in result: dict_data = dict(row) @@ -395,7 +395,7 @@ def test_remove_section(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") diff --git a/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py b/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py index edf5ac3cf..98008a8a3 100644 --- a/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py +++ b/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py @@ -43,7 +43,7 @@ def test_get_config_key(key_handler, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM collection_operations.config""") def test_get_config_key_with_none_specified(key_handler, test_db_engine): @@ -78,7 +78,7 @@ def test_get_api_keys_from_database(key_handler, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.worker_oauth""") + connection.execute("""DELETE FROM collection_operations.worker_oauth""") api_key_list = ["asdfdfkey", "jloire", "zdfdr", "asdrxer"] @pytest.mark.parametrize("api_key", api_key_list) @@ -112,4 +112,4 @@ def test_get_api_keys(key_handler, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.worker_oauth""") \ No newline at end of file + connection.execute("""DELETE FROM collection_operations.worker_oauth""") \ No newline at end of file From 4a6204a66394267aa4b89cdb3def7c133bf41541 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:51:03 -0400 Subject: [PATCH 067/165] create database migration for the schema rename Signed-off-by: Adrian Edwards --- .../alembic/versions/43_rename_schema.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 collectoss/application/schema/alembic/versions/43_rename_schema.py diff --git a/collectoss/application/schema/alembic/versions/43_rename_schema.py b/collectoss/application/schema/alembic/versions/43_rename_schema.py new file mode 100644 index 000000000..c5b1a9acd --- /dev/null +++ b/collectoss/application/schema/alembic/versions/43_rename_schema.py @@ -0,0 +1,30 @@ +"""rename schema + +Revision ID: 43 +Revises: 42 +Create Date: 2026-05-27 15:28:12.439500 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy import text +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '43' +down_revision = '42' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + conn = op.get_bind() + conn.execute(text("ALTER SCHEMA augur_data RENAME TO collection_data;")) + conn.execute(text("ALTER SCHEMA augur_operations RENAME TO collection_operations;")) + + + +def downgrade() -> None: + conn = op.get_bind() + conn.execute(text("ALTER SCHEMA collection_data RENAME TO augur_data;")) + conn.execute(text("ALTER SCHEMA collection_operations RENAME TO augur_operations;")) \ No newline at end of file From fc20087b6dc888700151ce55273a7ad18961d7b1 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:59:08 -0400 Subject: [PATCH 068/165] correct more imports of the ops schema Signed-off-by: Adrian Edwards --- collectoss/api/routes/dei.py | 2 +- collectoss/tasks/frontend.py | 2 +- .../tasks/git/util/facade_worker/facade_worker/repofetch.py | 2 +- collectoss/util/repo_load_controller.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/collectoss/api/routes/dei.py b/collectoss/api/routes/dei.py index 64af957bf..5e18dceb8 100644 --- a/collectoss/api/routes/dei.py +++ b/collectoss/api/routes/dei.py @@ -21,7 +21,7 @@ logger = logging.getLogger(__name__) from collectoss.api.routes import API_VERSION -from collectoss.application.db.models.augur_operations import FRONTEND_REPO_GROUP_NAME +from collectoss.application.db.models.operations import FRONTEND_REPO_GROUP_NAME @app.route(f"/{API_VERSION}/dei/repo/add", methods=['POST']) @ssl_required diff --git a/collectoss/tasks/frontend.py b/collectoss/tasks/frontend.py index 4ed2e24aa..d78fc1e1d 100644 --- a/collectoss/tasks/frontend.py +++ b/collectoss/tasks/frontend.py @@ -10,7 +10,7 @@ from collectoss.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from collectoss.application.db.lib import get_group_by_name, get_repo_by_repo_git, get_github_repo_by_src_id, get_gitlab_repo_by_src_id from collectoss.tasks.github.util.util import get_owner_repo -from collectoss.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup, CollectionStatus +from collectoss.application.db.models.operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup, CollectionStatus from collectoss.tasks.github.util.github_paginator import hit_api from collectoss.application.db.models import UserRepo, Repo diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py index 3f7ab07e9..968c4c54d 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -33,7 +33,7 @@ from .utilitymethods import update_repo_log, get_absolute_repo_path from sqlalchemy.orm.exc import NoResultFound from collectoss.application.db.models.augur_data import * -from collectoss.application.db.models.augur_operations import CollectionStatus +from collectoss.application.db.models.operations import CollectionStatus from collectoss.application.db.util import execute_session_query, convert_orm_list_to_dict_list from collectoss.application.db.lib import execute_sql, get_repo_by_repo_git from typing_extensions import deprecated diff --git a/collectoss/util/repo_load_controller.py b/collectoss/util/repo_load_controller.py index 7e3a0548a..2fde93bb0 100644 --- a/collectoss/util/repo_load_controller.py +++ b/collectoss/util/repo_load_controller.py @@ -6,7 +6,7 @@ from collectoss.application.db.engine import DatabaseEngine from collectoss.application.db.models import Repo, UserRepo, RepoGroup, UserGroup, User, CollectionStatus -from collectoss.application.db.models.augur_operations import retrieve_owner_repos +from collectoss.application.db.models.operations import retrieve_owner_repos from collectoss.application.db.util import execute_session_query from sqlalchemy import Column, Table, MetaData, or_ From 071c543b3ec50907ee95a38ece3949b8d81bbd7a Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 15:59:36 -0400 Subject: [PATCH 069/165] fix search paths Signed-off-by: Adrian Edwards --- collectoss/application/db/engine.py | 2 +- tests/test_helpers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/collectoss/application/db/engine.py b/collectoss/application/db/engine.py index ef582dbed..e2ba3902e 100644 --- a/collectoss/application/db/engine.py +++ b/collectoss/application/db/engine.py @@ -105,7 +105,7 @@ def set_search_path(dbapi_connection, connection_record): existing_autocommit = dbapi_connection.autocommit dbapi_connection.autocommit = True cursor = dbapi_connection.cursor() - cursor.execute("SET SESSION search_path=public,augur_data,augur_operations,spdx") + cursor.execute("SET SESSION search_path=public,collection_data,collection_operations,spdx") cursor.close() dbapi_connection.autocommit = existing_autocommit diff --git a/tests/test_helpers.py b/tests/test_helpers.py index a0401f369..dd850a23f 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -14,7 +14,7 @@ def set_search_path(dbapi_connection, connection_record): existing_autocommit = dbapi_connection.autocommit dbapi_connection.autocommit = True cursor = dbapi_connection.cursor() - cursor.execute("SET SESSION search_path=public,augur_data,augur_operations,spdx") + cursor.execute("SET SESSION search_path=public,collectoss_data,collectoss_operations,spdx") cursor.close() dbapi_connection.autocommit = existing_autocommit From d6d25f538e359f8149fed9e8533e7108b6f2e3a3 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 16:00:29 -0400 Subject: [PATCH 070/165] update test fixtures for worker tests Signed-off-by: Adrian Edwards --- tests/test_workers/test_set_up_fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_workers/test_set_up_fixtures.py b/tests/test_workers/test_set_up_fixtures.py index 8d3ad70f1..fa4496cd8 100644 --- a/tests/test_workers/test_set_up_fixtures.py +++ b/tests/test_workers/test_set_up_fixtures.py @@ -153,8 +153,8 @@ def initialize_database_connections(self): "augur", "augur", "172.17.0.1", 5400, "test" ) - self.db_schema = 'augur_data' - self.helper_schema = 'augur_operations' + self.db_schema = 'collectoss_data' + self.helper_schema = 'collection_operations' self.helper_db = s.create_engine(DB_STR, poolclass=s.pool.NullPool, connect_args={'options': '-csearch_path={}'.format(self.helper_schema)}) From daf8d7a5f6cac694a056a68847b348a84fa4a778 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 16:00:47 -0400 Subject: [PATCH 071/165] update unit test references to operations schema in SQL Signed-off-by: Adrian Edwards --- .../test_config/test_config.py | 16 ++++++++-------- .../test_augur_operations/test_user_group.py | 10 +++++----- .../test_augur_operations/test_user_repo.py | 6 +++--- .../test_repo_load_controller/helper.py | 18 +++++++++--------- .../test_repo_load_controller/util.py | 16 ++++++++-------- .../test_github_api_key_handler.py | 6 +++--- 6 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tests/test_application/test_config/test_config.py b/tests/test_application/test_config/test_config.py index 8341b12da..4ed62d9c5 100644 --- a/tests/test_application/test_config/test_config.py +++ b/tests/test_application/test_config/test_config.py @@ -15,7 +15,7 @@ def test_config_get_value(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -43,7 +43,7 @@ def test_config_get_section(test_db_config, test_db_engine): for data in network_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -82,7 +82,7 @@ def test_config_load_config(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -123,7 +123,7 @@ def test_config_empty(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -151,7 +151,7 @@ def test_config_is_section_in_config(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -212,7 +212,7 @@ def test_config_update_settings(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -342,7 +342,7 @@ def test_config_clear(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -377,7 +377,7 @@ def test_remove_section(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) diff --git a/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py b/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py index 1eb7c7492..f09582511 100644 --- a/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py +++ b/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py @@ -73,19 +73,19 @@ def test_add_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "augur_operations"."user_groups";""") + query = s.text("""SELECT * FROM "collection_operations"."user_groups";""") result = connection.execute(query).fetchall() assert result is not None assert len(result) == 3 - query = s.text("""SELECT * FROM "augur_operations"."user_groups" WHERE "user_id"={};""".format(data["users"][0]["id"])) + query = s.text("""SELECT * FROM "collection_operations"."user_groups" WHERE "user_id"={};""".format(data["users"][0]["id"])) result = connection.execute(query).fetchall() assert result is not None assert len(result) == 2 - query = s.text("""SELECT * FROM "augur_operations"."user_groups" WHERE "user_id"={};""".format(data["users"][1]["id"])) + query = s.text("""SELECT * FROM "collection_operations"."user_groups" WHERE "user_id"={};""".format(data["users"][1]["id"])) result = connection.execute(query).fetchall() assert result is not None @@ -212,7 +212,7 @@ def test_remove_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "augur_operations"."user_groups";""") + query = s.text("""SELECT * FROM "collection_operations"."user_groups";""") result = connection.execute(query).fetchall() assert result is not None @@ -226,7 +226,7 @@ def test_remove_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "augur_operations"."user_groups";""") + query = s.text("""SELECT * FROM "collection_operations"."user_groups";""") result = connection.execute(query).fetchall() assert result is not None diff --git a/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py b/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py index ee7abf4c1..58b1488d9 100644 --- a/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py +++ b/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py @@ -74,7 +74,7 @@ def test_add_repo_to_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "augur_operations"."user_repos";""") + query = s.text("""SELECT * FROM "collection_operations"."user_repos";""") # WHERE "group_id"=:user_group_id AND "repo_id"=:repo_id result = connection.execute(query).fetchall() @@ -82,14 +82,14 @@ def test_add_repo_to_user_group(test_db_engine): assert len(result) == 4 - query = s.text("""SELECT * FROM "augur_operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) + query = s.text("""SELECT * FROM "collection_operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) result = connection.execute(query).fetchall() assert result is not None assert len(result) == 2 - query = s.text("""SELECT * FROM "augur_operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) + query = s.text("""SELECT * FROM "collection_operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) result = connection.execute(query).fetchall() assert result is not None diff --git a/tests/test_application/test_repo_load_controller/helper.py b/tests/test_application/test_repo_load_controller/helper.py index 051e48eff..8138783f7 100644 --- a/tests/test_application/test_repo_load_controller/helper.py +++ b/tests/test_application/test_repo_load_controller/helper.py @@ -28,19 +28,19 @@ def get_repo_group_delete_statement(): def get_user_delete_statement(): - return get_delete_statement("augur_operations", "users") + return get_delete_statement("collection_operations", "users") def get_user_repo_delete_statement(): - return get_delete_statement("augur_operations", "user_repos") + return get_delete_statement("collection_operations", "user_repos") def get_user_group_delete_statement(): - return get_delete_statement("augur_operations", "user_groups") + return get_delete_statement("collection_operations", "user_groups") def get_config_delete_statement(): - return get_delete_statement("augur_operations", "config") + return get_delete_statement("collection_operations", "config") def get_repo_related_delete_statements(table_list): """Takes a list of tables related to the RepoLoadController class and generates a delete statement. @@ -96,7 +96,7 @@ def get_repo_insert_statement(repo_id, rg_id, repo_url="place holder url"): def get_user_repo_insert_statement(repo_id, group_id): - return """INSERT INTO "augur_operations"."user_repos" ("repo_id", "group_id") VALUES ({}, {});""".format(repo_id, group_id) + return """INSERT INTO "collection_operations"."user_repos" ("repo_id", "group_id") VALUES ({}, {});""".format(repo_id, group_id) def get_repo_group_insert_statement(rg_id): @@ -104,14 +104,14 @@ def get_repo_group_insert_statement(rg_id): def get_user_insert_statement(user_id, username="bil", email="default@gmail.com", password="pass"): - return """INSERT INTO "augur_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, User.compute_hashsed_password(password), email) + return """INSERT INTO "collection_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, User.compute_hashsed_password(password), email) def get_user_group_insert_statement(user_id, group_name, group_id=None): if group_id: - return """INSERT INTO "augur_operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) + return """INSERT INTO "collection_operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) - return """INSERT INTO "augur_operations"."user_groups" ("user_id", "name") VALUES ({}, '{}');""".format(user_id, group_name) + return """INSERT INTO "collection_operations"."user_groups" ("user_id", "name") VALUES ({}, '{}');""".format(user_id, group_name) ######## Helper Functions to get retrieve data from tables ################# @@ -135,7 +135,7 @@ def get_repos(connection, where_string=None): def get_user_repos(connection): - return connection.execute(s.text("""SELECT * FROM "augur_operations"."user_repos";""")).fetchall() + return connection.execute(s.text("""SELECT * FROM "collection_operations"."user_repos";""")).fetchall() ######## Helper Functions to get repos in an org ################# diff --git a/tests/test_application/test_repo_load_controller/util.py b/tests/test_application/test_repo_load_controller/util.py index 1283e7580..305d9acf7 100644 --- a/tests/test_application/test_repo_load_controller/util.py +++ b/tests/test_application/test_repo_load_controller/util.py @@ -14,19 +14,19 @@ def get_repo_group_delete_statement(): def get_user_delete_statement(): - return get_delete_statement("augur_operations", "users") + return get_delete_statement("collection_operations", "users") def get_user_repo_delete_statement(): - return get_delete_statement("augur_operations", "user_repos") + return get_delete_statement("collection_operations", "user_repos") def get_user_group_delete_statement(): - return get_delete_statement("augur_operations", "user_groups") + return get_delete_statement("collection_operations", "user_groups") def get_config_delete_statement(): - return get_delete_statement("augur_operations", "config") + return get_delete_statement("collection_operations", "config") def get_repo_related_delete_statements(table_list): """Takes a list of tables related to the RepoLoadController class and generates a delete statement. @@ -86,14 +86,14 @@ def get_repo_group_insert_statement(rg_id): def get_user_insert_statement(user_id): - return """INSERT INTO "augur_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, 'bil', 'pass', 'b@gmil.com', 'bill', 'bob', false);""".format(user_id) + return """INSERT INTO "collection_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, 'bil', 'pass', 'b@gmil.com', 'bill', 'bob', false);""".format(user_id) def get_user_group_insert_statement(user_id, group_name, group_id=None): if group_id: - return """INSERT INTO "augur_operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) + return """INSERT INTO "collection_operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) - return """INSERT INTO "augur_operations"."user_groups" (user_id", "name") VALUES (1, 'default');""".format(user_id, group_name) + return """INSERT INTO "collection_operations"."user_groups" (user_id", "name") VALUES (1, 'default');""".format(user_id, group_name) ######## Helper Functions to get retrieve data from tables ################# @@ -117,7 +117,7 @@ def get_repos(connection, where_string=None): def get_user_repos(connection): - return connection.execute(s.text("""SELECT * FROM "augur_operations"."user_repos";""")).fetchall() + return connection.execute(s.text("""SELECT * FROM "collection_operations"."user_repos";""")).fetchall() ######## Helper Functions to get repos in an org ################# diff --git a/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py b/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py index 98008a8a3..54849ebbc 100644 --- a/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py +++ b/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py @@ -33,7 +33,7 @@ def test_get_config_key(key_handler, test_db_engine): data = {"github_api_key": "asdfdfkey"} with test_db_engine.connect() as connection: - query = text("""INSERT INTO "augur_operations"."config" ("id", "section_name", "setting_name", "value", "type") VALUES (3, 'Keys', 'github_api_key', :github_api_key, 'str');""") + query = text("""INSERT INTO "collection_operations"."config" ("id", "section_name", "setting_name", "value", "type") VALUES (3, 'Keys', 'github_api_key', :github_api_key, 'str');""") connection.execute(query, **data) @@ -64,7 +64,7 @@ def test_get_api_keys_from_database(key_handler, test_db_engine): for value in data: - query = text("""INSERT INTO "augur_operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") + query = text("""INSERT INTO "collection_operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") connection.execute(query, **value) @@ -101,7 +101,7 @@ def test_get_api_keys(key_handler, test_db_engine): for value in data: - query = text("""INSERT INTO "augur_operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") + query = text("""INSERT INTO "collection_operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") connection.execute(query, **value) From 7218d74aeae54b1b4ec8394bb154fa644d7829de Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 16:01:05 -0400 Subject: [PATCH 072/165] update docs references to ops schema by name Signed-off-by: Adrian Edwards --- docs/source/getting-started/collecting-data.rst | 2 +- docs/source/getting-started/command-line-interface/db.rst | 4 ++-- docs/source/schema/overview.rst | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/getting-started/collecting-data.rst b/docs/source/getting-started/collecting-data.rst index 78b421f0f..5e7297ae3 100644 --- a/docs/source/getting-started/collecting-data.rst +++ b/docs/source/getting-started/collecting-data.rst @@ -60,7 +60,7 @@ There are many collection jobs that ship ready to collect out of the box: - ``collectoss.tasks.github.releases.tasks`` (collects release data from the GitHub API) - ``collectoss.tasks.data_analysis.insight_worker.tasks`` (queries CollectOSS's metrics API to find interesting anomalies in the collected data) -All worker configuration options are found in the config table generated when collectoss was installed. The config table is located in the augur_operations schema of your postgresql database. Each configurable data collection job set has its subsection with the same or similar title as the task's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. +All worker configuration options are found in the config table generated when collectoss was installed. The config table is located in the collection_operations schema of your postgresql database. Each configurable data collection job set has its subsection with the same or similar title as the task's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. Worker-specific configuration options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/getting-started/command-line-interface/db.rst b/docs/source/getting-started/command-line-interface/db.rst index 853cd2a66..9b8079f11 100644 --- a/docs/source/getting-started/command-line-interface/db.rst +++ b/docs/source/getting-started/command-line-interface/db.rst @@ -175,7 +175,7 @@ Example usage\: > ADD COLUMN "repo_archived_date_collected" timestamptz(0), > ALTER COLUMN "forked_from" TYPE varchar USING "forked_from"::varchar; > ALTER TABLE - > update "augur_operations"."augur_settings" set value = 17 where setting = 'augur_data_version'; + > update "collection_operations"."augur_settings" set value = 17 where setting = 'augur_data_version'; > UPDATE 1 > CLI: [db.upgrade_db_version] [INFO] Upgrading from 17 to 18 > etc... @@ -193,4 +193,4 @@ Example usage\: $ uv run collectoss db create-schema .. note:: - If this runs successfully, you should see a bunch of schema creation commands fly by pretty fast. If everything worked you should see: ``update "augur_operations"."augur_settings" set value = xx where setting = 'augur_data_version';`` at the end. + If this runs successfully, you should see a bunch of schema creation commands fly by pretty fast. If everything worked you should see: ``update "collection_operations"."augur_settings" set value = xx where setting = 'augur_data_version';`` at the end. diff --git a/docs/source/schema/overview.rst b/docs/source/schema/overview.rst index 1322cce8d..d065b64df 100644 --- a/docs/source/schema/overview.rst +++ b/docs/source/schema/overview.rst @@ -61,7 +61,7 @@ gathered from commits, issues, and other info. CollectOSS Operations ------------------------------------------------------- -The ``augur_operations`` tables are where most of the operations tables +The ``collection_operations`` tables are where most of the operations tables exist. There are a few, like ``settings`` that remain in ``augur_data`` for now, but will be moved. They keep records related to analytical history and data provenance for data in the schema. They also From 4fb395cc18615451e6f132578ae52811fa33e429 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 16:03:17 -0400 Subject: [PATCH 073/165] update database comments (and include them in the schema rename migration) Signed-off-by: Adrian Edwards --- .../application/db/models/operations.py | 6 +-- .../alembic/versions/43_rename_schema.py | 39 +++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/collectoss/application/db/models/operations.py b/collectoss/application/db/models/operations.py index 417230a50..4a05ed802 100644 --- a/collectoss/application/db/models/operations.py +++ b/collectoss/application/db/models/operations.py @@ -120,7 +120,7 @@ class Settings(Base): ), Index("repos_id,statusops", "repos_id", "status"), schema="collection_operations", - comment="For future use when we move all working tables to the augur_operations schema. ", + comment="For future use when we move all working tables to the collection_operations schema. ", ) class WorkerHistory(Base): @@ -197,7 +197,7 @@ class WorkerSettingsFacade(Base): __tablename__ = "worker_settings_facade" __table_args__ = { "schema": "collection_operations", - "comment": "For future use when we move all working tables to the augur_operations schema. ", + "comment": "For future use when we move all working tables to the collection_operations schema. ", } id = Column(Integer, primary_key=True) @@ -216,7 +216,7 @@ class WorkerSettingsFacade(Base): "working_commit", String(40), server_default=text("'NULL'::character varying") ), schema="collection_operations", - comment="For future use when we move all working tables to the augur_operations schema. ", + comment="For future use when we move all working tables to the collection_operations schema. ", ) class BadgingDEI(Base): diff --git a/collectoss/application/schema/alembic/versions/43_rename_schema.py b/collectoss/application/schema/alembic/versions/43_rename_schema.py index c5b1a9acd..b860cd6e4 100644 --- a/collectoss/application/schema/alembic/versions/43_rename_schema.py +++ b/collectoss/application/schema/alembic/versions/43_rename_schema.py @@ -22,9 +22,48 @@ def upgrade() -> None: conn.execute(text("ALTER SCHEMA augur_data RENAME TO collection_data;")) conn.execute(text("ALTER SCHEMA augur_operations RENAME TO collection_operations;")) + op.create_table_comment( + 'repos_fetch_log', + 'For future use when we move all working tables to the collection_operations schema. ', + existing_comment='For future use when we move all working tables to the augur_operations schema. ', + schema='collection_operations' + ) + op.create_table_comment( + 'worker_settings_facade', + 'For future use when we move all working tables to the collection_operations schema. ', + existing_comment='For future use when we move all working tables to the augur_operations schema. ', + schema='collection_operations' + ) + op.create_table_comment( + 'working_commits', + 'For future use when we move all working tables to the collection_operations schema. ', + existing_comment='For future use when we move all working tables to the augur_operations schema. ', + schema='collection_operations' + ) + def downgrade() -> None: + + op.create_table_comment( + 'working_commits', + 'For future use when we move all working tables to the augur_operations schema. ', + existing_comment='For future use when we move all working tables to the collection_operations schema. ', + schema='collection_operations' + ) + op.create_table_comment( + 'worker_settings_facade', + 'For future use when we move all working tables to the augur_operations schema. ', + existing_comment='For future use when we move all working tables to the collection_operations schema. ', + schema='collection_operations' + ) + op.create_table_comment( + 'repos_fetch_log', + 'For future use when we move all working tables to the augur_operations schema. ', + existing_comment='For future use when we move all working tables to the collection_operations schema. ', + schema='collection_operations' + ) + conn = op.get_bind() conn.execute(text("ALTER SCHEMA collection_data RENAME TO augur_data;")) conn.execute(text("ALTER SCHEMA collection_operations RENAME TO augur_operations;")) \ No newline at end of file From 9a7a0730f6d8a8479597b0046eb2021ecd09b5f1 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 16:03:38 -0400 Subject: [PATCH 074/165] update another schema reference in tests Signed-off-by: Adrian Edwards --- conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 38f7e7055..61e84e2a4 100644 --- a/conftest.py +++ b/conftest.py @@ -195,7 +195,7 @@ def read_only_db(empty_db): database_name = empty_db.url.database test_username = "testuser" test_password = "testpass" - schemas = ["public", "collection_data", "augur_operations"] + schemas = ["public", "collection_data", "collection_operations"] # create read-only user empty_db.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) From 508aec2967c812b0e09b95c299320acdb9079747 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 16:16:58 -0400 Subject: [PATCH 075/165] forgot to change augur_data model imports Signed-off-by: Adrian Edwards --- .../tasks/git/util/facade_worker/facade_worker/repofetch.py | 2 +- collectoss/tasks/github/repo_info/core.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py index 968c4c54d..dfb331c1d 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -32,7 +32,7 @@ import sqlalchemy as s from .utilitymethods import update_repo_log, get_absolute_repo_path from sqlalchemy.orm.exc import NoResultFound -from collectoss.application.db.models.augur_data import * +from collectoss.application.db.models.data import * from collectoss.application.db.models.operations import CollectionStatus from collectoss.application.db.util import execute_session_query, convert_orm_list_to_dict_list from collectoss.application.db.lib import execute_sql, get_repo_by_repo_git diff --git a/collectoss/tasks/github/repo_info/core.py b/collectoss/tasks/github/repo_info/core.py index 25b1b25d1..582a5ed45 100644 --- a/collectoss/tasks/github/repo_info/core.py +++ b/collectoss/tasks/github/repo_info/core.py @@ -9,7 +9,7 @@ from collectoss.application.db.models import * from collectoss.application.db.lib import execute_sql from collectoss.tasks.github.util.github_task_session import * -from collectoss.application.db.models.augur_data import RepoBadging +from collectoss.application.db.models.data import RepoBadging from urllib.parse import quote def query_committers_count(key_auth, logger, owner, repo): From 0169f416e251348acd2461b8905b03d4a3671d8c Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 17:17:26 -0400 Subject: [PATCH 076/165] rename a schema reflection ref that was crashing gunicorn Signed-off-by: Adrian Edwards --- collectoss/util/repo_load_controller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/util/repo_load_controller.py b/collectoss/util/repo_load_controller.py index 2fde93bb0..5dbef272a 100644 --- a/collectoss/util/repo_load_controller.py +++ b/collectoss/util/repo_load_controller.py @@ -19,7 +19,7 @@ with DatabaseEngine() as engine: - augur_data_schema = MetaData(schema = "augur_data") + augur_data_schema = MetaData(schema = "collection_data") augur_data_schema.reflect(bind = engine, views = True) commits_materialized_view: Table = augur_data_schema.tables["collection_data.api_get_all_repos_commits"] From 0f06f1b0d142d9343b92bbd7bf0cd7f260dde403 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 17:17:55 -0400 Subject: [PATCH 077/165] rename an augur_data variable Signed-off-by: Adrian Edwards --- collectoss/util/repo_load_controller.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/collectoss/util/repo_load_controller.py b/collectoss/util/repo_load_controller.py index 5dbef272a..1f05c2bca 100644 --- a/collectoss/util/repo_load_controller.py +++ b/collectoss/util/repo_load_controller.py @@ -19,11 +19,11 @@ with DatabaseEngine() as engine: - augur_data_schema = MetaData(schema = "collection_data") - augur_data_schema.reflect(bind = engine, views = True) + data_schema = MetaData(schema = "collection_data") + data_schema.reflect(bind = engine, views = True) - commits_materialized_view: Table = augur_data_schema.tables["collection_data.api_get_all_repos_commits"] - issues_materialized_view: Table = augur_data_schema.tables["collection_data.api_get_all_repos_issues"] + commits_materialized_view: Table = data_schema.tables["collection_data.api_get_all_repos_commits"] + issues_materialized_view: Table = data_schema.tables["collection_data.api_get_all_repos_issues"] class RepoLoadController: From 359e0dee2719f812aa971d8082d5a657b99f78e5 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 17:19:39 -0400 Subject: [PATCH 078/165] change augur_data references in test files Signed-off-by: Adrian Edwards --- tests/test_application/test_db/test_session.py | 18 +++++++++--------- .../test_repo_load_controller/helper.py | 10 +++++----- .../test_repo_load_controller/util.py | 10 +++++----- .../test_github_tasks/test_pull_requests.py | 14 +++++++------- .../test_endpoints.py | 2 +- tests/test_workers/test_set_up_fixtures.py | 2 +- 6 files changed, 28 insertions(+), 28 deletions(-) diff --git a/tests/test_application/test_db/test_session.py b/tests/test_application/test_db/test_session.py index 856a3f194..3c661136d 100644 --- a/tests/test_application/test_db/test_session.py +++ b/tests/test_application/test_db/test_session.py @@ -26,7 +26,7 @@ def test_execute_sql(test_db_engine): for data in all_data: - statement = s.sql.text("""INSERT INTO "augur_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") + statement = s.sql.text("""INSERT INTO "collection_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") connection.execute(statement, **data) @@ -106,7 +106,7 @@ def test_insert_data_with_updates(test_db_engine): with test_db_engine.connect() as connection: - statement = s.sql.text("""INSERT INTO "augur_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") + statement = s.sql.text("""INSERT INTO "collection_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") connection.execute(statement, **data_1) @@ -172,7 +172,7 @@ def test_insert_data_partial_update(test_db_engine): try: with test_db_engine.connect() as connection: - statement = s.sql.text("""INSERT INTO "augur_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") + statement = s.sql.text("""INSERT INTO "collection_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") connection.execute(statement, **data_1) @@ -210,11 +210,11 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): # insert the cntrb_id and cntrb_login into the contributors table so the contributor is present. # This is so we don't get a foreign key error on the cntrb_id when we insert the prs query = s.sql.text(""" - DELETE FROM "augur_data"."repo"; - DELETE FROM "augur_data"."repo_groups"; - INSERT INTO "augur_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); + DELETE FROM "collection_data"."repo"; + DELETE FROM "collection_data"."repo_groups"; + INSERT INTO "collection_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); - INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); + INSERT INTO "collection_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); """) connection.execute(query) @@ -243,8 +243,8 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): connection.execute(""" DELETE FROM collection_data.issues; - DELETE FROM "augur_data"."repo"; - DELETE FROM "augur_data"."repo_groups"; + DELETE FROM "collection_data"."repo"; + DELETE FROM "collection_data"."repo_groups"; """) diff --git a/tests/test_application/test_repo_load_controller/helper.py b/tests/test_application/test_repo_load_controller/helper.py index 8138783f7..da23932ae 100644 --- a/tests/test_application/test_repo_load_controller/helper.py +++ b/tests/test_application/test_repo_load_controller/helper.py @@ -20,11 +20,11 @@ def get_delete_statement(schema, table): def get_repo_delete_statement(): - return get_delete_statement("augur_data", "repo") + return get_delete_statement("collection_data", "repo") def get_repo_group_delete_statement(): - return get_delete_statement("augur_data", "repo_groups") + return get_delete_statement("collection_data", "repo_groups") def get_user_delete_statement(): @@ -92,7 +92,7 @@ def add_keys_to_test_db(test_db_engine): def get_repo_insert_statement(repo_id, rg_id, repo_url="place holder url"): - return """INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) + return """INSERT INTO "collection_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) def get_user_repo_insert_statement(repo_id, group_id): @@ -100,7 +100,7 @@ def get_user_repo_insert_statement(repo_id, group_id): def get_repo_group_insert_statement(rg_id): - return """INSERT INTO "augur_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) + return """INSERT INTO "collection_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) def get_user_insert_statement(user_id, username="bil", email="default@gmail.com", password="pass"): @@ -119,7 +119,7 @@ def get_user_group_insert_statement(user_id, group_name, group_id=None): def get_repos(connection, where_string=None): query_list = [] - query_list.append('SELECT * FROM "augur_data"."repo"') + query_list.append('SELECT * FROM "collection_data"."repo"') if where_string: if where_string.endswith(";"): diff --git a/tests/test_application/test_repo_load_controller/util.py b/tests/test_application/test_repo_load_controller/util.py index 305d9acf7..887dbf617 100644 --- a/tests/test_application/test_repo_load_controller/util.py +++ b/tests/test_application/test_repo_load_controller/util.py @@ -6,11 +6,11 @@ def get_delete_statement(schema, table): def get_repo_delete_statement(): - return get_delete_statement("augur_data", "repo") + return get_delete_statement("collection_data", "repo") def get_repo_group_delete_statement(): - return get_delete_statement("augur_data", "repo_groups") + return get_delete_statement("collection_data", "repo_groups") def get_user_delete_statement(): @@ -78,11 +78,11 @@ def add_keys_to_test_db(test_db_engine): def get_repo_insert_statement(repo_id, rg_id, repo_url="place holder url"): - return """INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) + return """INSERT INTO "collection_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) def get_repo_group_insert_statement(rg_id): - return """INSERT INTO "augur_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) + return """INSERT INTO "collection_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) def get_user_insert_statement(user_id): @@ -101,7 +101,7 @@ def get_user_group_insert_statement(user_id, group_name, group_id=None): def get_repos(connection, where_string=None): query_list = [] - query_list.append('SELECT * FROM "augur_data"."repo"') + query_list.append('SELECT * FROM "collection_data"."repo"') if where_string: if where_string.endswith(";"): diff --git a/tests/test_tasks/test_github_tasks/test_pull_requests.py b/tests/test_tasks/test_github_tasks/test_pull_requests.py index 83751ea22..0f70a64b0 100644 --- a/tests/test_tasks/test_github_tasks/test_pull_requests.py +++ b/tests/test_tasks/test_github_tasks/test_pull_requests.py @@ -312,13 +312,13 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): # insert the cntrb_id and cntrb_login into the contributors table so the contributor is present. # This is so we don't get a foreign key error on the cntrb_id when we insert the prs - query = text("""INSERT INTO "augur_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, 'kannayoshihiro@gmail.com', 'KANNA Yoshihiro', 'UTMC', '2009-04-17 12:43:58', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, 'kannayoshihiro@gmail.com', '2021-01-28 21:56:10-06', 74832, :gh_login, 'https://api.github.com/users/nan', 'https://github.com/nan', 'MDQ6VXNlcjc0ODMy', 'https://avatars.githubusercontent.com/u/74832?v=4', '', 'https://api.github.com/users/nan/followers', 'https://api.github.com/users/nan/following{/other_user}', 'https://api.github.com/users/nan/gists{/gist_id}', 'https://api.github.com/users/nan/starred{/owner}{/repo}', 'https://api.github.com/users/nan/subscriptions', 'https://api.github.com/users/nan/orgs', 'https://api.github.com/users/nan/repos', 'https://api.github.com/users/nan/events{/privacy}', 'https://api.github.com/users/nan/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'GitHub API Worker', '1.0.0', 'GitHub API', '2021-10-28 15:23:46', :cntrb_id); + query = text("""INSERT INTO "collection_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, 'kannayoshihiro@gmail.com', 'KANNA Yoshihiro', 'UTMC', '2009-04-17 12:43:58', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, 'kannayoshihiro@gmail.com', '2021-01-28 21:56:10-06', 74832, :gh_login, 'https://api.github.com/users/nan', 'https://github.com/nan', 'MDQ6VXNlcjc0ODMy', 'https://avatars.githubusercontent.com/u/74832?v=4', '', 'https://api.github.com/users/nan/followers', 'https://api.github.com/users/nan/following{/other_user}', 'https://api.github.com/users/nan/gists{/gist_id}', 'https://api.github.com/users/nan/starred{/owner}{/repo}', 'https://api.github.com/users/nan/subscriptions', 'https://api.github.com/users/nan/orgs', 'https://api.github.com/users/nan/repos', 'https://api.github.com/users/nan/events{/privacy}', 'https://api.github.com/users/nan/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'GitHub API Worker', '1.0.0', 'GitHub API', '2021-10-28 15:23:46', :cntrb_id); - DELETE FROM "augur_data"."repo"; - DELETE FROM "augur_data"."repo_groups"; - INSERT INTO "augur_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); + DELETE FROM "collection_data"."repo"; + DELETE FROM "collection_data"."repo_groups"; + INSERT INTO "collection_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); - INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); + INSERT INTO "collection_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); """) connection.execute(query, **contributor) @@ -354,8 +354,8 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): with test_db_session.engine.connect() as connection: connection.execute(f"DELETE FROM collection_data.pull_requests;") - connection.execute("""DELETE FROM "augur_data"."repo"; - DELETE FROM "augur_data"."repo_groups"; + connection.execute("""DELETE FROM "collection_data"."repo"; + DELETE FROM "collection_data"."repo_groups"; """) connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") diff --git a/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py b/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py index 14bd8bfe5..43b1a9524 100644 --- a/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py +++ b/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py @@ -13,7 +13,7 @@ def set_up_repo_groups(database_connection): repo_group_IDs = df['repo_group_id'].values.tolist() insert_repo_group_sql = s.sql.text(""" - INSERT INTO "augur_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); + INSERT INTO "collection_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); """) with open("tests/test_workers/test_facade/test_facade_contributor_interface/test_repo_groups.csv") as create_repo_groups_file: diff --git a/tests/test_workers/test_set_up_fixtures.py b/tests/test_workers/test_set_up_fixtures.py index fa4496cd8..82787ca97 100644 --- a/tests/test_workers/test_set_up_fixtures.py +++ b/tests/test_workers/test_set_up_fixtures.py @@ -12,7 +12,7 @@ def poll_database_connection(database_string): print("Attempting to create db engine") db = s.create_engine(database_string, poolclass=s.pool.NullPool, - connect_args={'options': '-csearch_path={}'.format('augur_data')}) + connect_args={'options': '-csearch_path={}'.format('collection_data')}) return db From 477b16a9e43b36c0f23a505239b69aeabf4c2266 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 17:19:47 -0400 Subject: [PATCH 079/165] change augur_data references in docs files Signed-off-by: Adrian Edwards --- docs/source/getting-started/command-line-interface/db.rst | 4 ++-- docs/source/schema/overview.rst | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/getting-started/command-line-interface/db.rst b/docs/source/getting-started/command-line-interface/db.rst index 9b8079f11..6401658d1 100644 --- a/docs/source/getting-started/command-line-interface/db.rst +++ b/docs/source/getting-started/command-line-interface/db.rst @@ -167,10 +167,10 @@ Example usage\: > [INFO] Config file loaded successfully > CLI: [db.check_pgpass_credentials] [INFO] Credentials found in $HOME/.pgpass > CLI: [db.upgrade_db_version] [INFO] Upgrading from 16 to 17 - > ALTER TABLE "augur_data"."repo" + > ALTER TABLE "collection_data"."repo" > ALTER COLUMN "forked_from" TYPE varchar USING "forked_from"::varchar; > ALTER TABLE - > ALTER TABLE "augur_data"."repo" + > ALTER TABLE "collection_data"."repo" > ADD COLUMN "repo_archived" int4, > ADD COLUMN "repo_archived_date_collected" timestamptz(0), > ALTER COLUMN "forked_from" TYPE varchar USING "forked_from"::varchar; diff --git a/docs/source/schema/overview.rst b/docs/source/schema/overview.rst index d065b64df..d82d83b70 100644 --- a/docs/source/schema/overview.rst +++ b/docs/source/schema/overview.rst @@ -35,7 +35,7 @@ Schema Overview CollectOSS Data ------------------------------------------------------- -The ``augur_data`` schema contains *most* of the information analyzed +The ``collection_data`` schema contains *most* of the information analyzed and constructed by CollectOSS. The origin’s of the data inside of collectoss are from data collection tasks and populate this schema.: @@ -63,7 +63,7 @@ CollectOSS Operations The ``collection_operations`` tables are where most of the operations tables exist. There are a few, like ``settings`` that remain in -``augur_data`` for now, but will be moved. They keep records related to +``collection_data`` for now, but will be moved. They keep records related to analytical history and data provenance for data in the schema. They also store information including API keys. From b5e8d7012745d8890abeba42675aaf52d074b543 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 17:19:53 -0400 Subject: [PATCH 080/165] change augur_data references in the CLI Signed-off-by: Adrian Edwards --- collectoss/application/cli/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index 10e830735..d2514eb65 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -186,7 +186,7 @@ def add_repo_groups(ctx: click.Context, filename: str) -> None: insert_repo_group_sql = s.sql.text( """ - INSERT INTO "augur_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); + INSERT INTO "collectoss_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); """ ) From 8140509c80701b62d86c8738058a49b4afaf4411 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 11:51:12 -0400 Subject: [PATCH 081/165] fix typo (its collection_data schema) Signed-off-by: Adrian Edwards --- collectoss/application/cli/db.py | 2 +- docs/source/quick-start.rst | 2 +- docs/source/schema/regularly_used_data.rst | 2 +- tests/test_helpers.py | 2 +- tests/test_workers/test_set_up_fixtures.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index d2514eb65..bff4f859f 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -186,7 +186,7 @@ def add_repo_groups(ctx: click.Context, filename: str) -> None: insert_repo_group_sql = s.sql.text( """ - INSERT INTO "collectoss_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); + INSERT INTO "collection_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); """ ) diff --git a/docs/source/quick-start.rst b/docs/source/quick-start.rst index a5466272f..b30101902 100644 --- a/docs/source/quick-start.rst +++ b/docs/source/quick-start.rst @@ -139,7 +139,7 @@ http://servername-or-ip:port_number Note: CollectOSS will run on port 5000 by default (you probably need to -change that in collectoss_operations.config for OSX) +change that in collection_operations.config for OSX) Stopping your CollectOSS Instance --------------------------------- diff --git a/docs/source/schema/regularly_used_data.rst b/docs/source/schema/regularly_used_data.rst index 14cdcb1f8..979c204c0 100644 --- a/docs/source/schema/regularly_used_data.rst +++ b/docs/source/schema/regularly_used_data.rst @@ -347,7 +347,7 @@ Repo_meta Repo_sbom_scans --------------- - This table links the collectoss_data schema to the collectoss_spdx schema to keep a list of repositories that need licenses scanned. (These are for file level license declarations, which are common in Linux Foundation projects, but otherwise not in wide use). + This table links the collection_data schema to the collectoss_spdx schema to keep a list of repositories that need licenses scanned. (These are for file level license declarations, which are common in Linux Foundation projects, but otherwise not in wide use). .. image:: images/repo_sbom_scans.png :width: 200 diff --git a/tests/test_helpers.py b/tests/test_helpers.py index dd850a23f..8ba765018 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -14,7 +14,7 @@ def set_search_path(dbapi_connection, connection_record): existing_autocommit = dbapi_connection.autocommit dbapi_connection.autocommit = True cursor = dbapi_connection.cursor() - cursor.execute("SET SESSION search_path=public,collectoss_data,collectoss_operations,spdx") + cursor.execute("SET SESSION search_path=public,collection_data,collection_operations,spdx") cursor.close() dbapi_connection.autocommit = existing_autocommit diff --git a/tests/test_workers/test_set_up_fixtures.py b/tests/test_workers/test_set_up_fixtures.py index 82787ca97..584c16745 100644 --- a/tests/test_workers/test_set_up_fixtures.py +++ b/tests/test_workers/test_set_up_fixtures.py @@ -153,7 +153,7 @@ def initialize_database_connections(self): "augur", "augur", "172.17.0.1", 5400, "test" ) - self.db_schema = 'collectoss_data' + self.db_schema = 'collection_data' self.helper_schema = 'collection_operations' self.helper_db = s.create_engine(DB_STR, poolclass=s.pool.NullPool, From f6aa44a8b5e72d47e96031b270477e2b785b2b62 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 12:10:59 -0400 Subject: [PATCH 082/165] hardcode schema in migrations Signed-off-by: Adrian Edwards --- .../alembic/versions/31_update_pr_events_unique.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py b/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py index f6aeeca20..d903ae8fd 100644 --- a/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py +++ b/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py @@ -20,7 +20,7 @@ # conn = op.get_bind() # conn.execute(text(""" - # UPDATE pull_request_events + # UPDATE augur_data.pull_request_events # SET issue_event_src_id = substring(node_url FROM '.*/([0-9]+)$')::BIGINT; # """)) @@ -32,7 +32,7 @@ def upgrade(): with engine.connect() as conn: - result = conn.execute(text("SELECT COUNT(*) FROM pull_request_events WHERE issue_event_src_id=pr_platform_event_id")) + result = conn.execute(text("SELECT COUNT(*) FROM augur_data.pull_request_events WHERE issue_event_src_id=pr_platform_event_id")) total_rows = result.scalar() if total_rows != 0: print(f"Rows needing updated: {total_rows}") @@ -43,14 +43,14 @@ def upgrade(): result = conn.execute(text(""" WITH cte AS ( SELECT pr_event_id - FROM pull_request_events + FROM augur_data.pull_request_events WHERE issue_event_src_id=pr_platform_event_id LIMIT 250000 ) - UPDATE pull_request_events + UPDATE augur_data.pull_request_events SET issue_event_src_id = substring(node_url FROM '.*/([0-9]+)$')::BIGINT FROM cte - WHERE pull_request_events.pr_event_id = cte.pr_event_id + WHERE augur_data.pull_request_events.pr_event_id = cte.pr_event_id RETURNING 1; """)) @@ -77,7 +77,7 @@ def downgrade(): print("Please run in background. This downgrade will take a very *very* long time") conn = op.get_bind() conn.execute(text(""" - UPDATE pull_request_events + UPDATE augur_data.pull_request_events SET issue_event_src_id = pr_platform_event_id WHERE issue_event_src_id <> pr_platform_event_id; """)) \ No newline at end of file From 04753962d8d8d735dc21bdce164c37898d603c28 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 4 Jun 2026 10:34:19 -0400 Subject: [PATCH 083/165] rename collection_data schema to data Signed-off-by: Adrian Edwards --- collectoss/api/metrics/commit.py | 2 +- collectoss/api/metrics/deps.py | 32 +- collectoss/api/metrics/message.py | 20 +- collectoss/api/metrics/pull_request.py | 6 +- collectoss/api/metrics/repo_meta.py | 12 +- collectoss/api/metrics/toss.py | 2 +- collectoss/api/routes/collection_status.py | 18 +- collectoss/api/routes/complexity.py | 144 ++--- collectoss/api/routes/metadata.py | 2 +- collectoss/application/cli/backend.py | 2 +- collectoss/application/cli/collection.py | 2 +- collectoss/application/cli/db.py | 6 +- collectoss/application/db/engine.py | 2 +- collectoss/application/db/models/data.py | 606 +++++++++--------- .../application/db/models/operations.py | 8 +- .../data_analysis/clustering_worker/tasks.py | 26 +- .../data_analysis/discourse_analysis/tasks.py | 8 +- .../data_analysis/message_insights/tasks.py | 54 +- .../pull_request_analysis_worker/tasks.py | 22 +- .../tasks/db/refresh_materialized_views.py | 28 +- .../tasks/github/facade_github/tasks.py | 12 +- collectoss/tasks/util/collection_util.py | 2 +- collectoss/util/repo_load_controller.py | 6 +- conftest.py | 2 +- .../command-line-interface/db.rst | 4 +- docs/source/schema/overview.rst | 4 +- docs/source/schema/regularly_used_data.rst | 2 +- .../test_application/test_db/test_session.py | 42 +- .../test_repo_load_controller/helper.py | 10 +- .../test_repo_load_controller/util.py | 10 +- tests/test_helpers.py | 2 +- .../test_github_tasks/test_pull_requests.py | 24 +- .../test_endpoints.py | 8 +- tests/test_workers/test_set_up_fixtures.py | 4 +- 34 files changed, 567 insertions(+), 567 deletions(-) diff --git a/collectoss/api/metrics/commit.py b/collectoss/api/metrics/commit.py index 3b55a1520..8b4227f71 100644 --- a/collectoss/api/metrics/commit.py +++ b/collectoss/api/metrics/commit.py @@ -231,7 +231,7 @@ def annual_commit_count_ranked_by_repo_in_repo_group(repo_group_id, repo_id=None if timeframe == 'all': cdRgTpRankedCommitsSQL = s.sql.text(""" SELECT repo.repo_id, repo_name as name, SUM(added - removed - whitespace) as net, patches - FROM collection_data.dm_repo_annual, repo, repo_groups + FROM data.dm_repo_annual, repo, repo_groups WHERE repo.repo_group_id = :repo_group_id AND repo.repo_group_id = repo_groups.repo_group_id AND dm_repo_annual.repo_id = repo.repo_id diff --git a/collectoss/api/metrics/deps.py b/collectoss/api/metrics/deps.py index 61f34092f..5f1162fd3 100644 --- a/collectoss/api/metrics/deps.py +++ b/collectoss/api/metrics/deps.py @@ -33,13 +33,13 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No depsSQL = s.sql.text(""" SELECT - collection_data.repo_dependencies.*, - collection_data.repo_groups.repo_group_id + data.repo_dependencies.*, + data.repo_groups.repo_group_id FROM - collection_data.repo_dependencies, - collection_data.repo_groups, - collection_data.repo, - ( SELECT MAX ( date_trunc( 'day', collection_data.repo_dependencies.data_collection_date ) ) AS data_collection_date FROM repo_dependencies WHERE repo_id = repo_id ) C + data.repo_dependencies, + data.repo_groups, + data.repo, + ( SELECT MAX ( date_trunc( 'day', data.repo_dependencies.data_collection_date ) ) AS data_collection_date FROM repo_dependencies WHERE repo_id = repo_id ) C WHERE repo_dependencies.repo_id = repo.repo_id AND repo.repo_group_id = repo_groups.repo_group_id @@ -54,13 +54,13 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No depsSQL = s.sql.text(""" SELECT - collection_data.repo_dependencies.*, - collection_data.repo_groups.repo_group_id + data.repo_dependencies.*, + data.repo_groups.repo_group_id FROM - collection_data.repo_dependencies, - collection_data.repo_groups, - collection_data.repo, - ( SELECT MAX ( date_trunc( 'day', collection_data.repo_dependencies.data_collection_date ) ) AS data_collection_date + data.repo_dependencies, + data.repo_groups, + data.repo, + ( SELECT MAX ( date_trunc( 'day', data.repo_dependencies.data_collection_date ) ) AS data_collection_date FROM repo_dependencies, repo, repo_groups WHERE repo.repo_group_id = repo_groups.repo_group_id and repo_dependencies.repo_id = repo.repo_id and @@ -134,8 +134,8 @@ def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date f.libyear, f.data_collection_date FROM - ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM collection_data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e, - collection_data.repo_deps_libyear f + ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e, + data.repo_deps_libyear f WHERE e.data_collection_date = f.data_collection_date and e.repo_id = f.repo_id @@ -203,8 +203,8 @@ def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date f.libyear, f.data_collection_date FROM - ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM collection_data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e, - collection_data.repo_deps_libyear f + ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e, + data.repo_deps_libyear f WHERE e.data_collection_date = f.data_collection_date and e.repo_id = f.repo_id diff --git a/collectoss/api/metrics/message.py b/collectoss/api/metrics/message.py index 26ce99cf3..15256a30c 100644 --- a/collectoss/api/metrics/message.py +++ b/collectoss/api/metrics/message.py @@ -40,12 +40,12 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en COUNT ( * ), repo_name FROM - collection_data.repo, - collection_data.message + data.repo, + data.message WHERE - collection_data.repo.repo_id = collection_data.message.repo_id + data.repo.repo_id = data.message.repo_id AND - collection_data.repo.repo_id = :repo_id + data.repo.repo_id = :repo_id AND message.msg_timestamp BETWEEN :begin_date AND :end_date GROUP BY @@ -69,14 +69,14 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en COUNT ( * ), rg_name FROM - collection_data.repo, - collection_data.repo_groups, - collection_data.message + data.repo, + data.repo_groups, + data.message WHERE - collection_data.repo.repo_id = collection_data.message.repo_id - AND collection_data.repo_groups.repo_group_id = repo.repo_group_id + data.repo.repo_id = data.message.repo_id + AND data.repo_groups.repo_group_id = repo.repo_group_id AND - collection_data.repo_groups.repo_group_id = :repo_group_id + data.repo_groups.repo_group_id = :repo_group_id AND message.msg_timestamp BETWEEN :begin_date AND :end_date GROUP BY diff --git a/collectoss/api/metrics/pull_request.py b/collectoss/api/metrics/pull_request.py index 8516ec999..7f98ccee6 100644 --- a/collectoss/api/metrics/pull_request.py +++ b/collectoss/api/metrics/pull_request.py @@ -787,8 +787,8 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo pr_merged_at, pr_closed_at, pr_created_at - FROM collection_data.pull_request_commits, collection_data.pull_request_meta,collection_data.repo_groups, - collection_data.pull_requests JOIN repo ON pull_requests.repo_id = repo.repo_id + FROM data.pull_request_commits, data.pull_request_meta,data.repo_groups, + data.pull_requests JOIN repo ON pull_requests.repo_id = repo.repo_id WHERE pull_requests.repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) AND pull_requests.pull_request_id = pull_request_commits.pull_request_id @@ -821,7 +821,7 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo pr_merged_at, pr_closed_at, pr_created_at - FROM collection_data.pull_request_commits, collection_data.pull_requests, collection_data.pull_request_meta + FROM data.pull_request_commits, data.pull_requests, data.pull_request_meta WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id AND pull_requests.repo_id = :repo_id diff --git a/collectoss/api/metrics/repo_meta.py b/collectoss/api/metrics/repo_meta.py index a609066ab..c9c5f8905 100644 --- a/collectoss/api/metrics/repo_meta.py +++ b/collectoss/api/metrics/repo_meta.py @@ -190,7 +190,7 @@ def sbom_download(repo_group_id, repo_id=None): :return: dosocs sbom """ dosocs_SQL = s.sql.text(""" - select * from collection_data.repo_sbom_scans + select * from data.repo_sbom_scans where repo_id = :repo_id; """) @@ -313,7 +313,7 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): if not repo_id: cii_best_practices_badge_SQL = s.sql.text(""" SELECT data - FROM collection_data.repo_badging + FROM data.repo_badging WHERE repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) ORDER BY created_at DESC LIMIT 1 @@ -321,7 +321,7 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): else: cii_best_practices_badge_SQL = s.sql.text(""" SELECT data - FROM collection_data.repo_badging + FROM data.repo_badging WHERE repo_id = :repo_id ORDER BY created_at DESC LIMIT 1 @@ -1270,7 +1270,7 @@ def clones(repo_group_id, repo_id=None, begin_date=None, end_date=None): clone_data_timestamp AS date, count_clones AS total_clones, unique_clones - FROM collection_data.repo_clones_data + FROM data.repo_clones_data WHERE repo_id = :repo_id AND clone_data_timestamp BETWEEN :begin_date AND :end_date ORDER BY clone_data_timestamp @@ -1289,9 +1289,9 @@ def clones(repo_group_id, repo_id=None, begin_date=None, end_date=None): clone_data_timestamp AS date, count_clones AS total_clones, unique_clones - FROM collection_data.repo_clones_data + FROM data.repo_clones_data WHERE repo_id IN ( - SELECT repo_id FROM collection_data.repo WHERE repo_group_id = :repo_group_id + SELECT repo_id FROM data.repo WHERE repo_group_id = :repo_group_id ) AND clone_data_timestamp BETWEEN :begin_date AND :end_date ORDER BY repo_id, clone_data_timestamp diff --git a/collectoss/api/metrics/toss.py b/collectoss/api/metrics/toss.py index 69597da66..620b79935 100644 --- a/collectoss/api/metrics/toss.py +++ b/collectoss/api/metrics/toss.py @@ -114,7 +114,7 @@ def toss_repo_info(repo_id): repo_info.default_branch, repo.repo_git FROM - collection_data.repo_info + data.repo_info JOIN repo ON repo.repo_id = repo_info.repo_id WHERE repo_info.repo_id = :repo_id diff --git a/collectoss/api/routes/collection_status.py b/collectoss/api/routes/collection_status.py index ba8373440..2a5e42675 100644 --- a/collectoss/api/routes/collection_status.py +++ b/collectoss/api/routes/collection_status.py @@ -61,10 +61,10 @@ def issue_collection_status(): # TODO: make this name automatic - wrapper? ( CAST (( COUNT ( * )) +1 AS DOUBLE PRECISION ) / CAST ( b.issues_count + 1 AS DOUBLE PRECISION )) AS ratio_issues FROM - collection_data.repo A, - collection_data.issues d, - collection_data.repo_info b, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM collection_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, + data.repo A, + data.issues d, + data.repo_info b, + ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, ( SELECT repo_id, MAX ( data_collection_date ) AS most_recently_collected_issue FROM issues GROUP BY repo_id ORDER BY repo_id ) f WHERE A.repo_id = b.repo_id @@ -135,11 +135,11 @@ def pull_request_collection_status(): # TODO: make this name automatic - wrappe ABS ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_abs, ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_issues FROM - collection_data.repo A, - collection_data.pull_requests d, - collection_data.repo_info b, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM collection_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_pr_collected FROM collection_data.pull_requests GROUP BY repo_id ORDER BY repo_id ) f + data.repo A, + data.pull_requests d, + data.repo_info b, + ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, + ( SELECT repo_id, MAX ( data_collection_date ) AS last_pr_collected FROM data.pull_requests GROUP BY repo_id ORDER BY repo_id ) f WHERE A.repo_id = b.repo_id AND LOWER ( A.repo_git ) LIKE'%github.com%' diff --git a/collectoss/api/routes/complexity.py b/collectoss/api/routes/complexity.py index 1b1a2c6a5..8cfe799de 100644 --- a/collectoss/api/routes/complexity.py +++ b/collectoss/api/routes/complexity.py @@ -17,13 +17,13 @@ def get_project_languages(): project_languages_sql = s.sql.text(""" SELECT e.repo_id, - collection_data.repo.repo_git, - collection_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.programming_language, e.code_lines, e.files FROM - collection_data.repo, + data.repo, (SELECT d.repo_id, d.programming_language, @@ -31,22 +31,22 @@ def get_project_languages(): COUNT(*)::int AS files FROM (SELECT - collection_data.repo_labor.repo_id, - collection_data.repo_labor.programming_language, - collection_data.repo_labor.code_lines + data.repo_labor.repo_id, + data.repo_labor.programming_language, + data.repo_labor.code_lines FROM - collection_data.repo_labor, + data.repo_labor, ( SELECT - collection_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - collection_data.repo_labor - GROUP BY collection_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - collection_data.repo_labor.repo_id = recent.repo_id - AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id, d.programming_language) e - WHERE collection_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id ORDER BY e.repo_id """) @@ -62,30 +62,30 @@ def get_project_files(): project_files_sql = s.sql.text(""" SELECT e.repo_id, - collection_data.repo.repo_git, - collection_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.files FROM - collection_data.repo, + data.repo, (SELECT d.repo_id, count(*) AS files FROM (SELECT - collection_data.repo_labor.repo_id + data.repo_labor.repo_id FROM - collection_data.repo_labor, + data.repo_labor, ( SELECT - collection_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - collection_data.repo_labor - GROUP BY collection_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - collection_data.repo_labor.repo_id = recent.repo_id - AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE collection_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id ORDER BY e.repo_id """) @@ -103,33 +103,33 @@ def get_project_lines(): project_lines_sql = s.sql.text(""" SELECT e.repo_id, - collection_data.repo.repo_git, - collection_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.total_lines, e.average_lines FROM - collection_data.repo, + data.repo, (SELECT d.repo_id, SUM(d.total_lines) AS total_lines, AVG(d.total_lines)::INT AS average_lines FROM (SELECT - collection_data.repo_labor.repo_id, - collection_data.repo_labor.total_lines + data.repo_labor.repo_id, + data.repo_labor.total_lines FROM - collection_data.repo_labor, + data.repo_labor, ( SELECT - collection_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - collection_data.repo_labor - GROUP BY collection_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - collection_data.repo_labor.repo_id = recent.repo_id - AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE collection_data.repo.repo_id = e.repo_id and collection_data.repo.repo_id = :repo_id_param + WHERE data.repo.repo_id = e.repo_id and data.repo.repo_id = :repo_id_param ORDER BY e.repo_id """).bindparams(repo_id_param=repo_id) @@ -147,33 +147,33 @@ def get_project_comment_lines(): comment_lines_sql = s.sql.text(""" SELECT e.repo_id, - collection_data.repo.repo_git, - collection_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.comment_lines, e.avg_comment_lines FROM - collection_data.repo, + data.repo, (SELECT d.repo_id, SUM(d.comment_lines) AS comment_lines, AVG(d.comment_lines)::INT AS avg_comment_lines FROM (SELECT - collection_data.repo_labor.repo_id, - collection_data.repo_labor.comment_lines + data.repo_labor.repo_id, + data.repo_labor.comment_lines FROM - collection_data.repo_labor, + data.repo_labor, ( SELECT - collection_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - collection_data.repo_labor - GROUP BY collection_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - collection_data.repo_labor.repo_id = recent.repo_id - AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE collection_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id AND e.repo_id = :repo_id_param ORDER BY e.repo_id """).bindparams(repo_id_param=repo_id) @@ -192,33 +192,33 @@ def get_project_blank_lines(): blank_lines_sql = s.sql.text(""" SELECT e.repo_id, - collection_data.repo.repo_git, - collection_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.blank_lines, e.avg_blank_lines FROM - collection_data.repo, + data.repo, (SELECT d.repo_id, SUM(d.blank_lines) AS blank_lines, AVG(d.blank_lines)::int AS avg_blank_lines FROM (SELECT - collection_data.repo_labor.repo_id, - collection_data.repo_labor.blank_lines + data.repo_labor.repo_id, + data.repo_labor.blank_lines FROM - collection_data.repo_labor, + data.repo_labor, ( SELECT - collection_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - collection_data.repo_labor - GROUP BY collection_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - collection_data.repo_labor.repo_id = recent.repo_id - AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE collection_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id AND e.repo_id = :repo_id_param ORDER BY e.repo_id """).bindparams(repo_id_param=repo_id) @@ -236,33 +236,33 @@ def get_project_file_complexity(): project_file_complexity_sql = s.sql.text(""" SELECT e.repo_id, - collection_data.repo.repo_git, - collection_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.sum_code_complexity, e.average_code_complexity FROM - collection_data.repo, + data.repo, (SELECT d.repo_id, SUM(d.code_complexity) AS sum_code_complexity, AVG(d.code_complexity)::int AS average_code_complexity FROM (SELECT - collection_data.repo_labor.repo_id, - collection_data.repo_labor.code_complexity + data.repo_labor.repo_id, + data.repo_labor.code_complexity FROM - collection_data.repo_labor, + data.repo_labor, ( SELECT - collection_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - collection_data.repo_labor - GROUP BY collection_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - collection_data.repo_labor.repo_id = recent.repo_id - AND collection_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE collection_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id ORDER BY e.repo_id """) diff --git a/collectoss/api/routes/metadata.py b/collectoss/api/routes/metadata.py index 7b09cfabe..bbdc94f62 100644 --- a/collectoss/api/routes/metadata.py +++ b/collectoss/api/routes/metadata.py @@ -31,7 +31,7 @@ def get_repo_info(): FROM repo_info, repo, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM collection_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e + ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM data.repo_info GROUP BY repo_id ORDER BY repo_id ) e WHERE repo_info.repo_id = repo.repo_id AND e.repo_id = repo_info.repo_id diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index 378a8a0fa..00eb2d44e 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -386,7 +386,7 @@ def repo_reset(backend_app): UPDATE collection_operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; - TRUNCATE collection_data.commits CASCADE; + TRUNCATE data.commits CASCADE; """)) logger.info("Repos successfully reset") diff --git a/collectoss/application/cli/collection.py b/collectoss/application/cli/collection.py index 471606d62..59ede7b2f 100644 --- a/collectoss/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -211,7 +211,7 @@ def repo_reset(ctx): UPDATE collection_operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; - TRUNCATE collection_data.commits CASCADE; + TRUNCATE data.commits CASCADE; """)) logger.info("Repos successfully reset") diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index bff4f859f..223f0db6c 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -140,7 +140,7 @@ def get_repo_groups(ctx: click.Context) -> pd.DataFrame: with ctx.obj.engine.connect() as connection: df = pd.read_sql( s.sql.text( - "SELECT repo_group_id, rg_name, rg_description FROM collection_data.repo_groups" + "SELECT repo_group_id, rg_name, rg_description FROM data.repo_groups" ), connection, ) @@ -179,14 +179,14 @@ def add_repo_groups(ctx: click.Context, filename: str) -> None: with ctx.obj.engine.begin() as connection: # Get existing repo group IDs df = pd.read_sql( - s.sql.text("SELECT repo_group_id FROM collection_data.repo_groups"), + s.sql.text("SELECT repo_group_id FROM data.repo_groups"), connection, ) repo_group_IDs = df["repo_group_id"].values.tolist() insert_repo_group_sql = s.sql.text( """ - INSERT INTO "collection_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); + INSERT INTO "data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); """ ) diff --git a/collectoss/application/db/engine.py b/collectoss/application/db/engine.py index e2ba3902e..090bc10f4 100644 --- a/collectoss/application/db/engine.py +++ b/collectoss/application/db/engine.py @@ -105,7 +105,7 @@ def set_search_path(dbapi_connection, connection_record): existing_autocommit = dbapi_connection.autocommit dbapi_connection.autocommit = True cursor = dbapi_connection.cursor() - cursor.execute("SET SESSION search_path=public,collection_data,collection_operations,spdx") + cursor.execute("SET SESSION search_path=public,data,collection_operations,spdx") cursor.close() dbapi_connection.autocommit = existing_autocommit diff --git a/collectoss/application/db/models/data.py b/collectoss/application/db/models/data.py index 4b8d7e5a9..2fa40b71a 100644 --- a/collectoss/application/db/models/data.py +++ b/collectoss/application/db/models/data.py @@ -54,7 +54,7 @@ nullable=False, server_default=text("CURRENT_TIMESTAMP"), ), - schema="collection_data", + schema="data", ) Index('repos_id', t_analysis_log.c.repos_id) @@ -63,16 +63,16 @@ class ChaossMetricStatus(Base): __tablename__ = "chaoss_metric_status" __table_args__ = { - "schema": "collection_data", + "schema": "data", "comment": "This table used to track CHAOSS Metric implementations, but due to the constantly changing location of that information, it is for the moment not actively populated. ", } cms_id = Column( BigInteger, - Sequence('chaoss_metric_status_cms_id_seq', start=1, schema='collection_data'), + Sequence('chaoss_metric_status_cms_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.chaoss_metric_status_cms_id_seq'::regclass)" + "nextval('data.chaoss_metric_status_cms_id_seq'::regclass)" ), ) cm_group = Column(String) @@ -97,14 +97,14 @@ class ChaossMetricStatus(Base): class ChaossUser(Base): __tablename__ = "chaoss_user" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} chaoss_id = Column( BigInteger, - Sequence('chaoss_user_chaoss_id_seq', start=1, schema='collection_data'), + Sequence('chaoss_user_chaoss_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.chaoss_user_chaoss_id_seq'::regclass)" + "nextval('data.chaoss_user_chaoss_id_seq'::regclass)" ), ) chaoss_login_name = Column(String) @@ -122,16 +122,16 @@ class ChaossUser(Base): class ContributorAffiliation(Base): __tablename__ = "contributor_affiliations" __table_args__ = { - "schema": "collection_data", + "schema": "data", "comment": "This table exists outside of relations with other tables. The purpose is to provide a dynamic, owner maintained (and collectoss augmented) list of affiliations. This table is processed in affiliation information in the DM_ tables generated when CollectOSS is finished counting commits using the Facade Worker. ", } ca_id = Column( BigInteger, - Sequence('contributor_affiliations_ca_id_seq', start=25430, schema='collection_data'), + Sequence('contributor_affiliations_ca_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.contributor_affiliations_ca_id_seq'::regclass)" + "nextval('data.contributor_affiliations_ca_id_seq'::regclass)" ), ) ca_domain = Column(String(64), nullable=False, unique=True) @@ -178,7 +178,7 @@ class Contributor(Base): Index("login-contributor-idx", "cntrb_login"), { - "schema": "collection_data", + "schema": "data", "comment": "For GitHub, this should be repeated from gh_login. for other systems, it should be that systems login. \nGithub now allows a user to change their login name, but their user id remains the same in this case. So, the natural key is the combination of id and login, but there should never be repeated logins. ", }, ) @@ -337,7 +337,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): ), Index("repo_id,email_copy_1", "repo_id", "email"), Index("repo_id,affiliation_copy_1", "repo_id", "affiliation"), - schema="collection_data", + schema="data", ) @@ -363,7 +363,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): ), Index("projects_id,email_copy_1", "repo_group_id", "email"), Index("projects_id,affiliation_copy_1", "repo_group_id", "affiliation"), - schema="collection_data", + schema="data", ) @@ -394,7 +394,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index( "projects_id,year,affiliation_copy_1", "repo_group_id", "year", "affiliation" ), - schema="collection_data", + schema="data", ) @@ -423,7 +423,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index("projects_id,email", "repo_group_id", "email"), Index("projects_id,year,email", "repo_group_id", "year", "email"), Index("projects_id,year,affiliation", "repo_group_id", "year", "affiliation"), - schema="collection_data", + schema="data", ) @@ -452,7 +452,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index("repo_id,year,affiliation_copy_1", "repo_id", "year", "affiliation"), Index("repo_id,affiliation_copy_2", "repo_id", "affiliation"), Index("repo_id,email_copy_2", "repo_id", "email"), - schema="collection_data", + schema="data", ) @@ -481,13 +481,13 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index("repo_id,email", "repo_id", "email"), Index("repo_id,year,email", "repo_id", "year", "email"), Index("repo_id,year,affiliation", "repo_id", "year", "affiliation"), - schema="collection_data", + schema="data", ) class Exclude(Base): __tablename__ = "exclude" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} id = Column(Integer, primary_key=True) projects_id = Column(Integer, nullable=False) @@ -497,14 +497,14 @@ class Exclude(Base): class LstmAnomalyModel(Base): __tablename__ = "lstm_anomaly_models" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} model_id = Column( BigInteger, - Sequence('lstm_anomaly_models_model_id_seq', start=1, schema='collection_data'), + Sequence('lstm_anomaly_models_model_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.lstm_anomaly_models_model_id_seq'::regclass)" + "nextval('data.lstm_anomaly_models_model_id_seq'::regclass)" ), ) model_name = Column(String) @@ -525,14 +525,14 @@ class Platform(Base): __tablename__ = "platform" __table_args__ = ( Index("plat", "pltfrm_id", unique=True), - {"schema": "collection_data"} + {"schema": "data"} ) pltfrm_id = Column( BigInteger, - Sequence('platform_pltfrm_id_seq', start=25430, schema="collection_data"), + Sequence('platform_pltfrm_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('collection_data.platform_pltfrm_id_seq'::regclass)"), + server_default=text("nextval('data.platform_pltfrm_id_seq'::regclass)"), ) pltfrm_name = Column(String) pltfrm_version = Column(String) @@ -548,16 +548,16 @@ class RepoGroup(Base): __table_args__ = ( Index("rgidm", "repo_group_id", unique=True), Index("rgnameindex", "rg_name"), - {"schema": "collection_data", + {"schema": "data", "comment": "rg_type is intended to be either a GitHub Organization or a User Created Repo Group. "}, ) repo_group_id = Column( BigInteger, - Sequence('repo_groups_repo_group_id_seq', start=25430, schema='collection_data'), + Sequence('repo_groups_repo_group_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_groups_repo_group_id_seq'::regclass)" + "nextval('data.repo_groups_repo_group_id_seq'::regclass)" ), ) rg_name = Column(String, nullable=False) @@ -622,13 +622,13 @@ def get_by_name(session, rg_name): ), Index("repos_id,status", "repos_id", "status"), Index("repos_id,statusops", "repos_id", "status"), - schema="collection_data", + schema="data", ) class Settings(Base): __tablename__ = "settings" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} id = Column(Integer, primary_key=True) setting = Column(String(32), nullable=False) @@ -640,14 +640,14 @@ class Settings(Base): class TopicWord(Base): __tablename__ = "topic_words" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} topic_words_id = Column( BigInteger, - Sequence('topic_words_topic_words_id_seq', start=1, schema='collection_data'), + Sequence('topic_words_topic_words_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.topic_words_topic_words_id_seq'::regclass)" + "nextval('data.topic_words_topic_words_id_seq'::regclass)" ), ) topic_id = Column(BigInteger) @@ -678,20 +678,20 @@ class TopicWord(Base): server_default=text("CURRENT_TIMESTAMP"), ), Index("type,projects_id", "type", "repo_group_id"), - schema="collection_data", + schema="data", ) class UnresolvedCommitEmail(Base): __tablename__ = "unresolved_commit_emails" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} email_unresolved_id = Column( BigInteger, - Sequence('unresolved_commit_emails_email_unresolved_id_seq', start=1, schema='collection_data'), + Sequence('unresolved_commit_emails_email_unresolved_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.unresolved_commit_emails_email_unresolved_id_seq'::regclass)" + "nextval('data.unresolved_commit_emails_email_unresolved_id_seq'::regclass)" ), ) email = Column(String, nullable=False, unique=True) @@ -706,13 +706,13 @@ class UnresolvedCommitEmail(Base): class UtilityLog(Base): __tablename__ = "utility_log" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} id = Column( BigInteger, - Sequence('utility_log_id_seq1', start=1, schema="collection_data"), + Sequence('utility_log_id_seq1', start=1, schema="data"), primary_key=True, - server_default=text("nextval('collection_data.utility_log_id_seq1'::regclass)"), + server_default=text("nextval('data.utility_log_id_seq1'::regclass)"), ) level = Column(String(8), nullable=False) status = Column(String, nullable=False) @@ -728,7 +728,7 @@ class UtilityLog(Base): Column( "working_commit", String(40), server_default=text("'NULL'::character varying") ), - schema="collection_data", + schema="data", ) @@ -737,22 +737,22 @@ class ContributorRepo(Base): __table_args__ = ( UniqueConstraint("event_id", "tool_version"), { - "schema": "collection_data", + "schema": "data", "comment": 'Developed in Partnership with Andrew Brain.', }, ) cntrb_repo_id = Column( BigInteger, - Sequence('contributor_repo_cntrb_repo_id_seq', start=1, schema='collection_data'), + Sequence('contributor_repo_cntrb_repo_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.contributor_repo_cntrb_repo_id_seq'::regclass)" + "nextval('data.contributor_repo_cntrb_repo_id_seq'::regclass)" ), ) cntrb_id = Column( ForeignKey( - "collection_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, comment="This is not null because what is the point without the contributor in this table? ", @@ -782,22 +782,22 @@ class ContributorsAlias(Base): __table_args__ = ( UniqueConstraint("cntrb_id","alias_email", name="cntrb-email-insert-unique"), { - "schema": "collection_data", + "schema": "data", "comment": "Every open source user may have more than one email used to make contributions over time. CollectOSS selects the first email it encounters for a user as its “canonical_email”. \n\nThe canonical_email is also added to the contributors_aliases table, with the canonical_email and alias_email being identical. Using this strategy, an email search will only need to join the alias table for basic email information, and can then more easily map the canonical email from each alias row to the same, more detailed information in the contributors table for a user. ", }, ) cntrb_alias_id = Column( BigInteger, - Sequence('contributors_aliases_cntrb_alias_id_seq', start=1, schema='collection_data'), + Sequence('contributors_aliases_cntrb_alias_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.contributors_aliases_cntrb_alias_id_seq'::regclass)" + "nextval('data.contributors_aliases_cntrb_alias_id_seq'::regclass)" ), ) cntrb_id = Column( ForeignKey( - "collection_data.contributors.cntrb_id", + "data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -838,19 +838,19 @@ class Repo(Base): Index("therepo", "repo_id", unique=True), { - "schema": "collection_data", + "schema": "data", "comment": "This table is a combination of the columns in Facade’s repo table and GHTorrent’s projects table. ", }, ) repo_id = Column( BigInteger, - Sequence('repo_repo_id_seq', start=25480, schema='collection_data'), + Sequence('repo_repo_id_seq', start=25480, schema='data'), primary_key=True, - server_default=text("nextval('collection_data.repo_repo_id_seq'::regclass)"), + server_default=text("nextval('data.repo_repo_id_seq'::regclass)"), ) repo_group_id = Column( - ForeignKey("collection_data.repo_groups.repo_group_id"), nullable=False + ForeignKey("data.repo_groups.repo_group_id"), nullable=False ) repo_git = Column(String, nullable=False) @@ -1192,22 +1192,22 @@ class HistoricalRepoURLs(Base): """ __tablename__ = "historical_repo_urls" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} - repo_id = Column(ForeignKey("collection_data.repo.repo_id"), primary_key=True) + repo_id = Column(ForeignKey("data.repo.repo_id"), primary_key=True) git_url = Column(String, primary_key=True) date_collected = Column(DateTime(timezone=True), server_default=func.now(), nullable=True) class RepoTestCoverage(Base): __tablename__ = "repo_test_coverage" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} repo_id = Column( - ForeignKey("collection_data.repo.repo_id"), - Sequence('repo_test_coverage_repo_id_seq', start=1, schema='collection_data'), + ForeignKey("data.repo.repo_id"), + Sequence('repo_test_coverage_repo_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_test_coverage_repo_id_seq'::regclass)" + "nextval('data.repo_test_coverage_repo_id_seq'::regclass)" ), ) repo_clone_date = Column(TIMESTAMP(precision=0)) @@ -1231,19 +1231,19 @@ class RepoTestCoverage(Base): class RepoGroupInsight(Base): __tablename__ = "repo_group_insights" __table_args__ = { - "schema": "collection_data", + "schema": "data", "comment": 'This table is output from an analytical worker. It runs through the different metrics on a REPOSITORY_GROUP and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', } rgi_id = Column( BigInteger, - Sequence('repo_group_insights_rgi_id_seq', start=25430, schema='collection_data'), + Sequence('repo_group_insights_rgi_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_group_insights_rgi_id_seq'::regclass)" + "nextval('data.repo_group_insights_rgi_id_seq'::regclass)" ), ) - repo_group_id = Column(ForeignKey("collection_data.repo_groups.repo_group_id")) + repo_group_id = Column(ForeignKey("data.repo_groups.repo_group_id")) rgi_metric = Column(String) rgi_value = Column(String) cms_id = Column(BigInteger) @@ -1266,19 +1266,19 @@ class RepoGroupsListServe(Base): __table_args__ = ( UniqueConstraint("rgls_id", "repo_group_id"), Index("lister", "rgls_id", "repo_group_id", unique=True), - {"schema": "collection_data"}, + {"schema": "data"}, ) rgls_id = Column( BigInteger, - Sequence('repo_groups_list_serve_rgls_id_seq', start=25430, schema='collection_data'), + Sequence('repo_groups_list_serve_rgls_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_groups_list_serve_rgls_id_seq'::regclass)" + "nextval('data.repo_groups_list_serve_rgls_id_seq'::regclass)" ), ) repo_group_id = Column( - ForeignKey("collection_data.repo_groups.repo_group_id"), nullable=False + ForeignKey("data.repo_groups.repo_group_id"), nullable=False ) rgls_name = Column(String) rgls_description = Column(String(3000)) @@ -1319,19 +1319,19 @@ class Commit(Base): Index("repo_id,commit", "repo_id", "cmt_commit_hash"), { - "schema": "collection_data", + "schema": "data", "comment": "Commits.\nEach row represents changes to one FILE within a single commit. So you will encounter multiple rows per commit hash in many cases. ", }, ) cmt_id = Column( BigInteger, - Sequence('commits_cmt_id_seq', start=25430, schema="collection_data"), + Sequence('commits_cmt_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('collection_data.commits_cmt_id_seq'::regclass)"), + server_default=text("nextval('data.commits_cmt_id_seq'::regclass)"), ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=False, ) cmt_commit_hash = Column(String(80), nullable=False) @@ -1355,7 +1355,7 @@ class Commit(Base): cmt_filename = Column(String, nullable=False) cmt_date_attempted = Column(TIMESTAMP(precision=0), nullable=False) cmt_ght_author_id = Column(ForeignKey( - "collection_data.contributors.cntrb_id", + "data.contributors.cntrb_id", name="cmt_ght_author_cntrb_id_fk", onupdate="CASCADE", ondelete="RESTRICT", @@ -1368,7 +1368,7 @@ class Commit(Base): cmt_author_timestamp = Column(TIMESTAMP(True, 0)) cmt_author_platform_username = Column( ForeignKey( - "collection_data.contributors.cntrb_login", + "data.contributors.cntrb_login", name="fk_commits_contributors_3", ondelete="CASCADE", onupdate="CASCADE", @@ -1376,7 +1376,7 @@ class Commit(Base): deferrable=True, ), ForeignKey( - "collection_data.contributors.cntrb_login", + "data.contributors.cntrb_login", name="fk_commits_contributors_4", ondelete="CASCADE", onupdate="CASCADE", @@ -1404,20 +1404,20 @@ class CommitMessage(Base): __table_args__ = ( UniqueConstraint("repo_id","cmt_hash", name="commit-message-insert-unique"), { - "schema": "collection_data", + "schema": "data", "comment": "This table holds commit messages", } ) cmt_msg_id = Column( BigInteger, - Sequence('commits_cmt_id_seq', start=25430, schema="collection_data"), + Sequence('commits_cmt_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('collection_data.commits_cmt_id_seq'::regclass)"), + server_default=text("nextval('data.commits_cmt_id_seq'::regclass)"), ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=False, ) @@ -1442,20 +1442,20 @@ class Issue(Base): UniqueConstraint("repo_id", "gh_issue_id"), UniqueConstraint("issue_url", name="issue-insert-unique"), - {"schema": "collection_data"}, + {"schema": "data"}, ) issue_id = Column( BigInteger, - Sequence('issue_seq', start=31000, schema="collection_data"), + Sequence('issue_seq', start=31000, schema="data"), primary_key=True, - server_default=text("nextval('collection_data.issue_seq'::regclass)"), + server_default=text("nextval('data.issue_seq'::regclass)"), ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), ) reporter_id = Column( - ForeignKey("collection_data.contributors.cntrb_id"), + ForeignKey("data.contributors.cntrb_id"), comment="The ID of the person who opened the issue. ", ) pull_request = Column(BigInteger) @@ -1464,7 +1464,7 @@ class Issue(Base): issue_title = Column(String) issue_body = Column(String) cntrb_id = Column( - ForeignKey("collection_data.contributors.cntrb_id"), + ForeignKey("data.contributors.cntrb_id"), comment="The ID of the person who closed the issue. ", ) comment_count = Column(BigInteger) @@ -1509,15 +1509,15 @@ class Issue(Base): class Library(Base): __tablename__ = "libraries" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} library_id = Column( BigInteger, - Sequence('libraries_library_id_seq', start=25430, schema="collection_data"), + Sequence('libraries_library_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('collection_data.libraries_library_id_seq'::regclass)"), + server_default=text("nextval('data.libraries_library_id_seq'::regclass)"), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) platform = Column(String) name = Column(String) created_timestamp = Column( @@ -1551,19 +1551,19 @@ class Library(Base): class LstmAnomalyResult(Base): __tablename__ = "lstm_anomaly_results" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} result_id = Column( BigInteger, - Sequence('lstm_anomaly_results_result_id_seq', start=1, schema='collection_data'), + Sequence('lstm_anomaly_results_result_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.lstm_anomaly_results_result_id_seq'::regclass)" + "nextval('data.lstm_anomaly_results_result_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) repo_category = Column(String) - model_id = Column(ForeignKey("collection_data.lstm_anomaly_models.model_id")) + model_id = Column(ForeignKey("data.lstm_anomaly_models.model_id")) metric = Column(String) contamination_factor = Column(Float(53)) mean_absolute_error = Column(Float(53)) @@ -1592,18 +1592,18 @@ class Message(Base): Index("msg-cntrb-id-idx", "cntrb_id"), Index("platformgrouper", "msg_id", "pltfrm_id"), Index("messagegrouper", "msg_id", "rgls_id", unique=True), - {"schema": "collection_data"}, + {"schema": "data"}, ) msg_id = Column( BigInteger, - Sequence('message_msg_id_seq', start=25430, schema="collection_data"), + Sequence('message_msg_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('collection_data.message_msg_id_seq'::regclass)"), + server_default=text("nextval('data.message_msg_id_seq'::regclass)"), ) rgls_id = Column( ForeignKey( - "collection_data.repo_groups_list_serve.rgls_id", + "data.repo_groups_list_serve.rgls_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -1612,7 +1612,7 @@ class Message(Base): platform_node_id = Column(String) repo_id = Column( ForeignKey( - "collection_data.repo.repo_id", + "data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -1621,7 +1621,7 @@ class Message(Base): ) cntrb_id = Column( ForeignKey( - "collection_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" ), comment="Not populated for mailing lists. Populated for GitHub issues. ", ) @@ -1631,7 +1631,7 @@ class Message(Base): msg_header = Column(String) pltfrm_id = Column( ForeignKey( - "collection_data.platform.pltfrm_id", ondelete="CASCADE", onupdate="CASCADE" + "data.platform.pltfrm_id", ondelete="CASCADE", onupdate="CASCADE" ), nullable=False, ) @@ -1661,19 +1661,19 @@ class Message(Base): class MessageAnalysisSummary(Base): __tablename__ = "message_analysis_summary" __table_args__ = { - "schema": "collection_data", + "schema": "data", "comment": "In a relationally perfect world, we would have a table called “message_analysis_run” the incremented the “worker_run_id” for both message_analysis and message_analysis_summary. For now, we decided this was overkill. ", } msg_summary_id = Column( BigInteger, - Sequence('message_analysis_summary_msg_summary_id_seq', start=1, schema='collection_data'), + Sequence('message_analysis_summary_msg_summary_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.message_analysis_summary_msg_summary_id_seq'::regclass)" + "nextval('data.message_analysis_summary_msg_summary_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) worker_run_id = Column( BigInteger, comment='This value should reflect the worker_run_id for the messages summarized in the table. There is not a relation between these two tables for that purpose because its not *really*, relationaly a concept unless we create a third table for "worker_run_id", which we determined was unnecessarily complex. ', @@ -1701,19 +1701,19 @@ class MessageAnalysisSummary(Base): class MessageSentimentSummary(Base): __tablename__ = "message_sentiment_summary" __table_args__ = { - "schema": "collection_data", + "schema": "data", "comment": "In a relationally perfect world, we would have a table called “message_sentiment_run” the incremented the “worker_run_id” for both message_sentiment and message_sentiment_summary. For now, we decided this was overkill. ", } msg_summary_id = Column( BigInteger, - Sequence('message_sentiment_summary_msg_summary_id_seq', start=1, schema='collection_data'), + Sequence('message_sentiment_summary_msg_summary_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.message_sentiment_summary_msg_summary_id_seq'::regclass)" + "nextval('data.message_sentiment_summary_msg_summary_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) worker_run_id = Column( BigInteger, comment='This value should reflect the worker_run_id for the messages summarized in the table. There is not a relation between these two tables for that purpose because its not *really*, relationaly a concept unless we create a third table for "worker_run_id", which we determined was unnecessarily complex. ', @@ -1749,19 +1749,19 @@ class PullRequest(Base): "pull_requests_idx_repo_id_data_datex", "repo_id", "data_collection_date" ), Index("pr_ID_prs_table", "pull_request_id"), - {"schema": "collection_data"}, + {"schema": "data"}, ) pull_request_id = Column( BigInteger, - Sequence('pull_requests_pull_request_id_seq', start=25430, schema='collection_data'), + Sequence('pull_requests_pull_request_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_requests_pull_request_id_seq'::regclass)" + "nextval('data.pull_requests_pull_request_id_seq'::regclass)" ), ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), server_default=text("0"), ) pr_url = Column(String) @@ -1784,7 +1784,7 @@ class PullRequest(Base): pr_src_title = Column(String) pr_augur_contributor_id = Column( ForeignKey( - "collection_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), comment="This is to link to the contributor record. ", ) @@ -1883,15 +1883,15 @@ def from_github(cls, pr, repo_id, tool_source, tool_version): class Release(Base): __tablename__ = "releases" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} release_id = Column( CHAR(256), - Sequence('releases_release_id_seq', start=1, schema="collection_data"), + Sequence('releases_release_id_seq', start=1, schema="data"), primary_key=True, - server_default=text("nextval('collection_data.releases_release_id_seq'::regclass)"), + server_default=text("nextval('data.releases_release_id_seq'::regclass)"), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id"), nullable=False) + repo_id = Column(ForeignKey("data.repo.repo_id"), nullable=False) release_name = Column(String) release_description = Column(String) release_author = Column(String) @@ -1916,19 +1916,19 @@ class Release(Base): class RepoBadging(Base): __tablename__ = "repo_badging" __table_args__ = { - "schema": "collection_data", + "schema": "data", "comment": "This will be collected from the LF’s Badging API\nhttps://bestpractices.coreinfrastructure.org/projects.json?pq=https%3A%2F%2Fgithub.com%2Fchaoss%2Faugur\n", } badge_collection_id = Column( BigInteger, - Sequence('repo_badging_badge_collection_id_seq', start=25012, schema='collection_data'), + Sequence('repo_badging_badge_collection_id_seq', start=25012, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_badging_badge_collection_id_seq'::regclass)" + "nextval('data.repo_badging_badge_collection_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) created_at = Column( TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) @@ -1960,17 +1960,17 @@ def insert(session, repo_id: int, data: dict) -> dict: class RepoClusterMessage(Base): __tablename__ = "repo_cluster_messages" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} msg_cluster_id = Column( BigInteger, - Sequence('repo_cluster_messages_msg_cluster_id_seq', start=1, schema='collection_data'), + Sequence('repo_cluster_messages_msg_cluster_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_cluster_messages_msg_cluster_id_seq'::regclass)" + "nextval('data.repo_cluster_messages_msg_cluster_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) cluster_content = Column(Integer) cluster_mechanism = Column(Integer) tool_source = Column(String) @@ -1988,21 +1988,21 @@ class RepoDependency(Base): __table_args__ = ( UniqueConstraint("repo_id","dep_name","data_collection_date", name="deps-insert-unique"), { - "schema": "collection_data", + "schema": "data", "comment": "Contains the dependencies for a repo." }, ) repo_dependencies_id = Column( BigInteger, - Sequence('repo_dependencies_repo_dependencies_id_seq', start=1, schema='collection_data'), + Sequence('repo_dependencies_repo_dependencies_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_dependencies_repo_dependencies_id_seq'::regclass)" + "nextval('data.repo_dependencies_repo_dependencies_id_seq'::regclass)" ), ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id"), comment="Forign key for repo id. " + ForeignKey("data.repo.repo_id"), comment="Forign key for repo id. " ) dep_name = Column(String, comment="Name of the dependancy found in project. ") dep_count = Column(Integer, comment="Number of times the dependancy was found. ") @@ -2021,18 +2021,18 @@ class RepoDepsLibyear(Base): __tablename__ = "repo_deps_libyear" __table_args__ = ( UniqueConstraint("repo_id","name", "data_collection_date", name="deps-libyear-insert-unique"), - {"schema": "collection_data"} + {"schema": "data"} ) repo_deps_libyear_id = Column( BigInteger, - Sequence('repo_deps_libyear_repo_deps_libyear_id_seq', start=1, schema='collection_data'), + Sequence('repo_deps_libyear_repo_deps_libyear_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_deps_libyear_repo_deps_libyear_id_seq'::regclass)" + "nextval('data.repo_deps_libyear_repo_deps_libyear_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) name = Column(String) requirement = Column(String) type = Column(String) @@ -2056,18 +2056,18 @@ class RepoDepsScorecard(Base): __tablename__ = "repo_deps_scorecard" __table_args__ = ( UniqueConstraint("repo_id","name", "data_collection_date", name="deps_scorecard_new_unique"), - {"schema": "collection_data"} + {"schema": "data"} ) repo_deps_scorecard_id = Column( BigInteger, - Sequence('repo_deps_scorecard_repo_deps_scorecard_id_seq1', start=1, schema='collection_data'), + Sequence('repo_deps_scorecard_repo_deps_scorecard_id_seq1', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_deps_scorecard_repo_deps_scorecard_id_seq1'::regclass)" + "nextval('data.repo_deps_scorecard_repo_deps_scorecard_id_seq1'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) name = Column(String) #status = Column(String) scorecard_check_details = Column(JSONB) @@ -2087,18 +2087,18 @@ class RepoInfo(Base): __table_args__ = ( Index("repo_info_idx_repo_id_data_date_1x", "repo_id", "data_collection_date"), Index("repo_info_idx_repo_id_data_datex", "repo_id", "data_collection_date"), - {"schema": "collection_data"}, + {"schema": "data"}, ) repo_info_id = Column( BigInteger, - Sequence('repo_info_repo_info_id_seq', start=25430, schema='collection_data'), + Sequence('repo_info_repo_info_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_info_repo_info_id_seq'::regclass)" + "nextval('data.repo_info_repo_info_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id"), nullable=False) + repo_id = Column(ForeignKey("data.repo.repo_id"), nullable=False) last_updated = Column( TIMESTAMP(precision=0), server_default=text("NULL::timestamp without time zone") ) @@ -2143,17 +2143,17 @@ class RepoInfo(Base): class RepoInsight(Base): __tablename__ = "repo_insights" __table_args__ = { - "schema": "collection_data", + "schema": "data", "comment": 'This table is output from an analytical worker. It runs through the different metrics on a repository and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', } ri_id = Column( BigInteger, - Sequence('repo_insights_ri_id_seq', start=25430, schema="collection_data"), + Sequence('repo_insights_ri_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('collection_data.repo_insights_ri_id_seq'::regclass)"), + server_default=text("nextval('data.repo_insights_ri_id_seq'::regclass)"), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) ri_metric = Column(String) ri_value = Column(String) ri_date = Column(TIMESTAMP(precision=0)) @@ -2178,20 +2178,20 @@ class RepoInsightsRecord(Base): __tablename__ = "repo_insights_records" __table_args__ = ( Index("dater", "ri_date"), - {"schema": "collection_data"} + {"schema": "data"} ) ri_id = Column( BigInteger, - Sequence('repo_insights_records_ri_id_seq', start=1, schema='collection_data'), + Sequence('repo_insights_records_ri_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_insights_records_ri_id_seq'::regclass)" + "nextval('data.repo_insights_records_ri_id_seq'::regclass)" ), comment="Primary key. ", ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="SET NULL", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="SET NULL", onupdate="CASCADE"), comment="Refers to repo table primary key. Will have a foreign key", ) ri_metric = Column(String, comment="The metric endpoint") @@ -2223,20 +2223,20 @@ class RepoLabor(Base): __table_args__ = ( UniqueConstraint("repo_id", "rl_analysis_date", "file_path", "file_name"), { - "schema": "collection_data", + "schema": "data", "comment": "repo_labor is a derivative of tables used to store scc code and complexity counting statistics that are inputs to labor analysis, which are components of CHAOSS value metric calculations. ", }, ) repo_labor_id = Column( BigInteger, - Sequence('repo_labor_repo_labor_id_seq', start=25430, schema='collection_data'), + Sequence('repo_labor_repo_labor_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_labor_repo_labor_id_seq'::regclass)" + "nextval('data.repo_labor_repo_labor_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) repo_clone_date = Column(TIMESTAMP(precision=0)) rl_analysis_date = Column(TIMESTAMP(precision=0)) programming_language = Column(String) @@ -2261,17 +2261,17 @@ class RepoLabor(Base): class RepoMeta(Base): __tablename__ = "repo_meta" - __table_args__ = {"schema": "collection_data", "comment": "Project Languages"} + __table_args__ = {"schema": "data", "comment": "Project Languages"} repo_id = Column( - ForeignKey("collection_data.repo.repo_id"), primary_key=True, nullable=False + ForeignKey("data.repo.repo_id"), primary_key=True, nullable=False ) rmeta_id = Column( BigInteger, - Sequence('repo_meta_rmeta_id_seq', start=25430, schema="collection_data"), + Sequence('repo_meta_rmeta_id_seq', start=25430, schema="data"), primary_key=True, nullable=False, - server_default=text("nextval('collection_data.repo_meta_rmeta_id_seq'::regclass)"), + server_default=text("nextval('data.repo_meta_rmeta_id_seq'::regclass)"), ) rmeta_name = Column(String) rmeta_value = Column(String, server_default=text("0")) @@ -2285,18 +2285,18 @@ class RepoMeta(Base): class RepoSbomScan(Base): __tablename__ = "repo_sbom_scans" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} rsb_id = Column( BigInteger, - Sequence('repo_sbom_scans_rsb_id_seq', start=25430, schema='collection_data'), + Sequence('repo_sbom_scans_rsb_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_sbom_scans_rsb_id_seq'::regclass)" + "nextval('data.repo_sbom_scans_rsb_id_seq'::regclass)" ), ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE") ) sbom_scan = Column(JSON) @@ -2305,17 +2305,17 @@ class RepoSbomScan(Base): class RepoStat(Base): __tablename__ = "repo_stats" - __table_args__ = {"schema": "collection_data", "comment": "Project Watchers"} + __table_args__ = {"schema": "data", "comment": "Project Watchers"} repo_id = Column( - ForeignKey("collection_data.repo.repo_id"), primary_key=True, nullable=False + ForeignKey("data.repo.repo_id"), primary_key=True, nullable=False ) rstat_id = Column( BigInteger, - Sequence('repo_stats_rstat_id_seq', start=25430, schema="collection_data"), + Sequence('repo_stats_rstat_id_seq', start=25430, schema="data"), primary_key=True, nullable=False, - server_default=text("nextval('collection_data.repo_stats_rstat_id_seq'::regclass)"), + server_default=text("nextval('data.repo_stats_rstat_id_seq'::regclass)"), ) rstat_name = Column(String(400)) rstat_value = Column(BigInteger) @@ -2329,17 +2329,17 @@ class RepoStat(Base): class RepoTopic(Base): __tablename__ = "repo_topic" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} repo_topic_id = Column( BigInteger, - Sequence('repo_topic_repo_topic_id_seq', start=1, schema='collection_data'), + Sequence('repo_topic_repo_topic_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_topic_repo_topic_id_seq'::regclass)" + "nextval('data.repo_topic_repo_topic_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) topic_id = Column(Integer) topic_prob = Column(Float(53)) tool_source = Column(String) @@ -2356,27 +2356,27 @@ class CommitCommentRef(Base): __tablename__ = "commit_comment_ref" __table_args__ = ( Index("comment_id", "cmt_comment_src_id", "cmt_comment_id", "msg_id"), - {"schema": "collection_data"}, + {"schema": "data"}, ) cmt_comment_id = Column( BigInteger, - Sequence('commit_comment_ref_cmt_comment_id_seq', start=25430, schema='collection_data'), + Sequence('commit_comment_ref_cmt_comment_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.commit_comment_ref_cmt_comment_id_seq'::regclass)" + "nextval('data.commit_comment_ref_cmt_comment_id_seq'::regclass)" ), ) cmt_id = Column( ForeignKey( - "collection_data.commits.cmt_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.commits.cmt_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) repo_id = Column(BigInteger) msg_id = Column( ForeignKey( - "collection_data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) @@ -2413,21 +2413,21 @@ class CommitParent(Base): __table_args__ = ( Index("commit_parents_ibfk_1", "cmt_id"), Index("commit_parents_ibfk_2", "parent_id"), - {"schema": "collection_data"} + {"schema": "data"} ) cmt_id = Column( - ForeignKey("collection_data.commits.cmt_id"), + ForeignKey("data.commits.cmt_id"), primary_key=True, nullable=False, ) parent_id = Column( - ForeignKey("collection_data.commits.cmt_id"), - Sequence('commit_parents_parent_id_seq', start=25430, schema='collection_data'), + ForeignKey("data.commits.cmt_id"), + Sequence('commit_parents_parent_id_seq', start=25430, schema='data'), primary_key=True, nullable=False, server_default=text( - "nextval('collection_data.commit_parents_parent_id_seq'::regclass)" + "nextval('data.commit_parents_parent_id_seq'::regclass)" ), ) tool_source = Column(String) @@ -2446,19 +2446,19 @@ class CommitParent(Base): class DiscourseInsight(Base): __tablename__ = "discourse_insights" __table_args__ = { - "schema": "collection_data", + "schema": "data", "comment": "This table is populated by the “Discourse_Analysis_Worker”. It examines sequential discourse, using computational linguistic methods, to draw statistical inferences regarding the discourse in a particular comment thread. ", } msg_discourse_id = Column( BigInteger, - Sequence('discourse_insights_msg_discourse_id_seq1', start=1, schema='collection_data'), + Sequence('discourse_insights_msg_discourse_id_seq1', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.discourse_insights_msg_discourse_id_seq1'::regclass)" + "nextval('data.discourse_insights_msg_discourse_id_seq1'::regclass)" ), ) - msg_id = Column(ForeignKey("collection_data.message.msg_id")) + msg_id = Column(ForeignKey("data.message.msg_id")) discourse_act = Column(String) tool_source = Column(String) tool_version = Column(String) @@ -2475,22 +2475,22 @@ class IssueAssignee(Base): __table_args__ = ( Index("issue-cntrb-assign-idx-1", "cntrb_id"), UniqueConstraint("issue_assignee_src_id", "issue_id", name="issue-assignee-insert-unique"), - {"schema": "collection_data"} + {"schema": "data"} ) issue_assignee_id = Column( BigInteger, - Sequence('issue_assignees_issue_assignee_id_seq', start=1, schema='collection_data'), + Sequence('issue_assignees_issue_assignee_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.issue_assignees_issue_assignee_id_seq'::regclass)" + "nextval('data.issue_assignees_issue_assignee_id_seq'::regclass)" ), ) - issue_id = Column(ForeignKey("collection_data.issues.issue_id")) + issue_id = Column(ForeignKey("data.issues.issue_id")) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) - cntrb_id = Column(ForeignKey("collection_data.contributors.cntrb_id")) + cntrb_id = Column(ForeignKey("data.contributors.cntrb_id")) issue_assignee_src_id = Column( BigInteger, comment="This ID comes from the source. In the case of GitHub, it is the id that is the first field returned from the issue events API in the issue_assignees embedded JSON object. We may discover it is an ID for the person themselves; but my hypothesis is that its not.", @@ -2535,29 +2535,29 @@ class IssueEvent(Base): Index("issue_events_ibfk_1", "issue_id"), Index("issue_events_ibfk_2", "cntrb_id"), - {"schema": "collection_data"}, + {"schema": "data"}, ) event_id = Column( BigInteger, - Sequence('issue_events_event_id_seq', start=25430, schema='collection_data'), + Sequence('issue_events_event_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.issue_events_event_id_seq'::regclass)" + "nextval('data.issue_events_event_id_seq'::regclass)" ), ) issue_id = Column( ForeignKey( - "collection_data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE" + "data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE" ), nullable=False, ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) cntrb_id = Column( ForeignKey( - "collection_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ) ) action = Column(String, nullable=False) @@ -2572,7 +2572,7 @@ class IssueEvent(Base): node_url = Column(String) platform_id = Column( ForeignKey( - "collection_data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) @@ -2620,22 +2620,22 @@ class IssueLabel(Base): __tablename__ = "issue_labels" __table_args__ = ( UniqueConstraint("label_src_id", "issue_id"), - {"schema": "collection_data"}, + {"schema": "data"}, ) issue_label_id = Column( BigInteger, - Sequence('issue_labels_issue_label_id_seq', start=25430, schema='collection_data'), + Sequence('issue_labels_issue_label_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.issue_labels_issue_label_id_seq'::regclass)" + "nextval('data.issue_labels_issue_label_id_seq'::regclass)" ), ) issue_id = Column( - ForeignKey("collection_data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE") + ForeignKey("data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE") ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) label_text = Column(String) label_description = Column(String) @@ -2677,20 +2677,20 @@ class IssueMessageRef(Base): __tablename__ = "issue_message_ref" __table_args__ = ( UniqueConstraint("issue_msg_ref_src_comment_id", "issue_id", name="issue-message-ref-insert-unique"), - {"schema": "collection_data"}, + {"schema": "data"}, ) issue_msg_ref_id = Column( BigInteger, - Sequence('issue_message_ref_issue_msg_ref_id_seq', start=25430, schema='collection_data'), + Sequence('issue_message_ref_issue_msg_ref_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.issue_message_ref_issue_msg_ref_id_seq'::regclass)" + "nextval('data.issue_message_ref_issue_msg_ref_id_seq'::regclass)" ), ) issue_id = Column( ForeignKey( - "collection_data.issues.issue_id", + "data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -2699,7 +2699,7 @@ class IssueMessageRef(Base): ) repo_id = Column( ForeignKey( - "collection_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -2708,7 +2708,7 @@ class IssueMessageRef(Base): ) msg_id = Column( ForeignKey( - "collection_data.message.msg_id", + "data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -2739,18 +2739,18 @@ class LibraryDependency(Base): __tablename__ = "library_dependencies" __table_args__ = ( Index("REPO_DEP", "library_id"), - {"schema": "collection_data"} + {"schema": "data"} ) lib_dependency_id = Column( BigInteger, - Sequence('library_dependencies_lib_dependency_id_seq', start=25430, schema='collection_data'), + Sequence('library_dependencies_lib_dependency_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.library_dependencies_lib_dependency_id_seq'::regclass)" + "nextval('data.library_dependencies_lib_dependency_id_seq'::regclass)" ), ) - library_id = Column(ForeignKey("collection_data.libraries.library_id")) + library_id = Column(ForeignKey("data.libraries.library_id")) manifest_platform = Column(String) manifest_filepath = Column( String(1000), server_default=text("NULL::character varying") @@ -2767,17 +2767,17 @@ class LibraryDependency(Base): class LibraryVersion(Base): __tablename__ = "library_version" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} library_version_id = Column( BigInteger, - Sequence('library_version_library_version_id_seq', start=25430, schema='collection_data'), + Sequence('library_version_library_version_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.library_version_library_version_id_seq'::regclass)" + "nextval('data.library_version_library_version_id_seq'::regclass)" ), ) - library_id = Column(ForeignKey("collection_data.libraries.library_id")) + library_id = Column(ForeignKey("data.libraries.library_id")) library_platform = Column(String) version_number = Column(String) version_release_date = Column( @@ -2793,17 +2793,17 @@ class LibraryVersion(Base): class MessageAnalysis(Base): __tablename__ = "message_analysis" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} msg_analysis_id = Column( BigInteger, - Sequence('message_analysis_msg_analysis_id_seq', start=1, schema='collection_data'), + Sequence('message_analysis_msg_analysis_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.message_analysis_msg_analysis_id_seq'::regclass)" + "nextval('data.message_analysis_msg_analysis_id_seq'::regclass)" ), ) - msg_id = Column(ForeignKey("collection_data.message.msg_id")) + msg_id = Column(ForeignKey("data.message.msg_id")) worker_run_id = Column( BigInteger, comment="This column is used to indicate analyses run by a worker during the same execution period, and is useful for grouping, and time series analysis. ", @@ -2836,17 +2836,17 @@ class MessageAnalysis(Base): class MessageSentiment(Base): __tablename__ = "message_sentiment" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} msg_analysis_id = Column( BigInteger, - Sequence('message_sentiment_msg_analysis_id_seq', start=1, schema='collection_data'), + Sequence('message_sentiment_msg_analysis_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.message_sentiment_msg_analysis_id_seq'::regclass)" + "nextval('data.message_sentiment_msg_analysis_id_seq'::regclass)" ), ) - msg_id = Column(ForeignKey("collection_data.message.msg_id")) + msg_id = Column(ForeignKey("data.message.msg_id")) worker_run_id = Column( BigInteger, comment="This column is used to indicate analyses run by a worker during the same execution period, and is useful for grouping, and time series analysis. ", @@ -2881,15 +2881,15 @@ class PullRequestAnalysis(Base): pull_request_analysis_id = Column( BigInteger, - Sequence('pull_request_analysis_pull_request_analysis_id_seq', start=1, schema='collection_data'), + Sequence('pull_request_analysis_pull_request_analysis_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_analysis_pull_request_analysis_id_seq'::regclass)" + "nextval('data.pull_request_analysis_pull_request_analysis_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ), @@ -2915,7 +2915,7 @@ class PullRequestAnalysis(Base): __table_args__ = ( Index("pr_anal_idx", pull_request_id), Index("probability_idx", merge_probability.desc().nullslast()), - {"schema": "collection_data"} + {"schema": "data"} ) pull_request = relationship("PullRequest") @@ -2926,34 +2926,34 @@ class PullRequestAssignee(Base): __table_args__ = ( Index("pr_meta_cntrb-idx", "contrib_id"), UniqueConstraint("pull_request_id", "pr_assignee_src_id", name="assigniees-unique"), - {"schema": "collection_data"} + {"schema": "data"} ) pr_assignee_map_id = Column( BigInteger, - Sequence('pull_request_assignees_pr_assignee_map_id_seq', start=25430, schema='collection_data'), + Sequence('pull_request_assignees_pr_assignee_map_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_assignees_pr_assignee_map_id_seq'::regclass)" + "nextval('data.pull_request_assignees_pr_assignee_map_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( ForeignKey( - "collection_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, initially="DEFERRED", ) ) - contrib_id = Column(ForeignKey("collection_data.contributors.cntrb_id")) + contrib_id = Column(ForeignKey("data.contributors.cntrb_id")) pr_assignee_src_id = Column(BigInteger) tool_source = Column(String) tool_version = Column(String) @@ -2987,28 +2987,28 @@ class PullRequestCommit(Base): __table_args__ = ( UniqueConstraint("pull_request_id", "repo_id", "pr_cmt_sha"), { - "schema": "collection_data", + "schema": "data", "comment": "Pull request commits are an enumeration of each commit associated with a pull request. \nNot all pull requests are from a branch or fork into master. \nThe commits table intends to count only commits that end up in the master branch (i.e., part of the deployed code base for a project).\nTherefore, there will be commit “SHA”’s in this table that are no associated with a commit SHA in the commits table. \nIn cases where the PR is to the master branch of a project, you will find a match. In cases where the PR does not involve the master branch, you will not find a corresponding commit SHA in the commits table. This is expected. ", }, ) pr_cmt_id = Column( BigInteger, - Sequence('pull_request_commits_pr_cmt_id_seq', start=1, schema='collection_data'), + Sequence('pull_request_commits_pr_cmt_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_commits_pr_cmt_id_seq'::regclass)" + "nextval('data.pull_request_commits_pr_cmt_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) pr_cmt_sha = Column( String, @@ -3019,7 +3019,7 @@ class PullRequestCommit(Base): pr_cmt_comments_url = Column(String) pr_cmt_author_cntrb_id = Column( ForeignKey( - "collection_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" ) ) pr_cmt_timestamp = Column(TIMESTAMP(precision=0)) @@ -3044,20 +3044,20 @@ class PullRequestEvent(Base): UniqueConstraint("repo_id", "issue_event_src_id", name="pr_events_repo_id_event_src_id_unique"), UniqueConstraint("platform_id", "node_id", name="unique-pr-event-id"), UniqueConstraint("node_id", name="pr-unqiue-event"), - {"schema": "collection_data"}, + {"schema": "data"}, ) pr_event_id = Column( BigInteger, - Sequence('pull_request_events_pr_event_id_seq', start=25430, schema='collection_data'), + Sequence('pull_request_events_pr_event_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_events_pr_event_id_seq'::regclass)" + "nextval('data.pull_request_events_pr_event_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ), @@ -3065,7 +3065,7 @@ class PullRequestEvent(Base): ) repo_id = Column( ForeignKey( - "collection_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="RESTRICT", deferrable=True, @@ -3073,7 +3073,7 @@ class PullRequestEvent(Base): ) ) cntrb_id = Column( - ForeignKey("collection_data.contributors.cntrb_id") + ForeignKey("data.contributors.cntrb_id") ) action = Column(String, nullable=False) action_commit_hash = Column(String) @@ -3091,7 +3091,7 @@ class PullRequestEvent(Base): node_url = Column(String) platform_id = Column( ForeignKey( - "collection_data.platform.pltfrm_id", + "data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="RESTRICT", deferrable=True, @@ -3142,29 +3142,29 @@ class PullRequestFile(Base): Index("pr_id_pr_files","pull_request_id"), UniqueConstraint("pull_request_id", "repo_id", "pr_file_path", name="prfiles_unique"), { - "schema": "collection_data", + "schema": "data", "comment": "Pull request commits are an enumeration of each commit associated with a pull request. \nNot all pull requests are from a branch or fork into master. \nThe commits table intends to count only commits that end up in the master branch (i.e., part of the deployed code base for a project).\nTherefore, there will be commit “SHA”’s in this table that are no associated with a commit SHA in the commits table. \nIn cases where the PR is to the master branch of a project, you will find a match. In cases where the PR does not involve the master branch, you will not find a corresponding commit SHA in the commits table. This is expected. ", }, ) pr_file_id = Column( BigInteger, - Sequence('pull_request_files_pr_file_id_seq', start=25150, schema='collection_data'), + Sequence('pull_request_files_pr_file_id_seq', start=25150, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_files_pr_file_id_seq'::regclass)" + "nextval('data.pull_request_files_pr_file_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( ForeignKey( - "collection_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3196,26 +3196,26 @@ class PullRequestLabel(Base): __tablename__ = "pull_request_labels" __table_args__ = ( UniqueConstraint("pr_src_id", "pull_request_id"), - {"schema": "collection_data"}, + {"schema": "data"}, ) pr_label_id = Column( BigInteger, - Sequence('pull_request_labels_pr_label_id_seq', start=25430, schema='collection_data'), + Sequence('pull_request_labels_pr_label_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_labels_pr_label_id_seq'::regclass)" + "nextval('data.pull_request_labels_pr_label_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) pr_src_id = Column(BigInteger) pr_src_node_id = Column(String) @@ -3258,20 +3258,20 @@ class PullRequestMessageRef(Base): __tablename__ = "pull_request_message_ref" __table_args__ = ( UniqueConstraint("pr_message_ref_src_comment_id", "pull_request_id", name="pull-request-message-ref-insert-unique"), - {"schema": "collection_data"}, + {"schema": "data"}, ) pr_msg_ref_id = Column( BigInteger, - Sequence('pull_request_message_ref_pr_msg_ref_id_seq', start=25430, schema='collection_data'), + Sequence('pull_request_message_ref_pr_msg_ref_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_message_ref_pr_msg_ref_id_seq'::regclass)" + "nextval('data.pull_request_message_ref_pr_msg_ref_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -3279,11 +3279,11 @@ class PullRequestMessageRef(Base): ) ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) msg_id = Column( ForeignKey( - "collection_data.message.msg_id", + "data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3310,28 +3310,28 @@ class PullRequestMeta(Base): __table_args__ = ( Index("pr_meta-cntrbid-idx", "cntrb_id"), UniqueConstraint("pull_request_id", "pr_head_or_base", 'pr_sha', name="pull-request-meta-insert-unique"), - {"schema": "collection_data", + {"schema": "data", "comment": 'Pull requests contain referencing metadata. There are a few columns that are discrete. There are also head and base designations for the repo on each side of the pull request. Similar functions exist in GitLab, though the language here is based on GitHub.'}, ) pr_repo_meta_id = Column( BigInteger, - Sequence('pull_request_meta_pr_repo_meta_id_seq', start=25430, schema='collection_data'), + Sequence('pull_request_meta_pr_repo_meta_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_meta_pr_repo_meta_id_seq'::regclass)" + "nextval('data.pull_request_meta_pr_repo_meta_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( ForeignKey( - "collection_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3348,7 +3348,7 @@ class PullRequestMeta(Base): ) pr_src_meta_ref = Column(String) pr_sha = Column(String) - cntrb_id = Column(ForeignKey("collection_data.contributors.cntrb_id")) + cntrb_id = Column(ForeignKey("data.contributors.cntrb_id")) tool_source = Column(String) tool_version = Column(String) data_source = Column(String) @@ -3384,20 +3384,20 @@ class PullRequestReviewer(Base): __table_args__ = ( Index("pr-reviewers-cntrb-idx1", "cntrb_id"), UniqueConstraint("pull_request_id", "pr_reviewer_src_id"), - {"schema": "collection_data"}, + {"schema": "data"}, ) pr_reviewer_map_id = Column( BigInteger, - Sequence('pull_request_reviewers_pr_reviewer_map_id_seq', start=25430, schema='collection_data'), + Sequence('pull_request_reviewers_pr_reviewer_map_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_reviewers_pr_reviewer_map_id_seq'::regclass)" + "nextval('data.pull_request_reviewers_pr_reviewer_map_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -3409,7 +3409,7 @@ class PullRequestReviewer(Base): repo_id = Column(BigInteger) cntrb_id = Column( ForeignKey( - "collection_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" ), ) pr_reviewer_src_id = Column( @@ -3446,31 +3446,31 @@ class PullRequestReview(Base): __table_args__ = ( UniqueConstraint("pr_review_src_id", name="pr_review_unique"), Index("pr_id_pr_reviews", "pull_request_id"), - {"schema": "collection_data"}, + {"schema": "data"}, ) pr_review_id = Column( BigInteger, - Sequence('pull_request_reviews_pr_review_id_seq', start=1, schema='collection_data'), + Sequence('pull_request_reviews_pr_review_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_reviews_pr_review_id_seq'::regclass)" + "nextval('data.pull_request_reviews_pr_review_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ), nullable=False, ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) cntrb_id = Column( ForeignKey( - "collection_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) @@ -3485,7 +3485,7 @@ class PullRequestReview(Base): pr_review_commit_id = Column(String) platform_id = Column( ForeignKey( - "collection_data.platform.pltfrm_id", + "data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3515,19 +3515,19 @@ class PullRequestReview(Base): class PullRequestTeam(Base): __tablename__ = "pull_request_teams" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} pr_team_id = Column( BigInteger, - Sequence('pull_request_teams_pr_team_id_seq', start=25430, schema='collection_data'), + Sequence('pull_request_teams_pr_team_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_teams_pr_team_id_seq'::regclass)" + "nextval('data.pull_request_teams_pr_team_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "collection_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -3557,21 +3557,21 @@ class PullRequestRepo(Base): __tablename__ = "pull_request_repo" __table_args__ = ( Index("pr-cntrb-idx-repo", "pr_cntrb_id"), - {"schema": "collection_data", + {"schema": "data", "comment": "This table is for storing information about forks that exist as part of a pull request. Generally we do not want to track these like ordinary repositories. "}, ) pr_repo_id = Column( BigInteger, - Sequence('pull_request_repo_pr_repo_id_seq', start=25430, schema='collection_data'), + Sequence('pull_request_repo_pr_repo_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_repo_pr_repo_id_seq'::regclass)" + "nextval('data.pull_request_repo_pr_repo_id_seq'::regclass)" ), ) pr_repo_meta_id = Column( ForeignKey( - "collection_data.pull_request_meta.pr_repo_meta_id", + "data.pull_request_meta.pr_repo_meta_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -3585,7 +3585,7 @@ class PullRequestRepo(Base): pr_repo_name = Column(String) pr_repo_full_name = Column(String) pr_repo_private_bool = Column(Boolean) - pr_cntrb_id = Column(ForeignKey("collection_data.contributors.cntrb_id")) + pr_cntrb_id = Column(ForeignKey("data.contributors.cntrb_id")) tool_source = Column(String) tool_version = Column(String) data_source = Column(String) @@ -3601,20 +3601,20 @@ class PullRequestReviewMessageRef(Base): __tablename__ = "pull_request_review_message_ref" __table_args__ = ( UniqueConstraint("pr_review_msg_src_id", name="pull-request-review-message-ref-insert-unique"), - {"schema": "collection_data"}, + {"schema": "data"}, ) pr_review_msg_ref_id = Column( BigInteger, - Sequence('pull_request_review_message_ref_pr_review_msg_ref_id_seq', start=1, schema='collection_data'), + Sequence('pull_request_review_message_ref_pr_review_msg_ref_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.pull_request_review_message_ref_pr_review_msg_ref_id_seq'::regclass)" + "nextval('data.pull_request_review_message_ref_pr_review_msg_ref_id_seq'::regclass)" ), ) pr_review_id = Column( ForeignKey( - "collection_data.pull_request_reviews.pr_review_id", + "data.pull_request_reviews.pr_review_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3624,7 +3624,7 @@ class PullRequestReviewMessageRef(Base): ) repo_id = Column( ForeignKey( - "collection_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3633,7 +3633,7 @@ class PullRequestReviewMessageRef(Base): ) msg_id = Column( ForeignKey( - "collection_data.message.msg_id", + "data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3675,19 +3675,19 @@ class PullRequestReviewMessageRef(Base): class RepoClone(Base): __tablename__ = "repo_clones_data" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} repo_clone_data_id = Column( BigInteger, - Sequence('repo_clones_data_id_seq', start=1, schema='collection_data'), + Sequence('repo_clones_data_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('collection_data.repo_clones_data_id_seq'::regclass)" + "nextval('data.repo_clones_data_id_seq'::regclass)" ), ) repo_id = Column( ForeignKey( - "collection_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3704,7 +3704,7 @@ class RepoClone(Base): class TopicModelMeta(Base): __tablename__ = "topic_model_meta" - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} model_id = Column( UUID(as_uuid=True), @@ -3713,7 +3713,7 @@ class TopicModelMeta(Base): comment="Unique identifier for the topic model" ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id"), + ForeignKey("data.repo.repo_id"), comment="Repository this model was trained on" ) model_method = Column( @@ -3811,7 +3811,7 @@ class TopicModelEvent(Base): __table_args__ = ( Index("ix_tme_repo_ts", "repo_id", "ts"), Index("ix_tme_event", "event"), - {"schema": "collection_data"} + {"schema": "data"} ) event_id = Column( @@ -3827,14 +3827,14 @@ class TopicModelEvent(Base): ) repo_id = Column( Integer, - ForeignKey("collection_data.repo.repo_id", name="fk_tme_repo_id"), + ForeignKey("data.repo.repo_id", name="fk_tme_repo_id"), nullable=True, comment="Repository associated with this event" ) model_id = Column( UUID(as_uuid=True), ForeignKey( - "collection_data.topic_model_meta.model_id", + "data.topic_model_meta.model_id", name="fk_tme_model_id", ondelete="SET NULL" ), diff --git a/collectoss/application/db/models/operations.py b/collectoss/application/db/models/operations.py index 4a05ed802..d7ebe4bdf 100644 --- a/collectoss/application/db/models/operations.py +++ b/collectoss/application/db/models/operations.py @@ -221,13 +221,13 @@ class WorkerSettingsFacade(Base): class BadgingDEI(Base): __tablename__ = 'dei_badging' - __table_args__ = {"schema": "collection_data"} + __table_args__ = {"schema": "data"} id = Column(Integer, primary_key=True, nullable=False) badging_id = Column(Integer, nullable=False) level = Column(String, nullable=False) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False + ForeignKey("data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False ) repo = relationship("Repo") @@ -749,7 +749,7 @@ class UserRepo(Base): ForeignKey("collection_operations.user_groups.group_id", name="user_repo_group_id_fkey"), primary_key=True, nullable=False ) repo_id = Column( - ForeignKey("collection_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False + ForeignKey("data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False ) repo = relationship("Repo", back_populates="user_repo") @@ -1204,7 +1204,7 @@ class CollectionStatus(Base): {"schema": "collection_operations"} ) - repo_id = Column(ForeignKey("collection_data.repo.repo_id", name="collection_status_repo_id_fk"), primary_key=True) + repo_id = Column(ForeignKey("data.repo.repo_id", name="collection_status_repo_id_fk"), primary_key=True) core_data_last_collected = Column(TIMESTAMP) core_status = Column(String, nullable=False, server_default=text("'Pending'")) core_task_id = Column(String) diff --git a/collectoss/tasks/data_analysis/clustering_worker/tasks.py b/collectoss/tasks/data_analysis/clustering_worker/tasks.py index 7ec48414d..da24e6028 100644 --- a/collectoss/tasks/data_analysis/clustering_worker/tasks.py +++ b/collectoss/tasks/data_analysis/clustering_worker/tasks.py @@ -78,10 +78,10 @@ def clustering_model(repo_git: str,logger,engine) -> None: i.issue_title thread_title, M.msg_id FROM - collection_data.repo r, - collection_data.issues i, - collection_data.message M, - collection_data.issue_message_ref imr + data.repo r, + data.issues i, + data.message M, + data.issue_message_ref imr WHERE r.repo_id = i.repo_id AND imr.issue_id = i.issue_id @@ -98,10 +98,10 @@ def clustering_model(repo_git: str,logger,engine) -> None: pr.pr_src_title thread_title, M.msg_id FROM - collection_data.repo r, - collection_data.pull_requests pr, - collection_data.message M, - collection_data.pull_request_message_ref prmr + data.repo r, + data.pull_requests pr, + data.message M, + data.pull_request_message_ref prmr WHERE r.repo_id = pr.repo_id AND prmr.pull_request_id = pr.pull_request_id @@ -289,15 +289,15 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): get_messages_sql = s.sql.text( """ SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id - FROM collection_data.repo r, collection_data.issues i, - collection_data.message m, collection_data.issue_message_ref imr + FROM data.repo r, data.issues i, + data.message m, data.issue_message_ref imr WHERE r.repo_id=i.repo_id AND imr.issue_id=i.issue_id AND imr.msg_id=m.msg_id UNION SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, pr.pull_request_id thread_id,m.msg_text,pr.pr_src_title thread_title,m.msg_id - FROM collection_data.repo r, collection_data.pull_requests pr, - collection_data.message m, collection_data.pull_request_message_ref prmr + FROM data.repo r, data.pull_requests pr, + data.message m, data.pull_request_message_ref prmr WHERE r.repo_id=pr.repo_id AND prmr.pull_request_id=pr.pull_request_id AND prmr.msg_id=m.msg_id @@ -365,7 +365,7 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): # key_sequence_words_sql = s.sql.text( # """ - # SELECT nextval('collection_data.topic_words_topic_words_id_seq'::text) + # SELECT nextval('data.topic_words_topic_words_id_seq'::text) # """ # ) diff --git a/collectoss/tasks/data_analysis/discourse_analysis/tasks.py b/collectoss/tasks/data_analysis/discourse_analysis/tasks.py index cad3856ab..fccf169e8 100644 --- a/collectoss/tasks/data_analysis/discourse_analysis/tasks.py +++ b/collectoss/tasks/data_analysis/discourse_analysis/tasks.py @@ -51,16 +51,16 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: get_messages_for_repo_sql = s.sql.text(""" (SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id - FROM collection_data.repo r, collection_data.issues i, - collection_data.message m, collection_data.issue_message_ref imr + FROM data.repo r, data.issues i, + data.message m, data.issue_message_ref imr WHERE r.repo_id=i.repo_id AND imr.issue_id=i.issue_id AND imr.msg_id=m.msg_id AND r.repo_id = :repo_id) UNION (SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, pr.pull_request_id thread_id,m.msg_text,pr.pr_src_title thread_title,m.msg_id - FROM collection_data.repo r, collection_data.pull_requests pr, - collection_data.message m, collection_data.pull_request_message_ref prmr + FROM data.repo r, data.pull_requests pr, + data.message m, data.pull_request_message_ref prmr WHERE r.repo_id=pr.repo_id AND prmr.pull_request_id=pr.pull_request_id AND prmr.msg_id=m.msg_id diff --git a/collectoss/tasks/data_analysis/message_insights/tasks.py b/collectoss/tasks/data_analysis/message_insights/tasks.py index f01de4305..751da1ea7 100644 --- a/collectoss/tasks/data_analysis/message_insights/tasks.py +++ b/collectoss/tasks/data_analysis/message_insights/tasks.py @@ -52,7 +52,7 @@ def message_insight_model(repo_git: str,logger,engine) -> None: # Check to see if repo has been analyzed previously repo_exists_SQL = s.sql.text(""" - SELECT exists (SELECT 1 FROM collection_data.message_analysis_summary WHERE repo_id = :repo_id LIMIT 1)""") + SELECT exists (SELECT 1 FROM data.message_analysis_summary WHERE repo_id = :repo_id LIMIT 1)""") with engine.connect() as conn: df_rep = pd.read_sql_query(repo_exists_SQL, conn, params={'repo_id': repo_id}) @@ -66,17 +66,17 @@ def message_insight_model(repo_git: str,logger,engine) -> None: # Fetch the timestamp of last analyzed message for the repo past_SQL = s.sql.text(""" select message_analysis.msg_id, message.msg_timestamp - from collection_data.message_analysis - inner join collection_data.message on message.msg_id = message_analysis.msg_id - inner join collection_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - inner join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + from data.message_analysis + inner join data.message on message.msg_id = message_analysis.msg_id + inner join data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + inner join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where message.repo_id = :repo_id UNION select message_analysis.msg_id, message.msg_timestamp - from collection_data.message_analysis - inner join collection_data.message on message.msg_id = message_analysis.msg_id - inner join collection_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - inner join collection_data.issues on issue_message_ref.issue_id = issues.issue_id + from data.message_analysis + inner join data.message on message.msg_id = message_analysis.msg_id + inner join data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + inner join data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id """) @@ -97,28 +97,28 @@ def message_insight_model(repo_git: str,logger,engine) -> None: # Fetch only recent messages join_SQL = s.sql.text(""" - select message.msg_id, msg_timestamp, msg_text from collection_data.message - left outer join collection_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - left outer join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + select message.msg_id, msg_timestamp, msg_text from data.message + left outer join data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + left outer join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where message.repo_id = :repo_id and msg_timestamp > :begin_date UNION - select message.msg_id, msg_timestamp, msg_text from collection_data.message - left outer join collection_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - left outer join collection_data.issues on issue_message_ref.issue_id = issues.issue_id + select message.msg_id, msg_timestamp, msg_text from data.message + left outer join data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + left outer join data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id and msg_timestamp > :begin_date""") else: logger.info(f'Fetching all past messages of repo {repo_id}...') # Fetch all messages join_SQL = s.sql.text(""" - select message.msg_id, msg_timestamp, msg_text from collection_data.message - left outer join collection_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - left outer join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + select message.msg_id, msg_timestamp, msg_text from data.message + left outer join data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + left outer join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where message.repo_id = :repo_id UNION - select message.msg_id, msg_timestamp, msg_text from collection_data.message - left outer join collection_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - left outer join collection_data.issues on issue_message_ref.issue_id = issues.issue_id + select message.msg_id, msg_timestamp, msg_text from data.message + left outer join data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + left outer join data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id""") with engine.connect() as conn: @@ -147,14 +147,14 @@ def message_insight_model(repo_git: str,logger,engine) -> None: if not full_train: merge_SQL = s.sql.text(""" - select novelty_flag, reconstruction_error from collection_data.message_analysis - left outer join collection_data.pull_request_message_ref on message_analysis.msg_id = pull_request_message_ref.msg_id - left outer join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + select novelty_flag, reconstruction_error from data.message_analysis + left outer join data.pull_request_message_ref on message_analysis.msg_id = pull_request_message_ref.msg_id + left outer join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where pull_request_message_ref.repo_id = :repo_id UNION - select novelty_flag, reconstruction_error from collection_data.message_analysis - left outer join collection_data.issue_message_ref on message_analysis.msg_id = issue_message_ref.msg_id - left outer join collection_data.issues on issue_message_ref.issue_id = issues.issue_id + select novelty_flag, reconstruction_error from data.message_analysis + left outer join data.issue_message_ref on message_analysis.msg_id = issue_message_ref.msg_id + left outer join data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") with engine.connect() as conn: diff --git a/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py index 2c6e4365e..34512fbc9 100644 --- a/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -59,8 +59,8 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: pull_request_commits.pr_cmt_id, pr_augur_contributor_id, pr_src_author_association - from collection_data.pull_requests - INNER JOIN collection_data.pull_request_commits on pull_requests.pull_request_id = pull_request_commits.pull_request_id + from data.pull_requests + INNER JOIN data.pull_request_commits on pull_requests.pull_request_id = pull_request_commits.pull_request_id where pr_created_at > :begin_date and pull_requests.repo_id = :repo_id and pr_src_state like 'open' @@ -90,13 +90,13 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: # Get sentiment score of all messages relating to the PR messages_SQL = s.sql.text(""" - select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from collection_data.message - left outer join collection_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - left outer join collection_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where pull_request_message_ref.repo_id = :repo_id + select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from data.message + left outer join data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + left outer join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where pull_request_message_ref.repo_id = :repo_id UNION - select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from collection_data.message - left outer join collection_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - left outer join collection_data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") + select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from data.message + left outer join data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + left outer join data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") with engine.connect() as conn: df_message = pd.read_sql_query(messages_SQL, conn, params={'repo_id': repo_id}) @@ -104,7 +104,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: # Map PR to its corresponding messages - pr_ref_sql = s.sql.text("select * from collection_data.pull_request_message_ref") + pr_ref_sql = s.sql.text("select * from data.pull_request_message_ref") with engine.connect() as conn: df_pr_ref = pd.read_sql_query(pr_ref_sql, conn) df_merge = pd.merge(df_pr, df_pr_ref, on='pull_request_id', how='left') @@ -142,7 +142,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: ''' # Get cntrb info from API - cntrb_sql = 'SELECT cntrb_id, gh_login FROM collection_data.contributors' + cntrb_sql = 'SELECT cntrb_id, gh_login FROM data.contributors' df_ctrb = pd.read_sql_query(cntrb_SQL, create_database_engine()) df_fin1 = pd.merge(df_fin,df_ctrb,left_on='pr_augur_contributor_id', right_on='cntrb_id', how='left') df_fin1 = df_fin1.drop(['cntrb_id'],axis=1) @@ -157,7 +157,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: # Get repo info repo_sql = s.sql.text(""" SELECT repo_id, pull_requests_merged, pull_request_count,watchers_count, last_updated FROM - collection_data.repo_info where repo_id = :repo_id + data.repo_info where repo_id = :repo_id """) with engine.connect() as conn: diff --git a/collectoss/tasks/db/refresh_materialized_views.py b/collectoss/tasks/db/refresh_materialized_views.py index d8eeabf97..751a97e93 100644 --- a/collectoss/tasks/db/refresh_materialized_views.py +++ b/collectoss/tasks/db/refresh_materialized_views.py @@ -19,78 +19,78 @@ def refresh_materialized_views(self): #self.logger = logging.getLogger(refresh_materialized_views.__name__) mv1_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.api_get_all_repo_prs with data; + REFRESH MATERIALIZED VIEW concurrently data.api_get_all_repo_prs with data; COMMIT; """) mv2_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.api_get_all_repos_commits with data; + REFRESH MATERIALIZED VIEW concurrently data.api_get_all_repos_commits with data; COMMIT; """) mv3_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.api_get_all_repos_issues with data; + REFRESH MATERIALIZED VIEW concurrently data.api_get_all_repos_issues with data; COMMIT; """) mv4_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.augur_new_contributors with data; + REFRESH MATERIALIZED VIEW concurrently data.augur_new_contributors with data; COMMIT; """) mv5_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_commits_and_committers_daily_count with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_commits_and_committers_daily_count with data; COMMIT; """) mv6_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_new_contributors with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_new_contributors with data; COMMIT; """) mv7_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_entry_list with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_entry_list with data; COMMIT; """) mv8_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_contributor_actions with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_contributor_actions with data; COMMIT; """) mv9_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_user_repos with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_user_repos with data; COMMIT; """) mv10_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_pr_response_times with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_pr_response_times with data; COMMIT; """) mv11_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_pr_assignments with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_pr_assignments with data; COMMIT; """) mv12_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_issue_assignments with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_issue_assignments with data; COMMIT; """) mv13_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_pr_response with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_pr_response with data; COMMIT; """) mv14_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently collection_data.explorer_repo_languages with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_repo_languages with data; COMMIT; """) diff --git a/collectoss/tasks/github/facade_github/tasks.py b/collectoss/tasks/github/facade_github/tasks.py index 732b70dcc..cc380d497 100644 --- a/collectoss/tasks/github/facade_github/tasks.py +++ b/collectoss/tasks/github/facade_github/tasks.py @@ -207,12 +207,12 @@ def insert_facade_contributors(self, repo_git): commits.cmt_commit_hash AS hash, commits.cmt_author_raw_email AS email_raw FROM - collection_data.commits + data.commits WHERE commits.repo_id = :repo_id AND commits.cmt_ght_author_id IS NULL AND commits.cmt_author_raw_email NOT IN ( - SELECT email FROM collection_data.unresolved_commit_emails + SELECT email FROM data.unresolved_commit_emails ) """).bindparams(repo_id=repo_id) @@ -253,19 +253,19 @@ def insert_facade_contributors(self, repo_git): resolve_email_to_cntrb_id_sql = s.sql.text(""" WITH email_to_contributor AS ( SELECT cntrb_email AS email, cntrb_id - FROM collection_data.contributors + FROM data.contributors WHERE cntrb_email IS NOT NULL UNION ALL SELECT cntrb_canonical AS email, cntrb_id - FROM collection_data.contributors + FROM data.contributors WHERE cntrb_canonical IS NOT NULL UNION ALL SELECT alias_email AS email, cntrb_id - FROM collection_data.contributors_aliases + FROM data.contributors_aliases WHERE alias_email IS NOT NULL ), deduplicated AS ( @@ -277,7 +277,7 @@ def insert_facade_contributors(self, repo_git): d.cntrb_id, c.cmt_author_email AS email FROM - collection_data.commits c + data.commits c INNER JOIN deduplicated d ON c.cmt_author_email = d.email diff --git a/collectoss/tasks/util/collection_util.py b/collectoss/tasks/util/collection_util.py index 59dd22e22..6ce161278 100644 --- a/collectoss/tasks/util/collection_util.py +++ b/collectoss/tasks/util/collection_util.py @@ -74,7 +74,7 @@ def get_newly_added_repos(session, limit, hook): repo_query = s.sql.text(f""" select repo_git - from collection_operations.collection_status x, collection_data.repo y + from collection_operations.collection_status x, data.repo y where x.repo_id=y.repo_id and {condition_string} order by {order_by_field} diff --git a/collectoss/util/repo_load_controller.py b/collectoss/util/repo_load_controller.py index 1f05c2bca..6a8b41587 100644 --- a/collectoss/util/repo_load_controller.py +++ b/collectoss/util/repo_load_controller.py @@ -19,11 +19,11 @@ with DatabaseEngine() as engine: - data_schema = MetaData(schema = "collection_data") + data_schema = MetaData(schema = "data") data_schema.reflect(bind = engine, views = True) - commits_materialized_view: Table = data_schema.tables["collection_data.api_get_all_repos_commits"] - issues_materialized_view: Table = data_schema.tables["collection_data.api_get_all_repos_issues"] + commits_materialized_view: Table = data_schema.tables["data.api_get_all_repos_commits"] + issues_materialized_view: Table = data_schema.tables["data.api_get_all_repos_issues"] class RepoLoadController: diff --git a/conftest.py b/conftest.py index 61e84e2a4..3335447c9 100644 --- a/conftest.py +++ b/conftest.py @@ -195,7 +195,7 @@ def read_only_db(empty_db): database_name = empty_db.url.database test_username = "testuser" test_password = "testpass" - schemas = ["public", "collection_data", "collection_operations"] + schemas = ["public", "data", "collection_operations"] # create read-only user empty_db.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) diff --git a/docs/source/getting-started/command-line-interface/db.rst b/docs/source/getting-started/command-line-interface/db.rst index 6401658d1..49ac0f74d 100644 --- a/docs/source/getting-started/command-line-interface/db.rst +++ b/docs/source/getting-started/command-line-interface/db.rst @@ -167,10 +167,10 @@ Example usage\: > [INFO] Config file loaded successfully > CLI: [db.check_pgpass_credentials] [INFO] Credentials found in $HOME/.pgpass > CLI: [db.upgrade_db_version] [INFO] Upgrading from 16 to 17 - > ALTER TABLE "collection_data"."repo" + > ALTER TABLE "data"."repo" > ALTER COLUMN "forked_from" TYPE varchar USING "forked_from"::varchar; > ALTER TABLE - > ALTER TABLE "collection_data"."repo" + > ALTER TABLE "data"."repo" > ADD COLUMN "repo_archived" int4, > ADD COLUMN "repo_archived_date_collected" timestamptz(0), > ALTER COLUMN "forked_from" TYPE varchar USING "forked_from"::varchar; diff --git a/docs/source/schema/overview.rst b/docs/source/schema/overview.rst index d82d83b70..7967a3640 100644 --- a/docs/source/schema/overview.rst +++ b/docs/source/schema/overview.rst @@ -35,7 +35,7 @@ Schema Overview CollectOSS Data ------------------------------------------------------- -The ``collection_data`` schema contains *most* of the information analyzed +The ``data`` schema contains *most* of the information analyzed and constructed by CollectOSS. The origin’s of the data inside of collectoss are from data collection tasks and populate this schema.: @@ -63,7 +63,7 @@ CollectOSS Operations The ``collection_operations`` tables are where most of the operations tables exist. There are a few, like ``settings`` that remain in -``collection_data`` for now, but will be moved. They keep records related to +``data`` for now, but will be moved. They keep records related to analytical history and data provenance for data in the schema. They also store information including API keys. diff --git a/docs/source/schema/regularly_used_data.rst b/docs/source/schema/regularly_used_data.rst index 979c204c0..aab64a2c8 100644 --- a/docs/source/schema/regularly_used_data.rst +++ b/docs/source/schema/regularly_used_data.rst @@ -347,7 +347,7 @@ Repo_meta Repo_sbom_scans --------------- - This table links the collection_data schema to the collectoss_spdx schema to keep a list of repositories that need licenses scanned. (These are for file level license declarations, which are common in Linux Foundation projects, but otherwise not in wide use). + This table links the data schema to the collectoss_spdx schema to keep a list of repositories that need licenses scanned. (These are for file level license declarations, which are common in Linux Foundation projects, but otherwise not in wide use). .. image:: images/repo_sbom_scans.png :width: 200 diff --git a/tests/test_application/test_db/test_session.py b/tests/test_application/test_db/test_session.py index 3c661136d..f3fb5f0e8 100644 --- a/tests/test_application/test_db/test_session.py +++ b/tests/test_application/test_db/test_session.py @@ -26,7 +26,7 @@ def test_execute_sql(test_db_engine): for data in all_data: - statement = s.sql.text("""INSERT INTO "collection_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") + statement = s.sql.text("""INSERT INTO "data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") connection.execute(statement, **data) @@ -35,7 +35,7 @@ def test_execute_sql(test_db_engine): with DatabaseSession(logger, engine=test_db_engine) as session: cntrb_id = data['cntrb_id'] - result = session.execute_sql(f"SELECT * FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() + result = session.execute_sql(f"SELECT * FROM data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() assert result is not None assert isinstance(result[0], s.engine.result.RowProxy) @@ -57,7 +57,7 @@ def test_execute_sql(test_db_engine): for data in all_data: cntrb_id = data["cntrb_id"] - connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id='{cntrb_id}';") def test_insert_data_with_duplicates(test_db_engine): @@ -79,7 +79,7 @@ def test_insert_data_with_duplicates(test_db_engine): cntrb_id = data_1['cntrb_id'] - result = session.execute_sql(f"SELECT * FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() + result = session.execute_sql(f"SELECT * FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() assert result is not None assert len(result) == 3 @@ -94,7 +94,7 @@ def test_insert_data_with_duplicates(test_db_engine): for data in duplicate_data_list: cntrb_id = data["cntrb_id"] - connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id='{cntrb_id}';") def test_insert_data_with_updates(test_db_engine): @@ -106,7 +106,7 @@ def test_insert_data_with_updates(test_db_engine): with test_db_engine.connect() as connection: - statement = s.sql.text("""INSERT INTO "collection_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") + statement = s.sql.text("""INSERT INTO "data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") connection.execute(statement, **data_1) @@ -117,7 +117,7 @@ def test_insert_data_with_updates(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1['cntrb_id'] - result = connection.execute(f"SELECT * FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() assert result is not None assert dict(result[0])["gh_user_id"] == 6 @@ -127,7 +127,7 @@ def test_insert_data_with_updates(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1["cntrb_id"] - connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id='{cntrb_id}';") def test_insert_data_with_bulk(test_db_engine): @@ -145,7 +145,7 @@ def test_insert_data_with_bulk(test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute(f"SELECT * FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() assert result is not None assert len(result) == 4 @@ -160,7 +160,7 @@ def test_insert_data_with_bulk(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1["cntrb_id"] - connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") @@ -172,7 +172,7 @@ def test_insert_data_partial_update(test_db_engine): try: with test_db_engine.connect() as connection: - statement = s.sql.text("""INSERT INTO "collection_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") + statement = s.sql.text("""INSERT INTO "data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") connection.execute(statement, **data_1) @@ -183,7 +183,7 @@ def test_insert_data_partial_update(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1['cntrb_id'] - result = connection.execute(f"SELECT * FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() assert result is not None assert dict(result[0])["gh_user_id"] == 6 @@ -193,7 +193,7 @@ def test_insert_data_partial_update(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1["cntrb_id"] - connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id='{cntrb_id}';") issue_data_with_null_strings = [] @@ -210,11 +210,11 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): # insert the cntrb_id and cntrb_login into the contributors table so the contributor is present. # This is so we don't get a foreign key error on the cntrb_id when we insert the prs query = s.sql.text(""" - DELETE FROM "collection_data"."repo"; - DELETE FROM "collection_data"."repo_groups"; - INSERT INTO "collection_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); + DELETE FROM "data"."repo"; + DELETE FROM "data"."repo_groups"; + INSERT INTO "data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); - INSERT INTO "collection_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); + INSERT INTO "data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); """) connection.execute(query) @@ -232,7 +232,7 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): return_columns=issue_return_columns, string_fields=issue_string_columns) data_inserted_count = len(issue_data_with_null_strings) - result = connection.execute(f"Select * FROM collection_data.issues;").fetchall() + result = connection.execute(f"Select * FROM data.issues;").fetchall() assert issue_return_data is not None assert len(issue_return_data) == data_inserted_count @@ -242,9 +242,9 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): with test_db_engine.connect() as connection: connection.execute(""" - DELETE FROM collection_data.issues; - DELETE FROM "collection_data"."repo"; - DELETE FROM "collection_data"."repo_groups"; + DELETE FROM data.issues; + DELETE FROM "data"."repo"; + DELETE FROM "data"."repo_groups"; """) diff --git a/tests/test_application/test_repo_load_controller/helper.py b/tests/test_application/test_repo_load_controller/helper.py index da23932ae..4a8e880e8 100644 --- a/tests/test_application/test_repo_load_controller/helper.py +++ b/tests/test_application/test_repo_load_controller/helper.py @@ -20,11 +20,11 @@ def get_delete_statement(schema, table): def get_repo_delete_statement(): - return get_delete_statement("collection_data", "repo") + return get_delete_statement("data", "repo") def get_repo_group_delete_statement(): - return get_delete_statement("collection_data", "repo_groups") + return get_delete_statement("data", "repo_groups") def get_user_delete_statement(): @@ -92,7 +92,7 @@ def add_keys_to_test_db(test_db_engine): def get_repo_insert_statement(repo_id, rg_id, repo_url="place holder url"): - return """INSERT INTO "collection_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) + return """INSERT INTO "data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) def get_user_repo_insert_statement(repo_id, group_id): @@ -100,7 +100,7 @@ def get_user_repo_insert_statement(repo_id, group_id): def get_repo_group_insert_statement(rg_id): - return """INSERT INTO "collection_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) + return """INSERT INTO "data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) def get_user_insert_statement(user_id, username="bil", email="default@gmail.com", password="pass"): @@ -119,7 +119,7 @@ def get_user_group_insert_statement(user_id, group_name, group_id=None): def get_repos(connection, where_string=None): query_list = [] - query_list.append('SELECT * FROM "collection_data"."repo"') + query_list.append('SELECT * FROM "data"."repo"') if where_string: if where_string.endswith(";"): diff --git a/tests/test_application/test_repo_load_controller/util.py b/tests/test_application/test_repo_load_controller/util.py index 887dbf617..5bacc69f1 100644 --- a/tests/test_application/test_repo_load_controller/util.py +++ b/tests/test_application/test_repo_load_controller/util.py @@ -6,11 +6,11 @@ def get_delete_statement(schema, table): def get_repo_delete_statement(): - return get_delete_statement("collection_data", "repo") + return get_delete_statement("data", "repo") def get_repo_group_delete_statement(): - return get_delete_statement("collection_data", "repo_groups") + return get_delete_statement("data", "repo_groups") def get_user_delete_statement(): @@ -78,11 +78,11 @@ def add_keys_to_test_db(test_db_engine): def get_repo_insert_statement(repo_id, rg_id, repo_url="place holder url"): - return """INSERT INTO "collection_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) + return """INSERT INTO "data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) def get_repo_group_insert_statement(rg_id): - return """INSERT INTO "collection_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) + return """INSERT INTO "data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) def get_user_insert_statement(user_id): @@ -101,7 +101,7 @@ def get_user_group_insert_statement(user_id, group_name, group_id=None): def get_repos(connection, where_string=None): query_list = [] - query_list.append('SELECT * FROM "collection_data"."repo"') + query_list.append('SELECT * FROM "data"."repo"') if where_string: if where_string.endswith(";"): diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 8ba765018..fd75b7fbf 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -14,7 +14,7 @@ def set_search_path(dbapi_connection, connection_record): existing_autocommit = dbapi_connection.autocommit dbapi_connection.autocommit = True cursor = dbapi_connection.cursor() - cursor.execute("SET SESSION search_path=public,collection_data,collection_operations,spdx") + cursor.execute("SET SESSION search_path=public,data,collection_operations,spdx") cursor.close() dbapi_connection.autocommit = existing_autocommit diff --git a/tests/test_tasks/test_github_tasks/test_pull_requests.py b/tests/test_tasks/test_github_tasks/test_pull_requests.py index 0f70a64b0..847522303 100644 --- a/tests/test_tasks/test_github_tasks/test_pull_requests.py +++ b/tests/test_tasks/test_github_tasks/test_pull_requests.py @@ -263,7 +263,7 @@ def test_insert_pr_contributors(github_api_key_headers, test_db_session, pr_numb with test_db_session.engine.connect() as connection: - result = connection.execute(f"SELECT * FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() assert result is not None assert len(result) == len(unique_contributors) @@ -277,7 +277,7 @@ def test_insert_pr_contributors(github_api_key_headers, test_db_session, pr_numb with test_db_session.engine.connect() as connection: - connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") repos = [] repos.append({"owner": "chaoss", "repo": "augur"}) @@ -312,13 +312,13 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): # insert the cntrb_id and cntrb_login into the contributors table so the contributor is present. # This is so we don't get a foreign key error on the cntrb_id when we insert the prs - query = text("""INSERT INTO "collection_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, 'kannayoshihiro@gmail.com', 'KANNA Yoshihiro', 'UTMC', '2009-04-17 12:43:58', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, 'kannayoshihiro@gmail.com', '2021-01-28 21:56:10-06', 74832, :gh_login, 'https://api.github.com/users/nan', 'https://github.com/nan', 'MDQ6VXNlcjc0ODMy', 'https://avatars.githubusercontent.com/u/74832?v=4', '', 'https://api.github.com/users/nan/followers', 'https://api.github.com/users/nan/following{/other_user}', 'https://api.github.com/users/nan/gists{/gist_id}', 'https://api.github.com/users/nan/starred{/owner}{/repo}', 'https://api.github.com/users/nan/subscriptions', 'https://api.github.com/users/nan/orgs', 'https://api.github.com/users/nan/repos', 'https://api.github.com/users/nan/events{/privacy}', 'https://api.github.com/users/nan/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'GitHub API Worker', '1.0.0', 'GitHub API', '2021-10-28 15:23:46', :cntrb_id); + query = text("""INSERT INTO "data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, 'kannayoshihiro@gmail.com', 'KANNA Yoshihiro', 'UTMC', '2009-04-17 12:43:58', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, 'kannayoshihiro@gmail.com', '2021-01-28 21:56:10-06', 74832, :gh_login, 'https://api.github.com/users/nan', 'https://github.com/nan', 'MDQ6VXNlcjc0ODMy', 'https://avatars.githubusercontent.com/u/74832?v=4', '', 'https://api.github.com/users/nan/followers', 'https://api.github.com/users/nan/following{/other_user}', 'https://api.github.com/users/nan/gists{/gist_id}', 'https://api.github.com/users/nan/starred{/owner}{/repo}', 'https://api.github.com/users/nan/subscriptions', 'https://api.github.com/users/nan/orgs', 'https://api.github.com/users/nan/repos', 'https://api.github.com/users/nan/events{/privacy}', 'https://api.github.com/users/nan/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'GitHub API Worker', '1.0.0', 'GitHub API', '2021-10-28 15:23:46', :cntrb_id); - DELETE FROM "collection_data"."repo"; - DELETE FROM "collection_data"."repo_groups"; - INSERT INTO "collection_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); + DELETE FROM "data"."repo"; + DELETE FROM "data"."repo_groups"; + INSERT INTO "data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); - INSERT INTO "collection_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); + INSERT INTO "data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); """) connection.execute(query, **contributor) @@ -336,7 +336,7 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): with test_db_session.engine.connect() as connection: - result = connection.execute(f"SELECT * FROM collection_data.pull_requests;").fetchall() + result = connection.execute(f"SELECT * FROM data.pull_requests;").fetchall() assert result is not None assert len(result) == len(prs) == len(return_data) @@ -353,11 +353,11 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): with test_db_session.engine.connect() as connection: - connection.execute(f"DELETE FROM collection_data.pull_requests;") - connection.execute("""DELETE FROM "collection_data"."repo"; - DELETE FROM "collection_data"."repo_groups"; + connection.execute(f"DELETE FROM data.pull_requests;") + connection.execute("""DELETE FROM "data"."repo"; + DELETE FROM "data"."repo_groups"; """) - connection.execute(f"DELETE FROM collection_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") diff --git a/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py b/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py index 43b1a9524..4ab662561 100644 --- a/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py +++ b/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py @@ -9,11 +9,11 @@ @pytest.fixture def set_up_repo_groups(database_connection): - df = pd.read_sql(s.sql.text("SELECT repo_group_id FROM collection_data.repo_groups"), database_connection) + df = pd.read_sql(s.sql.text("SELECT repo_group_id FROM data.repo_groups"), database_connection) repo_group_IDs = df['repo_group_id'].values.tolist() insert_repo_group_sql = s.sql.text(""" - INSERT INTO "collection_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); + INSERT INTO "data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); """) with open("tests/test_workers/test_facade/test_facade_contributor_interface/test_repo_groups.csv") as create_repo_groups_file: @@ -31,12 +31,12 @@ def set_up_repo_groups(database_connection): - df = database_connection.execute(s.sql.text("SELECT repo_group_id FROM collection_data.repo_groups")) + df = database_connection.execute(s.sql.text("SELECT repo_group_id FROM data.repo_groups")) repo_group_IDs = [group[0] for group in df.fetchall()] insertSQL = s.sql.text(""" - INSERT INTO collection_data.repo(repo_group_id, repo_git, + INSERT INTO data.repo(repo_group_id, repo_git, tool_source, tool_version, data_source, data_collection_date) VALUES (:repo_group_id, :repo_git, 'CLI', 1.0, 'Git', CURRENT_TIMESTAMP) """) diff --git a/tests/test_workers/test_set_up_fixtures.py b/tests/test_workers/test_set_up_fixtures.py index 584c16745..8594fd7f4 100644 --- a/tests/test_workers/test_set_up_fixtures.py +++ b/tests/test_workers/test_set_up_fixtures.py @@ -12,7 +12,7 @@ def poll_database_connection(database_string): print("Attempting to create db engine") db = s.create_engine(database_string, poolclass=s.pool.NullPool, - connect_args={'options': '-csearch_path={}'.format('collection_data')}) + connect_args={'options': '-csearch_path={}'.format('data')}) return db @@ -153,7 +153,7 @@ def initialize_database_connections(self): "augur", "augur", "172.17.0.1", 5400, "test" ) - self.db_schema = 'collection_data' + self.db_schema = 'data' self.helper_schema = 'collection_operations' self.helper_db = s.create_engine(DB_STR, poolclass=s.pool.NullPool, From 498c78124eaf4cfe28ae9b6204a057e3c3848d03 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 18:58:43 -0400 Subject: [PATCH 084/165] schema rename collection_operations -> operations Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 8 +-- collectoss/application/cli/collection.py | 22 +++--- collectoss/application/cli/db.py | 8 +-- collectoss/application/db/engine.py | 2 +- .../application/db/models/operations.py | 72 +++++++++---------- collectoss/application/service_manager.py | 14 ++-- collectoss/tasks/start_tasks.py | 2 +- collectoss/tasks/util/collection_util.py | 4 +- conftest.py | 2 +- .../configuration-file-reference.rst | 2 +- .../development-guide/tech-breakdown.rst | 2 +- .../getting-started/collecting-data.rst | 2 +- .../command-line-interface/db.rst | 4 +- docs/source/quick-start.rst | 2 +- docs/source/schema/overview.rst | 2 +- keyman/README.md | 2 +- .../test_config/test_config.py | 50 ++++++------- .../test_augur_operations/test_user_group.py | 10 +-- .../test_augur_operations/test_user_repo.py | 6 +- .../test_repo_load_controller/helper.py | 18 ++--- .../test_repo_load_controller/util.py | 16 ++--- tests/test_helpers.py | 4 +- .../test_github_api_key_handler.py | 12 ++-- tests/test_workers/test_set_up_fixtures.py | 2 +- 24 files changed, 134 insertions(+), 134 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index 00eb2d44e..8add0ce18 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -340,7 +340,7 @@ def stop_processes(signal, logger, engine): def assign_orphan_repos_to_default_user(session): query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM collection_operations.user_repos) + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM operations.user_repos) """) repos = session.execute_sql(query).fetchall() @@ -377,13 +377,13 @@ def repo_reset(backend_app): Refresh repo collection to force data collection """ backend_app.database.execute(s.sql.text(""" - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET core_status='Pending',core_task_id = NULL, core_data_last_collected = NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL, secondary_data_last_collected = NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; TRUNCATE data.commits CASCADE; diff --git a/collectoss/application/cli/collection.py b/collectoss/application/cli/collection.py index 59ede7b2f..78b6f5d13 100644 --- a/collectoss/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -202,13 +202,13 @@ def repo_reset(ctx): """ with ctx.obj.engine.connect() as connection: connection.execute(s.sql.text(""" - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET core_status='Pending',core_task_id = NULL, core_data_last_collected = NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL, secondary_data_last_collected = NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; TRUNCATE data.commits CASCADE; @@ -279,31 +279,31 @@ def cleanup_after_collection_halt(logger_instance, engine): #Make sure that database reflects collection status when processes are killed/stopped. def clean_collection_status(session): session.execute_sql(s.sql.text(""" - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET core_status='Pending',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET core_status='Success',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NOT NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Success',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NOT NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET facade_status='Update', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET facade_status='Success', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NOT NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET facade_status='Pending', facade_task_id=NULL WHERE facade_status='Failed Clone' OR facade_status='Initializing'; """)) @@ -311,7 +311,7 @@ def clean_collection_status(session): def assign_orphan_repos_to_default_user(session): query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM collection_operations.user_repos) + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM operations.user_repos) """) repos = session.execute_sql(query).fetchall() diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index 223f0db6c..25ea8a88e 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -262,7 +262,7 @@ def add_github_org(ctx, organization_name): def get_db_version(engine): db_version_sql = s.sql.text( """ - SELECT * FROM collection_operations.augur_settings WHERE setting = 'augur_data_version' + SELECT * FROM operations.augur_settings WHERE setting = 'augur_data_version' """ ) @@ -342,11 +342,11 @@ def update_api_key(ctx, api_key): """ update_api_key_sql = s.sql.text( """ - INSERT INTO collection_operations.augur_settings (setting,VALUE) VALUES ('augur_api_key','HudMhTyPW7wiaWopUKgRoGCxlIUulw4g') ON CONFLICT (setting) + INSERT INTO operations.augur_settings (setting,VALUE) VALUES ('augur_api_key','HudMhTyPW7wiaWopUKgRoGCxlIUulw4g') ON CONFLICT (setting) DO UPDATE SET VALUE='HudMhTyPW7wiaWopUKgRoGCxlIUulw4g'; - --UPDATE collection_operations.augur_settings SET VALUE = :api_key WHERE setting='augur_api_key'; + --UPDATE operations.augur_settings SET VALUE = :api_key WHERE setting='augur_api_key'; """ ) @@ -363,7 +363,7 @@ def update_api_key(ctx, api_key): def get_api_key(ctx): get_api_key_sql = s.sql.text( """ - SELECT value FROM collection_operations.augur_settings WHERE setting='augur_api_key'; + SELECT value FROM operations.augur_settings WHERE setting='augur_api_key'; """ ) diff --git a/collectoss/application/db/engine.py b/collectoss/application/db/engine.py index 090bc10f4..e00c3c992 100644 --- a/collectoss/application/db/engine.py +++ b/collectoss/application/db/engine.py @@ -105,7 +105,7 @@ def set_search_path(dbapi_connection, connection_record): existing_autocommit = dbapi_connection.autocommit dbapi_connection.autocommit = True cursor = dbapi_connection.cursor() - cursor.execute("SET SESSION search_path=public,data,collection_operations,spdx") + cursor.execute("SET SESSION search_path=public,data,operations,spdx") cursor.close() dbapi_connection.autocommit = existing_autocommit diff --git a/collectoss/application/db/models/operations.py b/collectoss/application/db/models/operations.py index d7ebe4bdf..19eff3bae 100644 --- a/collectoss/application/db/models/operations.py +++ b/collectoss/application/db/models/operations.py @@ -83,23 +83,23 @@ def retrieve_owner_repos(session, owner: str) -> List[str]: Column("Count", String), Column("WeightedComplexity", String), Column("Files", String), - schema="collection_operations", + schema="operations", ) class Settings(Base): __tablename__ = "augur_settings" __table_args__ = { - "schema": "collection_operations", + "schema": "operations", "comment": "CollectOSS settings include the schema version, and the CollectOSS API Key as of 10/25/2020. Future augur settings may be stored in this table, which has the basic structure of a name-value pair. ", } id = Column( BigInteger, - Sequence("augur_settings_id_seq", start=1, schema="collection_operations"), + Sequence("augur_settings_id_seq", start=1, schema="operations"), primary_key=True, server_default=text( - "nextval('collection_operations.augur_settings_id_seq'::regclass)" + "nextval('operations.augur_settings_id_seq'::regclass)" ), ) setting = Column(String) @@ -119,23 +119,23 @@ class Settings(Base): server_default=text("CURRENT_TIMESTAMP"), ), Index("repos_id,statusops", "repos_id", "status"), - schema="collection_operations", - comment="For future use when we move all working tables to the collection_operations schema. ", + schema="operations", + comment="For future use when we move all working tables to the operations schema. ", ) class WorkerHistory(Base): __tablename__ = "worker_history" __table_args__ = { - "schema": "collection_operations", + "schema": "operations", "comment": "This table stores the complete history of job execution, including success and failure. It is useful for troubleshooting. ", } history_id = Column( BigInteger, - Sequence("gh_worker_history_history_id_seq", start=1, schema="collection_operations"), + Sequence("gh_worker_history_history_id_seq", start=1, schema="operations"), primary_key=True, server_default=text( - "nextval('collection_operations.gh_worker_history_history_id_seq'::regclass)" + "nextval('operations.gh_worker_history_history_id_seq'::regclass)" ), ) repo_id = Column(BigInteger) @@ -150,7 +150,7 @@ class WorkerHistory(Base): class WorkerJob(Base): __tablename__ = "worker_job" __table_args__ = { - "schema": "collection_operations", + "schema": "operations", "comment": "This table stores the jobs workers collect data for. A job is found in the code, and in the augur.config.json under the construct of a “model”. ", } @@ -172,16 +172,16 @@ class WorkerJob(Base): class WorkerOauth(Base): __tablename__ = "worker_oauth" __table_args__ = { - "schema": "collection_operations", + "schema": "operations", "comment": "This table stores credentials for retrieving data from platform API’s. Entries in this table must comply with the terms of service for each platform. ", } oauth_id = Column( BigInteger, - Sequence("worker_oauth_oauth_id_seq", start=1000, schema="collection_operations"), + Sequence("worker_oauth_oauth_id_seq", start=1000, schema="operations"), primary_key=True, server_default=text( - "nextval('collection_operations.worker_oauth_oauth_id_seq'::regclass)" + "nextval('operations.worker_oauth_oauth_id_seq'::regclass)" ), ) name = Column(String(255), nullable=False) @@ -196,8 +196,8 @@ class WorkerOauth(Base): class WorkerSettingsFacade(Base): __tablename__ = "worker_settings_facade" __table_args__ = { - "schema": "collection_operations", - "comment": "For future use when we move all working tables to the collection_operations schema. ", + "schema": "operations", + "comment": "For future use when we move all working tables to the operations schema. ", } id = Column(Integer, primary_key=True) @@ -215,8 +215,8 @@ class WorkerSettingsFacade(Base): Column( "working_commit", String(40), server_default=text("'NULL'::character varying") ), - schema="collection_operations", - comment="For future use when we move all working tables to the collection_operations schema. ", + schema="operations", + comment="For future use when we move all working tables to the operations schema. ", ) class BadgingDEI(Base): @@ -237,7 +237,7 @@ class Config(Base): __tablename__ = 'config' __table_args__ = ( UniqueConstraint('section_name', "setting_name", name='unique-config-setting'), - {"schema": "collection_operations"} + {"schema": "operations"} ) id = Column(SmallInteger, primary_key=True, nullable=False) @@ -255,7 +255,7 @@ class User(Base): UniqueConstraint('email', name='user-unique-email'), UniqueConstraint('login_name', name='user-unique-name'), UniqueConstraint('text_phone', name='user-unique-phone'), - {"schema": "collection_operations"} + {"schema": "operations"} ) user_id = Column(Integer, primary_key=True) @@ -634,12 +634,12 @@ class UserGroup(Base): __tablename__ = 'user_groups' __table_args__ = ( UniqueConstraint('user_id', 'name', name='user_groups_user_id_name_key'), - {"schema": "collection_operations"} + {"schema": "operations"} ) group_id = Column(BigInteger, primary_key=True) user_id = Column(Integer, - ForeignKey("collection_operations.users.user_id", name="user_group_user_id_fkey"), nullable=False + ForeignKey("operations.users.user_id", name="user_group_user_id_fkey"), nullable=False ) name = Column(String, nullable=False) favorited = Column(Boolean, nullable=False, server_default=text("FALSE")) @@ -743,10 +743,10 @@ def convert_group_name_to_id(session, user_id: int, group_name: str) -> int: class UserRepo(Base): __tablename__ = "user_repos" - __table_args__ = { "schema": "collection_operations" } + __table_args__ = { "schema": "operations" } group_id = Column( - ForeignKey("collection_operations.user_groups.group_id", name="user_repo_group_id_fkey"), primary_key=True, nullable=False + ForeignKey("operations.user_groups.group_id", name="user_repo_group_id_fkey"), primary_key=True, nullable=False ) repo_id = Column( ForeignKey("data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False @@ -1010,12 +1010,12 @@ def add_github_org_repos(session, url: List[str], user_id: int, group_name: int) class UserSessionToken(Base): __tablename__ = "user_session_tokens" - __table_args__ = { "schema": "collection_operations" } + __table_args__ = { "schema": "operations" } token = Column(String, primary_key=True, nullable=False) - user_id = Column(ForeignKey("collection_operations.users.user_id", name="user_session_token_user_id_fkey"), nullable=False) + user_id = Column(ForeignKey("operations.users.user_id", name="user_session_token_user_id_fkey"), nullable=False) expiration = Column(BigInteger) - application_id = Column(ForeignKey("collection_operations.client_applications.id", name="user_session_token_application_id_fkey")) + application_id = Column(ForeignKey("operations.client_applications.id", name="user_session_token_application_id_fkey")) created_at = Column(BigInteger) user = relationship("User", back_populates="tokens") @@ -1048,10 +1048,10 @@ def delete_refresh_tokens(self, session): class ClientApplication(Base): __tablename__ = "client_applications" - __table_args__ = { "schema": "collection_operations" } + __table_args__ = { "schema": "operations" } id = Column(String, primary_key=True, nullable=False) - user_id = Column(ForeignKey("collection_operations.users.user_id", name="client_application_user_id_fkey"), nullable=False) + user_id = Column(ForeignKey("operations.users.user_id", name="client_application_user_id_fkey"), nullable=False) name = Column(String, nullable=False) redirect_url = Column(String, nullable=False) api_key = Column(String, nullable=False) @@ -1074,7 +1074,7 @@ def get_by_id(session, client_id): class ForgeInstance(Base): __tablename__ = "forge_instance" - __table_args__ = { "schema": "collection_operations" } + __table_args__ = { "schema": "operations" } id = Column(Integer, primary_key=True, nullable=False, comment="Internal unique identifier for this forge instance") # platform_type stores an integer that CollectOSS maps/will map to it's internal platform identifier Enum @@ -1089,10 +1089,10 @@ class ForgeInstance(Base): class Subscription(Base): __tablename__ = "subscriptions" - __table_args__ = { "schema": "collection_operations" } + __table_args__ = { "schema": "operations" } - application_id = Column(ForeignKey("collection_operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) - type_id = Column(ForeignKey("collection_operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) + application_id = Column(ForeignKey("operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) + type_id = Column(ForeignKey("operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) application = relationship("ClientApplication", back_populates="subscriptions") type = relationship("SubscriptionType", back_populates="subscriptions") @@ -1101,7 +1101,7 @@ class SubscriptionType(Base): __tablename__ = "subscription_types" __table_args__ = ( UniqueConstraint('name', name='subscription_type_title_unique'), - {"schema": "collection_operations"} + {"schema": "operations"} ) @@ -1115,11 +1115,11 @@ class RefreshToken(Base): __tablename__ = "refresh_tokens" __table_args__ = ( UniqueConstraint('user_session_token', name='refresh_token_user_session_token_id_unique'), - {"schema": "collection_operations"} + {"schema": "operations"} ) id = Column(String, primary_key=True) - user_session_token = Column(ForeignKey("collection_operations.user_session_tokens.token", name="refresh_token_session_token_id_fkey"), nullable=False) + user_session_token = Column(ForeignKey("operations.user_session_tokens.token", name="refresh_token_session_token_id_fkey"), nullable=False) user_session = relationship("UserSessionToken", back_populates="refresh_tokens") @@ -1201,7 +1201,7 @@ class CollectionStatus(Base): "NOT (core_status = 'Pending' AND secondary_status = 'Collecting')", name='core_secondary_dependency_check' ), - {"schema": "collection_operations"} + {"schema": "operations"} ) repo_id = Column(ForeignKey("data.repo.repo_id", name="collection_status_repo_id_fk"), primary_key=True) diff --git a/collectoss/application/service_manager.py b/collectoss/application/service_manager.py index 0ade300e8..2497f37ef 100644 --- a/collectoss/application/service_manager.py +++ b/collectoss/application/service_manager.py @@ -110,31 +110,31 @@ def clear_redis_caches(): #Make sure that database reflects collection status when processes are killed/stopped. def clean_collection_status(session): session.execute_sql(s.sql.text(""" - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET core_status='Pending',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET core_status='Success',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NOT NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Success',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NOT NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET facade_status='Update', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET facade_status='Success', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NOT NULL; - UPDATE collection_operations.collection_status + UPDATE operations.collection_status SET facade_status='Pending', facade_task_id=NULL WHERE facade_status='Failed Clone' OR facade_status='Initializing'; """)) diff --git a/collectoss/tasks/start_tasks.py b/collectoss/tasks/start_tasks.py index 42ebed21e..1f36dd90b 100644 --- a/collectoss/tasks/start_tasks.py +++ b/collectoss/tasks/start_tasks.py @@ -383,7 +383,7 @@ def create_collection_status_records(self): logger = logging.getLogger(create_collection_status_records.__name__) query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM collection_operations.collection_status) + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM operations.collection_status) """) repo = execute_sql(query).first() diff --git a/collectoss/tasks/util/collection_util.py b/collectoss/tasks/util/collection_util.py index 6ce161278..c0b8d1984 100644 --- a/collectoss/tasks/util/collection_util.py +++ b/collectoss/tasks/util/collection_util.py @@ -74,7 +74,7 @@ def get_newly_added_repos(session, limit, hook): repo_query = s.sql.text(f""" select repo_git - from collection_operations.collection_status x, data.repo y + from operations.collection_status x, data.repo y where x.repo_id=y.repo_id and {condition_string} order by {order_by_field} @@ -96,7 +96,7 @@ def get_repos_for_recollection(session, limit, hook, days_until_collect_again): repo_query = s.sql.text(f""" select repo_git - from collection_operations.collection_status x, repo y + from operations.collection_status x, repo y where x.repo_id = y.repo_id and {condition_string} and {hook}_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' diff --git a/conftest.py b/conftest.py index 3335447c9..a142f72ff 100644 --- a/conftest.py +++ b/conftest.py @@ -195,7 +195,7 @@ def read_only_db(empty_db): database_name = empty_db.url.database test_username = "testuser" test_password = "testpass" - schemas = ["public", "data", "collection_operations"] + schemas = ["public", "data", "operations"] # create read-only user empty_db.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) diff --git a/docs/source/development-guide/configuration-file-reference.rst b/docs/source/development-guide/configuration-file-reference.rst index a2da864cd..27fe868c2 100644 --- a/docs/source/development-guide/configuration-file-reference.rst +++ b/docs/source/development-guide/configuration-file-reference.rst @@ -1,7 +1,7 @@ Configuration file reference =============================== -CollectOSS's configuration template file, which generates your locally deployed ``augur.config.json`` file, is found at ``collectoss/config.py``. You will notice a small collection of workers are turned on to start with, by examining the ``switch`` variable within the ``Workers`` block of the config file. You can also specify the number of processes to spawn for each worker using the ``workers`` command. The default is one, and we recommend you start here. If you are going to spawn multiple workers, be sure you have enough credentials cached in the ``collection_operations.worker_oath`` table for the platforms you use. +CollectOSS's configuration template file, which generates your locally deployed ``augur.config.json`` file, is found at ``collectoss/config.py``. You will notice a small collection of workers are turned on to start with, by examining the ``switch`` variable within the ``Workers`` block of the config file. You can also specify the number of processes to spawn for each worker using the ``workers`` command. The default is one, and we recommend you start here. If you are going to spawn multiple workers, be sure you have enough credentials cached in the ``operations.worker_oath`` table for the platforms you use. If you have questions or would like to help please open an issue on GitHub_. diff --git a/docs/source/development-guide/tech-breakdown.rst b/docs/source/development-guide/tech-breakdown.rst index 0e002bcfa..36caa035b 100644 --- a/docs/source/development-guide/tech-breakdown.rst +++ b/docs/source/development-guide/tech-breakdown.rst @@ -127,7 +127,7 @@ Your CollectOSS instance will now be available at http://servername-or-ip:port_number Note: CollectOSS will run on port 5000 by default (you probably need to -change that in collection_operations.config for OSX) +change that in operations.config for OSX) Stopping your CollectOSS Instance --------------------------------- diff --git a/docs/source/getting-started/collecting-data.rst b/docs/source/getting-started/collecting-data.rst index 5e7297ae3..efc7980f8 100644 --- a/docs/source/getting-started/collecting-data.rst +++ b/docs/source/getting-started/collecting-data.rst @@ -60,7 +60,7 @@ There are many collection jobs that ship ready to collect out of the box: - ``collectoss.tasks.github.releases.tasks`` (collects release data from the GitHub API) - ``collectoss.tasks.data_analysis.insight_worker.tasks`` (queries CollectOSS's metrics API to find interesting anomalies in the collected data) -All worker configuration options are found in the config table generated when collectoss was installed. The config table is located in the collection_operations schema of your postgresql database. Each configurable data collection job set has its subsection with the same or similar title as the task's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. +All worker configuration options are found in the config table generated when collectoss was installed. The config table is located in the operations schema of your postgresql database. Each configurable data collection job set has its subsection with the same or similar title as the task's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. Worker-specific configuration options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/getting-started/command-line-interface/db.rst b/docs/source/getting-started/command-line-interface/db.rst index 49ac0f74d..229a942d5 100644 --- a/docs/source/getting-started/command-line-interface/db.rst +++ b/docs/source/getting-started/command-line-interface/db.rst @@ -175,7 +175,7 @@ Example usage\: > ADD COLUMN "repo_archived_date_collected" timestamptz(0), > ALTER COLUMN "forked_from" TYPE varchar USING "forked_from"::varchar; > ALTER TABLE - > update "collection_operations"."augur_settings" set value = 17 where setting = 'augur_data_version'; + > update "operations"."augur_settings" set value = 17 where setting = 'augur_data_version'; > UPDATE 1 > CLI: [db.upgrade_db_version] [INFO] Upgrading from 17 to 18 > etc... @@ -193,4 +193,4 @@ Example usage\: $ uv run collectoss db create-schema .. note:: - If this runs successfully, you should see a bunch of schema creation commands fly by pretty fast. If everything worked you should see: ``update "collection_operations"."augur_settings" set value = xx where setting = 'augur_data_version';`` at the end. + If this runs successfully, you should see a bunch of schema creation commands fly by pretty fast. If everything worked you should see: ``update "operations"."augur_settings" set value = xx where setting = 'augur_data_version';`` at the end. diff --git a/docs/source/quick-start.rst b/docs/source/quick-start.rst index b30101902..1610eaef5 100644 --- a/docs/source/quick-start.rst +++ b/docs/source/quick-start.rst @@ -139,7 +139,7 @@ http://servername-or-ip:port_number Note: CollectOSS will run on port 5000 by default (you probably need to -change that in collection_operations.config for OSX) +change that in operations.config for OSX) Stopping your CollectOSS Instance --------------------------------- diff --git a/docs/source/schema/overview.rst b/docs/source/schema/overview.rst index 7967a3640..58f0340a9 100644 --- a/docs/source/schema/overview.rst +++ b/docs/source/schema/overview.rst @@ -61,7 +61,7 @@ gathered from commits, issues, and other info. CollectOSS Operations ------------------------------------------------------- -The ``collection_operations`` tables are where most of the operations tables +The ``operations`` tables are where most of the operations tables exist. There are a few, like ``settings`` that remain in ``data`` for now, but will be moved. They keep records related to analytical history and data provenance for data in the schema. They also diff --git a/keyman/README.md b/keyman/README.md index 18622c914..2405a0a18 100644 --- a/keyman/README.md +++ b/keyman/README.md @@ -119,7 +119,7 @@ python keyman/Orchestrator.py ## Adding Keys ```sql -INSERT INTO collection_operations.worker_oauth +INSERT INTO operations.worker_oauth (name, consumer_key, consumer_secret, access_token, access_token_secret, platform) VALUES ('My GitHub Key', 'not_used', 'not_used', 'ghp_YOURTOKEN', 'not_used', 'github_rest'); diff --git a/tests/test_application/test_config/test_config.py b/tests/test_application/test_config/test_config.py index 4ed62d9c5..4f750876e 100644 --- a/tests/test_application/test_config/test_config.py +++ b/tests/test_application/test_config/test_config.py @@ -15,7 +15,7 @@ def test_config_get_value(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -26,7 +26,7 @@ def test_config_get_value(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_get_section(test_db_config, test_db_engine): @@ -43,7 +43,7 @@ def test_config_get_section(test_db_config, test_db_engine): for data in network_data: - query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -62,7 +62,7 @@ def test_config_get_section(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_load_config(test_db_config, test_db_engine): @@ -82,7 +82,7 @@ def test_config_load_config(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -102,7 +102,7 @@ def test_config_load_config(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_empty(test_db_config, test_db_engine): @@ -123,7 +123,7 @@ def test_config_empty(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -132,7 +132,7 @@ def test_config_empty(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_is_section_in_config(test_db_config, test_db_engine): @@ -151,7 +151,7 @@ def test_config_is_section_in_config(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -163,7 +163,7 @@ def test_config_is_section_in_config(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_add_settings(test_db_config, test_db_engine): @@ -174,7 +174,7 @@ def test_config_add_settings(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() assert result is not None assert len(result) == 2 @@ -189,7 +189,7 @@ def test_config_add_settings(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_update_settings(test_db_config, test_db_engine): @@ -212,7 +212,7 @@ def test_config_update_settings(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -222,7 +222,7 @@ def test_config_update_settings(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() assert len(result) == 3 @@ -235,7 +235,7 @@ def test_config_update_settings(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_add_section_from_json(test_db_config, test_db_engine): @@ -252,7 +252,7 @@ def test_config_add_section_from_json(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM collection_operations.config""") + result = connection.execute("""SELECT * FROM operations.config""") for row in result: dict_data = dict(row) @@ -266,7 +266,7 @@ def test_config_add_section_from_json(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_load_config_file(test_db_config): @@ -312,7 +312,7 @@ def test_config_load_config_from_dict(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() for row in result: dict_data = dict(row) @@ -328,7 +328,7 @@ def test_config_load_config_from_dict(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_clear(test_db_config, test_db_engine): @@ -342,7 +342,7 @@ def test_config_clear(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -350,14 +350,14 @@ def test_config_clear(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() assert len(result) == 0 finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_remove_section(test_db_config, test_db_engine): @@ -377,7 +377,7 @@ def test_remove_section(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "collection_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -385,7 +385,7 @@ def test_remove_section(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM collection_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() for row in result: dict_data = dict(row) @@ -395,7 +395,7 @@ def test_remove_section(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") diff --git a/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py b/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py index f09582511..363773001 100644 --- a/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py +++ b/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py @@ -73,19 +73,19 @@ def test_add_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "collection_operations"."user_groups";""") + query = s.text("""SELECT * FROM "operations"."user_groups";""") result = connection.execute(query).fetchall() assert result is not None assert len(result) == 3 - query = s.text("""SELECT * FROM "collection_operations"."user_groups" WHERE "user_id"={};""".format(data["users"][0]["id"])) + query = s.text("""SELECT * FROM "operations"."user_groups" WHERE "user_id"={};""".format(data["users"][0]["id"])) result = connection.execute(query).fetchall() assert result is not None assert len(result) == 2 - query = s.text("""SELECT * FROM "collection_operations"."user_groups" WHERE "user_id"={};""".format(data["users"][1]["id"])) + query = s.text("""SELECT * FROM "operations"."user_groups" WHERE "user_id"={};""".format(data["users"][1]["id"])) result = connection.execute(query).fetchall() assert result is not None @@ -212,7 +212,7 @@ def test_remove_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "collection_operations"."user_groups";""") + query = s.text("""SELECT * FROM "operations"."user_groups";""") result = connection.execute(query).fetchall() assert result is not None @@ -226,7 +226,7 @@ def test_remove_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "collection_operations"."user_groups";""") + query = s.text("""SELECT * FROM "operations"."user_groups";""") result = connection.execute(query).fetchall() assert result is not None diff --git a/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py b/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py index 58b1488d9..493af0116 100644 --- a/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py +++ b/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py @@ -74,7 +74,7 @@ def test_add_repo_to_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "collection_operations"."user_repos";""") + query = s.text("""SELECT * FROM "operations"."user_repos";""") # WHERE "group_id"=:user_group_id AND "repo_id"=:repo_id result = connection.execute(query).fetchall() @@ -82,14 +82,14 @@ def test_add_repo_to_user_group(test_db_engine): assert len(result) == 4 - query = s.text("""SELECT * FROM "collection_operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) + query = s.text("""SELECT * FROM "operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) result = connection.execute(query).fetchall() assert result is not None assert len(result) == 2 - query = s.text("""SELECT * FROM "collection_operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) + query = s.text("""SELECT * FROM "operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) result = connection.execute(query).fetchall() assert result is not None diff --git a/tests/test_application/test_repo_load_controller/helper.py b/tests/test_application/test_repo_load_controller/helper.py index 4a8e880e8..d4373132b 100644 --- a/tests/test_application/test_repo_load_controller/helper.py +++ b/tests/test_application/test_repo_load_controller/helper.py @@ -28,19 +28,19 @@ def get_repo_group_delete_statement(): def get_user_delete_statement(): - return get_delete_statement("collection_operations", "users") + return get_delete_statement("operations", "users") def get_user_repo_delete_statement(): - return get_delete_statement("collection_operations", "user_repos") + return get_delete_statement("operations", "user_repos") def get_user_group_delete_statement(): - return get_delete_statement("collection_operations", "user_groups") + return get_delete_statement("operations", "user_groups") def get_config_delete_statement(): - return get_delete_statement("collection_operations", "config") + return get_delete_statement("operations", "config") def get_repo_related_delete_statements(table_list): """Takes a list of tables related to the RepoLoadController class and generates a delete statement. @@ -96,7 +96,7 @@ def get_repo_insert_statement(repo_id, rg_id, repo_url="place holder url"): def get_user_repo_insert_statement(repo_id, group_id): - return """INSERT INTO "collection_operations"."user_repos" ("repo_id", "group_id") VALUES ({}, {});""".format(repo_id, group_id) + return """INSERT INTO "operations"."user_repos" ("repo_id", "group_id") VALUES ({}, {});""".format(repo_id, group_id) def get_repo_group_insert_statement(rg_id): @@ -104,14 +104,14 @@ def get_repo_group_insert_statement(rg_id): def get_user_insert_statement(user_id, username="bil", email="default@gmail.com", password="pass"): - return """INSERT INTO "collection_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, User.compute_hashsed_password(password), email) + return """INSERT INTO "operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, User.compute_hashsed_password(password), email) def get_user_group_insert_statement(user_id, group_name, group_id=None): if group_id: - return """INSERT INTO "collection_operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) + return """INSERT INTO "operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) - return """INSERT INTO "collection_operations"."user_groups" ("user_id", "name") VALUES ({}, '{}');""".format(user_id, group_name) + return """INSERT INTO "operations"."user_groups" ("user_id", "name") VALUES ({}, '{}');""".format(user_id, group_name) ######## Helper Functions to get retrieve data from tables ################# @@ -135,7 +135,7 @@ def get_repos(connection, where_string=None): def get_user_repos(connection): - return connection.execute(s.text("""SELECT * FROM "collection_operations"."user_repos";""")).fetchall() + return connection.execute(s.text("""SELECT * FROM "operations"."user_repos";""")).fetchall() ######## Helper Functions to get repos in an org ################# diff --git a/tests/test_application/test_repo_load_controller/util.py b/tests/test_application/test_repo_load_controller/util.py index 5bacc69f1..d966a7be7 100644 --- a/tests/test_application/test_repo_load_controller/util.py +++ b/tests/test_application/test_repo_load_controller/util.py @@ -14,19 +14,19 @@ def get_repo_group_delete_statement(): def get_user_delete_statement(): - return get_delete_statement("collection_operations", "users") + return get_delete_statement("operations", "users") def get_user_repo_delete_statement(): - return get_delete_statement("collection_operations", "user_repos") + return get_delete_statement("operations", "user_repos") def get_user_group_delete_statement(): - return get_delete_statement("collection_operations", "user_groups") + return get_delete_statement("operations", "user_groups") def get_config_delete_statement(): - return get_delete_statement("collection_operations", "config") + return get_delete_statement("operations", "config") def get_repo_related_delete_statements(table_list): """Takes a list of tables related to the RepoLoadController class and generates a delete statement. @@ -86,14 +86,14 @@ def get_repo_group_insert_statement(rg_id): def get_user_insert_statement(user_id): - return """INSERT INTO "collection_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, 'bil', 'pass', 'b@gmil.com', 'bill', 'bob', false);""".format(user_id) + return """INSERT INTO "operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, 'bil', 'pass', 'b@gmil.com', 'bill', 'bob', false);""".format(user_id) def get_user_group_insert_statement(user_id, group_name, group_id=None): if group_id: - return """INSERT INTO "collection_operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) + return """INSERT INTO "operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) - return """INSERT INTO "collection_operations"."user_groups" (user_id", "name") VALUES (1, 'default');""".format(user_id, group_name) + return """INSERT INTO "operations"."user_groups" (user_id", "name") VALUES (1, 'default');""".format(user_id, group_name) ######## Helper Functions to get retrieve data from tables ################# @@ -117,7 +117,7 @@ def get_repos(connection, where_string=None): def get_user_repos(connection): - return connection.execute(s.text("""SELECT * FROM "collection_operations"."user_repos";""")).fetchall() + return connection.execute(s.text("""SELECT * FROM "operations"."user_repos";""")).fetchall() ######## Helper Functions to get repos in an org ################# diff --git a/tests/test_helpers.py b/tests/test_helpers.py index fd75b7fbf..e73be9c9e 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -14,7 +14,7 @@ def set_search_path(dbapi_connection, connection_record): existing_autocommit = dbapi_connection.autocommit dbapi_connection.autocommit = True cursor = dbapi_connection.cursor() - cursor.execute("SET SESSION search_path=public,data,collection_operations,spdx") + cursor.execute("SET SESSION search_path=public,data,operations,spdx") cursor.close() dbapi_connection.autocommit = existing_autocommit @@ -79,4 +79,4 @@ def test_discover_config_file_env_no_exception(): -""" \ No newline at end of file +""" diff --git a/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py b/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py index 54849ebbc..44d42f3f3 100644 --- a/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py +++ b/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py @@ -33,7 +33,7 @@ def test_get_config_key(key_handler, test_db_engine): data = {"github_api_key": "asdfdfkey"} with test_db_engine.connect() as connection: - query = text("""INSERT INTO "collection_operations"."config" ("id", "section_name", "setting_name", "value", "type") VALUES (3, 'Keys', 'github_api_key', :github_api_key, 'str');""") + query = text("""INSERT INTO "operations"."config" ("id", "section_name", "setting_name", "value", "type") VALUES (3, 'Keys', 'github_api_key', :github_api_key, 'str');""") connection.execute(query, **data) @@ -43,7 +43,7 @@ def test_get_config_key(key_handler, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_get_config_key_with_none_specified(key_handler, test_db_engine): @@ -64,7 +64,7 @@ def test_get_api_keys_from_database(key_handler, test_db_engine): for value in data: - query = text("""INSERT INTO "collection_operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") + query = text("""INSERT INTO "operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") connection.execute(query, **value) @@ -78,7 +78,7 @@ def test_get_api_keys_from_database(key_handler, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.worker_oauth""") + connection.execute("""DELETE FROM operations.worker_oauth""") api_key_list = ["asdfdfkey", "jloire", "zdfdr", "asdrxer"] @pytest.mark.parametrize("api_key", api_key_list) @@ -101,7 +101,7 @@ def test_get_api_keys(key_handler, test_db_engine): for value in data: - query = text("""INSERT INTO "collection_operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") + query = text("""INSERT INTO "operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") connection.execute(query, **value) @@ -112,4 +112,4 @@ def test_get_api_keys(key_handler, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM collection_operations.worker_oauth""") \ No newline at end of file + connection.execute("""DELETE FROM operations.worker_oauth""") \ No newline at end of file diff --git a/tests/test_workers/test_set_up_fixtures.py b/tests/test_workers/test_set_up_fixtures.py index 8594fd7f4..9248d238a 100644 --- a/tests/test_workers/test_set_up_fixtures.py +++ b/tests/test_workers/test_set_up_fixtures.py @@ -154,7 +154,7 @@ def initialize_database_connections(self): ) self.db_schema = 'data' - self.helper_schema = 'collection_operations' + self.helper_schema = 'operations' self.helper_db = s.create_engine(DB_STR, poolclass=s.pool.NullPool, connect_args={'options': '-csearch_path={}'.format(self.helper_schema)}) From 00ca828a7ef450dee3779168ef4a9cf97d39dcdd Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 4 Jun 2026 10:36:56 -0400 Subject: [PATCH 085/165] update schema migration Signed-off-by: Adrian Edwards --- .../alembic/versions/43_rename_schema.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/collectoss/application/schema/alembic/versions/43_rename_schema.py b/collectoss/application/schema/alembic/versions/43_rename_schema.py index b860cd6e4..4e6692551 100644 --- a/collectoss/application/schema/alembic/versions/43_rename_schema.py +++ b/collectoss/application/schema/alembic/versions/43_rename_schema.py @@ -19,26 +19,26 @@ def upgrade() -> None: conn = op.get_bind() - conn.execute(text("ALTER SCHEMA augur_data RENAME TO collection_data;")) - conn.execute(text("ALTER SCHEMA augur_operations RENAME TO collection_operations;")) + conn.execute(text("ALTER SCHEMA augur_data RENAME TO data;")) + conn.execute(text("ALTER SCHEMA augur_operations RENAME TO operations;")) op.create_table_comment( 'repos_fetch_log', - 'For future use when we move all working tables to the collection_operations schema. ', + 'For future use when we move all working tables to the operations schema. ', existing_comment='For future use when we move all working tables to the augur_operations schema. ', - schema='collection_operations' + schema='operations' ) op.create_table_comment( 'worker_settings_facade', - 'For future use when we move all working tables to the collection_operations schema. ', + 'For future use when we move all working tables to the operations schema. ', existing_comment='For future use when we move all working tables to the augur_operations schema. ', - schema='collection_operations' + schema='operations' ) op.create_table_comment( 'working_commits', - 'For future use when we move all working tables to the collection_operations schema. ', + 'For future use when we move all working tables to the operations schema. ', existing_comment='For future use when we move all working tables to the augur_operations schema. ', - schema='collection_operations' + schema='operations' ) @@ -48,22 +48,22 @@ def downgrade() -> None: op.create_table_comment( 'working_commits', 'For future use when we move all working tables to the augur_operations schema. ', - existing_comment='For future use when we move all working tables to the collection_operations schema. ', - schema='collection_operations' + existing_comment='For future use when we move all working tables to the operations schema. ', + schema='operations' ) op.create_table_comment( 'worker_settings_facade', 'For future use when we move all working tables to the augur_operations schema. ', - existing_comment='For future use when we move all working tables to the collection_operations schema. ', - schema='collection_operations' + existing_comment='For future use when we move all working tables to the operations schema. ', + schema='operations' ) op.create_table_comment( 'repos_fetch_log', 'For future use when we move all working tables to the augur_operations schema. ', - existing_comment='For future use when we move all working tables to the collection_operations schema. ', - schema='collection_operations' + existing_comment='For future use when we move all working tables to the operations schema. ', + schema='operations' ) conn = op.get_bind() - conn.execute(text("ALTER SCHEMA collection_data RENAME TO augur_data;")) - conn.execute(text("ALTER SCHEMA collection_operations RENAME TO augur_operations;")) \ No newline at end of file + conn.execute(text("ALTER SCHEMA data RENAME TO augur_data;")) + conn.execute(text("ALTER SCHEMA operations RENAME TO augur_operations;")) \ No newline at end of file From 20de8da51d2fc488aea5b2e9f67dc5ed7a05608b Mon Sep 17 00:00:00 2001 From: Diptesh Roy Date: Sun, 7 Jun 2026 22:10:52 +0530 Subject: [PATCH 086/165] docs: move data sources to index.rst What is CollectOSS section Per reviewer feedback, moved the data sources list from scope.rst to the 'What is CollectOSS' section in index.rst under a new 'Where CollectOSS gets its data' subsection, which is a better fit for this content. Signed-off-by: Diptesh Roy --- docs/source/about/scope.rst | 13 +------------ docs/source/index.rst | 11 +++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/source/about/scope.rst b/docs/source/about/scope.rst index 70ccd4749..e89b319dc 100644 --- a/docs/source/about/scope.rst +++ b/docs/source/about/scope.rst @@ -7,15 +7,4 @@ The data CollectOSS collects covers more than just code contributions and extend This scope is intentionally narrower than that of the CHAOSS project as a whole to help keep the CollectOSS project sustainable with the resources available. Usecases and discussion of perspectives outside this defined scope are still welcome in the CHAOSS community, but may not be a good fit for direct contributions to CollectOSS. These usecases may work best as a complementary add-on project, new working group, or third-party addon to collectoss that depends on or extends CollectOSS functionality. -Future expansions of CollectOSS's scope may also bring in these community addons into the main codebase if new resources become available to sustain such expansion. - -Data Sources ------------- - -CollectOSS collects data from a variety of sources: - -1. Raw Git commit logs (commits, contributors) -2. GitHub's API (issues, pull requests, contributors, releases, repository metadata) -3. The Linux Foundation's `Core Infrastructure Initiative `_ API (repository metadata) -4. `Succinct Code Counter `_, a blazingly fast Sloc, Cloc, and Code tool that also performs COCOMO calculations -5. `OpenSSF Scorecard `_ analysis (security health metrics for open source projects) \ No newline at end of file +Future expansions of CollectOSS's scope may also bring in these community addons into the main codebase if new resources become available to sustain such expansion. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 95d14998c..bb19b1657 100755 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -44,6 +44,17 @@ How CollectOSS works 3. It organizes this data into a standard format called a data model. 4. Then it calculates metrics that tell you about the project’s health. +Where CollectOSS gets its data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +CollectOSS collects data from a variety of sources: + +1. Raw Git commit logs (commits, contributors) +2. GitHub’s API (issues, pull requests, contributors, releases, repository metadata) +3. The Linux Foundation’s `Core Infrastructure Initiative `_ API (repository metadata) +4. `Succinct Code Counter `_, a blazingly fast Sloc, Cloc, and Code tool that also performs COCOMO calculations +5. `OpenSSF Scorecard `_ analysis (security health metrics for open source projects) + Example of a metric: Burstiness ------------------------------- - Burstiness is one of CollectOSS’s metrics. From c037dcc03b75bbbed652cf0382cf82522394669a Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 14:00:23 -0400 Subject: [PATCH 087/165] add a deprefix helper and some unit tests Signed-off-by: Adrian Edwards --- collectoss/application/environment.py | 26 +++++++++++++++++++ .../test_config/test_environment.py | 17 ++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 collectoss/application/environment.py create mode 100644 tests/test_application/test_config/test_environment.py diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py new file mode 100644 index 000000000..4b737912f --- /dev/null +++ b/collectoss/application/environment.py @@ -0,0 +1,26 @@ + +def _deprefix(key: str, prefixes: list[str], separator = "_") -> str: + """Remove a prefix from the provided key + + + Args: + key (str): the key to remove the prefix from + prefixes (list[str]): the prefixes to look for + separator (str, optional): the separator between elements of the key to also remove (if they would otherwise be dangling). Defaults to "_". + + Returns: + str: The key value with the prefix removed if possible, otherwise returns the value of `key` + """ + unprefixed = None + for p in prefixes: + p = p.upper() + k = key.upper() + if k.startswith(p): + unprefixed = key[len(p):] + + if unprefixed.startswith(separator): + unprefixed = unprefixed[len(separator):] + return unprefixed + return key + + diff --git a/tests/test_application/test_config/test_environment.py b/tests/test_application/test_config/test_environment.py new file mode 100644 index 000000000..bca34d92c --- /dev/null +++ b/tests/test_application/test_config/test_environment.py @@ -0,0 +1,17 @@ +from collectoss.application.environment import SystemEnv, _deprefix +import logging + +logger = logging.getLogger(__name__) + +prefixes = ["COLLECTOSS", "OTHER"] + +def test_env_deprefix(): + assert _deprefix("OTHER_DB", prefixes) == "DB" + assert _deprefix("COLLECTOSS_DB", prefixes) == "DB" + +def test_env_deprefix_default(): + assert _deprefix("SOME_DB", prefixes) == "SOME_DB" + assert _deprefix("THINGY_DB", prefixes) == "THINGY_DB" + +def test_env_deprefix_unprefixed(): + assert _deprefix("DB", prefixes) == "DB" From d62214b90ed0c8f8c7068ee206b0ca91ca79cb9b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 14:16:24 -0400 Subject: [PATCH 088/165] refactor into a better prefix extraction helper Signed-off-by: Adrian Edwards --- collectoss/application/environment.py | 21 +++++++++---------- .../test_config/test_environment.py | 19 +++++++++-------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index 4b737912f..4bb02b631 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -1,7 +1,7 @@ +from typing import Optional -def _deprefix(key: str, prefixes: list[str], separator = "_") -> str: - """Remove a prefix from the provided key - +def extract_prefix(key: str, prefixes: list[str], separator = "_") -> Optional[str]: + """Detect and return the prefix present on the provided key Args: key (str): the key to remove the prefix from @@ -9,18 +9,17 @@ def _deprefix(key: str, prefixes: list[str], separator = "_") -> str: separator (str, optional): the separator between elements of the key to also remove (if they would otherwise be dangling). Defaults to "_". Returns: - str: The key value with the prefix removed if possible, otherwise returns the value of `key` + str: The detected prefix (including any separators) if any, otherwise None """ - unprefixed = None + prefix_len = 0 for p in prefixes: p = p.upper() k = key.upper() if k.startswith(p): - unprefixed = key[len(p):] - - if unprefixed.startswith(separator): - unprefixed = unprefixed[len(separator):] - return unprefixed - return key + prefix_len += len(p) + if k[prefix_len] == separator: + prefix_len += len(separator) + return key[0:prefix_len] + return None diff --git a/tests/test_application/test_config/test_environment.py b/tests/test_application/test_config/test_environment.py index bca34d92c..d3248412c 100644 --- a/tests/test_application/test_config/test_environment.py +++ b/tests/test_application/test_config/test_environment.py @@ -1,17 +1,18 @@ -from collectoss.application.environment import SystemEnv, _deprefix +from collectoss.application.environment import SystemEnv, extract_prefix import logging logger = logging.getLogger(__name__) prefixes = ["COLLECTOSS", "OTHER"] -def test_env_deprefix(): - assert _deprefix("OTHER_DB", prefixes) == "DB" - assert _deprefix("COLLECTOSS_DB", prefixes) == "DB" +def test_env_extract_prefix(): + assert extract_prefix("OTHER_DB", prefixes) == "OTHER_" + assert extract_prefix("COLLECTOSS_DB", prefixes) == "COLLECTOSS_" -def test_env_deprefix_default(): - assert _deprefix("SOME_DB", prefixes) == "SOME_DB" - assert _deprefix("THINGY_DB", prefixes) == "THINGY_DB" +def test_env_extract_prefix_default(): + assert extract_prefix("SOME_DB", prefixes) is None + assert extract_prefix("THINGY_DB", prefixes) is None -def test_env_deprefix_unprefixed(): - assert _deprefix("DB", prefixes) == "DB" + +def test_env_extract_prefix_unprefixed(): + assert extract_prefix("DB", prefixes) is None From 4a3d431733320f6474f6e99aad1fb71cd2219375 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 14:21:42 -0400 Subject: [PATCH 089/165] add first pass SystemEnv Signed-off-by: Adrian Edwards --- collectoss/application/environment.py | 37 +++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index 4bb02b631..3bf9fdb3e 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -1,4 +1,9 @@ from typing import Optional +import os +import warnings +import logging + +logger = logging.getLogger(__name__) def extract_prefix(key: str, prefixes: list[str], separator = "_") -> Optional[str]: """Detect and return the prefix present on the provided key @@ -23,3 +28,35 @@ def extract_prefix(key: str, prefixes: list[str], separator = "_") -> Optional[s return key[0:prefix_len] return None + +class SystemEnv: + """Centralized environment variable access + Built for enabling migration of environment variable names + """ + + _prefixes = ["COLLECTOSS", "AUGUR"] + _warn_prefixes = ["AUGUR"] + _separator = "_" + + @classmethod + def get(cls, key: str, default = None) -> Optional[str]: + # extract the suffix so we can try multiple prefixes + canonical_prefix = extract_prefix(key, cls._prefixes, cls._separator) + suffix = key[len(canonical_prefix):] if canonical_prefix is not None else key + # check prefixes in order and use the first one that has a value + for p in cls._prefixes: + check_key = f"{p}{cls._separator}{suffix}" + value = os.getenv(check_key, None) + + if value is not None: + # emit a warning if configured + if p in cls._warn_prefixes: + msg = ( + f"Environment variable '{check_key}' is deprecated. " + f"Use '{key}' instead. Automatic recovery may be removed in a future version" + ) + logger.warning(msg) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + return value + return default From 61a564f2315d102f529bcd75c2f17461d64431ea Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 14:39:55 -0400 Subject: [PATCH 090/165] allow different prefixes to be passed in for testing purposes Signed-off-by: Adrian Edwards --- collectoss/application/environment.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index 3bf9fdb3e..7cadc0dfc 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -39,12 +39,12 @@ class SystemEnv: _separator = "_" @classmethod - def get(cls, key: str, default = None) -> Optional[str]: + def get(cls, key: str, default = None, prefixes = _prefixes) -> Optional[str]: # extract the suffix so we can try multiple prefixes - canonical_prefix = extract_prefix(key, cls._prefixes, cls._separator) + canonical_prefix = extract_prefix(key, prefixes, cls._separator) suffix = key[len(canonical_prefix):] if canonical_prefix is not None else key # check prefixes in order and use the first one that has a value - for p in cls._prefixes: + for p in prefixes: check_key = f"{p}{cls._separator}{suffix}" value = os.getenv(check_key, None) From e8cc02fb398845ba7dc1ba082f1e0e6eec8715fc Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 14:40:08 -0400 Subject: [PATCH 091/165] update deprecation message Signed-off-by: Adrian Edwards --- collectoss/application/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index 7cadc0dfc..ae9d81bb7 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -53,7 +53,7 @@ def get(cls, key: str, default = None, prefixes = _prefixes) -> Optional[str]: if p in cls._warn_prefixes: msg = ( f"Environment variable '{check_key}' is deprecated. " - f"Use '{key}' instead. Automatic recovery may be removed in a future version" + f"Use '{key}' instead. This automatic recovery may become a failure in a future version " ) logger.warning(msg) warnings.warn(msg, DeprecationWarning, stacklevel=2) From b2f99caa89e28cf79f716ab29d952bc39b72ab44 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 14:42:44 -0400 Subject: [PATCH 092/165] basic functionality unit tests Signed-off-by: Adrian Edwards --- .../test_config/test_environment.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_application/test_config/test_environment.py b/tests/test_application/test_config/test_environment.py index d3248412c..11c1e65aa 100644 --- a/tests/test_application/test_config/test_environment.py +++ b/tests/test_application/test_config/test_environment.py @@ -1,5 +1,6 @@ from collectoss.application.environment import SystemEnv, extract_prefix import logging +import os logger = logging.getLogger(__name__) @@ -16,3 +17,23 @@ def test_env_extract_prefix_default(): def test_env_extract_prefix_unprefixed(): assert extract_prefix("DB", prefixes) is None + +def test_fetching_env(): + # plain + os.environ["COLLECTOSS_NAME"] = "A" + assert SystemEnv.get("COLLECTOSS_NAME") == "A" + + # fallback handling + os.environ["OTHER_THING"] = "B" + assert SystemEnv.get("COLLECTOSS_THING", None, prefixes) == "B" + + # cleanup + del os.environ["COLLECTOSS_NAME"] + del os.environ["OTHER_THING"] + +def test_fetching_env_no_value(): + assert SystemEnv.get("COLLECTOSS_MISSING", None, prefixes) is None + +def test_fetching_env_default(): + assert SystemEnv.get("COLLECTOSS_DEFAULT", "SOME", prefixes) == "SOME" + From 7a737fc8f48f6f3e575b1c3527baaf696d8f3e21 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 15:00:36 -0400 Subject: [PATCH 093/165] handle cases with no known prefix to avoid breaking stuff Signed-off-by: Adrian Edwards --- collectoss/application/environment.py | 4 ++++ tests/test_application/test_config/test_environment.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index ae9d81bb7..c2eca8b15 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -59,4 +59,8 @@ def get(cls, key: str, default = None, prefixes = _prefixes) -> Optional[str]: warnings.warn(msg, DeprecationWarning, stacklevel=2) return value + + if not canonical_prefix: + return os.getenv(key, default) + return default diff --git a/tests/test_application/test_config/test_environment.py b/tests/test_application/test_config/test_environment.py index 11c1e65aa..3b31bf950 100644 --- a/tests/test_application/test_config/test_environment.py +++ b/tests/test_application/test_config/test_environment.py @@ -37,3 +37,8 @@ def test_fetching_env_no_value(): def test_fetching_env_default(): assert SystemEnv.get("COLLECTOSS_DEFAULT", "SOME", prefixes) == "SOME" +def test_no_known_prefix(): + # fallback handling + os.environ["THING"] = "C" + assert SystemEnv.get("THING", None, prefixes) == "C" + From f6109aa87b572f783408b6f303e5c7cab41cb808 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 15:10:22 -0400 Subject: [PATCH 094/165] factor out bool fetching class to deduplicate logic Signed-off-by: Adrian Edwards --- collectoss/application/environment.py | 5 ++++ .../test_config/test_environment.py | 29 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index c2eca8b15..33bb0a7f5 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -64,3 +64,8 @@ def get(cls, key: str, default = None, prefixes = _prefixes) -> Optional[str]: return os.getenv(key, default) return default + + @classmethod + def get_bool(cls, key:str, default: bool, prefixes = _prefixes) -> bool: + raw_val = cls.get(key, None, prefixes) + return raw_val.lower() in ('true', '1', 't', 'y', 'yes') if raw_val else default diff --git a/tests/test_application/test_config/test_environment.py b/tests/test_application/test_config/test_environment.py index 3b31bf950..aa00bcb41 100644 --- a/tests/test_application/test_config/test_environment.py +++ b/tests/test_application/test_config/test_environment.py @@ -42,3 +42,32 @@ def test_no_known_prefix(): os.environ["THING"] = "C" assert SystemEnv.get("THING", None, prefixes) == "C" + +def test_get_bool_trues(): + + cases = ["1", "true", "True", "TRUE", "y", "Y", "yes", "Yes"] + + for case in cases: + os.environ["OTHER_BOOL"] = case + assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == True + del os.environ["OTHER_BOOL"] + +def test_get_bool_falses(): + + cases = ["0", "false", "False", "FALSE", "n", "N", "no", "No"] + + for case in cases: + os.environ["OTHER_BOOL"] = case + assert SystemEnv.get_bool("OTHER_BOOL", True, prefixes) == False + del os.environ["OTHER_BOOL"] + +def test_get_bool_default(): + + cases = ["?", "maybe", "Stuff", "333"] + + for case in cases: + os.environ["OTHER_BOOL"] = case + assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == False + del os.environ["OTHER_BOOL"] + + From 67ddbfd930b59a1ab1a72c3d3bf5c563c63e7e42 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 15:20:12 -0400 Subject: [PATCH 095/165] Test to make sure we can potentially migrate the AUGUR_ vars in the code slowly over time Signed-off-by: Adrian Edwards --- tests/test_application/test_config/test_environment.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_application/test_config/test_environment.py b/tests/test_application/test_config/test_environment.py index aa00bcb41..6b62f2ec9 100644 --- a/tests/test_application/test_config/test_environment.py +++ b/tests/test_application/test_config/test_environment.py @@ -31,6 +31,13 @@ def test_fetching_env(): del os.environ["COLLECTOSS_NAME"] del os.environ["OTHER_THING"] +def test_fetching_env_backwards(): + os.environ["COLLECTOSS_NAME"] = "A" + assert SystemEnv.get("OTHER_NAME", None, prefixes) == "A" + + # cleanup + del os.environ["COLLECTOSS_NAME"] + def test_fetching_env_no_value(): assert SystemEnv.get("COLLECTOSS_MISSING", None, prefixes) is None From ddb0ef55cd7a56327ab7be124104c5193ad5af70 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 15:20:28 -0400 Subject: [PATCH 096/165] replace all references to os.getenv with new class Signed-off-by: Adrian Edwards --- collectoss/api/gunicorn_conf.py | 8 +++++--- collectoss/application/cli/__init__.py | 8 +++++--- collectoss/application/cli/api.py | 4 +++- collectoss/application/cli/backend.py | 11 ++++++----- collectoss/application/cli/collection.py | 3 ++- collectoss/application/cli/db.py | 5 +++-- collectoss/application/config.py | 6 ++++-- collectoss/tasks/git/dependency_tasks/core.py | 3 ++- collectoss/tasks/git/scc_value_tasks/core.py | 3 ++- .../git/util/facade_worker/facade_worker/config.py | 4 +++- 10 files changed, 35 insertions(+), 20 deletions(-) diff --git a/collectoss/api/gunicorn_conf.py b/collectoss/api/gunicorn_conf.py index 22c11231a..ee7797471 100644 --- a/collectoss/api/gunicorn_conf.py +++ b/collectoss/api/gunicorn_conf.py @@ -7,6 +7,7 @@ from collectoss.application.db.lib import get_value from collectoss.application.db import dispose_database_engine +from collectoss.application.environment import SystemEnv logger = logging.getLogger(__name__) @@ -20,8 +21,8 @@ workers = multiprocessing.cpu_count() * 2 + 1 umask = 0o007 reload = True - -is_dev = os.getenv("AUGUR_DEV", 'False').lower() in ('true', '1', 't', 'y', 'yes') +# this satisfies the type checker +is_dev = SystemEnv.get_bool("AUGUR_DEV", False) if is_dev: @@ -40,7 +41,8 @@ # set the log location for gunicorn logs_directory = get_value('Logging', 'logs_directory') -is_docker = os.getenv("AUGUR_DOCKER_DEPLOY", 'False').lower() in ('true', '1', 't', 'y', 'yes') +# this syntax satisfies the type checker +is_docker = SystemEnv.get_bool("AUGUR_DOCKER_DEPLOY", False) accesslog = f"{logs_directory}/gunicorn.log" errorlog = f"{logs_directory}/gunicorn.log" diff --git a/collectoss/application/cli/__init__.py b/collectoss/application/cli/__init__.py index 8081d6a8e..b398614e2 100644 --- a/collectoss/application/cli/__init__.py +++ b/collectoss/application/cli/__init__.py @@ -10,7 +10,9 @@ from collectoss.application.db.engine import DatabaseEngine from collectoss.application.db import get_engine, dispose_database_engine -from sqlalchemy.exc import OperationalError +from sqlalchemy.exc import OperationalError +from collectoss.application.environment import SystemEnv + def check_connectivity(urls=["http://chaoss.community", "http://github.com", "http://gitlab.com"], timeout=10.0): @@ -65,11 +67,11 @@ def new_func(ctx, *args, **kwargs): return ctx.invoke(function_db_connection, *args, **kwargs) except OperationalError as e: - db_environment_var = os.getenv("AUGUR_DB") + db_environment_var = SystemEnv.get("AUGUR_DB") # determine the location to print in error string if db_environment_var: - location = f"the AUGUR_DB environment variable\nAUGUR_DB={os.getenv('AUGUR_DB')}" + location = f"the AUGUR_DB environment variable\nAUGUR_DB={SystemEnv.get('AUGUR_DB')}" else: with open("db.config.json", 'r') as f: db_config = json.load(f) diff --git a/collectoss/application/cli/api.py b/collectoss/application/cli/api.py index a8bb9e53b..70fe3a6a1 100644 --- a/collectoss/application/cli/api.py +++ b/collectoss/application/cli/api.py @@ -17,6 +17,8 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext from collectoss.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages from collectoss.application.db.lib import get_value +from collectoss.application.environment import SystemEnv + logger = SystemLogger("collectoss", reset_logfiles=False).get_logger() @@ -142,7 +144,7 @@ def get_api_processes(): def is_api_process(process): command = ''.join(process.info['cmdline'][:]).lower() - if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: + if SystemEnv.get('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: if process.pid != os.getpid(): diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index 8add0ce18..3c5f381c4 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -15,6 +15,7 @@ import requests from redis.exceptions import ConnectionError as RedisConnectionError +from collectoss.application.environment import SystemEnv from collectoss.tasks.start_tasks import collection_monitor, create_collection_status_records from collectoss.tasks.git.facade_tasks import clone_repos from collectoss.tasks.github.contributors import process_contributors @@ -31,7 +32,7 @@ from keyman.KeyClient import KeyClient, KeyPublisher -reset_logs = os.getenv("AUGUR_RESET_LOGS", 'True').lower() in ('true', '1', 't', 'y', 'yes') +reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) logger = SystemLogger("collectoss", reset_logfiles=reset_logs).get_logger() @@ -130,7 +131,7 @@ def start(ctx, disable_collection, development, pidfile, port): processes = start_celery_worker_processes((core_worker_count, secondary_worker_count, facade_worker_count), disable_collection) manager.processes = processes - celery_beat_schedule_db = os.getenv("CELERYBEAT_SCHEDULE_DB", "celerybeat-schedule.db") + celery_beat_schedule_db = SystemEnv.get("CELERYBEAT_SCHEDULE_DB", "celerybeat-schedule.db") if os.path.exists(celery_beat_schedule_db): logger.info("Deleting old task schedule") os.remove(celery_beat_schedule_db) @@ -355,10 +356,10 @@ def export_env(config): Exports your GitHub key and database credentials """ - export_file = open(os.getenv('AUGUR_EXPORT_FILE', 'collectoss_export_env.sh'), 'w+') + export_file = open(SystemEnv.get('AUGUR_EXPORT_FILE') or 'collectoss_export_env.sh', 'w+') export_file.write('#!/bin/bash') export_file.write('\n') - env_file = open(os.getenv('AUGUR_ENV_FILE', 'docker_env.txt'), 'w+') + env_file = open(SystemEnv.get('AUGUR_ENV_FILE') or 'docker_env.txt', 'w+') for env_var in config.get_env_config().items(): if "LOG" not in env_var[0]: @@ -403,7 +404,7 @@ def get_backend_processes(): for process in psutil.process_iter(['cmdline', 'name', 'environ']): if process.info['cmdline'] is not None and process.info['environ'] is not None: try: - if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in ''.join(process.info['cmdline'][:]).lower(): + if SystemEnv.get('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in ''.join(process.info['cmdline'][:]).lower(): if process.pid != os.getpid(): process_list.append(process) except (KeyError, FileNotFoundError): diff --git a/collectoss/application/cli/collection.py b/collectoss/application/cli/collection.py index 78b6f5d13..c502dc91e 100644 --- a/collectoss/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -14,6 +14,7 @@ import traceback import sqlalchemy as s +from collectoss.application.environment import SystemEnv from collectoss.tasks.start_tasks import collection_monitor, create_collection_status_records from collectoss.tasks.git.facade_tasks import clone_repos from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler @@ -237,7 +238,7 @@ def get_collection_processes(): def is_collection_process(process): command = ''.join(process.info['cmdline'][:]).lower() - if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: + if SystemEnv.get('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: if process.pid != os.getpid(): if "collectossbackendcollection" in command or "celery_app.celery_appbeat" in command: diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index 25ea8a88e..fe2250742 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -28,6 +28,7 @@ process_repo_csv, process_repo_group_csv, ) +from collectoss.application.environment import SystemEnv logger = logging.getLogger(__name__) @@ -379,7 +380,7 @@ def get_api_key(ctx): short_help="Check the ~/.pgpass file for CollectOSS's database credentials", ) def check_pgpass(): - db_environment_var = getenv("AUGUR_DB") + db_environment_var = SystemEnv.get("AUGUR_DB") if db_environment_var: # gets the user, passowrd, host, port, and database_name out of environment variable # assumes database string of structure //:@:/ @@ -495,7 +496,7 @@ def run_psql_command_in_database(target_type, target): logger.error("Invalid target type. Exiting...") exit(1) - db_environment_var = getenv("AUGUR_DB") + db_environment_var = SystemEnv.get("AUGUR_DB") # db_json_file_location = os.getcwd() + "/db.config.json" # db_json_exists = os.path.exists(db_json_file_location) diff --git a/collectoss/application/config.py b/collectoss/application/config.py index 56e6c57ae..051235323 100644 --- a/collectoss/application/config.py +++ b/collectoss/application/config.py @@ -7,6 +7,8 @@ from collectoss.application.db.models import Config from collectoss.application.db.util import execute_session_query, convert_type_of_value from pathlib import Path +from collectoss.application.environment import SystemEnv + import logging def get_development_flag_from_config(): @@ -27,7 +29,7 @@ def get_development_flag_from_config(): return flag def get_development_flag(): - return os.getenv("AUGUR_DEV") or get_development_flag_from_config() or False + return SystemEnv.get("AUGUR_DEV") or get_development_flag_from_config() or False def redact_setting_value(section_name, setting_name, value): value_redacted = value if section_name != "Keys" else "REDACTED" @@ -167,7 +169,7 @@ def __init__(self, logger, session: DatabaseSession, config_sources: list = None JsonConfig(default_config, logger) ] - config_dir = Path(os.getenv("CONFIG_DATADIR", "./")) + config_dir = Path(SystemEnv.get("CONFIG_DATADIR") or "./") config_path = config_dir.joinpath("augur.json") if config_path.exists(): config_sources.append(JsonConfig(json.loads(config_path.read_text(encoding="UTF-8")), logger)) diff --git a/collectoss/tasks/git/dependency_tasks/core.py b/collectoss/tasks/git/dependency_tasks/core.py index a9e74b4e1..0b713de93 100644 --- a/collectoss/tasks/git/dependency_tasks/core.py +++ b/collectoss/tasks/git/dependency_tasks/core.py @@ -2,6 +2,7 @@ import os from collectoss.application.db.models import * from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value, get_session +from collectoss.application.environment import SystemEnv from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from collectoss.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from collectoss.tasks.util.worker_util import parse_json_from_subprocess_call @@ -79,7 +80,7 @@ def generate_scorecard(logger, repo_git): command = '--repo=' + path #this is path where our scorecard project is located - path_to_scorecard = os.getenv('SCORECARD_DIR', os.environ['HOME'] + '/scorecard') + path_to_scorecard = SystemEnv.get('SCORECARD_DIR', os.environ['HOME'] + '/scorecard') #setting the environmental variable which is required by scorecard diff --git a/collectoss/tasks/git/scc_value_tasks/core.py b/collectoss/tasks/git/scc_value_tasks/core.py index 7c9e0bafd..a526af990 100644 --- a/collectoss/tasks/git/scc_value_tasks/core.py +++ b/collectoss/tasks/git/scc_value_tasks/core.py @@ -2,6 +2,7 @@ import os from collectoss.application.db.models import * from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value +from collectoss.application.environment import SystemEnv from collectoss.tasks.util.worker_util import parse_json_from_subprocess_call from collectoss.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path @@ -20,7 +21,7 @@ def value_model(logger,repo_git): logger.info(f"Repo ID: {repo_id}, Path: {path}") logger.info('Running scc...') - path_to_scc = os.getenv('SCC_DIR', os.environ['HOME'] + '/scc') + path_to_scc = SystemEnv.get('SCC_DIR', (SystemEnv.get('HOME') or "~") + '/scc') required_output = parse_json_from_subprocess_call(logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/config.py b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py index 7da6495bd..9db7d8866 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/config.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py @@ -40,11 +40,13 @@ from collectoss.application.db.lib import execute_sql from logging import Logger +from collectoss.application.environment import SystemEnv + logger = logging.getLogger(__name__) def get_database_args_from_env(): - db_str = os.getenv("AUGUR_DB") + db_str = SystemEnv.get("AUGUR_DB") try: db_json_file_location = os.getcwd() + "/db.config.json" except FileNotFoundError: From 5b04de6ab9b28cfdeecae1f62c0ad02db3bccc81 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 15:20:48 -0400 Subject: [PATCH 097/165] deprecate older Environment class that is buried in the module tree Signed-off-by: Adrian Edwards --- collectoss/api/view/server/Environment.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/collectoss/api/view/server/Environment.py b/collectoss/api/view/server/Environment.py index 76b8207ca..4d35bc70a 100644 --- a/collectoss/api/view/server/Environment.py +++ b/collectoss/api/view/server/Environment.py @@ -1,4 +1,5 @@ import os +from typing_extensions import deprecated class Environment: """ @@ -7,16 +8,19 @@ class Environment: with subscript notation without needing to deal with the particularities of non-existent values. """ + @deprecated("use collectoss.application.environment.SystemEnv instead") def __init__(self, **kwargs): for (key, value) in kwargs.items(): self[key] = value + @deprecated("use collectoss.application.environment.SystemEnv instead") def setdefault(self, key, value): if not self[key]: self[key] = value return value return self[key] + @deprecated("use collectoss.application.environment.SystemEnv instead") def setall(self, **kwargs): result = {} for (key, value) in kwargs.items(): @@ -24,6 +28,7 @@ def setall(self, **kwargs): result[key] = self[key] self[key] = value + @deprecated("use collectoss.application.environment.SystemEnv instead") def getany(self, *args): result = {} for arg in args: @@ -31,6 +36,7 @@ def getany(self, *args): result[arg] = self[arg] return result + @deprecated("use collectoss.application.environment.SystemEnv instead") def as_type(self, type, key): if self[key]: return type(self[key]) From 827dfb9adb3307e58ddffbd8fa1ca200b4ec7f52 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 15:27:24 -0400 Subject: [PATCH 098/165] get_bool docstring Signed-off-by: Adrian Edwards --- collectoss/application/environment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index 33bb0a7f5..462386f72 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -67,5 +67,7 @@ def get(cls, key: str, default = None, prefixes = _prefixes) -> Optional[str]: @classmethod def get_bool(cls, key:str, default: bool, prefixes = _prefixes) -> bool: + """gets a value from the environment and cast it to a boolean + """ raw_val = cls.get(key, None, prefixes) return raw_val.lower() in ('true', '1', 't', 'y', 'yes') if raw_val else default From 1963d3592041feea03ea233ac85ca65693560db8 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 15:27:34 -0400 Subject: [PATCH 099/165] basic setter Signed-off-by: Adrian Edwards --- collectoss/application/environment.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index 462386f72..3a28c12a9 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -71,3 +71,10 @@ def get_bool(cls, key:str, default: bool, prefixes = _prefixes) -> bool: """ raw_val = cls.get(key, None, prefixes) return raw_val.lower() in ('true', '1', 't', 'y', 'yes') if raw_val else default + + @classmethod + def set(cls, key: str, value: str, overwrite=True) -> None: + if os.getenv(key) is not None and not overwrite: + return + + os.environ[key] = value \ No newline at end of file From 96509061c67168273220a5d8ac62d3924d101290 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 15:32:05 -0400 Subject: [PATCH 100/165] replace references to os.environ[] with new class Signed-off-by: Adrian Edwards --- collectoss/api/routes/auggie.py | 4 +++- collectoss/application/cli/api.py | 2 +- collectoss/application/cli/backend.py | 8 ++++---- collectoss/application/cli/collection.py | 2 +- collectoss/tasks/git/dependency_tasks/core.py | 4 ++-- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/collectoss/api/routes/auggie.py b/collectoss/api/routes/auggie.py index 18642498f..f02122ae3 100644 --- a/collectoss/api/routes/auggie.py +++ b/collectoss/api/routes/auggie.py @@ -14,6 +14,8 @@ import requests import slack +from collectoss.application.environment import SystemEnv + from ..server import app @@ -326,7 +328,7 @@ def slack_login(): print("slack_login") r = requests.get( - url=f'https://slack.com/api/oauth.v2.access?code={body["code"]}&client_id={os.environ["AUGGIE_CLIENT_ID"]}&client_secret={os.environ["AUGGIE_CLIENT_SECRET"]}&redirect_uri=http%3A%2F%2Flocalhost%3A8080') + url=f'https://slack.com/api/oauth.v2.access?code={body["code"]}&client_id={SystemEnv.get("AUGGIE_CLIENT_ID")}&client_secret={SystemEnv.get("AUGGIE_CLIENT_SECRET")}&redirect_uri=http%3A%2F%2Flocalhost%3A8080') data = r.json() if (data["ok"]): diff --git a/collectoss/application/cli/api.py b/collectoss/application/cli/api.py index 70fe3a6a1..e3e4a5a55 100644 --- a/collectoss/application/cli/api.py +++ b/collectoss/application/cli/api.py @@ -48,7 +48,7 @@ def start(ctx, development, port): raise e if development: - os.environ["AUGUR_DEV"] = "1" + SystemEnv.set("AUGUR_DEV", "1") logger.info("Starting in development mode") try: diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index 3c5f381c4..edffccc1f 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -72,10 +72,10 @@ def start(ctx, disable_collection, development, pidfile, port): raise e if development: - os.environ["AUGUR_DEV"] = "1" + SystemEnv.set("AUGUR_DEV", "1") logger.info("Starting in development mode") - os.environ["AUGUR_PIDFILE"] = pidfile + SystemEnv.set("AUGUR_PIDFILE", pidfile) try: gunicorn_location = os.getcwd() + "/collectoss/api/gunicorn_conf.py" @@ -87,10 +87,10 @@ def start(ctx, disable_collection, development, pidfile, port): if not port: port = get_value("Server", "port") - os.environ["AUGUR_PORT"] = str(port) + SystemEnv.set("AUGUR_PORT", str(port)) if disable_collection: - os.environ["AUGUR_DISABLE_COLLECTION"] = "1" + SystemEnv.set("AUGUR_DISABLE_COLLECTION", "1") core_worker_count = get_value("Celery", 'core_worker_count') secondary_worker_count = get_value("Celery", 'secondary_worker_count') diff --git a/collectoss/application/cli/collection.py b/collectoss/application/cli/collection.py index c502dc91e..369a8de40 100644 --- a/collectoss/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -76,7 +76,7 @@ def start(ctx, development): keypub.publish(key, "gitlab_rest") if development: - os.environ["AUGUR_DEV"] = "1" + SystemEnv.set("AUGUR_DEV", "1") logger.info("Starting in development mode") core_worker_count = get_value("Celery", 'core_worker_count') diff --git a/collectoss/tasks/git/dependency_tasks/core.py b/collectoss/tasks/git/dependency_tasks/core.py index 0b713de93..0e5c55b9e 100644 --- a/collectoss/tasks/git/dependency_tasks/core.py +++ b/collectoss/tasks/git/dependency_tasks/core.py @@ -80,14 +80,14 @@ def generate_scorecard(logger, repo_git): command = '--repo=' + path #this is path where our scorecard project is located - path_to_scorecard = SystemEnv.get('SCORECARD_DIR', os.environ['HOME'] + '/scorecard') + path_to_scorecard = SystemEnv.get('SCORECARD_DIR', (SystemEnv.get('HOME') or "~") + '/scorecard') #setting the environmental variable which is required by scorecard with get_session() as session: #key_handler = GithubRandomKeyAuth(logger) key_handler = GithubApiKeyHandler(logger) - os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + SystemEnv.set('GITHUB_AUTH_TOKEN', key_handler.get_random_key()) # This seems outdated #setting the environmental variable which is required by scorecard From dc160799ad2b5c0df5fdb5404fbfc66591add081 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 15:32:24 -0400 Subject: [PATCH 101/165] remove some redundant wrapping code Signed-off-by: Adrian Edwards --- collectoss/tasks/git/dependency_tasks/core.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/collectoss/tasks/git/dependency_tasks/core.py b/collectoss/tasks/git/dependency_tasks/core.py index 0e5c55b9e..21f24246a 100644 --- a/collectoss/tasks/git/dependency_tasks/core.py +++ b/collectoss/tasks/git/dependency_tasks/core.py @@ -83,16 +83,8 @@ def generate_scorecard(logger, repo_git): path_to_scorecard = SystemEnv.get('SCORECARD_DIR', (SystemEnv.get('HOME') or "~") + '/scorecard') #setting the environmental variable which is required by scorecard - - with get_session() as session: - #key_handler = GithubRandomKeyAuth(logger) - key_handler = GithubApiKeyHandler(logger) - SystemEnv.set('GITHUB_AUTH_TOKEN', key_handler.get_random_key()) - - # This seems outdated - #setting the environmental variable which is required by scorecard - #key_handler = GithubApiKeyHandler(session, session.logger) - #os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + key_handler = GithubApiKeyHandler(logger) + SystemEnv.set('GITHUB_AUTH_TOKEN', key_handler.get_random_key()) try: required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) From 1bb1de287ce5b2155a8deb2ad6d46d3ad96b44f7 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 16:28:10 -0400 Subject: [PATCH 102/165] replace references to os.environ.get Signed-off-by: Adrian Edwards --- collectoss/api/routes/auggie.py | 6 +++--- collectoss/application/cli/api.py | 2 +- collectoss/application/cli/backend.py | 4 ++-- collectoss/application/cli/collection.py | 2 +- collectoss/tasks/init/celery_app.py | 2 +- collectoss/tasks/start_tasks.py | 5 +++-- keyman/Orchestrator.py | 5 +++-- 7 files changed, 14 insertions(+), 12 deletions(-) diff --git a/collectoss/api/routes/auggie.py b/collectoss/api/routes/auggie.py index f02122ae3..6d036045a 100644 --- a/collectoss/api/routes/auggie.py +++ b/collectoss/api/routes/auggie.py @@ -254,7 +254,7 @@ def get_auggie_user(): # return Response(response=response, status=200, mimetype="application/json") ## From Method profile_name = 'collectoss' - if os.environ.get('AUGUR_IS_PROD'): + if SystemEnv.get('AUGUR_IS_PROD'): profile_name = 'default' client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') response = client.get_item( @@ -280,7 +280,7 @@ def update_auggie_user_tracking(): # return Response(response=response, status=200, mimetype="application/json") ## From Method profile_name = 'collectoss' - if os.environ.get('AUGUR_IS_PROD'): + if SystemEnv.get('AUGUR_IS_PROD'): profile_name = 'default' client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') response = client.update_item( @@ -342,7 +342,7 @@ def slack_login(): email = user_response["user"]["email"] profile_name = 'collectoss' - if os.environ.get('AUGUR_IS_PROD'): + if SystemEnv.get('AUGUR_IS_PROD'): profile_name = 'default' print("Making Boto3 Session") client = boto3.Session(region_name='us-east-1', diff --git a/collectoss/application/cli/api.py b/collectoss/application/cli/api.py index e3e4a5a55..4f7077a78 100644 --- a/collectoss/application/cli/api.py +++ b/collectoss/application/cli/api.py @@ -38,7 +38,7 @@ def start(ctx, development, port): """Start CollectOSS's backend server.""" try: - if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) except Exception as e: logger.error( diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index edffccc1f..e163ad366 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -62,7 +62,7 @@ def start(ctx, disable_collection, development, pidfile, port): signal.signal(signal.SIGINT, manager.shutdown_signal_handler) try: - if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) except Exception as e: logger.error( @@ -145,7 +145,7 @@ def start(ctx, disable_collection, development, pidfile, port): manager.keypub = keypub if not disable_collection: - if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": orchestrator = subprocess.Popen("python keyman/Orchestrator.py".split()) # Wait for orchestrator startup diff --git a/collectoss/application/cli/collection.py b/collectoss/application/cli/collection.py index 369a8de40..c84d81907 100644 --- a/collectoss/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -46,7 +46,7 @@ def start(ctx, development): """Start CollectOSS's backend server.""" try: - if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) except Exception as e: logger.error( diff --git a/collectoss/tasks/init/celery_app.py b/collectoss/tasks/init/celery_app.py index e14230f99..a33e1e961 100644 --- a/collectoss/tasks/init/celery_app.py +++ b/collectoss/tasks/init/celery_app.py @@ -63,7 +63,7 @@ tasks = start_tasks + github_tasks + gitlab_tasks + git_tasks + materialized_view_tasks + frontend_tasks -if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": +if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": tasks += data_analysis_tasks redis_db_number, redis_conn_string = get_redis_conn_values() diff --git a/collectoss/tasks/start_tasks.py b/collectoss/tasks/start_tasks.py index 1f36dd90b..e5939f240 100644 --- a/collectoss/tasks/start_tasks.py +++ b/collectoss/tasks/start_tasks.py @@ -14,7 +14,8 @@ from collectoss.tasks.github.pull_requests.tasks import * from collectoss.tasks.github.repo_info.tasks import * from collectoss.tasks.github.releases.tasks import * -if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": +from collectoss.application.environment import SystemEnv +if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": from collectoss.tasks.data_analysis import * from collectoss.tasks.github.detect_move.tasks import detect_github_repo_move_core, detect_github_repo_move_secondary from collectoss.tasks.github.releases.tasks import collect_releases @@ -38,7 +39,7 @@ from collectoss.application.db.lib import execute_sql, get_session from collectoss.application.config import SystemConfig -RUNNING_DOCKER = os.environ.get('AUGUR_DOCKER_DEPLOY') == "1" +RUNNING_DOCKER = SystemEnv.get('AUGUR_DOCKER_DEPLOY') == "1" CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) diff --git a/keyman/Orchestrator.py b/keyman/Orchestrator.py index 71cfae8bb..d93a1f064 100644 --- a/keyman/Orchestrator.py +++ b/keyman/Orchestrator.py @@ -4,15 +4,16 @@ import time from keyman.KeyOrchestrationAPI import spec, WaitKeyTimeout, InvalidRequest +from collectoss.application.environment import SystemEnv -if os.environ.get("KEYMAN_DOCKER"): +if SystemEnv.get("KEYMAN_DOCKER"): import sys import redis import logging sys.path.append("/collectoss") - conn = redis.Redis.from_url(os.environ.get("REDIS_CONN_STRING")) + conn = redis.Redis.from_url(SystemEnv.get("REDIS_CONN_STRING")) # Just log to stdout if we're running in docker logger = logging.Logger("KeyOrchestrator") From f5b501923256bdb7bed333dde770136497f4897e Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 16:38:47 -0400 Subject: [PATCH 103/165] Swap out only usage of deprecated Environment class Signed-off-by: Adrian Edwards --- collectoss/api/view/init.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/collectoss/api/view/init.py b/collectoss/api/view/init.py index ab4708793..1ab68912c 100644 --- a/collectoss/api/view/init.py +++ b/collectoss/api/view/init.py @@ -1,13 +1,11 @@ import os from pathlib import Path -from .server import Environment from collectoss.application.logs import SystemLogger import secrets, yaml - -env = Environment() +from collectoss.application.environment import SystemEnv # load configuration files and initialize globals -configFile = Path(env.setdefault("CONFIG_LOCATION", "config.yml")) +configFile = Path(SystemEnv.get("CONFIG_LOCATION") or "config.yml") settings = {} From 202465ccb9be01f53510df5eb8a030cf3f651a9c Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 8 May 2026 16:39:13 -0400 Subject: [PATCH 104/165] Remove no-longer-used Environment class in API Signed-off-by: Adrian Edwards --- collectoss/api/view/server/Environment.py | 58 ----------------------- collectoss/api/view/server/__init__.py | 3 +- 2 files changed, 1 insertion(+), 60 deletions(-) delete mode 100644 collectoss/api/view/server/Environment.py diff --git a/collectoss/api/view/server/Environment.py b/collectoss/api/view/server/Environment.py deleted file mode 100644 index 4d35bc70a..000000000 --- a/collectoss/api/view/server/Environment.py +++ /dev/null @@ -1,58 +0,0 @@ -import os -from typing_extensions import deprecated - -class Environment: - """ - This class is used to make dealing with environment variables easier. It - allows you to set multiple environment variables at once, and to get items - with subscript notation without needing to deal with the particularities of - non-existent values. - """ - @deprecated("use collectoss.application.environment.SystemEnv instead") - def __init__(self, **kwargs): - for (key, value) in kwargs.items(): - self[key] = value - - @deprecated("use collectoss.application.environment.SystemEnv instead") - def setdefault(self, key, value): - if not self[key]: - self[key] = value - return value - return self[key] - - @deprecated("use collectoss.application.environment.SystemEnv instead") - def setall(self, **kwargs): - result = {} - for (key, value) in kwargs.items(): - if self[key]: - result[key] = self[key] - self[key] = value - - @deprecated("use collectoss.application.environment.SystemEnv instead") - def getany(self, *args): - result = {} - for arg in args: - if self[arg]: - result[arg] = self[arg] - return result - - @deprecated("use collectoss.application.environment.SystemEnv instead") - def as_type(self, type, key): - if self[key]: - return type(self[key]) - return None - - def __getitem__(self, key): - return os.getenv(key) - - def __setitem__(self, key, value): - os.environ[key] = str(value) - - def __len__(self)-> int: - return len(os.environ) - - def __str__(self)-> str: - return str(os.environ) - - def __iter__(self): - return (item for item in os.environ.items()) \ No newline at end of file diff --git a/collectoss/api/view/server/__init__.py b/collectoss/api/view/server/__init__.py index e919a597a..98ce903be 100644 --- a/collectoss/api/view/server/__init__.py +++ b/collectoss/api/view/server/__init__.py @@ -1,2 +1 @@ -from .LoginException import LoginException -from .Environment import Environment \ No newline at end of file +from .LoginException import LoginException \ No newline at end of file From da765da11a6c3ebd48a99985dc4d4f825f3cb877 Mon Sep 17 00:00:00 2001 From: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> Date: Fri, 29 May 2026 17:38:44 -0400 Subject: [PATCH 105/165] Refactor extract_prefix Co-authored-by: Shlok Gilda Signed-off-by: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> --- collectoss/application/environment.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index 3a28c12a9..9c5b3cf65 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -16,16 +16,13 @@ def extract_prefix(key: str, prefixes: list[str], separator = "_") -> Optional[s Returns: str: The detected prefix (including any separators) if any, otherwise None """ - prefix_len = 0 + k = key.upper() for p in prefixes: - p = p.upper() - k = key.upper() - if k.startswith(p): - prefix_len += len(p) - - if k[prefix_len] == separator: - prefix_len += len(separator) - return key[0:prefix_len] + p_up = p.upper() + if k == p_up: + return key[:len(p)] + if k.startswith(p_up + separator): + return key[:len(p) + len(separator)] return None From b3e92a2d42807b2ffc2afdf838cd560cbb810852 Mon Sep 17 00:00:00 2001 From: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> Date: Fri, 29 May 2026 17:39:06 -0400 Subject: [PATCH 106/165] refactor get_bool Co-authored-by: Shlok Gilda Signed-off-by: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> --- collectoss/application/environment.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index 9c5b3cf65..22a8c95d4 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -67,7 +67,9 @@ def get_bool(cls, key:str, default: bool, prefixes = _prefixes) -> bool: """gets a value from the environment and cast it to a boolean """ raw_val = cls.get(key, None, prefixes) - return raw_val.lower() in ('true', '1', 't', 'y', 'yes') if raw_val else default + if raw_val is None: + return default + return raw_val.lower() in ('true', '1', 't', 'y', 'yes') @classmethod def set(cls, key: str, value: str, overwrite=True) -> None: From 21a34f1a90633803b6db56f921897ce702192a34 Mon Sep 17 00:00:00 2001 From: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> Date: Fri, 29 May 2026 17:39:42 -0400 Subject: [PATCH 107/165] fix ~ path expansion in default scorecard value Co-authored-by: Shlok Gilda Signed-off-by: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> --- collectoss/tasks/git/dependency_tasks/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/tasks/git/dependency_tasks/core.py b/collectoss/tasks/git/dependency_tasks/core.py index 21f24246a..3bd2aaab2 100644 --- a/collectoss/tasks/git/dependency_tasks/core.py +++ b/collectoss/tasks/git/dependency_tasks/core.py @@ -80,7 +80,7 @@ def generate_scorecard(logger, repo_git): command = '--repo=' + path #this is path where our scorecard project is located - path_to_scorecard = SystemEnv.get('SCORECARD_DIR', (SystemEnv.get('HOME') or "~") + '/scorecard') + path_to_scorecard = SystemEnv.get('SCORECARD_DIR', os.path.expanduser('~/scorecard')) #setting the environmental variable which is required by scorecard key_handler = GithubApiKeyHandler(logger) From 8945ef8d55899748f57b9085265394249dca0c5a Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 20 May 2026 16:14:40 -0400 Subject: [PATCH 108/165] replace a bunch of env var names the application accesses impact: low due to the new env variable interpretation layer Signed-off-by: Adrian Edwards --- collectoss/api/routes/auggie.py | 6 +++--- collectoss/application/cli/__init__.py | 4 ++-- collectoss/application/cli/api.py | 2 +- collectoss/application/cli/backend.py | 8 ++++---- collectoss/application/cli/collection.py | 2 +- collectoss/application/cli/db.py | 4 ++-- collectoss/application/config.py | 2 +- .../tasks/git/util/facade_worker/facade_worker/config.py | 2 +- collectoss/tasks/init/celery_app.py | 2 +- collectoss/tasks/start_tasks.py | 4 ++-- 10 files changed, 18 insertions(+), 18 deletions(-) diff --git a/collectoss/api/routes/auggie.py b/collectoss/api/routes/auggie.py index 6d036045a..4cde77084 100644 --- a/collectoss/api/routes/auggie.py +++ b/collectoss/api/routes/auggie.py @@ -254,7 +254,7 @@ def get_auggie_user(): # return Response(response=response, status=200, mimetype="application/json") ## From Method profile_name = 'collectoss' - if SystemEnv.get('AUGUR_IS_PROD'): + if SystemEnv.get('COLLECTOSS_IS_PROD'): profile_name = 'default' client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') response = client.get_item( @@ -280,7 +280,7 @@ def update_auggie_user_tracking(): # return Response(response=response, status=200, mimetype="application/json") ## From Method profile_name = 'collectoss' - if SystemEnv.get('AUGUR_IS_PROD'): + if SystemEnv.get('COLLECTOSS_IS_PROD'): profile_name = 'default' client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') response = client.update_item( @@ -342,7 +342,7 @@ def slack_login(): email = user_response["user"]["email"] profile_name = 'collectoss' - if SystemEnv.get('AUGUR_IS_PROD'): + if SystemEnv.get('COLLECTOSS_IS_PROD'): profile_name = 'default' print("Making Boto3 Session") client = boto3.Session(region_name='us-east-1', diff --git a/collectoss/application/cli/__init__.py b/collectoss/application/cli/__init__.py index b398614e2..18fac2f0a 100644 --- a/collectoss/application/cli/__init__.py +++ b/collectoss/application/cli/__init__.py @@ -67,11 +67,11 @@ def new_func(ctx, *args, **kwargs): return ctx.invoke(function_db_connection, *args, **kwargs) except OperationalError as e: - db_environment_var = SystemEnv.get("AUGUR_DB") + db_environment_var = SystemEnv.get("COLLECTOSS_DB") # determine the location to print in error string if db_environment_var: - location = f"the AUGUR_DB environment variable\nAUGUR_DB={SystemEnv.get('AUGUR_DB')}" + location = f"the AUGUR_DB environment variable\nAUGUR_DB={SystemEnv.get('COLLECTOSS_DB')}" else: with open("db.config.json", 'r') as f: db_config = json.load(f) diff --git a/collectoss/application/cli/api.py b/collectoss/application/cli/api.py index 4f7077a78..0c567c590 100644 --- a/collectoss/application/cli/api.py +++ b/collectoss/application/cli/api.py @@ -38,7 +38,7 @@ def start(ctx, development, port): """Start CollectOSS's backend server.""" try: - if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) except Exception as e: logger.error( diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index e163ad366..3526a3c2c 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -62,7 +62,7 @@ def start(ctx, disable_collection, development, pidfile, port): signal.signal(signal.SIGINT, manager.shutdown_signal_handler) try: - if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) except Exception as e: logger.error( @@ -145,7 +145,7 @@ def start(ctx, disable_collection, development, pidfile, port): manager.keypub = keypub if not disable_collection: - if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": orchestrator = subprocess.Popen("python keyman/Orchestrator.py".split()) # Wait for orchestrator startup @@ -356,10 +356,10 @@ def export_env(config): Exports your GitHub key and database credentials """ - export_file = open(SystemEnv.get('AUGUR_EXPORT_FILE') or 'collectoss_export_env.sh', 'w+') + export_file = open(SystemEnv.get('COLLECTOSS_EXPORT_FILE') or 'collectoss_export_env.sh', 'w+') export_file.write('#!/bin/bash') export_file.write('\n') - env_file = open(SystemEnv.get('AUGUR_ENV_FILE') or 'docker_env.txt', 'w+') + env_file = open(SystemEnv.get('COLLECTOSS_ENV_FILE') or 'docker_env.txt', 'w+') for env_var in config.get_env_config().items(): if "LOG" not in env_var[0]: diff --git a/collectoss/application/cli/collection.py b/collectoss/application/cli/collection.py index c84d81907..adf4b50e8 100644 --- a/collectoss/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -46,7 +46,7 @@ def start(ctx, development): """Start CollectOSS's backend server.""" try: - if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) except Exception as e: logger.error( diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index fe2250742..e43e472aa 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -380,7 +380,7 @@ def get_api_key(ctx): short_help="Check the ~/.pgpass file for CollectOSS's database credentials", ) def check_pgpass(): - db_environment_var = SystemEnv.get("AUGUR_DB") + db_environment_var = SystemEnv.get("COLLECTOSS_DB") if db_environment_var: # gets the user, passowrd, host, port, and database_name out of environment variable # assumes database string of structure //:@:/ @@ -496,7 +496,7 @@ def run_psql_command_in_database(target_type, target): logger.error("Invalid target type. Exiting...") exit(1) - db_environment_var = SystemEnv.get("AUGUR_DB") + db_environment_var = SystemEnv.get("COLLECTOSS_DB") # db_json_file_location = os.getcwd() + "/db.config.json" # db_json_exists = os.path.exists(db_json_file_location) diff --git a/collectoss/application/config.py b/collectoss/application/config.py index 051235323..16f62b5ad 100644 --- a/collectoss/application/config.py +++ b/collectoss/application/config.py @@ -29,7 +29,7 @@ def get_development_flag_from_config(): return flag def get_development_flag(): - return SystemEnv.get("AUGUR_DEV") or get_development_flag_from_config() or False + return SystemEnv.get("COLLECTOSS_DEV") or get_development_flag_from_config() or False def redact_setting_value(section_name, setting_name, value): value_redacted = value if section_name != "Keys" else "REDACTED" diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/config.py b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py index 9db7d8866..f6d5aa465 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/config.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py @@ -46,7 +46,7 @@ def get_database_args_from_env(): - db_str = SystemEnv.get("AUGUR_DB") + db_str = SystemEnv.get("COLLECTOSS_DB") try: db_json_file_location = os.getcwd() + "/db.config.json" except FileNotFoundError: diff --git a/collectoss/tasks/init/celery_app.py b/collectoss/tasks/init/celery_app.py index a33e1e961..22fd34872 100644 --- a/collectoss/tasks/init/celery_app.py +++ b/collectoss/tasks/init/celery_app.py @@ -63,7 +63,7 @@ tasks = start_tasks + github_tasks + gitlab_tasks + git_tasks + materialized_view_tasks + frontend_tasks -if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": +if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": tasks += data_analysis_tasks redis_db_number, redis_conn_string = get_redis_conn_values() diff --git a/collectoss/tasks/start_tasks.py b/collectoss/tasks/start_tasks.py index e5939f240..51bf25cd7 100644 --- a/collectoss/tasks/start_tasks.py +++ b/collectoss/tasks/start_tasks.py @@ -15,7 +15,7 @@ from collectoss.tasks.github.repo_info.tasks import * from collectoss.tasks.github.releases.tasks import * from collectoss.application.environment import SystemEnv -if SystemEnv.get('AUGUR_DOCKER_DEPLOY') != "1": +if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": from collectoss.tasks.data_analysis import * from collectoss.tasks.github.detect_move.tasks import detect_github_repo_move_core, detect_github_repo_move_secondary from collectoss.tasks.github.releases.tasks import collect_releases @@ -39,7 +39,7 @@ from collectoss.application.db.lib import execute_sql, get_session from collectoss.application.config import SystemConfig -RUNNING_DOCKER = SystemEnv.get('AUGUR_DOCKER_DEPLOY') == "1" +RUNNING_DOCKER = SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') == "1" CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) From 27fd27e8186ce7903b0bf271b172af60818e363d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 20 May 2026 16:16:06 -0400 Subject: [PATCH 109/165] update env var names in RST docs Signed-off-by: Adrian Edwards --- docs/source/deployment/production.rst | 16 ++++----- .../create-a-metric/api-development.rst | 6 ++-- .../create-a-metric/metrics-steps.rst | 2 +- .../workers/creating_a_new_worker.rst | 2 +- docs/source/docker/docker-compose.rst | 12 +++---- docs/source/docker/getting-started.rst | 24 ++++++------- docs/source/docker/quick-start.rst | 28 +++++++-------- .../command-line-interface/backend.rst | 36 +++++++++---------- .../command-line-interface/configure.rst | 16 ++++----- docs/source/getting-started/using-docker.rst | 16 ++++----- 10 files changed, 79 insertions(+), 79 deletions(-) diff --git a/docs/source/deployment/production.rst b/docs/source/deployment/production.rst index e65a987f1..614737256 100644 --- a/docs/source/deployment/production.rst +++ b/docs/source/deployment/production.rst @@ -11,10 +11,10 @@ Environment Variables CollectOSS uses several environment variables in production. Make sure to configure the ones relevant to your deployment: -- ``AUGUR_RESET_LOGS`` : Controls automatic log reset on server startup -- ``AUGUR_DB`` : PostgreSQL database connection string (used if variable not set) +- ``COLLECTOSS_RESET_LOGS`` : Controls automatic log reset on server startup +- ``COLLECTOSS_DB`` : PostgreSQL database connection string (used if variable not set) -AUGUR_RESET_LOGS +COLLECTOSS_RESET_LOGS ---------------- **Description:** @@ -27,7 +27,7 @@ boolean `True` : CollectOSS clears old logs at startup. **Environment Variable:** -AUGUR_RESET_LOGS +COLLECTOSS_RESET_LOGS **Notes:** If set to `False`, CollectOSS will not reset logs automatically. Administrators must ensure log rotation or cleanup is handled manually. @@ -36,9 +36,9 @@ If set to `False`, CollectOSS will not reset logs automatically. Administrators .. code-block:: bash - export AUGUR_RESET_LOGS=False + export COLLECTOSS_RESET_LOGS=False -AUGUR_DB +COLLECTOSS_DB -------- **Description:** @@ -48,10 +48,10 @@ Specifies the connection string for the PostgreSQL database used by CollectOSS. string **Default:** -Docker container database (if `AUGUR_DB` is not specified) +Docker container database (if `COLLECTOSS_DB` is not specified) **Environment Variable:** -AUGUR_DB +COLLECTOSS_DB Related Resources ----------------- diff --git a/docs/source/development-guide/create-a-metric/api-development.rst b/docs/source/development-guide/create-a-metric/api-development.rst index 8aea48aac..12a010465 100644 --- a/docs/source/development-guide/create-a-metric/api-development.rst +++ b/docs/source/development-guide/create-a-metric/api-development.rst @@ -11,13 +11,13 @@ JSON Metrics are here: .. code-block:: bash - $ AUGUR_HOME/collectoss/metrics + $ COLLECTOSS_HOME/collectoss/metrics Visualization Metrics are here: .. code-block:: bash - $ AUGUR_HOME/collectoss/routes + $ COLLECTOSS_HOME/collectoss/routes Existing metrics files (JSON Metric) "Standard Metrics": @@ -46,7 +46,7 @@ You can see that one of the imports is our standard metric import from the util .. code-block:: python - AUGUR_HOME/collectoss/routes/util.py + COLLECTOSS_HOME/collectoss/routes/util.py All "Standard Metrics" share declaration and a method signature diff --git a/docs/source/development-guide/create-a-metric/metrics-steps.rst b/docs/source/development-guide/create-a-metric/metrics-steps.rst index 5604c422b..a2fb24a02 100644 --- a/docs/source/development-guide/create-a-metric/metrics-steps.rst +++ b/docs/source/development-guide/create-a-metric/metrics-steps.rst @@ -11,7 +11,7 @@ There are many paths, but we usually follow something along these lines: 2. Sometimes, there are metrics endpoints that integrate, or visualize several metrics. 3. Determine what tables in the CollectOSS Schema contain the data we need to develop this metric 4. Construct a very basic query that does the work of joining those tables in a minimal way so we have a "baseline query." -5. Refine the query so that it takes the standard inputs for a "standard metric" if that's what type it is; alternatively, look at non-standard metrics as they are defined in ``AUGUR_HOME/collectoss/routes``, or one of the visualization metrics in ``AUGUR_HOME/collectoss/routes/contributor.py``, ``AUGUR_HOME/collectoss/routes/pull_requests.py`` or ``AUGUR_HOME/collectoss/routes/nonstandard_metrics.py``. (This step is explained in the next section.) +5. Refine the query so that it takes the standard inputs for a "standard metric" if that's what type it is; alternatively, look at non-standard metrics as they are defined in ``COLLECTOSS_HOME/collectoss/routes``, or one of the visualization metrics in ``COLLECTOSS_HOME/collectoss/routes/contributor.py``, ``COLLECTOSS_HOME/collectoss/routes/pull_requests.py`` or ``COLLECTOSS_HOME/collectoss/routes/nonstandard_metrics.py``. (This step is explained in the next section.) Example Query diff --git a/docs/source/development-guide/workers/creating_a_new_worker.rst b/docs/source/development-guide/workers/creating_a_new_worker.rst index 4e713c4ac..a34d73f4b 100644 --- a/docs/source/development-guide/workers/creating_a_new_worker.rst +++ b/docs/source/development-guide/workers/creating_a_new_worker.rst @@ -132,7 +132,7 @@ In the Worker block you need to add something like this: There should NOT be a comma after the final entry in each block. -ALSO, if you wanted to have those blocks installed with auger itself when you do the PR, you need to add them to the `$AUGUR_ROOT/collectoss/config.py` file. The recommended way is to set a port range not already in use and assign a random variable range with the others, like this `your_new_worker_p = randint(56500, 56999)` ... its totally ok to compress a couple other port ranges for this process. +ALSO, if you wanted to have those blocks installed with auger itself when you do the PR, you need to add them to the `$COLLECTOSS_ROOT/collectoss/config.py` file. The recommended way is to set a port range not already in use and assign a random variable range with the others, like this `your_new_worker_p = randint(56500, 56999)` ... its totally ok to compress a couple other port ranges for this process. You can copy the housekeeper block verbatim from what you added to your own `augur.config.json`. For the worker block, in the `config.py` it would look like this: diff --git a/docs/source/docker/docker-compose.rst b/docs/source/docker/docker-compose.rst index 5c5d16a47..96e8e1c51 100644 --- a/docs/source/docker/docker-compose.rst +++ b/docs/source/docker/docker-compose.rst @@ -27,16 +27,16 @@ This section of the documentation details how to use CollectOSS's Docker Compose .. warning:: Don't forget to provide your external database credentials in a file called ``.env`` file. Make sure all the following environment variables are specified, keep placeholder values if you don't need some of them. - Don't specify AUGUR_DB if you want the docker database to be used. + Don't specify COLLECTOSS_DB if you want the docker database to be used. Example .env: .. code:: - AUGUR_GITHUB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx - AUGUR_GITHUB_USERNAME=usernameGithub - AUGUR_GITLAB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx - AUGUR_GITLAB_USERNAME=usernameGitlab - AUGUR_DB=yourDBString + COLLECTOSS_GITHUB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx + COLLECTOSS_GITHUB_USERNAME=usernameGithub + COLLECTOSS_GITLAB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx + COLLECTOSS_GITLAB_USERNAME=usernameGitlab + COLLECTOSS_DB=yourDBString diff --git a/docs/source/docker/getting-started.rst b/docs/source/docker/getting-started.rst index 0648236a5..db6822b79 100644 --- a/docs/source/docker/getting-started.rst +++ b/docs/source/docker/getting-started.rst @@ -31,14 +31,14 @@ with the following fields (don't remove any variable, keep placeholder values if .. code:: python - AUGUR_DB=collectoss - AUGUR_DB_USER=collectoss - AUGUR_DB_PASSWORD=password_here + COLLECTOSS_DB=collectoss + COLLECTOSS_DB_USER=collectoss + COLLECTOSS_DB_PASSWORD=password_here - AUGUR_GITHUB_API_KEY=ghp_value_here - AUGUR_GITHUB_USERNAME=gh_username - AUGUR_GITLAB_API_KEY=placeholder - AUGUR_GITLAB_USERNAME=placeholder + COLLECTOSS_GITHUB_API_KEY=ghp_value_here + COLLECTOSS_GITHUB_USERNAME=gh_username + COLLECTOSS_GITLAB_API_KEY=placeholder + COLLECTOSS_GITLAB_USERNAME=placeholder Then run: @@ -98,11 +98,11 @@ You can provide your own ``.env`` file to pull from. The file should have the be .. code:: - AUGUR_GITHUB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx - AUGUR_GITHUB_USERNAME=usernameGithub - AUGUR_GITLAB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx - AUGUR_GITLAB_USERNAME=usernameGitlab - AUGUR_DB=yourDBString + COLLECTOSS_GITHUB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx + COLLECTOSS_GITHUB_USERNAME=usernameGithub + COLLECTOSS_GITLAB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx + COLLECTOSS_GITLAB_USERNAME=usernameGitlab + COLLECTOSS_DB=yourDBString Now that you've created your config file or are ready to generate it yourself, you're ready to `get going `_ . diff --git a/docs/source/docker/quick-start.rst b/docs/source/docker/quick-start.rst index 86b552ea3..c71d9dfa2 100644 --- a/docs/source/docker/quick-start.rst +++ b/docs/source/docker/quick-start.rst @@ -13,14 +13,14 @@ Before you get off to such a quick start, go ahead and .. code:: python - AUGUR_DB=collectoss - AUGUR_DB_USER=collectoss - AUGUR_DB_PASSWORD=password_here + COLLECTOSS_DB=collectoss + COLLECTOSS_DB_USER=collectoss + COLLECTOSS_DB_PASSWORD=password_here - AUGUR_GITHUB_API_KEY=ghp_value_here - AUGUR_GITHUB_USERNAME=gh_username - AUGUR_GITLAB_API_KEY=placeholder - AUGUR_GITLAB_USERNAME=placeholder + COLLECTOSS_GITHUB_API_KEY=ghp_value_here + COLLECTOSS_GITHUB_USERNAME=gh_username + COLLECTOSS_GITLAB_API_KEY=placeholder + COLLECTOSS_GITLAB_USERNAME=placeholder 5. Build the container using one of the following commands: @@ -57,14 +57,14 @@ And collectoss should be up and running! .. code-block:: - AUGUR_DB=collectoss - AUGUR_DB_USER=collectoss - AUGUR_DB_PASSWORD=password_here + COLLECTOSS_DB=collectoss + COLLECTOSS_DB_USER=collectoss + COLLECTOSS_DB_PASSWORD=password_here - AUGUR_GITHUB_API_KEY=ghp_value_here - AUGUR_GITHUB_USERNAME=gh_username - AUGUR_GITLAB_API_KEY=placeholder - AUGUR_GITLAB_USERNAME=placeholder + COLLECTOSS_GITHUB_API_KEY=ghp_value_here + COLLECTOSS_GITHUB_USERNAME=gh_username + COLLECTOSS_GITLAB_API_KEY=placeholder + COLLECTOSS_GITLAB_USERNAME=placeholder 4. Execute the code from the base directory of the CollectOSS repository: diff --git a/docs/source/getting-started/command-line-interface/backend.rst b/docs/source/getting-started/command-line-interface/backend.rst index d53fd36ae..2adcce0ef 100644 --- a/docs/source/getting-started/command-line-interface/backend.rst +++ b/docs/source/getting-started/command-line-interface/backend.rst @@ -145,29 +145,29 @@ Successful output looks like: .. code-block:: bash - > CLI: [util.export_env] [INFO] Exporting AUGUR_GITHUB_API_KEY - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_HOST - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_NAME - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_PORT - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_USER - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_PASSWORD + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_GITHUB_API_KEY + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_HOST + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_NAME + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_PORT + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_USER + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_PASSWORD # contents of collectoss_export_env.sh #!/bin/bash - export AUGUR_GITHUB_API_KEY="your_key_here" - export AUGUR_DB_HOST="your_host" - export AUGUR_DB_NAME="your_db_name" - export AUGUR_DB_PORT="your_db_port" - export AUGUR_DB_USER="your_db_user" - export AUGUR_DB_PASSWORD="your_db_password" + export COLLECTOSS_GITHUB_API_KEY="your_key_here" + export COLLECTOSS_DB_HOST="your_host" + export COLLECTOSS_DB_NAME="your_db_name" + export COLLECTOSS_DB_PORT="your_db_port" + export COLLECTOSS_DB_USER="your_db_user" + export COLLECTOSS_DB_PASSWORD="your_db_password" # contents of docker_env.txt - AUGUR_GITHUB_API_KEY="your_key_here" - AUGUR_DB_HOST="your_host" - AUGUR_DB_NAME="your_db_name" - AUGUR_DB_PORT="your_db_port" - AUGUR_DB_USER="your_db_user" - AUGUR_DB_PASSWORD="your_db_password" + COLLECTOSS_GITHUB_API_KEY="your_key_here" + COLLECTOSS_DB_HOST="your_host" + COLLECTOSS_DB_NAME="your_db_name" + COLLECTOSS_DB_PORT="your_db_port" + COLLECTOSS_DB_USER="your_db_user" + COLLECTOSS_DB_PASSWORD="your_db_password" ``repo-reset`` diff --git a/docs/source/getting-started/command-line-interface/configure.rst b/docs/source/getting-started/command-line-interface/configure.rst index 5659cf6ec..89350bc1a 100644 --- a/docs/source/getting-started/command-line-interface/configure.rst +++ b/docs/source/getting-started/command-line-interface/configure.rst @@ -12,19 +12,19 @@ The ``init`` command is used to create a configuration file, by default named `` Each of the available parameters is optional, and can also be configured using an existing environment variable. Below is the list of available parameters, their defaults, and the corresponding environment variable. ---db_name Database name for your data collection database. Defaults to ``augur``. Set by the ``AUGUR_DB_NAME`` environment variable +--db_name Database name for your data collection database. Defaults to ``augur``. Set by the ``COLLECTOSS_DB_NAME`` environment variable ---db_host Host for your data collection database. Defaults to ``localhost``. Set by the ``AUGUR_DB_HOST`` environment variable +--db_host Host for your data collection database. Defaults to ``localhost``. Set by the ``COLLECTOSS_DB_HOST`` environment variable ---db_user User for your data collection database. Defaults to ``augur``. Set by the ``AUGUR_DB_USER`` environment variable +--db_user User for your data collection database. Defaults to ``augur``. Set by the ``COLLECTOSS_DB_USER`` environment variable ---db_port Port for your data collection database. Defaults to ``5432``. Set by the ``AUGUR_DB_PORT`` environment variable +--db_port Port for your data collection database. Defaults to ``5432``. Set by the ``COLLECTOSS_DB_PORT`` environment variable ---db_password Password for your data collection database. Defaults to ``augur``. Set by the ``AUGUR_DB_PASSWORD`` environment variable +--db_password Password for your data collection database. Defaults to ``augur``. Set by the ``COLLECTOSS_DB_PASSWORD`` environment variable ---github_api_key GitHub API key for data collection from the GitHub API. Defaults to ``key``. Set by the ``AUGUR_GITHUB_API_KEY`` environment variable +--github_api_key GitHub API key for data collection from the GitHub API. Defaults to ``key``. Set by the ``COLLECTOSS_GITHUB_API_KEY`` environment variable ---facade_repo_directory The directory on this machine where Facade should store its cloned repos. Defaults to ``repos/``. Set by the ``AUGUR_FACADE_REPO_DIRECTORY`` environment variable +--facade_repo_directory The directory on this machine where Facade should store its cloned repos. Defaults to ``repos/``. Set by the ``COLLECTOSS_FACADE_REPO_DIRECTORY`` environment variable --rc-config-file Path to an existing CollectOSS config file whose values will be used as the defaults. Defaults to ``None``. This parameter does not support being set by an environment variable. @@ -41,7 +41,7 @@ Example usage\: $ uv run collectoss config init --db_name "db_name" --db_host "host" --db_port "port" --db_user "db_user" --db_password "password" --github_api_key "github_api_key" --facade_repo_directory "facade_repo_directory" # to generate an augur.config.json given all credentials and environment variables - $ uv run collectoss config init --db_name $AUGUR_DB_NAME --db_host $AUGUR_DB_HOST --db_port $AUGUR_DB_PORT --db_user $AUGUR_DB_DB_USER --db_password $AUGUR_DB_PASSWORD --github_api_key $AUGUR_GITHUB_API_KEY --facade_repo_directory $AUGUR_FACADE_REPO_DIRECTORY + $ uv run collectoss config init --db_name $COLLECTOSS_DB_NAME --db_host $COLLECTOSS_DB_HOST --db_port $COLLECTOSS_DB_PORT --db_user $COLLECTOSS_DB_DB_USER --db_password $COLLECTOSS_DB_PASSWORD --github_api_key $COLLECTOSS_GITHUB_API_KEY --facade_repo_directory $COLLECTOSS_FACADE_REPO_DIRECTORY # successful output looks like: > CLI: [config.init] [INFO] Config written to /Users/carter/.collectoss/augur.config.json diff --git a/docs/source/getting-started/using-docker.rst b/docs/source/getting-started/using-docker.rst index 5028d5c3a..c427372b1 100644 --- a/docs/source/getting-started/using-docker.rst +++ b/docs/source/getting-started/using-docker.rst @@ -14,14 +14,14 @@ the following resources (or more). .. code:: python - AUGUR_DB=augur - AUGUR_DB_USER=augur - AUGUR_DB_PASSWORD=password_here - - AUGUR_GITHUB_API_KEY=ghp_value_here - AUGUR_GITHUB_USERNAME=gh_username - AUGUR_GITLAB_API_KEY=placeholder - AUGUR_GITLAB_USERNAME=placeholder + COLLECTOSS_DB=augur + COLLECTOSS_DB_USER=augur + COLLECTOSS_DB_PASSWORD=password_here + + COLLECTOSS_GITHUB_API_KEY=ghp_value_here + COLLECTOSS_GITHUB_USERNAME=gh_username + COLLECTOSS_GITLAB_API_KEY=placeholder + COLLECTOSS_GITLAB_USERNAME=placeholder 3. Build the container using one of the following commands: From bef5639443e88630fd27fe937241e460b92f21ee Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 11:30:34 -0400 Subject: [PATCH 110/165] update vars in example env Signed-off-by: Adrian Edwards --- environment.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/environment.txt b/environment.txt index 42d00b9c1..3d4c4a721 100644 --- a/environment.txt +++ b/environment.txt @@ -1,12 +1,12 @@ -AUGUR_DB_HOST=collectoss -AUGUR_DB_NAME=collectoss -AUGUR_DB_USER=collectoss -AUGUR_DB_PASSWORD= +COLLECTOSS_DB_HOST=collectoss +COLLECTOSS_DB_NAME=collectoss +COLLECTOSS_DB_USER=collectoss +COLLECTOSS_DB_PASSWORD= -AUGUR_GITHUB_API_KEY= -AUGUR_GITHUB_USERNAME= -AUGUR_GITLAB_API_KEY= -AUGUR_GITLAB_USERNAME= +COLLECTOSS_GITHUB_API_KEY= +COLLECTOSS_GITHUB_USERNAME= +COLLECTOSS_GITLAB_API_KEY= +COLLECTOSS_GITLAB_USERNAME= -AUGUR_RABBITMQ_USERNAME= -AUGUR_RABBITMQ_PASSWORD= +COLLECTOSS_RABBITMQ_USERNAME= +COLLECTOSS_RABBITMQ_PASSWORD= From 6fd7ee0edd825e168b4b19e35a35e73489273bce Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 27 May 2026 11:32:34 -0400 Subject: [PATCH 111/165] update variable names in docker compose Signed-off-by: Adrian Edwards --- docker-compose.yml | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 55e1127be..e1e8ed8da 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,11 +7,11 @@ services: restart: unless-stopped environment: - "POSTGRES_DB=augur" - - "POSTGRES_USER=${AUGUR_DB_USER:-augur}" - - "POSTGRES_PASSWORD=${AUGUR_DB_PASSWORD:-augur}" + - "POSTGRES_USER=${COLLECTOSS_DB_USER:-augur}" + - "POSTGRES_PASSWORD=${COLLECTOSS_DB_PASSWORD:-augur}" - "PGDATA=/var/lib/postgresql/data/pgdata" ports: - - "${AUGUR_DB_PORT:-5432}:5432" + - "${COLLECTOSS_DB_PORT:-5432}:5432" volumes: - augurpostgres:/var/lib/postgresql/data @@ -36,15 +36,15 @@ services: context: . dockerfile: ./docker/rabbitmq/Dockerfile args: - - RABBIT_MQ_DEFAULT_USER=${AUGUR_RABBITMQ_USERNAME:-augur} - - RABBIT_MQ_DEFAULT_PASSWORD=${AUGUR_RABBITMQ_PASSWORD:-password123} - - RABBIT_MQ_DEFAULT_VHOST=${AUGUR_RABBITMQ_VHOST:-collectoss_vhost} + - RABBIT_MQ_DEFAULT_USER=${COLLECTOSS_RABBITMQ_USERNAME:-augur} + - RABBIT_MQ_DEFAULT_PASSWORD=${COLLECTOSS_RABBITMQ_PASSWORD:-password123} + - RABBIT_MQ_DEFAULT_VHOST=${COLLECTOSS_RABBITMQ_VHOST:-collectoss_vhost} core: image: collectoss:latest build: context: . - dockerfile: ./docker/backend/${AUGUR_TARGET:-Dockerfile} + dockerfile: ./docker/backend/${COLLECTOSS_TARGET:-Dockerfile} volumes: - cache:/cache:rw - config:/config:rw @@ -56,16 +56,16 @@ services: #extra_hosts: # - "host.docker.internal:host-gateway" #Be able to ping services on the local machine environment: - - "AUGUR_DB=postgresql+psycopg2://${AUGUR_DB_USER:-augur}:${AUGUR_DB_PASSWORD:-augur}@database:5432/augur" - - "AUGUR_DB_SCHEMA_BUILD=1" - - AUGUR_FACADE_REPO_DIRECTORY=/facade - - "AUGUR_FLAGS=$AUGUR_FLAGS" - - "AUGUR_GITHUB_API_KEY=${AUGUR_GITHUB_API_KEY}" - - "AUGUR_GITLAB_API_KEY=${AUGUR_GITLAB_API_KEY}" - - "AUGUR_GITHUB_USERNAME=${AUGUR_GITHUB_USERNAME}" - - "AUGUR_GITLAB_USERNAME=${AUGUR_GITLAB_USERNAME}" + - "COLLECTOSS_DB=postgresql+psycopg2://${COLLECTOSS_DB_USER:-augur}:${COLLECTOSS_DB_PASSWORD:-augur}@database:5432/augur" + - "COLLECTOSS_DB_SCHEMA_BUILD=1" + - COLLECTOSS_FACADE_REPO_DIRECTORY=/facade + - "COLLECTOSS_FLAGS=$COLLECTOSS_FLAGS" + - "COLLECTOSS_GITHUB_API_KEY=${COLLECTOSS_GITHUB_API_KEY}" + - "COLLECTOSS_GITLAB_API_KEY=${COLLECTOSS_GITLAB_API_KEY}" + - "COLLECTOSS_GITHUB_USERNAME=${COLLECTOSS_GITHUB_USERNAME}" + - "COLLECTOSS_GITLAB_USERNAME=${COLLECTOSS_GITLAB_USERNAME}" - REDIS_CONN_STRING=redis://redis:6379 - - RABBITMQ_CONN_STRING=amqp://${AUGUR_RABBITMQ_USERNAME:-augur}:${AUGUR_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${AUGUR_RABBITMQ_VHOST:-collectoss_vhost} + - RABBITMQ_CONN_STRING=amqp://${COLLECTOSS_RABBITMQ_USERNAME:-augur}:${COLLECTOSS_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${COLLECTOSS_RABBITMQ_VHOST:-collectoss_vhost} - CONFIG_LOCATION=/config/config.yml - CONFIG_DATADIR=/config - CACHE_DATADIR=/cache @@ -92,9 +92,9 @@ services: # ports: # - 5555:5555 # environment: - # - "AUGUR_DB=postgresql+psycopg2://${AUGUR_DB_USER:-augur}:${AUGUR_DB_PASSWORD:-augur}@database:5432/augur" + # - "COLLECTOSS_DB=postgresql+psycopg2://${COLLECTOSS_DB_USER:-augur}:${COLLECTOSS_DB_PASSWORD:-augur}@database:5432/augur" # - REDIS_CONN_STRING=redis://redis:6379 - # - RABBITMQ_CONN_STRING=amqp://${AUGUR_RABBITMQ_USERNAME:-augur}:${AUGUR_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${AUGUR_RABBITMQ_VHOST:-collectoss_vhost} + # - RABBITMQ_CONN_STRING=amqp://${COLLECTOSS_RABBITMQ_USERNAME:-augur}:${COLLECTOSS_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${COLLECTOSS_RABBITMQ_VHOST:-collectoss_vhost} # depends_on: # - core # - database From 0aa0d8accca4ed78fc82d7411678b4a56ee82ac8 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 12:55:57 -0400 Subject: [PATCH 112/165] hard change env var prefix for CLI commands Signed-off-by: Adrian Edwards --- collectoss/application/cli/_multicommand.py | 2 +- collectoss/application/cli/config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/collectoss/application/cli/_multicommand.py b/collectoss/application/cli/_multicommand.py index 13186e7bb..06aae01de 100644 --- a/collectoss/application/cli/_multicommand.py +++ b/collectoss/application/cli/_multicommand.py @@ -11,7 +11,7 @@ from pathlib import Path # import collectoss.application -CONTEXT_SETTINGS = dict(auto_envvar_prefix='AUGUR') +CONTEXT_SETTINGS = dict(auto_envvar_prefix='COLLECTOSS') class CLIMultiCommand(click.MultiCommand): def __commands_folder(self): diff --git a/collectoss/application/cli/config.py b/collectoss/application/cli/config.py index 2a9a09320..9753f5299 100644 --- a/collectoss/application/cli/config.py +++ b/collectoss/application/cli/config.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -ENVVAR_PREFIX = "AUGUR_" +ENVVAR_PREFIX = "COLLECTOSS_" @click.group('config', short_help='Generate an augur.config.json') @click.pass_context From 110f42b6fafd871048e2ceeb1960bfdb9d822c98 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 12:56:38 -0400 Subject: [PATCH 113/165] attempt to add transitional variables for the specific existing places where env vars are explicitly needed in the CLI Signed-off-by: Adrian Edwards --- collectoss/application/cli/config.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/collectoss/application/cli/config.py b/collectoss/application/cli/config.py index 9753f5299..50641439e 100644 --- a/collectoss/application/cli/config.py +++ b/collectoss/application/cli/config.py @@ -18,18 +18,21 @@ ENVVAR_PREFIX = "COLLECTOSS_" +def get_transitional_envs(name: str) -> list: + return [ENVVAR_PREFIX + name, "AUGUR_" + name] + @click.group('config', short_help='Generate an augur.config.json') @click.pass_context def cli(ctx): ctx.obj = DatabaseContext() @cli.command('init') -@click.option('--github-api-key', help="GitHub API key for data collection from the GitHub API", envvar=ENVVAR_PREFIX + 'GITHUB_API_KEY') -@click.option('--facade-repo-directory', help="Directory on the database server where Facade should clone repos", envvar=ENVVAR_PREFIX + 'FACADE_REPO_DIRECTORY') -@click.option('--gitlab-api-key', help="GitLab API key for data collection from the GitLab API", envvar=ENVVAR_PREFIX + 'GITLAB_API_KEY') -@click.option('--redis-conn-string', help="String to connect to redis cache", envvar=ENVVAR_PREFIX + 'REDIS_CONN_STRING') -@click.option('--rabbitmq-conn-string', help="String to connect to rabbitmq broker", envvar=ENVVAR_PREFIX + 'RABBITMQ_CONN_STRING') -@click.option('--logs-directory', help="Directory to store logs", envvar=ENVVAR_PREFIX + 'LOGS_DIRECTORY') +@click.option('--github-api-key', help="GitHub API key for data collection from the GitHub API", envvar=get_transitional_envs('GITHUB_API_KEY')) +@click.option('--facade-repo-directory', help="Directory on the database server where Facade should clone repos", envvar=get_transitional_envs('FACADE_REPO_DIRECTORY')) +@click.option('--gitlab-api-key', help="GitLab API key for data collection from the GitLab API", envvar=get_transitional_envs('GITLAB_API_KEY')) +@click.option('--redis-conn-string', help="String to connect to redis cache", envvar=get_transitional_envs('REDIS_CONN_STRING')) +@click.option('--rabbitmq-conn-string', help="String to connect to rabbitmq broker", envvar=get_transitional_envs('RABBITMQ_CONN_STRING')) +@click.option('--logs-directory', help="Directory to store logs", envvar=get_transitional_envs('LOGS_DIRECTORY')) @test_connection @test_db_connection @with_database From f3ff5a34310b1eab69ea948aedcdf63c082fa3d8 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 17:12:30 -0400 Subject: [PATCH 114/165] factor tests into a class Signed-off-by: Adrian Edwards --- .../test_config/test_environment.py | 102 +++++++++--------- 1 file changed, 52 insertions(+), 50 deletions(-) diff --git a/tests/test_application/test_config/test_environment.py b/tests/test_application/test_config/test_environment.py index 6b62f2ec9..587910da7 100644 --- a/tests/test_application/test_config/test_environment.py +++ b/tests/test_application/test_config/test_environment.py @@ -6,75 +6,77 @@ prefixes = ["COLLECTOSS", "OTHER"] -def test_env_extract_prefix(): - assert extract_prefix("OTHER_DB", prefixes) == "OTHER_" - assert extract_prefix("COLLECTOSS_DB", prefixes) == "COLLECTOSS_" +class TestSystemEnv: -def test_env_extract_prefix_default(): - assert extract_prefix("SOME_DB", prefixes) is None - assert extract_prefix("THINGY_DB", prefixes) is None + def test_env_extract_prefix(self): + assert extract_prefix("OTHER_DB", prefixes) == "OTHER_" + assert extract_prefix("COLLECTOSS_DB", prefixes) == "COLLECTOSS_" + def test_env_extract_prefix_default(self): + assert extract_prefix("SOME_DB", prefixes) is None + assert extract_prefix("THINGY_DB", prefixes) is None -def test_env_extract_prefix_unprefixed(): - assert extract_prefix("DB", prefixes) is None -def test_fetching_env(): - # plain - os.environ["COLLECTOSS_NAME"] = "A" - assert SystemEnv.get("COLLECTOSS_NAME") == "A" + def test_env_extract_prefix_unprefixed(self): + assert extract_prefix("DB", prefixes) is None - # fallback handling - os.environ["OTHER_THING"] = "B" - assert SystemEnv.get("COLLECTOSS_THING", None, prefixes) == "B" + def test_fetching_env(self): + # plain + os.environ["COLLECTOSS_NAME"] = "A" + assert SystemEnv.get("COLLECTOSS_NAME") == "A" - # cleanup - del os.environ["COLLECTOSS_NAME"] - del os.environ["OTHER_THING"] + # fallback handling + os.environ["OTHER_THING"] = "B" + assert SystemEnv.get("COLLECTOSS_THING", None, prefixes) == "B" -def test_fetching_env_backwards(): - os.environ["COLLECTOSS_NAME"] = "A" - assert SystemEnv.get("OTHER_NAME", None, prefixes) == "A" + # cleanup + del os.environ["COLLECTOSS_NAME"] + del os.environ["OTHER_THING"] - # cleanup - del os.environ["COLLECTOSS_NAME"] + def test_fetching_env_backwards(self): + os.environ["COLLECTOSS_NAME"] = "A" + assert SystemEnv.get("OTHER_NAME", None, prefixes) == "A" -def test_fetching_env_no_value(): - assert SystemEnv.get("COLLECTOSS_MISSING", None, prefixes) is None + # cleanup + del os.environ["COLLECTOSS_NAME"] -def test_fetching_env_default(): - assert SystemEnv.get("COLLECTOSS_DEFAULT", "SOME", prefixes) == "SOME" + def test_fetching_env_no_value(self): + assert SystemEnv.get("COLLECTOSS_MISSING", None, prefixes) is None -def test_no_known_prefix(): - # fallback handling - os.environ["THING"] = "C" - assert SystemEnv.get("THING", None, prefixes) == "C" + def test_fetching_env_default(self): + assert SystemEnv.get("COLLECTOSS_DEFAULT", "SOME", prefixes) == "SOME" + def test_no_known_prefix(self): + # fallback handling + os.environ["THING"] = "C" + assert SystemEnv.get("THING", None, prefixes) == "C" -def test_get_bool_trues(): - cases = ["1", "true", "True", "TRUE", "y", "Y", "yes", "Yes"] + def test_get_bool_trues(self): - for case in cases: - os.environ["OTHER_BOOL"] = case - assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == True - del os.environ["OTHER_BOOL"] + cases = ["1", "true", "True", "TRUE", "y", "Y", "yes", "Yes"] -def test_get_bool_falses(): + for case in cases: + os.environ["OTHER_BOOL"] = case + assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == True + del os.environ["OTHER_BOOL"] - cases = ["0", "false", "False", "FALSE", "n", "N", "no", "No"] + def test_get_bool_falses(self): - for case in cases: - os.environ["OTHER_BOOL"] = case - assert SystemEnv.get_bool("OTHER_BOOL", True, prefixes) == False - del os.environ["OTHER_BOOL"] + cases = ["0", "false", "False", "FALSE", "n", "N", "no", "No"] -def test_get_bool_default(): + for case in cases: + os.environ["OTHER_BOOL"] = case + assert SystemEnv.get_bool("OTHER_BOOL", True, prefixes) == False + del os.environ["OTHER_BOOL"] - cases = ["?", "maybe", "Stuff", "333"] + def test_get_bool_default(self): - for case in cases: - os.environ["OTHER_BOOL"] = case - assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == False - del os.environ["OTHER_BOOL"] + cases = ["?", "maybe", "Stuff", "333"] - + for case in cases: + os.environ["OTHER_BOOL"] = case + assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == False + del os.environ["OTHER_BOOL"] + + From 75345cba1c31be4115b0ab62150272bcd242a5dd Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 17:12:42 -0400 Subject: [PATCH 115/165] import SystemEnv into celery_app Signed-off-by: Adrian Edwards --- collectoss/tasks/init/celery_app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/collectoss/tasks/init/celery_app.py b/collectoss/tasks/init/celery_app.py index 22fd34872..4b10af18a 100644 --- a/collectoss/tasks/init/celery_app.py +++ b/collectoss/tasks/init/celery_app.py @@ -17,6 +17,7 @@ from collectoss.application.db import get_engine from collectoss.application.db.lib import get_session from collectoss.application.config import SystemConfig +from collectoss.application.environment import SystemEnv from collectoss.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string from collectoss.application.db.models import Repo from collectoss.tasks.util.collection_state import CollectionState From bcff419815881cf65cde36554f046c529a16c949 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 17:17:51 -0400 Subject: [PATCH 116/165] move test next to the actual known good config tests Signed-off-by: Adrian Edwards --- .../test_config => test_classes}/test_environment.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{test_application/test_config => test_classes}/test_environment.py (100%) diff --git a/tests/test_application/test_config/test_environment.py b/tests/test_classes/test_environment.py similarity index 100% rename from tests/test_application/test_config/test_environment.py rename to tests/test_classes/test_environment.py From 3351859a90c0f51a116155a9029f43a27f687476 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 17:26:42 -0400 Subject: [PATCH 117/165] add more detailed failure reasons to get_bool tests Signed-off-by: Adrian Edwards --- tests/test_classes/test_environment.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_classes/test_environment.py b/tests/test_classes/test_environment.py index 587910da7..38c0a99e2 100644 --- a/tests/test_classes/test_environment.py +++ b/tests/test_classes/test_environment.py @@ -58,7 +58,7 @@ def test_get_bool_trues(self): for case in cases: os.environ["OTHER_BOOL"] = case - assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == True + assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == True, f"value '{case}' should resolve to True" del os.environ["OTHER_BOOL"] def test_get_bool_falses(self): @@ -67,7 +67,7 @@ def test_get_bool_falses(self): for case in cases: os.environ["OTHER_BOOL"] = case - assert SystemEnv.get_bool("OTHER_BOOL", True, prefixes) == False + assert SystemEnv.get_bool("OTHER_BOOL", True, prefixes) == False, f"value '{case}' should resolve to False" del os.environ["OTHER_BOOL"] def test_get_bool_default(self): @@ -76,7 +76,7 @@ def test_get_bool_default(self): for case in cases: os.environ["OTHER_BOOL"] = case - assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == False + assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == False, f"value '{case}' should resolve to Default value" del os.environ["OTHER_BOOL"] From 3585e5062ba76d3e28b5e76d13ab8852db4c2182 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 17:29:24 -0400 Subject: [PATCH 118/165] split environment tests into two classes Signed-off-by: Adrian Edwards --- tests/test_classes/test_environment.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_classes/test_environment.py b/tests/test_classes/test_environment.py index 38c0a99e2..e6621062a 100644 --- a/tests/test_classes/test_environment.py +++ b/tests/test_classes/test_environment.py @@ -6,8 +6,7 @@ prefixes = ["COLLECTOSS", "OTHER"] -class TestSystemEnv: - +class TestExtractPrefix: def test_env_extract_prefix(self): assert extract_prefix("OTHER_DB", prefixes) == "OTHER_" assert extract_prefix("COLLECTOSS_DB", prefixes) == "COLLECTOSS_" @@ -20,6 +19,8 @@ def test_env_extract_prefix_default(self): def test_env_extract_prefix_unprefixed(self): assert extract_prefix("DB", prefixes) is None +class TestSystemEnv: + def test_fetching_env(self): # plain os.environ["COLLECTOSS_NAME"] = "A" From 19984f247d74c5ed7b3589916d139010cbb241e2 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 17:46:37 -0400 Subject: [PATCH 119/165] apply homedir resolution fix to SCC path as well Signed-off-by: Adrian Edwards --- collectoss/tasks/git/scc_value_tasks/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/tasks/git/scc_value_tasks/core.py b/collectoss/tasks/git/scc_value_tasks/core.py index a526af990..770165522 100644 --- a/collectoss/tasks/git/scc_value_tasks/core.py +++ b/collectoss/tasks/git/scc_value_tasks/core.py @@ -21,7 +21,7 @@ def value_model(logger,repo_git): logger.info(f"Repo ID: {repo_id}, Path: {path}") logger.info('Running scc...') - path_to_scc = SystemEnv.get('SCC_DIR', (SystemEnv.get('HOME') or "~") + '/scc') + path_to_scc = SystemEnv.get('SCC_DIR', os.path.expanduser('~/scc')) required_output = parse_json_from_subprocess_call(logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) From c2b6215410eb5a7d88e322fa20de6f422136a760 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 17:51:58 -0400 Subject: [PATCH 120/165] fix docs underline lengths Signed-off-by: Adrian Edwards --- docs/source/deployment/production.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/deployment/production.rst b/docs/source/deployment/production.rst index 614737256..186a38c4c 100644 --- a/docs/source/deployment/production.rst +++ b/docs/source/deployment/production.rst @@ -15,7 +15,7 @@ to your deployment: - ``COLLECTOSS_DB`` : PostgreSQL database connection string (used if variable not set) COLLECTOSS_RESET_LOGS ----------------- +--------------------- **Description:** Controls whether CollectOSS resets its log files every time the server starts. Useful for managing log size or integrating with external log rotation systems. @@ -39,7 +39,7 @@ If set to `False`, CollectOSS will not reset logs automatically. Administrators export COLLECTOSS_RESET_LOGS=False COLLECTOSS_DB --------- +------------- **Description:** Specifies the connection string for the PostgreSQL database used by CollectOSS. If omitted, the default Docker database is used. From 97d443c716b35e9f82663ef275e8cbf5502aac9b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 17:59:04 -0400 Subject: [PATCH 121/165] remove unused imports Signed-off-by: Adrian Edwards --- collectoss/api/view/init.py | 1 - collectoss/tasks/git/dependency_tasks/core.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/collectoss/api/view/init.py b/collectoss/api/view/init.py index 1ab68912c..b26752af9 100644 --- a/collectoss/api/view/init.py +++ b/collectoss/api/view/init.py @@ -1,4 +1,3 @@ -import os from pathlib import Path from collectoss.application.logs import SystemLogger import secrets, yaml diff --git a/collectoss/tasks/git/dependency_tasks/core.py b/collectoss/tasks/git/dependency_tasks/core.py index 3bd2aaab2..0648231b0 100644 --- a/collectoss/tasks/git/dependency_tasks/core.py +++ b/collectoss/tasks/git/dependency_tasks/core.py @@ -1,7 +1,7 @@ from datetime import datetime import os from collectoss.application.db.models import * -from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value, get_session +from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value from collectoss.application.environment import SystemEnv from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from collectoss.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc From e949c77a8538ffd6c04b54d32e30db2e87541680 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 29 May 2026 18:29:44 -0400 Subject: [PATCH 122/165] use SystemEnv for fetching database variable Signed-off-by: Adrian Edwards --- collectoss/application/db/engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/collectoss/application/db/engine.py b/collectoss/application/db/engine.py index e00c3c992..5aae4466e 100644 --- a/collectoss/application/db/engine.py +++ b/collectoss/application/db/engine.py @@ -7,6 +7,7 @@ from sqlalchemy import create_engine, event from sqlalchemy.engine import Engine +from collectoss.application.environment import SystemEnv from collectoss.application.db.util import catch_operational_error @@ -61,7 +62,7 @@ def get_database_string() -> str: postgres database string """ - db_environment_var = os.getenv("AUGUR_DB") + db_environment_var = SystemEnv.get("COLLECTOSS_DB") try: current_dir = os.getcwd() From 0b46bfbbb57cc152b878674be20b558cc3522580 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 08:55:44 -0400 Subject: [PATCH 123/165] update error messages surrounding DB access to refer to the new variable Signed-off-by: Adrian Edwards --- collectoss/application/cli/__init__.py | 2 +- collectoss/application/db/engine.py | 2 +- collectoss/tasks/git/util/facade_worker/facade_worker/config.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/collectoss/application/cli/__init__.py b/collectoss/application/cli/__init__.py index 18fac2f0a..444473016 100644 --- a/collectoss/application/cli/__init__.py +++ b/collectoss/application/cli/__init__.py @@ -71,7 +71,7 @@ def new_func(ctx, *args, **kwargs): # determine the location to print in error string if db_environment_var: - location = f"the AUGUR_DB environment variable\nAUGUR_DB={SystemEnv.get('COLLECTOSS_DB')}" + location = f"the COLLECTOSS_DB environment variable\nCOLLECTOSS_DB={SystemEnv.get('COLLECTOSS_DB')}" else: with open("db.config.json", 'r') as f: db_config = json.load(f) diff --git a/collectoss/application/db/engine.py b/collectoss/application/db/engine.py index 5aae4466e..884d5a61c 100644 --- a/collectoss/application/db/engine.py +++ b/collectoss/application/db/engine.py @@ -75,7 +75,7 @@ def get_database_string() -> str: if not db_environment_var and not db_json_exists: - print("ERROR no way to get connection to the database. \n\t\t\t\t\t\t There is no db.config.json and the AUGUR_DB environment variable is not set\n\t\t\t\t\t\t Please run make install or set the AUGUR_DB environment then run make install") + print("ERROR no way to get connection to the database. \n\t\t\t\t\t\t There is no db.config.json and the COLLECTOSS_DB environment variable is not set\n\t\t\t\t\t\t Please run make install or set the COLLECTOSS_DB environment then run make install") sys.exit() if db_environment_var: diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/config.py b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py index f6d5aa465..2b536a3a4 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/config.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py @@ -57,7 +57,7 @@ def get_database_args_from_env(): if not db_str and not db_json_exists: - logger.error("ERROR no way to get connection to the database. \n\t\t\t\t\t\t There is no db.config.json and the AUGUR_DB environment variable is not set\n\t\t\t\t\t\t Please run make install or set the AUGUR_DB environment then run make install") + logger.error("ERROR no way to get connection to the database. \n\t\t\t\t\t\t There is no db.config.json and the COLLECTOSS_DB environment variable is not set\n\t\t\t\t\t\t Please run make install or set the COLLECTOSS_DB environment then run make install") sys.exit() credentials = {} From 0b3aa2b3e460b1594d27f36cb8048f317cd0b3e0 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 9 Jun 2026 18:02:00 -0400 Subject: [PATCH 124/165] allow SystemEnv to set a default value Signed-off-by: Adrian Edwards --- collectoss/api/server.py | 5 +++-- collectoss/application/environment.py | 9 ++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/collectoss/api/server.py b/collectoss/api/server.py index a4d212f58..7955cd7a1 100644 --- a/collectoss/api/server.py +++ b/collectoss/api/server.py @@ -27,6 +27,7 @@ from collectoss.application.logs import SystemLogger from collectoss.application.db.session import DatabaseSession from collectoss.application.config import SystemConfig +from collectoss.application.environment import SystemEnv from collectoss.application.db.engine import get_database_string, create_database_engine from collectoss.application.db.models import Repo, Issue, PullRequest, Message, PullRequestReview, Commit, IssueAssignee, PullRequestAssignee, PullRequestCommit, PullRequestFile, Contributor, IssueLabel, PullRequestLabel, ContributorsAlias, Release, ClientApplication @@ -300,8 +301,8 @@ def create_cache_manager() -> CacheManager: cache_config = { 'cache.type': 'file', # Allow setting cache directories via environment variables - 'cache.data_dir': Path(env.setdefault("CACHE_DATADIR", 'runtime/cache/')), - 'cache.lock_dir': Path(env.setdefault("CACHE_LOCKDIR", 'runtime/cache/')), + 'cache.data_dir': Path(SystemEnv.set_default("CACHE_DATADIR", 'runtime/cache/')), + 'cache.lock_dir': Path(SystemEnv.set_default("CACHE_LOCKDIR", 'runtime/cache/')), } if not os.path.exists(cache_config['cache.data_dir']): diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py index 22a8c95d4..eee8942ed 100644 --- a/collectoss/application/environment.py +++ b/collectoss/application/environment.py @@ -76,4 +76,11 @@ def set(cls, key: str, value: str, overwrite=True) -> None: if os.getenv(key) is not None and not overwrite: return - os.environ[key] = value \ No newline at end of file + os.environ[key] = value + + @classmethod + def set_default(cls, key: str, value: str) -> None: + if cls.get(key) is None: + cls.set(key, value) + return value + return cls.get(key) \ No newline at end of file From 4a30c75451cc5db1469c97999b65d5b76764afda Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 09:32:19 -0400 Subject: [PATCH 125/165] create a stub of a helper function to check and init the schema Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 3 +++ collectoss/application/cli/db.py | 4 +++- collectoss/util/startup.py | 12 ++++++++++++ docker/backend/init.sh | 4 ---- 4 files changed, 18 insertions(+), 5 deletions(-) create mode 100644 collectoss/util/startup.py diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index 3526a3c2c..bb180433b 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -30,6 +30,7 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s +from collectoss.util.startup import check_init_schema from keyman.KeyClient import KeyClient, KeyPublisher reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) @@ -61,6 +62,8 @@ def start(ctx, disable_collection, development, pidfile, port): signal.signal(signal.SIGTERM, manager.shutdown_signal_handler) signal.signal(signal.SIGINT, manager.shutdown_signal_handler) + check_init_schema() + try: if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index e43e472aa..1827079ec 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -29,6 +29,7 @@ process_repo_group_csv, ) from collectoss.application.environment import SystemEnv +from collectoss.util.startup import check_init_schema logger = logging.getLogger(__name__) @@ -311,7 +312,8 @@ def create_schema(): """ Create schema in the configured database """ - check_call(["alembic", "upgrade", "head"]) + # check_call(["alembic", "upgrade", "head"]) + check_init_schema() def generate_key(length): diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py new file mode 100644 index 000000000..eb9fb01c5 --- /dev/null +++ b/collectoss/util/startup.py @@ -0,0 +1,12 @@ +## Startup helpers + + +def check_init_schema(): + """Initialize the CollectOSS database schema as appropriate + """ + + pass + # does public.alembic_version exist? + # if yes, do nothing + # if no, do a sanity check to make sure the other schemas dont exist, + # then init the current db with sqlalchemy and stamp the current version with alembic diff --git a/docker/backend/init.sh b/docker/backend/init.sh index 782b8fa53..65470a403 100644 --- a/docker/backend/init.sh +++ b/docker/backend/init.sh @@ -2,10 +2,6 @@ #SPDX-License-Identifier: MIT set -e -if [[ "$AUGUR_DB_SCHEMA_BUILD" == "1" ]]; then - collectoss db create-schema -fi - if [ ! -v AUGUR_NO_CONFIG ]; then ./scripts/docker/config.sh docker From 4cf94bba4d66ef85794f03638ae4a0fd79d02809 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 09:40:54 -0400 Subject: [PATCH 126/165] add stub for making sure the schema is updated this was split into a separate thing Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 3 ++- collectoss/application/cli/db.py | 4 ++-- collectoss/util/startup.py | 8 ++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index bb180433b..15b162ced 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -30,7 +30,7 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s -from collectoss.util.startup import check_init_schema +from collectoss.util.startup import check_init_schema, check_update_schema from keyman.KeyClient import KeyClient, KeyPublisher reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) @@ -63,6 +63,7 @@ def start(ctx, disable_collection, development, pidfile, port): signal.signal(signal.SIGINT, manager.shutdown_signal_handler) check_init_schema() + check_update_schema() try: if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index 1827079ec..7b6bc7c09 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -29,7 +29,7 @@ process_repo_group_csv, ) from collectoss.application.environment import SystemEnv -from collectoss.util.startup import check_init_schema +from collectoss.util.startup import check_init_schema, check_update_schema logger = logging.getLogger(__name__) @@ -292,7 +292,7 @@ def upgrade_db_version(): """ Upgrade the configured database to the latest version """ - check_call(["alembic", "upgrade", "head"]) + check_update_schema() @cli.command("check-for-upgrade") diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index eb9fb01c5..359054ab2 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -10,3 +10,11 @@ def check_init_schema(): # if yes, do nothing # if no, do a sanity check to make sure the other schemas dont exist, # then init the current db with sqlalchemy and stamp the current version with alembic + +def check_update_schema(): + """ensure the CollectOSS schema is on the latest version + """ + pass + # alembic upgrade head, unless theres an env var preventing automatic migration + # check_call(["alembic", "upgrade", "head"]) + From 71e74b7b8294e515e4b5557cc245df9349847907 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 11:26:33 -0400 Subject: [PATCH 127/165] move some entrypoint behavior into a variable collection helper Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 6 +++++- collectoss/util/startup.py | 29 +++++++++++++++++++++++++++ docker/backend/entrypoint.sh | 16 --------------- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index 15b162ced..af3e2825d 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -30,7 +30,7 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s -from collectoss.util.startup import check_init_schema, check_update_schema +from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables from keyman.KeyClient import KeyClient, KeyPublisher reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) @@ -62,6 +62,10 @@ def start(ctx, disable_collection, development, pidfile, port): signal.signal(signal.SIGTERM, manager.shutdown_signal_handler) signal.signal(signal.SIGINT, manager.shutdown_signal_handler) + + collect_env_variables(logger) + + check_init_schema() check_update_schema() diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 359054ab2..bda69c00e 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -1,6 +1,9 @@ ## Startup helpers +from collectoss.application.environment import SystemEnv + + def check_init_schema(): """Initialize the CollectOSS database schema as appropriate """ @@ -18,3 +21,29 @@ def check_update_schema(): # alembic upgrade head, unless theres an env var preventing automatic migration # check_call(["alembic", "upgrade", "head"]) +def collect_env_variables(logger): + """convenience helper for assembling more complex environment variables out of smaller ones + and other environment variable convenience operations + """ + + if SystemEnv.get("COLLECTOSS_DB") is None: + names = ["COLLECTOSS_DB_HOST", "COLLECTOSS_DB_USER", "COLLECTOSS_DB_PASSWORD", "COLLECTOSS_DB_NAME"] + values = [SystemEnv.get(n) for n in names] + + if all(map(lambda p: p is not None, values)): + host, user, passwd, name = values + SystemEnv.set("COLLECTOSS_DB", f"postgresql+psycopg2://{user}:{passwd}@{host}/{name}") + else: + logger.warning("CollectOSS was unable to create your database connection string automatically") + logger.warning("The following environment variables are missing:") + for n, v in zip(names, values): + if v is None: + logger.warning(n) + + + + db_string = SystemEnv.get("COLLECTOSS_DB") + if db_string and "localhost" in db_string: + SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) + elif db_string and "127.0.0.1" in db_string: + SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) diff --git a/docker/backend/entrypoint.sh b/docker/backend/entrypoint.sh index 78eda49e2..37d5095d5 100644 --- a/docker/backend/entrypoint.sh +++ b/docker/backend/entrypoint.sh @@ -2,22 +2,6 @@ #SPDX-License-Identifier: MIT set -e -if [[ -z "$AUGUR_DB" ]]; then - # If AUGUR_DB is not set, check for individual environment variables and construct AUGUR_DB connection string - if [[ -n "$AUGUR_DB_HOST" ]] && [[ -n "$AUGUR_DB_USER" ]] && [[ -n "$AUGUR_DB_PASSWORD" ]] && [[ -n "$AUGUR_DB_NAME" ]]; then - export AUGUR_DB="postgresql+psycopg2://${AUGUR_DB_USER}:${AUGUR_DB_PASSWORD}@${AUGUR_DB_HOST}/${AUGUR_DB_NAME}" - fi -fi - - -if [[ "$AUGUR_DB" == *"localhost"* ]]; then - echo "localhost db connection" - export AUGUR_DB="${AUGUR_DB/localhost/host.docker.internal}" -elif [[ "$AUGUR_DB" == *"127.0.0.1"* ]]; then - echo "localhost db connection" - export AUGUR_DB="${AUGUR_DB/127.0.0.1/host.docker.internal}" -fi - export AUGUR_FACADE_REPO_DIRECTORY=${AUGUR_FACADE_REPO_DIRECTORY:-/collectoss/facade/} export AUGUR_DOCKER_DEPLOY="1" From 009e09256dc124e3a4f8767f8c4c63eb06b2458d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 11:27:14 -0400 Subject: [PATCH 128/165] Automatically populate gitlab data with nonfunctional values if they are not specified Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 7 +++++++ scripts/docker/config.sh | 42 -------------------------------------- 2 files changed, 7 insertions(+), 42 deletions(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index bda69c00e..44f7fff35 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -47,3 +47,10 @@ def collect_env_variables(logger): SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) elif db_string and "127.0.0.1" in db_string: SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) + + # if user didnt specify gitlab credentials, just inject fake ones so we can start up. + if SystemEnv.get("COLLECTOSS_GITLAB_API_KEY") is None: + SystemEnv.set("COLLECTOSS_GITLAB_API_KEY", "fake") + if SystemEnv.get("COLLECTOSS_GITLAB_USERNAME") is None: + SystemEnv.set("COLLECTOSS_GITLAB_USERNAME", "fake") + diff --git a/scripts/docker/config.sh b/scripts/docker/config.sh index 6f92c9a36..5a279d7da 100755 --- a/scripts/docker/config.sh +++ b/scripts/docker/config.sh @@ -50,26 +50,6 @@ function get_github_api_key() { echo } -function get_gitlab_username() { - echo - echo "Please provide your username for GitLab." - echo "** This is required for CollectOSS to clone GitLab repos ***" - read -p "GitLab username: " gitlab_username - blank_confirm gitlab_username - echo -} - -function get_gitlab_api_key() { - echo - echo "Please provide a valid GitLab API key." - echo "For more information on how to create the key, visit:" - echo "https://docs.collectoss.org/en/latest/getting-started/collecting-data.html" - echo "** This is required for CollectOSS to gather data ***" - read -p "GitLab API Key: " gitlab_api_key - blank_confirm gitlab_api_key - echo -} - function get_facade_repo_path() { echo "The Facade data collection worker will clone repositories to this machine to run its analysis." @@ -165,28 +145,6 @@ function create_config() { echo fi - if [[ -z "${AUGUR_GITLAB_API_KEY}" ]]; then - get_gitlab_api_key - else - echo - echo "Found AUGUR_GITLAB_API_KEY environment variable" - echo "Using it in the config" - echo "Please unset AUGUR_GITLAB_API_KEY if you would like to be prompted for a gitlab api key" - gitlab_api_key=$AUGUR_GITLAB_API_KEY - echo - fi - - if [[ -z "${AUGUR_GITLAB_USERNAME}" ]]; then - get_gitlab_username - else - echo - echo "Found AUGUR_GITLAB_USERNAME environment variable" - echo "Using it in the config" - echo "Please unset AUGUR_GITLAB_USERNAME if you would like to be prompted for a gitlab username" - gitlab_username=$AUGUR_GITLAB_USERNAME - echo - fi - if [[ -z "${AUGUR_FACADE_REPO_DIRECTORY}" ]]; then get_facade_repo_path else From c9924ecb013db7ba4bac1fccb42bf43e7e9bb6cb Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 11:27:59 -0400 Subject: [PATCH 129/165] move docker deploy variable to the dockerfile/composefile Signed-off-by: Adrian Edwards --- docker-compose.yml | 1 + docker/backend/Dockerfile | 2 ++ docker/backend/entrypoint.sh | 1 - 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index e1e8ed8da..5673fd5e9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -64,6 +64,7 @@ services: - "COLLECTOSS_GITLAB_API_KEY=${COLLECTOSS_GITLAB_API_KEY}" - "COLLECTOSS_GITHUB_USERNAME=${COLLECTOSS_GITHUB_USERNAME}" - "COLLECTOSS_GITLAB_USERNAME=${COLLECTOSS_GITLAB_USERNAME}" + - COLLECTOSS_DOCKER_DEPLOY=1 - REDIS_CONN_STRING=redis://redis:6379 - RABBITMQ_CONN_STRING=amqp://${COLLECTOSS_RABBITMQ_USERNAME:-augur}:${COLLECTOSS_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${COLLECTOSS_RABBITMQ_VHOST:-collectoss_vhost} - CONFIG_LOCATION=/config/config.yml diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index d3ada5bf0..bcef5b22e 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -33,6 +33,8 @@ LABEL org.opencontainers.image.revision=${REVISION} ENV DEBIAN_FRONTEND=noninteractive ENV PATH="/usr/bin/:/usr/local/bin:/usr/lib:${PATH}" +ENV COLLECTOSS_DOCKER_DEPLOY="1" + RUN set -x \ && apt-get update \ && apt-get -y install --no-install-recommends \ diff --git a/docker/backend/entrypoint.sh b/docker/backend/entrypoint.sh index 37d5095d5..fc51cae87 100644 --- a/docker/backend/entrypoint.sh +++ b/docker/backend/entrypoint.sh @@ -3,7 +3,6 @@ set -e export AUGUR_FACADE_REPO_DIRECTORY=${AUGUR_FACADE_REPO_DIRECTORY:-/collectoss/facade/} -export AUGUR_DOCKER_DEPLOY="1" #Deal with special case where 'localhost' is the machine that started the container if [[ "$REDIS_CONN_STRING" == *"localhost"* ]] || [[ "$REDIS_CONN_STRING" == *"127.0.0.1"* ]]; then From 32033a0d9a307ce11ba4381b0c7b24f5d1ab06ee Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 12:55:22 -0400 Subject: [PATCH 130/165] provide default facade repo directory Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 4 ++++ docker/backend/entrypoint.sh | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 44f7fff35..4c5262580 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -54,3 +54,7 @@ def collect_env_variables(logger): if SystemEnv.get("COLLECTOSS_GITLAB_USERNAME") is None: SystemEnv.set("COLLECTOSS_GITLAB_USERNAME", "fake") + # provide a default value for the facade repo directory (assumes docker paths) + facade_repo_directory = SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") + if facade_repo_directory is None: + SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", "/collectoss/facade/") diff --git a/docker/backend/entrypoint.sh b/docker/backend/entrypoint.sh index fc51cae87..b1a45d851 100644 --- a/docker/backend/entrypoint.sh +++ b/docker/backend/entrypoint.sh @@ -2,8 +2,6 @@ #SPDX-License-Identifier: MIT set -e -export AUGUR_FACADE_REPO_DIRECTORY=${AUGUR_FACADE_REPO_DIRECTORY:-/collectoss/facade/} - #Deal with special case where 'localhost' is the machine that started the container if [[ "$REDIS_CONN_STRING" == *"localhost"* ]] || [[ "$REDIS_CONN_STRING" == *"127.0.0.1"* ]]; then echo "localhost redis connection" From 5fc3eccf8cce50f75a8245ac16975d576946afaa Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 12:56:40 -0400 Subject: [PATCH 131/165] ensure facade repo directory is resolveable/absolute Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 4 ++++ scripts/docker/config.sh | 6 +----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 4c5262580..6f6d1ec35 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -1,6 +1,7 @@ ## Startup helpers +from pathlib import Path from collectoss.application.environment import SystemEnv @@ -58,3 +59,6 @@ def collect_env_variables(logger): facade_repo_directory = SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") if facade_repo_directory is None: SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", "/collectoss/facade/") + else: + # Check if the path is resolveable/make it absolute + SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", str(Path(facade_repo_directory).resolve(strict=True))) diff --git a/scripts/docker/config.sh b/scripts/docker/config.sh index 5a279d7da..2dea09de4 100755 --- a/scripts/docker/config.sh +++ b/scripts/docker/config.sh @@ -57,11 +57,7 @@ function get_facade_repo_path() { echo while true; do - read -e -p "Facade worker directory: " facade_repo_directory - blank_confirm facade_repo_directory - - facade_repo_directory=$(realpath $facade_repo_directory) - echo + # if ! [ -w $facade_repo_directory/.git-credentials ]; then # echo "User $(whoami) does not have permission to write to that location" From 4747b8ca8ea4c9ab2a2423f3520784bbd36418a5 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 12:57:20 -0400 Subject: [PATCH 132/165] ensure facade repo directory has a trailing slash Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 6 ++++++ scripts/docker/config.sh | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 6f6d1ec35..409fa117b 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -62,3 +62,9 @@ def collect_env_variables(logger): else: # Check if the path is resolveable/make it absolute SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", str(Path(facade_repo_directory).resolve(strict=True))) + + # ensure trailing slash is present + facade_repo_directory = SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") + if facade_repo_directory and not facade_repo_directory.endswith("/"): + facade_repo_directory += "/" + SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", facade_repo_directory) diff --git a/scripts/docker/config.sh b/scripts/docker/config.sh index 2dea09de4..c0e752d02 100755 --- a/scripts/docker/config.sh +++ b/scripts/docker/config.sh @@ -104,8 +104,6 @@ function get_facade_repo_path() { esac fi done - - [[ "${facade_repo_directory}" != */ ]] && facade_repo_directory="${facade_repo_directory}/" } function get_rabbitmq_broker_url() { From 1d2910baf8e2847ec398938a58fd76527ca370a8 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 12:58:04 -0400 Subject: [PATCH 133/165] oops Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 409fa117b..dfeb8e186 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -45,7 +45,7 @@ def collect_env_variables(logger): db_string = SystemEnv.get("COLLECTOSS_DB") if db_string and "localhost" in db_string: - SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) + SystemEnv.set("COLLECTOSS_DB", db_string.replace("localhost", "host.docker.internal")) elif db_string and "127.0.0.1" in db_string: SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) From ba802c289112c5a35af842df02ad7df6311e342d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 13:00:44 -0400 Subject: [PATCH 134/165] handle swapping localhosts for redis connection string too Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 7 +++++++ docker/backend/entrypoint.sh | 8 -------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index dfeb8e186..fee1d8e1a 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -49,6 +49,13 @@ def collect_env_variables(logger): elif db_string and "127.0.0.1" in db_string: SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) + redis_string = SystemEnv.get("REDIS_CONN_STRING") + if redis_string and "localhost" in redis_string: + SystemEnv.set("REDIS_CONN_STRING", redis_string.replace("localhost", "host.docker.internal")) + elif redis_string and "127.0.0.1" in redis_string: + SystemEnv.set("REDIS_CONN_STRING", redis_string.replace("127.0.0.1", "host.docker.internal")) + + # if user didnt specify gitlab credentials, just inject fake ones so we can start up. if SystemEnv.get("COLLECTOSS_GITLAB_API_KEY") is None: SystemEnv.set("COLLECTOSS_GITLAB_API_KEY", "fake") diff --git a/docker/backend/entrypoint.sh b/docker/backend/entrypoint.sh index b1a45d851..5a6c63fa0 100644 --- a/docker/backend/entrypoint.sh +++ b/docker/backend/entrypoint.sh @@ -2,12 +2,4 @@ #SPDX-License-Identifier: MIT set -e -#Deal with special case where 'localhost' is the machine that started the container -if [[ "$REDIS_CONN_STRING" == *"localhost"* ]] || [[ "$REDIS_CONN_STRING" == *"127.0.0.1"* ]]; then - echo "localhost redis connection" - export redis_conn_string="redis://host.docker.internal:6379" -else - export redis_conn_string=$REDIS_CONN_STRING -fi - exec "$@" From 7e5fc1881c3094fe9423c170ff6d70cb19e2f4b6 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 13:01:19 -0400 Subject: [PATCH 135/165] entrypoint is now empty Signed-off-by: Adrian Edwards --- docker/backend/Dockerfile | 1 - docker/backend/entrypoint.sh | 5 ----- 2 files changed, 6 deletions(-) delete mode 100644 docker/backend/entrypoint.sh diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index bcef5b22e..e45559b66 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -117,5 +117,4 @@ RUN ln -s /cache /collectoss/collectoss/static/cache COPY --chmod=755 ./docker/backend/entrypoint.sh / COPY --chmod=755 ./docker/backend/init.sh / RUN chmod +x /entrypoint.sh /init.sh -ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] CMD ["/init.sh"] diff --git a/docker/backend/entrypoint.sh b/docker/backend/entrypoint.sh deleted file mode 100644 index 5a6c63fa0..000000000 --- a/docker/backend/entrypoint.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -#SPDX-License-Identifier: MIT -set -e - -exec "$@" From e4c13ef231404aba232e6aaa90691b9403e7b9ef Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 13:29:16 -0400 Subject: [PATCH 136/165] provide excessive user feedback with verbose logging Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index fee1d8e1a..ab2eb5cc9 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -33,6 +33,7 @@ def collect_env_variables(logger): if all(map(lambda p: p is not None, values)): host, user, passwd, name = values + logger.verbose(f"Assembling COLLECTOSS_DB string from provided variables") SystemEnv.set("COLLECTOSS_DB", f"postgresql+psycopg2://{user}:{passwd}@{host}/{name}") else: logger.warning("CollectOSS was unable to create your database connection string automatically") @@ -45,29 +46,37 @@ def collect_env_variables(logger): db_string = SystemEnv.get("COLLECTOSS_DB") if db_string and "localhost" in db_string: + logger.verbose(f"Swapping localhost in COLLECTOSS_DB string with docker host gateway name") SystemEnv.set("COLLECTOSS_DB", db_string.replace("localhost", "host.docker.internal")) elif db_string and "127.0.0.1" in db_string: + logger.verbose(f"Swapping 127.0.0.1 in COLLECTOSS_DB string with docker host gateway name") SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) redis_string = SystemEnv.get("REDIS_CONN_STRING") if redis_string and "localhost" in redis_string: + logger.verbose(f"Swapping localhost in REDIS_CONN_STRING with docker host gateway name") SystemEnv.set("REDIS_CONN_STRING", redis_string.replace("localhost", "host.docker.internal")) elif redis_string and "127.0.0.1" in redis_string: + logger.verbose(f"Swapping 127.0.0.1 in REDIS_CONN_STRING with docker host gateway name") SystemEnv.set("REDIS_CONN_STRING", redis_string.replace("127.0.0.1", "host.docker.internal")) # if user didnt specify gitlab credentials, just inject fake ones so we can start up. if SystemEnv.get("COLLECTOSS_GITLAB_API_KEY") is None: + logger.verbose(f"Detected no specified gitlab key, using made up values as a workaround") SystemEnv.set("COLLECTOSS_GITLAB_API_KEY", "fake") if SystemEnv.get("COLLECTOSS_GITLAB_USERNAME") is None: + logger.verbose(f"Detected no specified gitlab username, using made up value as a workaround") SystemEnv.set("COLLECTOSS_GITLAB_USERNAME", "fake") # provide a default value for the facade repo directory (assumes docker paths) facade_repo_directory = SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") if facade_repo_directory is None: + logger.verbose(f"Setting default value for COLLECTOSS_FACADE_REPO_DIRECTORY") SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", "/collectoss/facade/") else: # Check if the path is resolveable/make it absolute + logger.verbose(f"Resolving full path to COLLECTOSS_FACADE_REPO_DIRECTORY") SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", str(Path(facade_repo_directory).resolve(strict=True))) # ensure trailing slash is present From f7f7cb7a29291cd892187b5907c4a7f0e4bf15bb Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 13:31:06 -0400 Subject: [PATCH 137/165] port over the facade config file handling into python itll start as deprecated since there are already plans to make this better. Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 4 +- collectoss/util/startup.py | 51 ++++++++++++++++++++- scripts/docker/config.sh | 64 --------------------------- 3 files changed, 53 insertions(+), 66 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index af3e2825d..f9e83448f 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -30,7 +30,7 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s -from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables +from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables, setup_facade_directory from keyman.KeyClient import KeyClient, KeyPublisher reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) @@ -69,6 +69,8 @@ def start(ctx, disable_collection, development, pidfile, port): check_init_schema() check_update_schema() + setup_facade_directory() + try: if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index ab2eb5cc9..5b2befd3c 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -2,8 +2,13 @@ from pathlib import Path -from collectoss.application.environment import SystemEnv +import os +import getpass +import subprocess +from sqlalchemy.orm.attributes import get_history +from collectoss.application.environment import SystemEnv +from typing_extensions import deprecated def check_init_schema(): """Initialize the CollectOSS database schema as appropriate @@ -84,3 +89,47 @@ def collect_env_variables(logger): if facade_repo_directory and not facade_repo_directory.endswith("/"): facade_repo_directory += "/" SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", facade_repo_directory) + +@deprecated("The bulk of this function is handling .git-credentials, which will be replaced with pygit2 (see issue #258)", category=None) +def setup_facade_directory(logger): + """Perform permission checks and create the facade directory if it doesnt exist + """ + + facade_directory_path = SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") or "/collectoss/facade/" + + facade_directory = Path(facade_directory_path) + + if not facade_directory.exists(): + logger.verbose(f"Specified facade directory {facade_directory_path} does not exist. Creating...") + facade_directory.mkdir() + + git_credentials = facade_directory.joinpath(".git-credentials") + git_credentials.touch(exist_ok=True) + + if not os.access(git_credentials, os.R_OK): + logger.error(f"User {getpass.getuser()} does not have permission to write to {git_credentials}. Please select another location") + else: + logger.verbose(f"Permission check passed for {git_credentials}") + + + credentials = [] + + gh_names = ["COLLECTOSS_GITHUB_USERNAME","COLLECTOSS_GITHUB_API_KEY"] + gh_values = [SystemEnv.get(n) for n in gh_names] + + if all(map(lambda p: p is not None, gh_values)): + user, key = gh_values + credentials.append(f"https://{user}:{key}@github.com") + + + gl_names = ["COLLECTOSS_GITLAB_USERNAME","COLLECTOSS_GITLAB_API_KEY"] + gl_values = [SystemEnv.get(n) for n in gl_names] + + if all(map(lambda p: p is not None, gl_values)): + user, key = gl_values + credentials.append(f"https://{user}:{key}@gitlab.com") + + with git_credentials.open(encoding="utf-8") as c: + c.writelines(credentials) + + subprocess.call(["git", "config", "--global", "credential.helper", "store", "--file", str(git_credentials)]) \ No newline at end of file diff --git a/scripts/docker/config.sh b/scripts/docker/config.sh index c0e752d02..6d9477fe3 100755 --- a/scripts/docker/config.sh +++ b/scripts/docker/config.sh @@ -50,62 +50,6 @@ function get_github_api_key() { echo } -function get_facade_repo_path() { - - echo "The Facade data collection worker will clone repositories to this machine to run its analysis." - echo "Please select a new or existing directory for the Facade worker to use:" - echo - - while true; do - - - # if ! [ -w $facade_repo_directory/.git-credentials ]; then - # echo "User $(whoami) does not have permission to write to that location" - # echo "Please select another location" - # continue - # fi - - # Check if the file exists and create it if it doesn't - if [ ! -f "$facade_repo_directory/.git-credentials" ]; then - echo "File .git-credentials does not exist. Creating it..." - touch "$facade_repo_directory/.git-credentials" - fi - - # Check for write permissions - if ! [ -w "$facade_repo_directory/.git-credentials" ]; then - echo "User $(whoami) does not have permission to write to $facade_repo_directory/.git-credentials" - echo "Please select another location" - continue - else - echo "Permission check passed for $facade_repo_directory/.git-credentials" - fi - - if [[ -d "$facade_repo_directory" ]]; then - read -r -p "That directory already exists. Use it? [Y/n]: " facade_response - case "$facade_response" in - [nN][oO] | [nN]) - continue - ;; - *) - break - ;; - esac - else - read -r -p "That directory does not exist. Create it? [Y/n]: " facade_response - case "$facade_response" in - [nN][oO] | [nN]) - continue - ;; - *) - mkdir "$facade_repo_directory" - echo "Directory created." - break - ;; - esac - fi - done -} - function get_rabbitmq_broker_url() { echo echo "Please provide your rabbitmq broker url." @@ -176,14 +120,6 @@ function create_config() { else cmd=( collectoss config init --github-api-key $github_api_key --gitlab-api-key $gitlab_api_key --facade-repo-directory $facade_repo_directory --rabbitmq-conn-string $rabbitmq_conn_string ) fi - - #Create and cache credentials for github and gitlab - touch $facade_repo_directory/.git-credentials - - echo "https://$github_username:$github_api_key@github.com" > $facade_repo_directory/.git-credentials - echo "https://$gitlab_username:$gitlab_api_key@gitlab.com" >> $facade_repo_directory/.git-credentials - - git config --global credential.helper "store --file $facade_repo_directory/.git-credentials" "${cmd[@]}" } echo From ac5a57deadf0d99d183b2cbddaa67ff798b80ad0 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 13:59:34 -0400 Subject: [PATCH 138/165] refactor config merging into a separate function so it could be reused from two places in the CLI Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 1 + collectoss/application/cli/config.py | 51 +--------------- collectoss/util/startup.py | 84 ++++++++++++++++++++++++++- 3 files changed, 87 insertions(+), 49 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index f9e83448f..8a691e69f 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -71,6 +71,7 @@ def start(ctx, disable_collection, development, pidfile, port): setup_facade_directory() + merge_config(ctx.obj.engine, logger) try: if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) diff --git a/collectoss/application/cli/config.py b/collectoss/application/cli/config.py index 50641439e..681c9d201 100644 --- a/collectoss/application/cli/config.py +++ b/collectoss/application/cli/config.py @@ -11,8 +11,7 @@ from collectoss.application.db.session import DatabaseSession from collectoss.application.config import SystemConfig, redact_setting_value from collectoss.application.cli import DatabaseContext, test_connection, test_db_connection, with_database -from collectoss.util.inspect_without_import import get_phase_names_without_import -ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) +from collectoss.util.startup import merge_config logger = logging.getLogger(__name__) @@ -61,52 +60,8 @@ def init_config(ctx, github_api_key, facade_repo_directory, gitlab_api_key, redi if facade_repo_directory[-1] != "/": facade_repo_directory += "/" - - keys = {} - - keys["github_api_key"] = github_api_key - keys["gitlab_api_key"] = gitlab_api_key - - with DatabaseSession(logger, engine=ctx.obj.engine) as session: - - config = SystemConfig(logger, session) - - augmented_config = config.base_config - - phase_names = get_phase_names_without_import() - - #Add all phases as enabled by default - for name in phase_names: - - if name not in augmented_config['Task_Routine']: - augmented_config['Task_Routine'].update({name : 1}) - - #print(default_config) - if redis_conn_string: - - try: - redis_string_array = redis_conn_string.split("/") - cache_number = int(redis_string_array[-1]) - digits = len(str(cache_number)) - - redis_conn_string = redis_conn_string[:-digits] - - except ValueError: - pass - - augmented_config["Redis"]["connection_string"] = redis_conn_string - - if rabbitmq_conn_string: - augmented_config["RabbitMQ"]["connection_string"] = rabbitmq_conn_string - - augmented_config["Keys"] = keys - - augmented_config["Facade"]["repo_directory"] = facade_repo_directory - - augmented_config["Logging"]["logs_directory"] = logs_directory or (ROOT_PROJECT_REPO_DIRECTORY + "/logs/") - - config.load_config_from_dict(augmented_config) - + merge_config(ctx.obj.engine, logger, github_api_key, facade_repo_directory, gitlab_api_key, redis_conn_string, rabbitmq_conn_string, logs_directory) + @cli.command('load') @click.option('--file', required=True) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 5b2befd3c..b9a66f9c9 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -7,9 +7,15 @@ import subprocess from sqlalchemy.orm.attributes import get_history +from collectoss.application.config import SystemConfig +from collectoss.application.db.session import DatabaseSession from collectoss.application.environment import SystemEnv from typing_extensions import deprecated +from collectoss.util.inspect_without_import import get_phase_names_without_import + +ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) + def check_init_schema(): """Initialize the CollectOSS database schema as appropriate """ @@ -132,4 +138,80 @@ def setup_facade_directory(logger): with git_credentials.open(encoding="utf-8") as c: c.writelines(credentials) - subprocess.call(["git", "config", "--global", "credential.helper", "store", "--file", str(git_credentials)]) \ No newline at end of file + subprocess.call(["git", "config", "--global", "credential.helper", "store", "--file", str(git_credentials)]) + + +def merge_config( + engine, + logger, + github_api_key:str | None = None, + facade_repo_directory:str | None = None, + gitlab_api_key:str | None = None, + redis_conn_string:str | None = None, + rabbitmq_conn_string:str | None = None, + logs_directory:str | None = None + ): + """Merge config items provided via environment variables into a place where SystemConfig can pick them up. + + Args: + engine: the database engine to connect to + logger: object to use for outputting logging messages + github_api_key (str): config value + facade_repo_directory (str): config value + gitlab_api_key (str): config value + redis_conn_string (str): config value + rabbitmq_conn_string (str): config value + logs_directory (str): config value + """ + + github_api_key = github_api_key or SystemEnv.get("COLLECTOSS_GITHUB_API_KEY") + facade_repo_directory = github_api_key or SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") + gitlab_api_key = github_api_key or SystemEnv.get("COLLECTOSS_GITLAB_API_KEY") + redis_conn_string = github_api_key or SystemEnv.get("REDIS_CONN_STRING") + rabbitmq_conn_string = github_api_key or SystemEnv.get("RABBITMQ_CONN_STRING") + logs_directory = github_api_key or SystemEnv.get("COLLECTOSS_LOGS_DIRECTORY") + + keys = {} + + keys["github_api_key"] = github_api_key + keys["gitlab_api_key"] = gitlab_api_key + + with DatabaseSession(logger, engine=engine) as session: + + config = SystemConfig(logger, session) + + augmented_config = config.base_config + + phase_names = get_phase_names_without_import() + + #Add all phases as enabled by default + for name in phase_names: + + if name not in augmented_config['Task_Routine']: + augmented_config['Task_Routine'].update({name : 1}) + + #print(default_config) + if redis_conn_string: + + try: + redis_string_array = redis_conn_string.split("/") + cache_number = int(redis_string_array[-1]) + digits = len(str(cache_number)) + + redis_conn_string = redis_conn_string[:-digits] + + except ValueError: + pass + + augmented_config["Redis"]["connection_string"] = redis_conn_string + + if rabbitmq_conn_string: + augmented_config["RabbitMQ"]["connection_string"] = rabbitmq_conn_string + + augmented_config["Keys"] = keys + + augmented_config["Facade"]["repo_directory"] = facade_repo_directory + + augmented_config["Logging"]["logs_directory"] = logs_directory or (ROOT_PROJECT_REPO_DIRECTORY + "/logs/") + + config.load_config_from_dict(augmented_config) From a8280c27565b5e6d4de85ded0260e8659fcb65df Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 2 Jun 2026 14:00:30 -0400 Subject: [PATCH 139/165] config script no longer needed Signed-off-by: Adrian Edwards --- docker/backend/init.sh | 5 -- scripts/docker/config.sh | 132 --------------------------------------- 2 files changed, 137 deletions(-) delete mode 100755 scripts/docker/config.sh diff --git a/docker/backend/init.sh b/docker/backend/init.sh index 65470a403..ce1a11f75 100644 --- a/docker/backend/init.sh +++ b/docker/backend/init.sh @@ -2,11 +2,6 @@ #SPDX-License-Identifier: MIT set -e - -if [ ! -v AUGUR_NO_CONFIG ]; then - ./scripts/docker/config.sh docker -fi - if [[ -f /repo_groups.csv ]]; then collectoss db add-repo-groups /repo_groups.csv fi diff --git a/scripts/docker/config.sh b/scripts/docker/config.sh deleted file mode 100755 index 6d9477fe3..000000000 --- a/scripts/docker/config.sh +++ /dev/null @@ -1,132 +0,0 @@ -#!/bin/bash - -PS3=" -Please type the number corresponding to your selection and then press the Enter/Return key. -Your choice: " - -target=$1 - -function blank_confirm() { - if [ -z "${1}" ]; then - echo "Bad usage of blank_confirm at:" - caller - return - fi - - confirm_placeholder=${!1} - - while [ -z "${confirm_placeholder}" ]; do - echo "You entered a blank line, are you sure?" - read -p "enter 'yes' to continue, or enter the intended value: " confirm_placeholder - case "$confirm_placeholder" in - [yY][eE][sS] | [yY][eE] | [yY]) - return - ;; - *) - continue - ;; - esac - done - printf -v "$1" "%s" $confirm_placeholder -} - -function get_github_username() { - echo - echo "Please provide your username for Github." - echo "** This is required for CollectOSS to clone Github repos ***" - read -p "GitHub username: " github_username - blank_confirm github_username - echo -} - -function get_github_api_key() { - echo - echo "Please provide a valid GitHub API key." - echo "For more information on how to create the key, visit:" - echo "https://docs.collectoss.org/en/latest/getting-started/collecting-data.html" - echo "** This is required for CollectOSS to gather data ***" - read -p "GitHub API Key: " github_api_key - blank_confirm github_api_key - echo -} - -function get_rabbitmq_broker_url() { - echo - echo "Please provide your rabbitmq broker url." - echo "** This is required for CollectOSS to run all collection tasks. ***" - read -p "broker_url: " rabbitmq_conn_string - blank_confirm rabbitmq_conn_string - echo -} - -function create_config() { - - if [[ -z "${AUGUR_GITHUB_API_KEY}" ]]; then - get_github_api_key - else - echo - echo "Found AUGUR_GITHUB_API_KEY environment variable" - echo "Using it in the config" - echo "Please unset AUGUR_GITHUB_API_KEY if you would like to be prompted for a github api key" - github_api_key=$AUGUR_GITHUB_API_KEY - echo - fi - - if [[ -z "${AUGUR_GITHUB_USERNAME}" ]]; then - get_github_username - else - echo - echo "Found AUGUR_GITHUB_USERNAME environment variable" - echo "Using it in the config" - echo "Please unset AUGUR_GITHUB_USERNAME if you would like to be prompted for a github username" - github_username=$AUGUR_GITHUB_USERNAME - echo - fi - - if [[ -z "${AUGUR_FACADE_REPO_DIRECTORY}" ]]; then - get_facade_repo_path - else - echo - echo "Found AUGUR_FACADE_REPO_DIRECTORY environment variable with value $AUGUR_FACADE_REPO_DIRECTORY" - echo "Using it in the config" - echo "IMPORTANT NOTE: This assumes that this directory already exists" - echo "Please unset AUGUR_FACADE_REPO_DIRECTORY if you would like to be prompted for the facade repo directory" - facade_repo_directory=$AUGUR_FACADE_REPO_DIRECTORY - echo - fi - - if [[ -z "${RABBITMQ_CONN_STRING}" ]]; then - get_rabbitmq_broker_url - else - echo - echo "Found RABBITMQ_CONN_STRING environment variable with value $RABBITMQ_CONN_STRING" - echo "Using it in the config" - echo "Please unset RABBITMQ_CONN_STRING if you would like to be prompted for the rabbit MQ connection string" - rabbitmq_conn_string=$RABBITMQ_CONN_STRING - echo - fi - - # echo $rabbitmq_conn_string - # echo $facade_repo_directory - # echo $gitlab_username - # echo $gitlab_api_key - # echo $github_username - # echo $github_api_key - - #special case for docker entrypoint - if [ $target = "docker" ]; then - cmd=( collectoss config init --github-api-key $github_api_key --gitlab-api-key $gitlab_api_key --facade-repo-directory $facade_repo_directory --redis-conn-string $redis_conn_string --rabbitmq-conn-string $rabbitmq_conn_string --logs-directory /logs) - echo "init with redis $redis_conn_string" - else - cmd=( collectoss config init --github-api-key $github_api_key --gitlab-api-key $gitlab_api_key --facade-repo-directory $facade_repo_directory --rabbitmq-conn-string $rabbitmq_conn_string ) - fi - "${cmd[@]}" -} -echo -echo "Collecting data for config..." -create_config -echo -echo "Config created" -echo - -# config_prompt From 2bf5f8baf23f6ae5e2a4aa2bdacdd868981662f3 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 5 Jun 2026 09:30:57 -0400 Subject: [PATCH 140/165] enable the basic schema update subshell for now Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index b9a66f9c9..3a4820483 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -5,6 +5,7 @@ import os import getpass import subprocess +from subprocess import check_call from sqlalchemy.orm.attributes import get_history from collectoss.application.config import SystemConfig @@ -31,7 +32,7 @@ def check_update_schema(): """ pass # alembic upgrade head, unless theres an env var preventing automatic migration - # check_call(["alembic", "upgrade", "head"]) + check_call(["alembic", "upgrade", "head"]) def collect_env_variables(logger): """convenience helper for assembling more complex environment variables out of smaller ones From d6175a881d20085d967cfc851e2e01c9a14922d3 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 5 Jun 2026 11:31:32 -0400 Subject: [PATCH 141/165] add a platform information function Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 5 ++++- collectoss/util/startup.py | 9 +++++++++ docker/backend/init.sh | 5 ----- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index 8a691e69f..e1c38c8a2 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -30,7 +30,7 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s -from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables, setup_facade_directory +from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables, print_platform_information, setup_facade_directory from keyman.KeyClient import KeyClient, KeyPublisher reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) @@ -72,6 +72,9 @@ def start(ctx, disable_collection, development, pidfile, port): setup_facade_directory() merge_config(ctx.obj.engine, logger) + + print_platform_information() + try: if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 3a4820483..74edad2d2 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -6,6 +6,8 @@ import getpass import subprocess from subprocess import check_call +import platform +import sys from sqlalchemy.orm.attributes import get_history from collectoss.application.config import SystemConfig @@ -216,3 +218,10 @@ def merge_config( augmented_config["Logging"]["logs_directory"] = logs_directory or (ROOT_PROJECT_REPO_DIRECTORY + "/logs/") config.load_config_from_dict(augmented_config) + + +def print_platform_information(): + + print(f"PATH: {os.environ.get('PATH')}") + print(f"Python executable (current): {sys.executable}") + print(f"Python version: {platform.python_version()}") \ No newline at end of file diff --git a/docker/backend/init.sh b/docker/backend/init.sh index ce1a11f75..51d3abddd 100644 --- a/docker/backend/init.sh +++ b/docker/backend/init.sh @@ -9,9 +9,4 @@ fi if [[ -f /repos.csv ]]; then collectoss db add-repos /repos.csv fi - -echo "PATH: $PATH" -echo "Python executable: $(which python)" -python --version - exec collectoss backend start --pidfile /tmp/main.pid From ad04c9062bcb71371b0e3ddc7502f96479669e88 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 8 Jun 2026 16:16:46 -0400 Subject: [PATCH 142/165] include new environment file in linted files list Signed-off-by: Adrian Edwards --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6a99aacf5..5671e21de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -163,7 +163,10 @@ markers = [ [tool.mypy] -files = ['collectoss/application/db/*.py'] +files = [ + 'collectoss/application/db/*.py', + 'collectoss/application/environment.py', +] ignore_missing_imports = true follow_imports = "skip" disallow_untyped_defs = false From b10041529374bbc44c2d6fec7cde9b347723b381 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 8 Jun 2026 16:59:27 -0400 Subject: [PATCH 143/165] add function to detect and warn if files are present at the previous autoimport locations Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 4 +++- collectoss/util/startup.py | 18 ++++++++++++++++++ docker/backend/init.sh | 7 ------- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index e1c38c8a2..c2dffd4cf 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -30,7 +30,7 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s -from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables, print_platform_information, setup_facade_directory +from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables, print_platform_information, setup_facade_directory, warn_import_repos from keyman.KeyClient import KeyClient, KeyPublisher reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) @@ -73,6 +73,8 @@ def start(ctx, disable_collection, development, pidfile, port): merge_config(ctx.obj.engine, logger) + warn_import_repos() + print_platform_information() try: diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 74edad2d2..c7aa74c60 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -220,6 +220,24 @@ def merge_config( config.load_config_from_dict(augmented_config) +@deprecated("automatic import is deprecated. This is a function to warn users and help them transition") +def warn_import_repos(logger): + """We are choosing not to auto import repos and repo groups automatically + This function detects attempts to use the automatic feature and warns users to use the CLI themselves + + Args: + logger: the logger to use + """ + + if Path("/repo_groups.csv").exists(): + logger.warning("Detected /repo_groups.csv file at startup. Automatic import of repo groups is deprecated.") + logger.warning("To import repo groups from a CSV, use the CLI: collectoss db add-repo-groups /repo_groups.csv") + + if Path("/repos.csv").exists(): + logger.warning("Detected /repos.csv file at startup. Automatic import of repos is deprecated.") + logger.warning("To import repos from a CSV, use the CLI: collectoss db add-repos /repos.csv") + + def print_platform_information(): print(f"PATH: {os.environ.get('PATH')}") diff --git a/docker/backend/init.sh b/docker/backend/init.sh index 51d3abddd..ee624c63f 100644 --- a/docker/backend/init.sh +++ b/docker/backend/init.sh @@ -2,11 +2,4 @@ #SPDX-License-Identifier: MIT set -e -if [[ -f /repo_groups.csv ]]; then - collectoss db add-repo-groups /repo_groups.csv -fi - -if [[ -f /repos.csv ]]; then - collectoss db add-repos /repos.csv -fi exec collectoss backend start --pidfile /tmp/main.pid From fb2930eadebc3f9c76f1d9ff6862328ee1605f97 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 8 Jun 2026 17:02:33 -0400 Subject: [PATCH 144/165] move remaining command in init into the docker init directive Signed-off-by: Adrian Edwards --- docker/backend/Dockerfile | 2 +- docker/backend/init.sh | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) delete mode 100644 docker/backend/init.sh diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index e45559b66..63a81effb 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -117,4 +117,4 @@ RUN ln -s /cache /collectoss/collectoss/static/cache COPY --chmod=755 ./docker/backend/entrypoint.sh / COPY --chmod=755 ./docker/backend/init.sh / RUN chmod +x /entrypoint.sh /init.sh -CMD ["/init.sh"] +CMD ["collectoss", "backend", "start", "--pidfile", "/tmp/main.pid"] diff --git a/docker/backend/init.sh b/docker/backend/init.sh deleted file mode 100644 index ee624c63f..000000000 --- a/docker/backend/init.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -#SPDX-License-Identifier: MIT -set -e - -exec collectoss backend start --pidfile /tmp/main.pid From 62dcbc72ccea01a24357ac135a1c616d73c0c29a Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 8 Jun 2026 17:07:38 -0400 Subject: [PATCH 145/165] remove old dockerfile entrypoint and init script infrastructure Signed-off-by: Adrian Edwards --- docker/backend/Dockerfile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 63a81effb..65b2ccdbb 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -113,8 +113,4 @@ RUN ${SCORECARD_DIR}/scorecard version RUN mkdir -p repos/ logs/ /collectoss/facade/ RUN ln -s /cache /collectoss/collectoss/static/cache -# Copy in the entrypoint and init scripts, ensuring they are executable -COPY --chmod=755 ./docker/backend/entrypoint.sh / -COPY --chmod=755 ./docker/backend/init.sh / -RUN chmod +x /entrypoint.sh /init.sh CMD ["collectoss", "backend", "start", "--pidfile", "/tmp/main.pid"] From eab204c708da4574bda982869551907ff15342a9 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 8 Jun 2026 17:13:18 -0400 Subject: [PATCH 146/165] revert keyman orchestrator back to regular environment variables so it doesn't import CollectOSS Signed-off-by: Adrian Edwards --- keyman/Orchestrator.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/keyman/Orchestrator.py b/keyman/Orchestrator.py index d93a1f064..71cfae8bb 100644 --- a/keyman/Orchestrator.py +++ b/keyman/Orchestrator.py @@ -4,16 +4,15 @@ import time from keyman.KeyOrchestrationAPI import spec, WaitKeyTimeout, InvalidRequest -from collectoss.application.environment import SystemEnv -if SystemEnv.get("KEYMAN_DOCKER"): +if os.environ.get("KEYMAN_DOCKER"): import sys import redis import logging sys.path.append("/collectoss") - conn = redis.Redis.from_url(SystemEnv.get("REDIS_CONN_STRING")) + conn = redis.Redis.from_url(os.environ.get("REDIS_CONN_STRING")) # Just log to stdout if we're running in docker logger = logging.Logger("KeyOrchestrator") From 6934d7c84523e837d01948a4bdd3d3940e124546 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 8 Jun 2026 17:34:56 -0400 Subject: [PATCH 147/165] logger.verbose -> logger.debug Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index c7aa74c60..392df2df3 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -47,7 +47,7 @@ def collect_env_variables(logger): if all(map(lambda p: p is not None, values)): host, user, passwd, name = values - logger.verbose(f"Assembling COLLECTOSS_DB string from provided variables") + logger.debug(f"Assembling COLLECTOSS_DB string from provided variables") SystemEnv.set("COLLECTOSS_DB", f"postgresql+psycopg2://{user}:{passwd}@{host}/{name}") else: logger.warning("CollectOSS was unable to create your database connection string automatically") @@ -60,37 +60,37 @@ def collect_env_variables(logger): db_string = SystemEnv.get("COLLECTOSS_DB") if db_string and "localhost" in db_string: - logger.verbose(f"Swapping localhost in COLLECTOSS_DB string with docker host gateway name") + logger.debug(f"Swapping localhost in COLLECTOSS_DB string with docker host gateway name") SystemEnv.set("COLLECTOSS_DB", db_string.replace("localhost", "host.docker.internal")) elif db_string and "127.0.0.1" in db_string: - logger.verbose(f"Swapping 127.0.0.1 in COLLECTOSS_DB string with docker host gateway name") + logger.debug(f"Swapping 127.0.0.1 in COLLECTOSS_DB string with docker host gateway name") SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) redis_string = SystemEnv.get("REDIS_CONN_STRING") if redis_string and "localhost" in redis_string: - logger.verbose(f"Swapping localhost in REDIS_CONN_STRING with docker host gateway name") + logger.debug(f"Swapping localhost in REDIS_CONN_STRING with docker host gateway name") SystemEnv.set("REDIS_CONN_STRING", redis_string.replace("localhost", "host.docker.internal")) elif redis_string and "127.0.0.1" in redis_string: - logger.verbose(f"Swapping 127.0.0.1 in REDIS_CONN_STRING with docker host gateway name") + logger.debug(f"Swapping 127.0.0.1 in REDIS_CONN_STRING with docker host gateway name") SystemEnv.set("REDIS_CONN_STRING", redis_string.replace("127.0.0.1", "host.docker.internal")) # if user didnt specify gitlab credentials, just inject fake ones so we can start up. if SystemEnv.get("COLLECTOSS_GITLAB_API_KEY") is None: - logger.verbose(f"Detected no specified gitlab key, using made up values as a workaround") + logger.debug(f"Detected no specified gitlab key, using made up values as a workaround") SystemEnv.set("COLLECTOSS_GITLAB_API_KEY", "fake") if SystemEnv.get("COLLECTOSS_GITLAB_USERNAME") is None: - logger.verbose(f"Detected no specified gitlab username, using made up value as a workaround") + logger.debug(f"Detected no specified gitlab username, using made up value as a workaround") SystemEnv.set("COLLECTOSS_GITLAB_USERNAME", "fake") # provide a default value for the facade repo directory (assumes docker paths) facade_repo_directory = SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") if facade_repo_directory is None: - logger.verbose(f"Setting default value for COLLECTOSS_FACADE_REPO_DIRECTORY") + logger.debug(f"Setting default value for COLLECTOSS_FACADE_REPO_DIRECTORY") SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", "/collectoss/facade/") else: # Check if the path is resolveable/make it absolute - logger.verbose(f"Resolving full path to COLLECTOSS_FACADE_REPO_DIRECTORY") + logger.debug(f"Resolving full path to COLLECTOSS_FACADE_REPO_DIRECTORY") SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", str(Path(facade_repo_directory).resolve(strict=True))) # ensure trailing slash is present @@ -109,7 +109,7 @@ def setup_facade_directory(logger): facade_directory = Path(facade_directory_path) if not facade_directory.exists(): - logger.verbose(f"Specified facade directory {facade_directory_path} does not exist. Creating...") + logger.debug(f"Specified facade directory {facade_directory_path} does not exist. Creating...") facade_directory.mkdir() git_credentials = facade_directory.joinpath(".git-credentials") @@ -118,7 +118,7 @@ def setup_facade_directory(logger): if not os.access(git_credentials, os.R_OK): logger.error(f"User {getpass.getuser()} does not have permission to write to {git_credentials}. Please select another location") else: - logger.verbose(f"Permission check passed for {git_credentials}") + logger.debug(f"Permission check passed for {git_credentials}") credentials = [] From d1b49a87161ed8217be0b8559c66d9253d89ba12 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 9 Jun 2026 09:51:12 -0400 Subject: [PATCH 148/165] ensure logger is correctly passed and all functions are imported Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index c2dffd4cf..40ae406aa 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -30,7 +30,7 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s -from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables, print_platform_information, setup_facade_directory, warn_import_repos +from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables, print_platform_information, setup_facade_directory, warn_import_repos, merge_config from keyman.KeyClient import KeyClient, KeyPublisher reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) @@ -69,11 +69,11 @@ def start(ctx, disable_collection, development, pidfile, port): check_init_schema() check_update_schema() - setup_facade_directory() + setup_facade_directory(logger) merge_config(ctx.obj.engine, logger) - warn_import_repos() + warn_import_repos(logger) print_platform_information() From 8722da4579604837233c2893e497e559e5f0fbe9 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 9 Jun 2026 09:51:21 -0400 Subject: [PATCH 149/165] use logger to output version information Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 2 +- collectoss/util/startup.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index 40ae406aa..173bc41a7 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -75,7 +75,7 @@ def start(ctx, disable_collection, development, pidfile, port): warn_import_repos(logger) - print_platform_information() + print_platform_information(logger) try: if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 392df2df3..0d58bf253 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -238,8 +238,7 @@ def warn_import_repos(logger): logger.warning("To import repos from a CSV, use the CLI: collectoss db add-repos /repos.csv") -def print_platform_information(): - - print(f"PATH: {os.environ.get('PATH')}") - print(f"Python executable (current): {sys.executable}") - print(f"Python version: {platform.python_version()}") \ No newline at end of file +def print_platform_information(logger): + logger.info(f"PATH: {os.environ.get('PATH')}") + logger.info(f"Python executable (current): {sys.executable}") + logger.info(f"Python version: {platform.python_version()}") \ No newline at end of file From 3ce2b1efb9a2b75769bce733f87aeda841bff4a5 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 9 Jun 2026 10:47:57 -0400 Subject: [PATCH 150/165] open facade directory file in write mode Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 0d58bf253..8921df20d 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -138,7 +138,7 @@ def setup_facade_directory(logger): user, key = gl_values credentials.append(f"https://{user}:{key}@gitlab.com") - with git_credentials.open(encoding="utf-8") as c: + with git_credentials.open("w", encoding="utf-8") as c: c.writelines(credentials) subprocess.call(["git", "config", "--global", "credential.helper", "store", "--file", str(git_credentials)]) From 3f7a8bec43128d5a31da0de5bd5b6405e5d01b97 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 9 Jun 2026 11:35:24 -0400 Subject: [PATCH 151/165] fix config bug with new startup process github keys being put in wrong places Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 8921df20d..200691a5f 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -168,11 +168,11 @@ def merge_config( """ github_api_key = github_api_key or SystemEnv.get("COLLECTOSS_GITHUB_API_KEY") - facade_repo_directory = github_api_key or SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") - gitlab_api_key = github_api_key or SystemEnv.get("COLLECTOSS_GITLAB_API_KEY") - redis_conn_string = github_api_key or SystemEnv.get("REDIS_CONN_STRING") - rabbitmq_conn_string = github_api_key or SystemEnv.get("RABBITMQ_CONN_STRING") - logs_directory = github_api_key or SystemEnv.get("COLLECTOSS_LOGS_DIRECTORY") + facade_repo_directory = facade_repo_directory or SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") + gitlab_api_key = gitlab_api_key or SystemEnv.get("COLLECTOSS_GITLAB_API_KEY") + redis_conn_string = redis_conn_string or SystemEnv.get("REDIS_CONN_STRING") + rabbitmq_conn_string = rabbitmq_conn_string or SystemEnv.get("RABBITMQ_CONN_STRING") + logs_directory = logs_directory or SystemEnv.get("COLLECTOSS_LOGS_DIRECTORY") keys = {} From 9618ab13ebe703af40b931a95d8fe555d4d895b3 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 9 Jun 2026 14:18:57 -0400 Subject: [PATCH 152/165] improve robustness of logger against missing log directories in config Signed-off-by: Adrian Edwards --- collectoss/application/logs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/collectoss/application/logs.py b/collectoss/application/logs.py index 253482877..aaf6cb5d8 100644 --- a/collectoss/application/logs.py +++ b/collectoss/application/logs.py @@ -192,8 +192,8 @@ def __init__(self, logger_name, disable_log_files=False,reset_logfiles=False,bas log_config = get_log_config() - if log_config["logs_directory"] != "": - base_log_dir=log_config["logs_directory"] + if log_config.get("logs_directory", "") != "": + base_log_dir=log_config.get("logs_directory") if reset_logfiles is True: try: From b1f52dc67786267dfc7b9f2406ead18720ec5098 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 9 Jun 2026 17:55:41 -0400 Subject: [PATCH 153/165] include logs directory environment variable in default compose Signed-off-by: Adrian Edwards --- docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yml b/docker-compose.yml index 5673fd5e9..82268e7e5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -59,6 +59,7 @@ services: - "COLLECTOSS_DB=postgresql+psycopg2://${COLLECTOSS_DB_USER:-augur}:${COLLECTOSS_DB_PASSWORD:-augur}@database:5432/augur" - "COLLECTOSS_DB_SCHEMA_BUILD=1" - COLLECTOSS_FACADE_REPO_DIRECTORY=/facade + - COLLECTOSS_LOGS_DIRECTORY=/logs - "COLLECTOSS_FLAGS=$COLLECTOSS_FLAGS" - "COLLECTOSS_GITHUB_API_KEY=${COLLECTOSS_GITHUB_API_KEY}" - "COLLECTOSS_GITLAB_API_KEY=${COLLECTOSS_GITLAB_API_KEY}" From 4dff338f29fc0bc1002a5a99d9b89ce61652057f Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 9 Jun 2026 18:32:15 -0400 Subject: [PATCH 154/165] correct translation error in git config command Signed-off-by: Adrian Edwards --- collectoss/util/startup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py index 200691a5f..8fe5b2374 100644 --- a/collectoss/util/startup.py +++ b/collectoss/util/startup.py @@ -141,7 +141,7 @@ def setup_facade_directory(logger): with git_credentials.open("w", encoding="utf-8") as c: c.writelines(credentials) - subprocess.call(["git", "config", "--global", "credential.helper", "store", "--file", str(git_credentials)]) + subprocess.call(["git", "config", "--global", "credential.helper", f"store --file {str(git_credentials)}"]) def merge_config( From c04981112980e0921ede15572c3bc42065eb04af Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 11:52:30 -0400 Subject: [PATCH 155/165] lengthen timeout for docker job so that things actually can run to potential completion Temporary workaround until we have a faster DB initialization Signed-off-by: Adrian Edwards --- .github/workflows/build_docker.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index ec40bc11d..8e0da793d 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -178,7 +178,7 @@ jobs: docker compose -f docker-compose.yml up --no-build 2>&1 \ | (./scripts/ci/await_all.py /tmp/regex_matches.txt \ && docker compose -f docker-compose.yml down) - timeout-minutes: 3 + timeout-minutes: 8 env: AUGUR_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} AUGUR_GITHUB_USERNAME: ${{ github.repository_owner }} @@ -294,7 +294,7 @@ jobs: podman compose -f docker-compose.yml up --no-build 2>&1 \ | (./scripts/ci/await_all.py /tmp/regex_matches.txt \ && podman compose -f docker-compose.yml down) - timeout-minutes: 3 + timeout-minutes: 8 env: AUGUR_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} AUGUR_GITHUB_USERNAME: ${{ github.repository_owner }} From 97820fe1cbd4fc54b9ca59da3829fd5f4622f6c6 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 12:52:42 -0400 Subject: [PATCH 156/165] move startup process to a python based preflight script this should hopefully take care of everything up front like the original scripts did, without creating issues related to the cursed import situation Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 15 --------------- docker/backend/Dockerfile | 4 +++- docker/backend/preflight.py | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+), 16 deletions(-) create mode 100755 docker/backend/preflight.py diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index 173bc41a7..f71059d1b 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -62,21 +62,6 @@ def start(ctx, disable_collection, development, pidfile, port): signal.signal(signal.SIGTERM, manager.shutdown_signal_handler) signal.signal(signal.SIGINT, manager.shutdown_signal_handler) - - collect_env_variables(logger) - - - check_init_schema() - check_update_schema() - - setup_facade_directory(logger) - - merge_config(ctx.obj.engine, logger) - - warn_import_repos(logger) - - print_platform_information(logger) - try: if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 65b2ccdbb..9b333b18b 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -113,4 +113,6 @@ RUN ${SCORECARD_DIR}/scorecard version RUN mkdir -p repos/ logs/ /collectoss/facade/ RUN ln -s /cache /collectoss/collectoss/static/cache -CMD ["collectoss", "backend", "start", "--pidfile", "/tmp/main.pid"] +COPY --chmod=755 ./docker/backend/preflight.py /preflight.py +RUN chmod +x /preflight.py +CMD ["python3", "/preflight.py", "&", "collectoss", "backend", "start", "--pidfile", "/tmp/main.pid"] diff --git a/docker/backend/preflight.py b/docker/backend/preflight.py new file mode 100755 index 000000000..a6a2ccb47 --- /dev/null +++ b/docker/backend/preflight.py @@ -0,0 +1,19 @@ +from collectoss.util.startup import collect_env_variables, check_init_schema, check_update_schema, setup_facade_directory, merge_config, warn_import_repos, print_platform_information +from collectoss.application.logs import SystemLogger + + +if __name__ == "__main__": + logger = SystemLogger("backend", reset_logfiles=False).get_logger() + + collect_env_variables(logger) + + check_init_schema() + check_update_schema() + + setup_facade_directory(logger) + + merge_config(ctx.obj.engine, logger) + + warn_import_repos(logger) + + print_platform_information(logger) From c804aa0ac4437e8575ef81a643130dedd96b7639 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 14:29:42 -0400 Subject: [PATCH 157/165] pass in correct DB context Signed-off-by: Adrian Edwards --- docker/backend/preflight.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/backend/preflight.py b/docker/backend/preflight.py index a6a2ccb47..69de26e42 100755 --- a/docker/backend/preflight.py +++ b/docker/backend/preflight.py @@ -1,6 +1,6 @@ from collectoss.util.startup import collect_env_variables, check_init_schema, check_update_schema, setup_facade_directory, merge_config, warn_import_repos, print_platform_information from collectoss.application.logs import SystemLogger - +from collectoss.application.cli import DatabaseContext if __name__ == "__main__": logger = SystemLogger("backend", reset_logfiles=False).get_logger() @@ -12,7 +12,7 @@ setup_facade_directory(logger) - merge_config(ctx.obj.engine, logger) + merge_config(DatabaseContext().engine, logger) warn_import_repos(logger) From e58b343bda392db47d7a95ccd31754067266ff30 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 14:29:52 -0400 Subject: [PATCH 158/165] exit successfully so the backend can start Signed-off-by: Adrian Edwards --- docker/backend/preflight.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/backend/preflight.py b/docker/backend/preflight.py index 69de26e42..d5271a3e9 100755 --- a/docker/backend/preflight.py +++ b/docker/backend/preflight.py @@ -1,6 +1,7 @@ from collectoss.util.startup import collect_env_variables, check_init_schema, check_update_schema, setup_facade_directory, merge_config, warn_import_repos, print_platform_information from collectoss.application.logs import SystemLogger from collectoss.application.cli import DatabaseContext +import sys if __name__ == "__main__": logger = SystemLogger("backend", reset_logfiles=False).get_logger() @@ -17,3 +18,5 @@ warn_import_repos(logger) print_platform_information(logger) + + sys.exit(0) From 666605fc56eeaa5150110eb92d33a9bf96b3901f Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 14:30:08 -0400 Subject: [PATCH 159/165] && only works with the shell form of CMD Signed-off-by: Adrian Edwards --- docker/backend/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 9b333b18b..344756a06 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -115,4 +115,5 @@ RUN ln -s /cache /collectoss/collectoss/static/cache COPY --chmod=755 ./docker/backend/preflight.py /preflight.py RUN chmod +x /preflight.py -CMD ["python3", "/preflight.py", "&", "collectoss", "backend", "start", "--pidfile", "/tmp/main.pid"] +CMD python3 /preflight.py && collectoss backend start --pidfile /tmp/main.pid +# CMD ["collectoss", "backend", "start", "--pidfile", "/tmp/main.pid"] From 5a69838446ed9db77cc5a675d87d0c5b1371d747 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 15:00:15 -0400 Subject: [PATCH 160/165] bypass SystemLogger, which requires a database connection Signed-off-by: Adrian Edwards --- docker/backend/preflight.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/docker/backend/preflight.py b/docker/backend/preflight.py index d5271a3e9..4207db8b7 100755 --- a/docker/backend/preflight.py +++ b/docker/backend/preflight.py @@ -1,10 +1,23 @@ from collectoss.util.startup import collect_env_variables, check_init_schema, check_update_schema, setup_facade_directory, merge_config, warn_import_repos, print_platform_information -from collectoss.application.logs import SystemLogger +from collectoss.application.logs import getFormatter from collectoss.application.cli import DatabaseContext import sys +import logging if __name__ == "__main__": - logger = SystemLogger("backend", reset_logfiles=False).get_logger() + # We cannot use systemLogger here because it depends on the database + # At this point in execution, the database may not yet be initialized or + # usable for configuration. So for now we DIY it as a temporary measure + # until we can more comprehensively improve the high level configuration system + logger = logging.getLogger("collectoss.preflight") + log_level = logging.INFO + if not logger.handlers: + handler = logging.StreamHandler() + handler.setLevel(log_level) + handler.setFormatter(getFormatter(log_level)) + logger.addHandler(handler) + logger.setLevel(log_level) + logger.propagate = False collect_env_variables(logger) From 51d1f32e21411d93e8e3788c53fe0bfee05d448c Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 15:28:56 -0400 Subject: [PATCH 161/165] update env vars used in CI so we are passing correct values to the application Signed-off-by: Adrian Edwards --- .github/workflows/build_docker.yml | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index 8e0da793d..5cd90b0ba 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -180,10 +180,10 @@ jobs: && docker compose -f docker-compose.yml down) timeout-minutes: 8 env: - AUGUR_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} - AUGUR_GITHUB_USERNAME: ${{ github.repository_owner }} - AUGUR_GITLAB_API_KEY: dummy - AUGUR_GITLAB_USERNAME: dummy + COLLECTOSS_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} + COLLECTOSS_GITHUB_USERNAME: ${{ github.repository_owner }} + COLLECTOSS_GITLAB_API_KEY: dummy + COLLECTOSS_GITLAB_USERNAME: dummy - name: Dump logs # Always run this step to get logs, even if the previous step fails @@ -296,16 +296,17 @@ jobs: && podman compose -f docker-compose.yml down) timeout-minutes: 8 env: - AUGUR_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} - AUGUR_GITHUB_USERNAME: ${{ github.repository_owner }} - AUGUR_GITLAB_API_KEY: dummy - AUGUR_GITLAB_USERNAME: dummy + COLLECTOSS_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} + COLLECTOSS_GITHUB_USERNAME: ${{ github.repository_owner }} + COLLECTOSS_GITLAB_API_KEY: dummy + COLLECTOSS_GITLAB_USERNAME: dummy - name: Dump logs # Always run this step to get logs, even if the previous step fails if: always() # We use tail so that we can see the name of each file as it's printed - run: "podman run -t --rm -v augur_logs:/logs bash -c 'find /logs -type f | xargs + run: + "podman run -t --rm -v augur_logs:/logs bash -c 'find /logs -type f | xargs tail -n +0'" push-image: From bba22afc416da91852bca714a0cf438c90c5b487 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 19 Jan 2026 14:37:36 -0500 Subject: [PATCH 162/165] add health check to postgres to ensure it is done with its thing before CollectOSS starts. This avoids some race conditions with the startup process that could create issues, especially on first initialization and especially in CI where we are still replaying migrations Co-Authored-By: guptapratykshh Co-Authored-By: Sukuna0007Abhi Signed-off-by: Adrian Edwards --- docker-compose.yml | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 82268e7e5..1b455a352 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,6 +14,15 @@ services: - "${COLLECTOSS_DB_PORT:-5432}:5432" volumes: - augurpostgres:/var/lib/postgresql/data + healthcheck: + test: + [ + "CMD-SHELL", + "pg_isready -U ${COLLECTOSS_DB_USER:-augur} -d ${COLLECTOSS_DB_NAME:-augur}", + ] + interval: 10s + timeout: 5s + retries: 5 redis: image: "redis:alpine" @@ -74,10 +83,14 @@ services: - CACHE_LOCKDIR=/cache - CELERYBEAT_SCHEDULE_DB=/tmp/celerybeat-schedule.db depends_on: - - database - - redis - - keyman - - rabbitmq + database: + condition: service_healthy + redis: + condition: service_started + keyman: + condition: service_started + rabbitmq: + condition: service_started user: 2345:2345 # Run as an arbitrary non-root user post_start: # Make sure the user has access to the volumes From f8e2d4f6ba098dc1a2b2b6d04a2256f3f2332322 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 18:03:18 -0400 Subject: [PATCH 163/165] remove unused imports Signed-off-by: Adrian Edwards --- collectoss/application/cli/backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index f71059d1b..3526a3c2c 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -30,7 +30,6 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s -from collectoss.util.startup import check_init_schema, check_update_schema, collect_env_variables, print_platform_information, setup_facade_directory, warn_import_repos, merge_config from keyman.KeyClient import KeyClient, KeyPublisher reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) From 51ce6c868e783a3978d20424fc4db4a389b63d3b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 11:12:15 -0400 Subject: [PATCH 164/165] bump metadata to 1.1rc1 Signed-off-by: Adrian Edwards --- metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata.py b/metadata.py index f93aaa8b9..8b93a183c 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "1.0.0" -__release__ = "v1.0.0" +__version__ = "1.1.0" +__release__ = "v1.1.0-RC1" __author__ = "CollectOSS Community" __license__ = "MIT" From 37000654bbffad1fa63efea15973345a949624fa Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 10 Jun 2026 11:40:04 -0400 Subject: [PATCH 165/165] update docker build tagging rules to not update the `release` tag for releases marked as prerelease Signed-off-by: Adrian Edwards --- .github/workflows/build_docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index 5cd90b0ba..ce1c5a19b 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -359,7 +359,7 @@ jobs: # Releases update the *:latest tag and the *: tag tags: | type=raw,value=devel-latest,enable=${{ github.ref == 'refs/heads/main' }} - type=raw,value=latest,enable=${{ github.event_name == 'release' }} + type=raw,value=latest,enable=${{ github.event_name == 'release' && !github.event.release.prerelease }} type=raw,value=${{ github.event.release.tag_name }},enable=${{ github.event_name == 'release' }} - name: Build and push