diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml index 138b175f5..ce8b50bb7 100644 --- a/.github/workflows/bandit.yml +++ b/.github/workflows/bandit.yml @@ -13,12 +13,12 @@ name: Bandit on: push: - branches: [ "main", "release" ] + branches: ["main", "release"] pull_request: # The branches below must be a subset of the branches above - branches: [ "main" ] + branches: ["main"] schedule: - - cron: '24 2 * * 2' + - cron: "24 2 * * 2" jobs: bandit: @@ -30,7 +30,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Bandit Scan - uses: shundor/python-bandit-scan@ab1d87dfccc5a0ffab88be3aaac6ffe35c10d6cd + uses: reactive-firewall/python-bandit-scan@11a72c7c18aab77758bf6f5d9456f1018ec107b0 with: # optional arguments # exit with 0, even with results found exit_zero: true # optional, default is DEFAULT @@ -43,9 +43,9 @@ jobs: # Report only issues of a given confidence level or higher. Can be LOW, MEDIUM or HIGH. Default is UNDEFINED (everything) # confidence: # optional, default is UNDEFINED # comma-separated list of paths (glob patterns supported) to exclude from scan (note that these are in addition to the excluded paths provided in the config file) (default: .svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg) - excluded_paths: tests + # excluded_paths: # comma-separated list of test IDs to skip # skips: # optional, default is DEFAULT # path to a .bandit file that supplies command line arguments # ini_path: # optional, default is DEFAULT - + config_path: pyproject.toml diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index ec40bc11d..ce1c5a19b 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -178,12 +178,12 @@ jobs: docker compose -f docker-compose.yml up --no-build 2>&1 \ | (./scripts/ci/await_all.py /tmp/regex_matches.txt \ && docker compose -f docker-compose.yml down) - timeout-minutes: 3 + timeout-minutes: 8 env: - AUGUR_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} - AUGUR_GITHUB_USERNAME: ${{ github.repository_owner }} - AUGUR_GITLAB_API_KEY: dummy - AUGUR_GITLAB_USERNAME: dummy + COLLECTOSS_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} + COLLECTOSS_GITHUB_USERNAME: ${{ github.repository_owner }} + COLLECTOSS_GITLAB_API_KEY: dummy + COLLECTOSS_GITLAB_USERNAME: dummy - name: Dump logs # Always run this step to get logs, even if the previous step fails @@ -294,18 +294,19 @@ jobs: podman compose -f docker-compose.yml up --no-build 2>&1 \ | (./scripts/ci/await_all.py /tmp/regex_matches.txt \ && podman compose -f docker-compose.yml down) - timeout-minutes: 3 + timeout-minutes: 8 env: - AUGUR_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} - AUGUR_GITHUB_USERNAME: ${{ github.repository_owner }} - AUGUR_GITLAB_API_KEY: dummy - AUGUR_GITLAB_USERNAME: dummy + COLLECTOSS_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} + COLLECTOSS_GITHUB_USERNAME: ${{ github.repository_owner }} + COLLECTOSS_GITLAB_API_KEY: dummy + COLLECTOSS_GITLAB_USERNAME: dummy - name: Dump logs # Always run this step to get logs, even if the previous step fails if: always() # We use tail so that we can see the name of each file as it's printed - run: "podman run -t --rm -v augur_logs:/logs bash -c 'find /logs -type f | xargs + run: + "podman run -t --rm -v augur_logs:/logs bash -c 'find /logs -type f | xargs tail -n +0'" push-image: @@ -358,7 +359,7 @@ jobs: # Releases update the *:latest tag and the *: tag tags: | type=raw,value=devel-latest,enable=${{ github.ref == 'refs/heads/main' }} - type=raw,value=latest,enable=${{ github.event_name == 'release' }} + type=raw,value=latest,enable=${{ github.event_name == 'release' && !github.event.release.prerelease }} type=raw,value=${{ github.event.release.tag_name }},enable=${{ github.event_name == 'release' }} - name: Build and push diff --git a/.github/workflows/functional_test.yml b/.github/workflows/functional_test.yml index eaa50adf3..5ec4dc2b9 100644 --- a/.github/workflows/functional_test.yml +++ b/.github/workflows/functional_test.yml @@ -27,5 +27,4 @@ jobs: - name: Run Tests run: | uv run --python ${{ matrix.env }} pytest \ - tests/test_classes \ --color=yes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3d9182b26..ad19e84e3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,7 +9,7 @@ These resources are a great way to meet the people behind the project, ask quest ## Learn about the project -If you aren't already familiar with what CollectOSS is, please make sure you've read the [README](README.md) to get a primer on our project, and maybe take a look around the [documentation](https://collectoss.readthedocs.io/en/release/) so you know what we are about. You can also hang out in Slack or join our community meetings to learn more about what we do. +If you aren't already familiar with what CollectOSS is, please make sure you've read the [README](README.md) to get a primer on our project, and maybe take a look around the [documentation](https://docs.collectoss.org/en/release/) so you know what we are about. You can also hang out in Slack or join our community meetings to learn more about what we do. ## Opening an issue If you're experiencing an issue with CollectOSS you can search for your problem or question on our [issues](https://github.com/chaoss/collectoss/issues) page to see if someone else has already reported it. If you cannot find your issue, please feel free to [open a new one](https://github.com/chaoss/collectoss/issues/new/choose). @@ -53,7 +53,7 @@ Github has an article called [Syncing a fork](https://docs.github.com/en/pull-re ## Helpful Links -- [CollectOSS stable documentation](https://collectoss.readthedocs.io/en/release/) +- [CollectOSS stable documentation](https://docs.collectoss.org/en/release/) - [CHAOSS Getting Started page](https://chaoss.community/kb-getting-started/) **Git & GitHub** diff --git a/CREDITS.md b/CREDITS.md index 238478d63..3305fd3ca 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -13,6 +13,8 @@ The list of current CollectOSS maintainers can be found in the [MAINTAINERS](./M Augur has been supported by the University of Missouri through funding provided by the Alfred P. Sloan Foundation, Mozilla, The Reynolds Journalism Institute with contributions from VMWare, Red Hat LLC, Grace Hopper's Open Source Day, GitHub, Microsoft, Twitter, Adobe, the Gluster Project, Open Source Summit (NA/Europe), and the Linux Foundation Compliance Summit. +Augur has also been supported by the University of Nebraska at Omaha. + Significant design contributors include Kate Stewart, Dawn Foster, Duane O'Brien, Remy Decausemaker, Google Summer of Code Students, and others including: ### Maintainers @@ -43,6 +45,18 @@ Significant design contributors include Kate Stewart, Dawn Foster, Duane O'Brien - [Gary P. White](https://github.com/garypwhite) - [Shlok Gilda](https://github.com/shlokgilda) +### Credited Copyright Holders +These names came from the LICENSE.md file in the Augur project: +- Matt Germonprez +- Sean Goggins +- Gabe Heim +- Derek Howard +- Carter Landis +- Matt Snell +- Brian Warner +- University of Nebraska at Omaha +- University of Missouri + ### GSoC 2025 Participants - [Akshat Baranwal](https://github.com/akshatb2006) - [Asish Kumar](https://github.com/officialasishkumar) diff --git a/GOVERNANCE.md b/GOVERNANCE.md index bdd6f2560..4b01e769a 100644 --- a/GOVERNANCE.md +++ b/GOVERNANCE.md @@ -121,7 +121,7 @@ While most business in CollectOSS is conducted by "[lazy consensus](https://comm periodically the Maintainers may need to vote on specific actions or changes. A vote can be taken on the project's public Slack channel (#wg-collectoss-8knot in the [CHAOSS Slack](https://chaoss.community/kb-getting-started/)) or the private Maintainer Slack channel for security or conduct matters. -Votes may also be taken at the biweely developer meeting. Any Maintainer may +Votes may also be taken at the biweekly developer meeting. Any Maintainer may demand a vote be taken. Most votes require a simple majority of all Maintainers to succeed, except where diff --git a/README.md b/README.md index ee4782dce..983fdfe66 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ CollectOSS's main focus is to measure the overall health and sustainability of o The data CollectOSS collects covers more than just code contributions and extends to anything that can be derived from forge data, including comments, change reviews, releases, and other project activity or interactions. This data is stored in a relational database (PostgreSQL), enabling large-scale data aggregation across any number of repositories to provide context about the way these communities evolve. -CollectOSS is part of [CHAOSS](https://chaoss.community), which is a Linux Foundation® project. Many of our metrics are implementations of the [metrics](https://chaoss.community/metrics/) defined by the CHAOSS community. +CollectOSS is part of [CHAOSS](https://chaoss.community), which is a Linux Foundation® project. Many of our metrics are implementations of the [metrics](https://chaoss.community/kb-metrics-and-metrics-models/) defined by the CHAOSS community. ## Versions and support CollectOSS is a Python project distributed via container images and aims to support all currently-supported versions of Python on macOS and Linux platforms. Docker is the primary supported container runtime, but Podman is also supported and used by some maintainers, although it requires configuring some extra permissions to run correctly. @@ -25,7 +25,7 @@ Basic initial setup can be completed in a few minutes as follows: 3. Copy the `environment.txt` file to a new file called `.env` and fill in values for the required variables 4. Run `docker compose up` to start the containers -Check out the [CollectOSS Documentation](https://collectoss.readthedocs.io) for more detailed setup instructions and troubleshooting steps. +Check out the [CollectOSS Documentation](https://docs.collectoss.org) for more detailed setup instructions and troubleshooting steps. ## Contributing We strongly believe that communities are what makes open source so impactful. We invite you to join our community, regardless of your experience level or coding abilities! diff --git a/collectoss/api/gunicorn_conf.py b/collectoss/api/gunicorn_conf.py index 22c11231a..ee7797471 100644 --- a/collectoss/api/gunicorn_conf.py +++ b/collectoss/api/gunicorn_conf.py @@ -7,6 +7,7 @@ from collectoss.application.db.lib import get_value from collectoss.application.db import dispose_database_engine +from collectoss.application.environment import SystemEnv logger = logging.getLogger(__name__) @@ -20,8 +21,8 @@ workers = multiprocessing.cpu_count() * 2 + 1 umask = 0o007 reload = True - -is_dev = os.getenv("AUGUR_DEV", 'False').lower() in ('true', '1', 't', 'y', 'yes') +# this satisfies the type checker +is_dev = SystemEnv.get_bool("AUGUR_DEV", False) if is_dev: @@ -40,7 +41,8 @@ # set the log location for gunicorn logs_directory = get_value('Logging', 'logs_directory') -is_docker = os.getenv("AUGUR_DOCKER_DEPLOY", 'False').lower() in ('true', '1', 't', 'y', 'yes') +# this syntax satisfies the type checker +is_docker = SystemEnv.get_bool("AUGUR_DOCKER_DEPLOY", False) accesslog = f"{logs_directory}/gunicorn.log" errorlog = f"{logs_directory}/gunicorn.log" diff --git a/collectoss/api/metrics/commit.py b/collectoss/api/metrics/commit.py index de2c84809..8b4227f71 100644 --- a/collectoss/api/metrics/commit.py +++ b/collectoss/api/metrics/commit.py @@ -231,7 +231,7 @@ def annual_commit_count_ranked_by_repo_in_repo_group(repo_group_id, repo_id=None if timeframe == 'all': cdRgTpRankedCommitsSQL = s.sql.text(""" SELECT repo.repo_id, repo_name as name, SUM(added - removed - whitespace) as net, patches - FROM augur_data.dm_repo_annual, repo, repo_groups + FROM data.dm_repo_annual, repo, repo_groups WHERE repo.repo_group_id = :repo_group_id AND repo.repo_group_id = repo_groups.repo_group_id AND dm_repo_annual.repo_id = repo.repo_id diff --git a/collectoss/api/metrics/deps.py b/collectoss/api/metrics/deps.py index ef13aee7d..5f1162fd3 100644 --- a/collectoss/api/metrics/deps.py +++ b/collectoss/api/metrics/deps.py @@ -33,13 +33,13 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No depsSQL = s.sql.text(""" SELECT - augur_data.repo_dependencies.*, - augur_data.repo_groups.repo_group_id + data.repo_dependencies.*, + data.repo_groups.repo_group_id FROM - augur_data.repo_dependencies, - augur_data.repo_groups, - augur_data.repo, - ( SELECT MAX ( date_trunc( 'day', augur_data.repo_dependencies.data_collection_date ) ) AS data_collection_date FROM repo_dependencies WHERE repo_id = repo_id ) C + data.repo_dependencies, + data.repo_groups, + data.repo, + ( SELECT MAX ( date_trunc( 'day', data.repo_dependencies.data_collection_date ) ) AS data_collection_date FROM repo_dependencies WHERE repo_id = repo_id ) C WHERE repo_dependencies.repo_id = repo.repo_id AND repo.repo_group_id = repo_groups.repo_group_id @@ -54,13 +54,13 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No depsSQL = s.sql.text(""" SELECT - augur_data.repo_dependencies.*, - augur_data.repo_groups.repo_group_id + data.repo_dependencies.*, + data.repo_groups.repo_group_id FROM - augur_data.repo_dependencies, - augur_data.repo_groups, - augur_data.repo, - ( SELECT MAX ( date_trunc( 'day', augur_data.repo_dependencies.data_collection_date ) ) AS data_collection_date + data.repo_dependencies, + data.repo_groups, + data.repo, + ( SELECT MAX ( date_trunc( 'day', data.repo_dependencies.data_collection_date ) ) AS data_collection_date FROM repo_dependencies, repo, repo_groups WHERE repo.repo_group_id = repo_groups.repo_group_id and repo_dependencies.repo_id = repo.repo_id and @@ -134,8 +134,8 @@ def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date f.libyear, f.data_collection_date FROM - ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e, - augur_data.repo_deps_libyear f + ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM data.repo_deps_libyear WHERE repo_id = :repo_id GROUP BY repo_id, NAME ORDER BY NAME ) e, + data.repo_deps_libyear f WHERE e.data_collection_date = f.data_collection_date and e.repo_id = f.repo_id @@ -203,8 +203,8 @@ def libyear(repo_group_id, repo_id=None, period='day', begin_date=None, end_date f.libyear, f.data_collection_date FROM - ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM augur_data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e, - augur_data.repo_deps_libyear f + ( SELECT repo_id, NAME, MAX ( data_collection_date ) AS data_collection_date FROM data.repo_deps_libyear GROUP BY repo_id, NAME ORDER BY NAME ) e, + data.repo_deps_libyear f WHERE e.data_collection_date = f.data_collection_date and e.repo_id = f.repo_id diff --git a/collectoss/api/metrics/message.py b/collectoss/api/metrics/message.py index f76aabd28..15256a30c 100644 --- a/collectoss/api/metrics/message.py +++ b/collectoss/api/metrics/message.py @@ -40,12 +40,12 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en COUNT ( * ), repo_name FROM - augur_data.repo, - augur_data.message + data.repo, + data.message WHERE - augur_data.repo.repo_id = augur_data.message.repo_id + data.repo.repo_id = data.message.repo_id AND - augur_data.repo.repo_id = :repo_id + data.repo.repo_id = :repo_id AND message.msg_timestamp BETWEEN :begin_date AND :end_date GROUP BY @@ -69,14 +69,14 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en COUNT ( * ), rg_name FROM - augur_data.repo, - augur_data.repo_groups, - augur_data.message + data.repo, + data.repo_groups, + data.message WHERE - augur_data.repo.repo_id = augur_data.message.repo_id - AND augur_data.repo_groups.repo_group_id = repo.repo_group_id + data.repo.repo_id = data.message.repo_id + AND data.repo_groups.repo_group_id = repo.repo_group_id AND - augur_data.repo_groups.repo_group_id = :repo_group_id + data.repo_groups.repo_group_id = :repo_group_id AND message.msg_timestamp BETWEEN :begin_date AND :end_date GROUP BY diff --git a/collectoss/api/metrics/pull_request.py b/collectoss/api/metrics/pull_request.py index 20d6be893..7f98ccee6 100644 --- a/collectoss/api/metrics/pull_request.py +++ b/collectoss/api/metrics/pull_request.py @@ -787,8 +787,8 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo pr_merged_at, pr_closed_at, pr_created_at - FROM augur_data.pull_request_commits, augur_data.pull_request_meta,augur_data.repo_groups, - augur_data.pull_requests JOIN repo ON pull_requests.repo_id = repo.repo_id + FROM data.pull_request_commits, data.pull_request_meta,data.repo_groups, + data.pull_requests JOIN repo ON pull_requests.repo_id = repo.repo_id WHERE pull_requests.repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) AND pull_requests.pull_request_id = pull_request_commits.pull_request_id @@ -821,7 +821,7 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo pr_merged_at, pr_closed_at, pr_created_at - FROM augur_data.pull_request_commits, augur_data.pull_requests, augur_data.pull_request_meta + FROM data.pull_request_commits, data.pull_requests, data.pull_request_meta WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id AND pull_requests.repo_id = :repo_id diff --git a/collectoss/api/metrics/repo_meta.py b/collectoss/api/metrics/repo_meta.py index 7c4129081..c9c5f8905 100644 --- a/collectoss/api/metrics/repo_meta.py +++ b/collectoss/api/metrics/repo_meta.py @@ -190,7 +190,7 @@ def sbom_download(repo_group_id, repo_id=None): :return: dosocs sbom """ dosocs_SQL = s.sql.text(""" - select * from augur_data.repo_sbom_scans + select * from data.repo_sbom_scans where repo_id = :repo_id; """) @@ -313,7 +313,7 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): if not repo_id: cii_best_practices_badge_SQL = s.sql.text(""" SELECT data - FROM augur_data.repo_badging + FROM data.repo_badging WHERE repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) ORDER BY created_at DESC LIMIT 1 @@ -321,7 +321,7 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): else: cii_best_practices_badge_SQL = s.sql.text(""" SELECT data - FROM augur_data.repo_badging + FROM data.repo_badging WHERE repo_id = :repo_id ORDER BY created_at DESC LIMIT 1 @@ -1270,7 +1270,7 @@ def clones(repo_group_id, repo_id=None, begin_date=None, end_date=None): clone_data_timestamp AS date, count_clones AS total_clones, unique_clones - FROM augur_data.repo_clones_data + FROM data.repo_clones_data WHERE repo_id = :repo_id AND clone_data_timestamp BETWEEN :begin_date AND :end_date ORDER BY clone_data_timestamp @@ -1289,9 +1289,9 @@ def clones(repo_group_id, repo_id=None, begin_date=None, end_date=None): clone_data_timestamp AS date, count_clones AS total_clones, unique_clones - FROM augur_data.repo_clones_data + FROM data.repo_clones_data WHERE repo_id IN ( - SELECT repo_id FROM augur_data.repo WHERE repo_group_id = :repo_group_id + SELECT repo_id FROM data.repo WHERE repo_group_id = :repo_group_id ) AND clone_data_timestamp BETWEEN :begin_date AND :end_date ORDER BY repo_id, clone_data_timestamp diff --git a/collectoss/api/metrics/toss.py b/collectoss/api/metrics/toss.py index 698b4cf31..620b79935 100644 --- a/collectoss/api/metrics/toss.py +++ b/collectoss/api/metrics/toss.py @@ -114,7 +114,7 @@ def toss_repo_info(repo_id): repo_info.default_branch, repo.repo_git FROM - augur_data.repo_info + data.repo_info JOIN repo ON repo.repo_id = repo_info.repo_id WHERE repo_info.repo_id = :repo_id diff --git a/collectoss/api/routes/auggie.py b/collectoss/api/routes/auggie.py index 18642498f..4cde77084 100644 --- a/collectoss/api/routes/auggie.py +++ b/collectoss/api/routes/auggie.py @@ -14,6 +14,8 @@ import requests import slack +from collectoss.application.environment import SystemEnv + from ..server import app @@ -252,7 +254,7 @@ def get_auggie_user(): # return Response(response=response, status=200, mimetype="application/json") ## From Method profile_name = 'collectoss' - if os.environ.get('AUGUR_IS_PROD'): + if SystemEnv.get('COLLECTOSS_IS_PROD'): profile_name = 'default' client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') response = client.get_item( @@ -278,7 +280,7 @@ def update_auggie_user_tracking(): # return Response(response=response, status=200, mimetype="application/json") ## From Method profile_name = 'collectoss' - if os.environ.get('AUGUR_IS_PROD'): + if SystemEnv.get('COLLECTOSS_IS_PROD'): profile_name = 'default' client = boto3.Session(region_name='us-east-1', profile_name=profile_name).client('dynamodb') response = client.update_item( @@ -326,7 +328,7 @@ def slack_login(): print("slack_login") r = requests.get( - url=f'https://slack.com/api/oauth.v2.access?code={body["code"]}&client_id={os.environ["AUGGIE_CLIENT_ID"]}&client_secret={os.environ["AUGGIE_CLIENT_SECRET"]}&redirect_uri=http%3A%2F%2Flocalhost%3A8080') + url=f'https://slack.com/api/oauth.v2.access?code={body["code"]}&client_id={SystemEnv.get("AUGGIE_CLIENT_ID")}&client_secret={SystemEnv.get("AUGGIE_CLIENT_SECRET")}&redirect_uri=http%3A%2F%2Flocalhost%3A8080') data = r.json() if (data["ok"]): @@ -340,7 +342,7 @@ def slack_login(): email = user_response["user"]["email"] profile_name = 'collectoss' - if os.environ.get('AUGUR_IS_PROD'): + if SystemEnv.get('COLLECTOSS_IS_PROD'): profile_name = 'default' print("Making Boto3 Session") client = boto3.Session(region_name='us-east-1', diff --git a/collectoss/api/routes/collection_status.py b/collectoss/api/routes/collection_status.py index eaa374f4c..2a5e42675 100644 --- a/collectoss/api/routes/collection_status.py +++ b/collectoss/api/routes/collection_status.py @@ -61,10 +61,10 @@ def issue_collection_status(): # TODO: make this name automatic - wrapper? ( CAST (( COUNT ( * )) +1 AS DOUBLE PRECISION ) / CAST ( b.issues_count + 1 AS DOUBLE PRECISION )) AS ratio_issues FROM - augur_data.repo A, - augur_data.issues d, - augur_data.repo_info b, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, + data.repo A, + data.issues d, + data.repo_info b, + ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, ( SELECT repo_id, MAX ( data_collection_date ) AS most_recently_collected_issue FROM issues GROUP BY repo_id ORDER BY repo_id ) f WHERE A.repo_id = b.repo_id @@ -135,11 +135,11 @@ def pull_request_collection_status(): # TODO: make this name automatic - wrappe ABS ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_abs, ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_issues FROM - augur_data.repo A, - augur_data.pull_requests d, - augur_data.repo_info b, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_pr_collected FROM augur_data.pull_requests GROUP BY repo_id ORDER BY repo_id ) f + data.repo A, + data.pull_requests d, + data.repo_info b, + ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM data.repo_info GROUP BY repo_id ORDER BY repo_id ) e, + ( SELECT repo_id, MAX ( data_collection_date ) AS last_pr_collected FROM data.pull_requests GROUP BY repo_id ORDER BY repo_id ) f WHERE A.repo_id = b.repo_id AND LOWER ( A.repo_git ) LIKE'%github.com%' diff --git a/collectoss/api/routes/complexity.py b/collectoss/api/routes/complexity.py index 11fbf5ebe..8cfe799de 100644 --- a/collectoss/api/routes/complexity.py +++ b/collectoss/api/routes/complexity.py @@ -17,13 +17,13 @@ def get_project_languages(): project_languages_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.programming_language, e.code_lines, e.files FROM - augur_data.repo, + data.repo, (SELECT d.repo_id, d.programming_language, @@ -31,22 +31,22 @@ def get_project_languages(): COUNT(*)::int AS files FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.programming_language, - augur_data.repo_labor.code_lines + data.repo_labor.repo_id, + data.repo_labor.programming_language, + data.repo_labor.code_lines FROM - augur_data.repo_labor, + data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id, d.programming_language) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id ORDER BY e.repo_id """) @@ -62,30 +62,30 @@ def get_project_files(): project_files_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.files FROM - augur_data.repo, + data.repo, (SELECT d.repo_id, count(*) AS files FROM (SELECT - augur_data.repo_labor.repo_id + data.repo_labor.repo_id FROM - augur_data.repo_labor, + data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id ORDER BY e.repo_id """) @@ -103,33 +103,33 @@ def get_project_lines(): project_lines_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.total_lines, e.average_lines FROM - augur_data.repo, + data.repo, (SELECT d.repo_id, SUM(d.total_lines) AS total_lines, AVG(d.total_lines)::INT AS average_lines FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.total_lines + data.repo_labor.repo_id, + data.repo_labor.total_lines FROM - augur_data.repo_labor, + data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id and augur_data.repo.repo_id = :repo_id_param + WHERE data.repo.repo_id = e.repo_id and data.repo.repo_id = :repo_id_param ORDER BY e.repo_id """).bindparams(repo_id_param=repo_id) @@ -147,33 +147,33 @@ def get_project_comment_lines(): comment_lines_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.comment_lines, e.avg_comment_lines FROM - augur_data.repo, + data.repo, (SELECT d.repo_id, SUM(d.comment_lines) AS comment_lines, AVG(d.comment_lines)::INT AS avg_comment_lines FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.comment_lines + data.repo_labor.repo_id, + data.repo_labor.comment_lines FROM - augur_data.repo_labor, + data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id AND e.repo_id = :repo_id_param ORDER BY e.repo_id """).bindparams(repo_id_param=repo_id) @@ -192,33 +192,33 @@ def get_project_blank_lines(): blank_lines_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.blank_lines, e.avg_blank_lines FROM - augur_data.repo, + data.repo, (SELECT d.repo_id, SUM(d.blank_lines) AS blank_lines, AVG(d.blank_lines)::int AS avg_blank_lines FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.blank_lines + data.repo_labor.repo_id, + data.repo_labor.blank_lines FROM - augur_data.repo_labor, + data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id AND e.repo_id = :repo_id_param ORDER BY e.repo_id """).bindparams(repo_id_param=repo_id) @@ -236,33 +236,33 @@ def get_project_file_complexity(): project_file_complexity_sql = s.sql.text(""" SELECT e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, + data.repo.repo_git, + data.repo.repo_name, e.sum_code_complexity, e.average_code_complexity FROM - augur_data.repo, + data.repo, (SELECT d.repo_id, SUM(d.code_complexity) AS sum_code_complexity, AVG(d.code_complexity)::int AS average_code_complexity FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.code_complexity + data.repo_labor.repo_id, + data.repo_labor.code_complexity FROM - augur_data.repo_labor, + data.repo_labor, ( SELECT - augur_data.repo_labor.repo_id, + data.repo_labor.repo_id, MAX ( data_collection_date ) AS last_collected FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent + data.repo_labor + GROUP BY data.repo_labor.repo_id) recent WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + data.repo_labor.repo_id = recent.repo_id + AND data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE data.repo.repo_id = e.repo_id ORDER BY e.repo_id """) diff --git a/collectoss/api/routes/dei.py b/collectoss/api/routes/dei.py index 64af957bf..5e18dceb8 100644 --- a/collectoss/api/routes/dei.py +++ b/collectoss/api/routes/dei.py @@ -21,7 +21,7 @@ logger = logging.getLogger(__name__) from collectoss.api.routes import API_VERSION -from collectoss.application.db.models.augur_operations import FRONTEND_REPO_GROUP_NAME +from collectoss.application.db.models.operations import FRONTEND_REPO_GROUP_NAME @app.route(f"/{API_VERSION}/dei/repo/add", methods=['POST']) @ssl_required diff --git a/collectoss/api/routes/metadata.py b/collectoss/api/routes/metadata.py index edd65f595..bbdc94f62 100644 --- a/collectoss/api/routes/metadata.py +++ b/collectoss/api/routes/metadata.py @@ -31,7 +31,7 @@ def get_repo_info(): FROM repo_info, repo, - ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e + ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM data.repo_info GROUP BY repo_id ORDER BY repo_id ) e WHERE repo_info.repo_id = repo.repo_id AND e.repo_id = repo_info.repo_id diff --git a/collectoss/api/server.py b/collectoss/api/server.py index 2c71dfcd1..7955cd7a1 100644 --- a/collectoss/api/server.py +++ b/collectoss/api/server.py @@ -27,6 +27,7 @@ from collectoss.application.logs import SystemLogger from collectoss.application.db.session import DatabaseSession from collectoss.application.config import SystemConfig +from collectoss.application.environment import SystemEnv from collectoss.application.db.engine import get_database_string, create_database_engine from collectoss.application.db.models import Repo, Issue, PullRequest, Message, PullRequestReview, Commit, IssueAssignee, PullRequestAssignee, PullRequestCommit, PullRequestFile, Contributor, IssueLabel, PullRequestLabel, ContributorsAlias, Release, ClientApplication @@ -300,8 +301,8 @@ def create_cache_manager() -> CacheManager: cache_config = { 'cache.type': 'file', # Allow setting cache directories via environment variables - 'cache.data_dir': Path(env.setdefault("CACHE_DATADIR", 'runtime/cache/')), - 'cache.lock_dir': Path(env.setdefault("CACHE_LOCKDIR", 'runtime/cache/')), + 'cache.data_dir': Path(SystemEnv.set_default("CACHE_DATADIR", 'runtime/cache/')), + 'cache.lock_dir': Path(SystemEnv.set_default("CACHE_LOCKDIR", 'runtime/cache/')), } if not os.path.exists(cache_config['cache.data_dir']): @@ -329,7 +330,7 @@ def get_server_cache(cache_manager) -> Cache: logger = SystemLogger("server").get_logger() url = get_database_string() -engine = create_database_engine(url, poolclass=StaticPool) +engine = create_database_engine(url, poolclass=StaticPool, connect_args={"application_name": f"collectoss v{code_version} api"}) db_session = DatabaseSession(logger, engine) system_config = SystemConfig(logger, db_session) diff --git a/collectoss/api/view/init.py b/collectoss/api/view/init.py index ab4708793..b26752af9 100644 --- a/collectoss/api/view/init.py +++ b/collectoss/api/view/init.py @@ -1,13 +1,10 @@ -import os from pathlib import Path -from .server import Environment from collectoss.application.logs import SystemLogger import secrets, yaml - -env = Environment() +from collectoss.application.environment import SystemEnv # load configuration files and initialize globals -configFile = Path(env.setdefault("CONFIG_LOCATION", "config.yml")) +configFile = Path(SystemEnv.get("CONFIG_LOCATION") or "config.yml") settings = {} diff --git a/collectoss/api/view/server/Environment.py b/collectoss/api/view/server/Environment.py deleted file mode 100644 index 76b8207ca..000000000 --- a/collectoss/api/view/server/Environment.py +++ /dev/null @@ -1,52 +0,0 @@ -import os - -class Environment: - """ - This class is used to make dealing with environment variables easier. It - allows you to set multiple environment variables at once, and to get items - with subscript notation without needing to deal with the particularities of - non-existent values. - """ - def __init__(self, **kwargs): - for (key, value) in kwargs.items(): - self[key] = value - - def setdefault(self, key, value): - if not self[key]: - self[key] = value - return value - return self[key] - - def setall(self, **kwargs): - result = {} - for (key, value) in kwargs.items(): - if self[key]: - result[key] = self[key] - self[key] = value - - def getany(self, *args): - result = {} - for arg in args: - if self[arg]: - result[arg] = self[arg] - return result - - def as_type(self, type, key): - if self[key]: - return type(self[key]) - return None - - def __getitem__(self, key): - return os.getenv(key) - - def __setitem__(self, key, value): - os.environ[key] = str(value) - - def __len__(self)-> int: - return len(os.environ) - - def __str__(self)-> str: - return str(os.environ) - - def __iter__(self): - return (item for item in os.environ.items()) \ No newline at end of file diff --git a/collectoss/api/view/server/__init__.py b/collectoss/api/view/server/__init__.py index e919a597a..98ce903be 100644 --- a/collectoss/api/view/server/__init__.py +++ b/collectoss/api/view/server/__init__.py @@ -1,2 +1 @@ -from .LoginException import LoginException -from .Environment import Environment \ No newline at end of file +from .LoginException import LoginException \ No newline at end of file diff --git a/collectoss/application/cli/__init__.py b/collectoss/application/cli/__init__.py index 8081d6a8e..444473016 100644 --- a/collectoss/application/cli/__init__.py +++ b/collectoss/application/cli/__init__.py @@ -10,7 +10,9 @@ from collectoss.application.db.engine import DatabaseEngine from collectoss.application.db import get_engine, dispose_database_engine -from sqlalchemy.exc import OperationalError +from sqlalchemy.exc import OperationalError +from collectoss.application.environment import SystemEnv + def check_connectivity(urls=["http://chaoss.community", "http://github.com", "http://gitlab.com"], timeout=10.0): @@ -65,11 +67,11 @@ def new_func(ctx, *args, **kwargs): return ctx.invoke(function_db_connection, *args, **kwargs) except OperationalError as e: - db_environment_var = os.getenv("AUGUR_DB") + db_environment_var = SystemEnv.get("COLLECTOSS_DB") # determine the location to print in error string if db_environment_var: - location = f"the AUGUR_DB environment variable\nAUGUR_DB={os.getenv('AUGUR_DB')}" + location = f"the COLLECTOSS_DB environment variable\nCOLLECTOSS_DB={SystemEnv.get('COLLECTOSS_DB')}" else: with open("db.config.json", 'r') as f: db_config = json.load(f) diff --git a/collectoss/application/cli/_multicommand.py b/collectoss/application/cli/_multicommand.py index 13186e7bb..06aae01de 100644 --- a/collectoss/application/cli/_multicommand.py +++ b/collectoss/application/cli/_multicommand.py @@ -11,7 +11,7 @@ from pathlib import Path # import collectoss.application -CONTEXT_SETTINGS = dict(auto_envvar_prefix='AUGUR') +CONTEXT_SETTINGS = dict(auto_envvar_prefix='COLLECTOSS') class CLIMultiCommand(click.MultiCommand): def __commands_folder(self): diff --git a/collectoss/application/cli/api.py b/collectoss/application/cli/api.py index a8bb9e53b..0c567c590 100644 --- a/collectoss/application/cli/api.py +++ b/collectoss/application/cli/api.py @@ -17,6 +17,8 @@ from collectoss.application.cli import test_connection, test_db_connection, with_database, DatabaseContext from collectoss.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages from collectoss.application.db.lib import get_value +from collectoss.application.environment import SystemEnv + logger = SystemLogger("collectoss", reset_logfiles=False).get_logger() @@ -36,7 +38,7 @@ def start(ctx, development, port): """Start CollectOSS's backend server.""" try: - if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) except Exception as e: logger.error( @@ -46,7 +48,7 @@ def start(ctx, development, port): raise e if development: - os.environ["AUGUR_DEV"] = "1" + SystemEnv.set("AUGUR_DEV", "1") logger.info("Starting in development mode") try: @@ -142,7 +144,7 @@ def get_api_processes(): def is_api_process(process): command = ''.join(process.info['cmdline'][:]).lower() - if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: + if SystemEnv.get('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: if process.pid != os.getpid(): diff --git a/collectoss/application/cli/backend.py b/collectoss/application/cli/backend.py index a07ddf198..3526a3c2c 100644 --- a/collectoss/application/cli/backend.py +++ b/collectoss/application/cli/backend.py @@ -15,6 +15,7 @@ import requests from redis.exceptions import ConnectionError as RedisConnectionError +from collectoss.application.environment import SystemEnv from collectoss.tasks.start_tasks import collection_monitor, create_collection_status_records from collectoss.tasks.git.facade_tasks import clone_repos from collectoss.tasks.github.contributors import process_contributors @@ -31,7 +32,7 @@ from keyman.KeyClient import KeyClient, KeyPublisher -reset_logs = os.getenv("AUGUR_RESET_LOGS", 'True').lower() in ('true', '1', 't', 'y', 'yes') +reset_logs = SystemEnv.get_bool("AUGUR_RESET_LOGS", True) logger = SystemLogger("collectoss", reset_logfiles=reset_logs).get_logger() @@ -61,7 +62,7 @@ def start(ctx, disable_collection, development, pidfile, port): signal.signal(signal.SIGINT, manager.shutdown_signal_handler) try: - if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) except Exception as e: logger.error( @@ -71,10 +72,10 @@ def start(ctx, disable_collection, development, pidfile, port): raise e if development: - os.environ["AUGUR_DEV"] = "1" + SystemEnv.set("AUGUR_DEV", "1") logger.info("Starting in development mode") - os.environ["AUGUR_PIDFILE"] = pidfile + SystemEnv.set("AUGUR_PIDFILE", pidfile) try: gunicorn_location = os.getcwd() + "/collectoss/api/gunicorn_conf.py" @@ -86,10 +87,10 @@ def start(ctx, disable_collection, development, pidfile, port): if not port: port = get_value("Server", "port") - os.environ["AUGUR_PORT"] = str(port) + SystemEnv.set("AUGUR_PORT", str(port)) if disable_collection: - os.environ["AUGUR_DISABLE_COLLECTION"] = "1" + SystemEnv.set("AUGUR_DISABLE_COLLECTION", "1") core_worker_count = get_value("Celery", 'core_worker_count') secondary_worker_count = get_value("Celery", 'secondary_worker_count') @@ -130,7 +131,7 @@ def start(ctx, disable_collection, development, pidfile, port): processes = start_celery_worker_processes((core_worker_count, secondary_worker_count, facade_worker_count), disable_collection) manager.processes = processes - celery_beat_schedule_db = os.getenv("CELERYBEAT_SCHEDULE_DB", "celerybeat-schedule.db") + celery_beat_schedule_db = SystemEnv.get("CELERYBEAT_SCHEDULE_DB", "celerybeat-schedule.db") if os.path.exists(celery_beat_schedule_db): logger.info("Deleting old task schedule") os.remove(celery_beat_schedule_db) @@ -144,7 +145,7 @@ def start(ctx, disable_collection, development, pidfile, port): manager.keypub = keypub if not disable_collection: - if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": orchestrator = subprocess.Popen("python keyman/Orchestrator.py".split()) # Wait for orchestrator startup @@ -340,7 +341,7 @@ def stop_processes(signal, logger, engine): def assign_orphan_repos_to_default_user(session): query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.user_repos) + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM operations.user_repos) """) repos = session.execute_sql(query).fetchall() @@ -355,10 +356,10 @@ def export_env(config): Exports your GitHub key and database credentials """ - export_file = open(os.getenv('AUGUR_EXPORT_FILE', 'collectoss_export_env.sh'), 'w+') + export_file = open(SystemEnv.get('COLLECTOSS_EXPORT_FILE') or 'collectoss_export_env.sh', 'w+') export_file.write('#!/bin/bash') export_file.write('\n') - env_file = open(os.getenv('AUGUR_ENV_FILE', 'docker_env.txt'), 'w+') + env_file = open(SystemEnv.get('COLLECTOSS_ENV_FILE') or 'docker_env.txt', 'w+') for env_var in config.get_env_config().items(): if "LOG" not in env_var[0]: @@ -377,16 +378,16 @@ def repo_reset(backend_app): Refresh repo collection to force data collection """ backend_app.database.execute(s.sql.text(""" - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET core_status='Pending',core_task_id = NULL, core_data_last_collected = NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL, secondary_data_last_collected = NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; - TRUNCATE augur_data.commits CASCADE; + TRUNCATE data.commits CASCADE; """)) logger.info("Repos successfully reset") @@ -403,7 +404,7 @@ def get_backend_processes(): for process in psutil.process_iter(['cmdline', 'name', 'environ']): if process.info['cmdline'] is not None and process.info['environ'] is not None: try: - if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in ''.join(process.info['cmdline'][:]).lower(): + if SystemEnv.get('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in ''.join(process.info['cmdline'][:]).lower(): if process.pid != os.getpid(): process_list.append(process) except (KeyError, FileNotFoundError): diff --git a/collectoss/application/cli/collection.py b/collectoss/application/cli/collection.py index b1a93ce80..adf4b50e8 100644 --- a/collectoss/application/cli/collection.py +++ b/collectoss/application/cli/collection.py @@ -14,6 +14,7 @@ import traceback import sqlalchemy as s +from collectoss.application.environment import SystemEnv from collectoss.tasks.start_tasks import collection_monitor, create_collection_status_records from collectoss.tasks.git.facade_tasks import clone_repos from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler @@ -45,7 +46,7 @@ def start(ctx, development): """Start CollectOSS's backend server.""" try: - if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": raise_open_file_limit(100000) except Exception as e: logger.error( @@ -75,7 +76,7 @@ def start(ctx, development): keypub.publish(key, "gitlab_rest") if development: - os.environ["AUGUR_DEV"] = "1" + SystemEnv.set("AUGUR_DEV", "1") logger.info("Starting in development mode") core_worker_count = get_value("Celery", 'core_worker_count') @@ -202,16 +203,16 @@ def repo_reset(ctx): """ with ctx.obj.engine.connect() as connection: connection.execute(s.sql.text(""" - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET core_status='Pending',core_task_id = NULL, core_data_last_collected = NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL, secondary_data_last_collected = NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; - TRUNCATE augur_data.commits CASCADE; + TRUNCATE data.commits CASCADE; """)) logger.info("Repos successfully reset") @@ -237,7 +238,7 @@ def get_collection_processes(): def is_collection_process(process): command = ''.join(process.info['cmdline'][:]).lower() - if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: + if SystemEnv.get('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: if process.pid != os.getpid(): if "collectossbackendcollection" in command or "celery_app.celery_appbeat" in command: @@ -279,31 +280,31 @@ def cleanup_after_collection_halt(logger_instance, engine): #Make sure that database reflects collection status when processes are killed/stopped. def clean_collection_status(session): session.execute_sql(s.sql.text(""" - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET core_status='Pending',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET core_status='Success',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Success',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET facade_status='Update', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET facade_status='Success', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET facade_status='Pending', facade_task_id=NULL WHERE facade_status='Failed Clone' OR facade_status='Initializing'; """)) @@ -311,7 +312,7 @@ def clean_collection_status(session): def assign_orphan_repos_to_default_user(session): query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.user_repos) + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM operations.user_repos) """) repos = session.execute_sql(query).fetchall() diff --git a/collectoss/application/cli/config.py b/collectoss/application/cli/config.py index 2a9a09320..681c9d201 100644 --- a/collectoss/application/cli/config.py +++ b/collectoss/application/cli/config.py @@ -11,12 +11,14 @@ from collectoss.application.db.session import DatabaseSession from collectoss.application.config import SystemConfig, redact_setting_value from collectoss.application.cli import DatabaseContext, test_connection, test_db_connection, with_database -from collectoss.util.inspect_without_import import get_phase_names_without_import -ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) +from collectoss.util.startup import merge_config logger = logging.getLogger(__name__) -ENVVAR_PREFIX = "AUGUR_" +ENVVAR_PREFIX = "COLLECTOSS_" + +def get_transitional_envs(name: str) -> list: + return [ENVVAR_PREFIX + name, "AUGUR_" + name] @click.group('config', short_help='Generate an augur.config.json') @click.pass_context @@ -24,12 +26,12 @@ def cli(ctx): ctx.obj = DatabaseContext() @cli.command('init') -@click.option('--github-api-key', help="GitHub API key for data collection from the GitHub API", envvar=ENVVAR_PREFIX + 'GITHUB_API_KEY') -@click.option('--facade-repo-directory', help="Directory on the database server where Facade should clone repos", envvar=ENVVAR_PREFIX + 'FACADE_REPO_DIRECTORY') -@click.option('--gitlab-api-key', help="GitLab API key for data collection from the GitLab API", envvar=ENVVAR_PREFIX + 'GITLAB_API_KEY') -@click.option('--redis-conn-string', help="String to connect to redis cache", envvar=ENVVAR_PREFIX + 'REDIS_CONN_STRING') -@click.option('--rabbitmq-conn-string', help="String to connect to rabbitmq broker", envvar=ENVVAR_PREFIX + 'RABBITMQ_CONN_STRING') -@click.option('--logs-directory', help="Directory to store logs", envvar=ENVVAR_PREFIX + 'LOGS_DIRECTORY') +@click.option('--github-api-key', help="GitHub API key for data collection from the GitHub API", envvar=get_transitional_envs('GITHUB_API_KEY')) +@click.option('--facade-repo-directory', help="Directory on the database server where Facade should clone repos", envvar=get_transitional_envs('FACADE_REPO_DIRECTORY')) +@click.option('--gitlab-api-key', help="GitLab API key for data collection from the GitLab API", envvar=get_transitional_envs('GITLAB_API_KEY')) +@click.option('--redis-conn-string', help="String to connect to redis cache", envvar=get_transitional_envs('REDIS_CONN_STRING')) +@click.option('--rabbitmq-conn-string', help="String to connect to rabbitmq broker", envvar=get_transitional_envs('RABBITMQ_CONN_STRING')) +@click.option('--logs-directory', help="Directory to store logs", envvar=get_transitional_envs('LOGS_DIRECTORY')) @test_connection @test_db_connection @with_database @@ -58,52 +60,8 @@ def init_config(ctx, github_api_key, facade_repo_directory, gitlab_api_key, redi if facade_repo_directory[-1] != "/": facade_repo_directory += "/" - - keys = {} - - keys["github_api_key"] = github_api_key - keys["gitlab_api_key"] = gitlab_api_key - - with DatabaseSession(logger, engine=ctx.obj.engine) as session: - - config = SystemConfig(logger, session) - - augmented_config = config.base_config - - phase_names = get_phase_names_without_import() - - #Add all phases as enabled by default - for name in phase_names: - - if name not in augmented_config['Task_Routine']: - augmented_config['Task_Routine'].update({name : 1}) - - #print(default_config) - if redis_conn_string: - - try: - redis_string_array = redis_conn_string.split("/") - cache_number = int(redis_string_array[-1]) - digits = len(str(cache_number)) - - redis_conn_string = redis_conn_string[:-digits] - - except ValueError: - pass - - augmented_config["Redis"]["connection_string"] = redis_conn_string - - if rabbitmq_conn_string: - augmented_config["RabbitMQ"]["connection_string"] = rabbitmq_conn_string - - augmented_config["Keys"] = keys - - augmented_config["Facade"]["repo_directory"] = facade_repo_directory - - augmented_config["Logging"]["logs_directory"] = logs_directory or (ROOT_PROJECT_REPO_DIRECTORY + "/logs/") - - config.load_config_from_dict(augmented_config) - + merge_config(ctx.obj.engine, logger, github_api_key, facade_repo_directory, gitlab_api_key, redis_conn_string, rabbitmq_conn_string, logs_directory) + @cli.command('load') @click.option('--file', required=True) diff --git a/collectoss/application/cli/db.py b/collectoss/application/cli/db.py index fd5db52cf..7b6bc7c09 100644 --- a/collectoss/application/cli/db.py +++ b/collectoss/application/cli/db.py @@ -28,6 +28,8 @@ process_repo_csv, process_repo_group_csv, ) +from collectoss.application.environment import SystemEnv +from collectoss.util.startup import check_init_schema, check_update_schema logger = logging.getLogger(__name__) @@ -140,7 +142,7 @@ def get_repo_groups(ctx: click.Context) -> pd.DataFrame: with ctx.obj.engine.connect() as connection: df = pd.read_sql( s.sql.text( - "SELECT repo_group_id, rg_name, rg_description FROM augur_data.repo_groups" + "SELECT repo_group_id, rg_name, rg_description FROM data.repo_groups" ), connection, ) @@ -179,14 +181,14 @@ def add_repo_groups(ctx: click.Context, filename: str) -> None: with ctx.obj.engine.begin() as connection: # Get existing repo group IDs df = pd.read_sql( - s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), + s.sql.text("SELECT repo_group_id FROM data.repo_groups"), connection, ) repo_group_IDs = df["repo_group_id"].values.tolist() insert_repo_group_sql = s.sql.text( """ - INSERT INTO "augur_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); + INSERT INTO "data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); """ ) @@ -262,7 +264,7 @@ def add_github_org(ctx, organization_name): def get_db_version(engine): db_version_sql = s.sql.text( """ - SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version' + SELECT * FROM operations.augur_settings WHERE setting = 'augur_data_version' """ ) @@ -290,7 +292,7 @@ def upgrade_db_version(): """ Upgrade the configured database to the latest version """ - check_call(["alembic", "upgrade", "head"]) + check_update_schema() @cli.command("check-for-upgrade") @@ -310,7 +312,8 @@ def create_schema(): """ Create schema in the configured database """ - check_call(["alembic", "upgrade", "head"]) + # check_call(["alembic", "upgrade", "head"]) + check_init_schema() def generate_key(length): @@ -342,11 +345,11 @@ def update_api_key(ctx, api_key): """ update_api_key_sql = s.sql.text( """ - INSERT INTO augur_operations.augur_settings (setting,VALUE) VALUES ('augur_api_key','HudMhTyPW7wiaWopUKgRoGCxlIUulw4g') ON CONFLICT (setting) + INSERT INTO operations.augur_settings (setting,VALUE) VALUES ('augur_api_key','HudMhTyPW7wiaWopUKgRoGCxlIUulw4g') ON CONFLICT (setting) DO UPDATE SET VALUE='HudMhTyPW7wiaWopUKgRoGCxlIUulw4g'; - --UPDATE augur_operations.augur_settings SET VALUE = :api_key WHERE setting='augur_api_key'; + --UPDATE operations.augur_settings SET VALUE = :api_key WHERE setting='augur_api_key'; """ ) @@ -363,7 +366,7 @@ def update_api_key(ctx, api_key): def get_api_key(ctx): get_api_key_sql = s.sql.text( """ - SELECT value FROM augur_operations.augur_settings WHERE setting='augur_api_key'; + SELECT value FROM operations.augur_settings WHERE setting='augur_api_key'; """ ) @@ -379,7 +382,7 @@ def get_api_key(ctx): short_help="Check the ~/.pgpass file for CollectOSS's database credentials", ) def check_pgpass(): - db_environment_var = getenv("AUGUR_DB") + db_environment_var = SystemEnv.get("COLLECTOSS_DB") if db_environment_var: # gets the user, passowrd, host, port, and database_name out of environment variable # assumes database string of structure //:@:/ @@ -495,7 +498,7 @@ def run_psql_command_in_database(target_type, target): logger.error("Invalid target type. Exiting...") exit(1) - db_environment_var = getenv("AUGUR_DB") + db_environment_var = SystemEnv.get("COLLECTOSS_DB") # db_json_file_location = os.getcwd() + "/db.config.json" # db_json_exists = os.path.exists(db_json_file_location) @@ -511,7 +514,7 @@ def run_psql_command_in_database(target_type, target): database_name = db_config["database_name"] db_conn_string = f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database_name']}" - engine = s.create_engine(db_conn_string) + engine = s.create_engine(db_conn_string, connect_args={"application_name": f"collectoss cli"}) check_call( [ diff --git a/collectoss/application/config.py b/collectoss/application/config.py index 56e6c57ae..16f62b5ad 100644 --- a/collectoss/application/config.py +++ b/collectoss/application/config.py @@ -7,6 +7,8 @@ from collectoss.application.db.models import Config from collectoss.application.db.util import execute_session_query, convert_type_of_value from pathlib import Path +from collectoss.application.environment import SystemEnv + import logging def get_development_flag_from_config(): @@ -27,7 +29,7 @@ def get_development_flag_from_config(): return flag def get_development_flag(): - return os.getenv("AUGUR_DEV") or get_development_flag_from_config() or False + return SystemEnv.get("COLLECTOSS_DEV") or get_development_flag_from_config() or False def redact_setting_value(section_name, setting_name, value): value_redacted = value if section_name != "Keys" else "REDACTED" @@ -167,7 +169,7 @@ def __init__(self, logger, session: DatabaseSession, config_sources: list = None JsonConfig(default_config, logger) ] - config_dir = Path(os.getenv("CONFIG_DATADIR", "./")) + config_dir = Path(SystemEnv.get("CONFIG_DATADIR") or "./") config_path = config_dir.joinpath("augur.json") if config_path.exists(): config_sources.append(JsonConfig(json.loads(config_path.read_text(encoding="UTF-8")), logger)) diff --git a/collectoss/application/db/__init__.py b/collectoss/application/db/__init__.py index b10b00b44..84e2d977f 100644 --- a/collectoss/application/db/__init__.py +++ b/collectoss/application/db/__init__.py @@ -12,7 +12,7 @@ def get_engine(): if engine is None: url = get_database_string() - engine = create_database_engine(url=url, poolclass=StaticPool) + engine = create_database_engine(url=url, poolclass=StaticPool, connect_args={"application_name": f"collectoss"}) Session = sessionmaker(bind=engine) return engine @@ -42,7 +42,7 @@ def get_session(): def temporary_database_engine(): url = get_database_string() - temporary_database_engine = create_database_engine(url=url, poolclass=StaticPool) + temporary_database_engine = create_database_engine(url=url, poolclass=StaticPool, connect_args={"application_name": f"collectoss temporary/testing"}) try: yield temporary_database_engine diff --git a/collectoss/application/db/engine.py b/collectoss/application/db/engine.py index ef582dbed..884d5a61c 100644 --- a/collectoss/application/db/engine.py +++ b/collectoss/application/db/engine.py @@ -7,6 +7,7 @@ from sqlalchemy import create_engine, event from sqlalchemy.engine import Engine +from collectoss.application.environment import SystemEnv from collectoss.application.db.util import catch_operational_error @@ -61,7 +62,7 @@ def get_database_string() -> str: postgres database string """ - db_environment_var = os.getenv("AUGUR_DB") + db_environment_var = SystemEnv.get("COLLECTOSS_DB") try: current_dir = os.getcwd() @@ -74,7 +75,7 @@ def get_database_string() -> str: if not db_environment_var and not db_json_exists: - print("ERROR no way to get connection to the database. \n\t\t\t\t\t\t There is no db.config.json and the AUGUR_DB environment variable is not set\n\t\t\t\t\t\t Please run make install or set the AUGUR_DB environment then run make install") + print("ERROR no way to get connection to the database. \n\t\t\t\t\t\t There is no db.config.json and the COLLECTOSS_DB environment variable is not set\n\t\t\t\t\t\t Please run make install or set the COLLECTOSS_DB environment then run make install") sys.exit() if db_environment_var: @@ -105,7 +106,7 @@ def set_search_path(dbapi_connection, connection_record): existing_autocommit = dbapi_connection.autocommit dbapi_connection.autocommit = True cursor = dbapi_connection.cursor() - cursor.execute("SET SESSION search_path=public,augur_data,augur_operations,spdx") + cursor.execute("SET SESSION search_path=public,data,operations,spdx") cursor.close() dbapi_connection.autocommit = existing_autocommit diff --git a/collectoss/application/db/lib.py b/collectoss/application/db/lib.py index 4d719d1ac..c5394365d 100644 --- a/collectoss/application/db/lib.py +++ b/collectoss/application/db/lib.py @@ -1,24 +1,26 @@ -import re import time import random import logging import sqlalchemy as s -from sqlalchemy import func -from sqlalchemy.exc import DataError +from sqlalchemy import func from sqlalchemy.dialects import postgresql from sqlalchemy.exc import OperationalError from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union +from typing_extensions import deprecated from collectoss.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus, UserGroup, RepoGroup +# TODO: CollectionState should be moved to augur/application/db/ to eliminate +# this cross-layer dependency — same issue as the correction.py import above. from collectoss.tasks.util.collection_state import CollectionState +from collectoss.application.db.timestamp_utils import correct_timestamp from collectoss.application.db import get_session, get_engine from collectoss.application.db.util import execute_session_query, convert_type_of_value from collectoss.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts logger = logging.getLogger("db_lib") - +@deprecated("This is a legacy method. Use AugurConfig.get_value instead") def get_value(section_name: str, setting_name: str) -> Optional[Any]: """Get the value of a setting from the config. @@ -160,7 +162,7 @@ def get_working_commits_by_repo_id(repo_id): try: working_commits = fetchall_data_from_sql_text(query) - except: + except Exception: working_commits = [] return working_commits @@ -176,7 +178,7 @@ def get_missing_commit_message_hashes(repo_id): try: missing_commit_hashes = fetchall_data_from_sql_text(fetch_missing_hashes_sql) - except: + except Exception: missing_commit_hashes = [] return missing_commit_hashes @@ -217,46 +219,30 @@ def facade_bulk_insert_commits(logger, records): facade_bulk_insert_commits(logger, firsthalfRecords) facade_bulk_insert_commits(logger, secondhalfRecords) elif len(records) == 1: + # Binary search isolated the problematic record + # Try to fix invalid timestamps (rare but possible from git corruption) commit_record = records[0] - #replace incomprehensible dates with epoch. - #2021-10-11 11:57:46 -0500 - - # placeholder_date = "1970-01-01 00:00:15 -0500" - placeholder_date = commit_record['cmt_author_timestamp'] - - postgres_valid_timezones = { - -1200, -1100, -1000, -930, -900, -800, -700, - -600, -500, -400, -300, -230, -200, -100, 000, - 100, 200, 300, 330, 400, 430, 500, 530, 545, 600, - 630, 700, 800, 845, 900, 930, 1000, 1030, 1100, 1200, - 1245, 1300, 1400 - } - - # Reconstruct timezone portion of the date string to UTC - placeholder_date_segments = re.split(" ", placeholder_date) - tzdata = placeholder_date_segments.pop() - - if ":" in tzdata: - tzdata = tzdata.replace(":", "") - - if int(tzdata) not in postgres_valid_timezones: - tzdata = "+0000" - else: - raise e - placeholder_date_segments.append(tzdata) - - placeholder_date = " ".join(placeholder_date_segments) + # Correct both author and committer timestamps + author_corrected = correct_timestamp( + commit_record.get('cmt_author_timestamp', ''), + fallback=None, + logger=logger + ) + committer_corrected = correct_timestamp( + commit_record.get('cmt_committer_timestamp', ''), + fallback=author_corrected, + logger=logger + ) - #Check for improper utc timezone offset - #UTC timezone offset should be between -14:00 and +14:00 + commit_record['cmt_author_timestamp'] = author_corrected + commit_record['cmt_committer_timestamp'] = committer_corrected - # analyzecommit.generate_commit_record() defines the keys on the commit_record dictionary - commit_record['cmt_author_timestamp'] = placeholder_date - commit_record['cmt_committer_timestamp'] = placeholder_date - - logger.warning(f"commit with invalid timezone set to UTC: {commit_record['cmt_commit_hash']}") + logger.warning( + f"Corrected invalid timestamp(s) for commit {commit_record.get('cmt_commit_hash')}" + ) + # Retry insert with corrected timestamps session.execute( s.insert(Commit), [commit_record], diff --git a/collectoss/application/db/models/__init__.py b/collectoss/application/db/models/__init__.py index bed0e4c8e..80d3cf9b4 100644 --- a/collectoss/application/db/models/__init__.py +++ b/collectoss/application/db/models/__init__.py @@ -1,4 +1,4 @@ -from collectoss.application.db.models.augur_data import ( +from collectoss.application.db.models.data import ( ChaossMetricStatus, ChaossUser, ContributorAffiliation, @@ -95,7 +95,7 @@ SpdxIdentifier, ) -from collectoss.application.db.models.augur_operations import ( +from collectoss.application.db.models.operations import ( Settings, WorkerHistory, WorkerJob, diff --git a/collectoss/application/db/models/augur_data.py b/collectoss/application/db/models/data.py similarity index 88% rename from collectoss/application/db/models/augur_data.py rename to collectoss/application/db/models/data.py index 7ea85eefc..2fa40b71a 100644 --- a/collectoss/application/db/models/augur_data.py +++ b/collectoss/application/db/models/data.py @@ -54,7 +54,7 @@ nullable=False, server_default=text("CURRENT_TIMESTAMP"), ), - schema="augur_data", + schema="data", ) Index('repos_id', t_analysis_log.c.repos_id) @@ -63,16 +63,16 @@ class ChaossMetricStatus(Base): __tablename__ = "chaoss_metric_status" __table_args__ = { - "schema": "augur_data", + "schema": "data", "comment": "This table used to track CHAOSS Metric implementations, but due to the constantly changing location of that information, it is for the moment not actively populated. ", } cms_id = Column( BigInteger, - Sequence('chaoss_metric_status_cms_id_seq', start=1, schema='augur_data'), + Sequence('chaoss_metric_status_cms_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.chaoss_metric_status_cms_id_seq'::regclass)" + "nextval('data.chaoss_metric_status_cms_id_seq'::regclass)" ), ) cm_group = Column(String) @@ -97,14 +97,14 @@ class ChaossMetricStatus(Base): class ChaossUser(Base): __tablename__ = "chaoss_user" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} chaoss_id = Column( BigInteger, - Sequence('chaoss_user_chaoss_id_seq', start=1, schema='augur_data'), + Sequence('chaoss_user_chaoss_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.chaoss_user_chaoss_id_seq'::regclass)" + "nextval('data.chaoss_user_chaoss_id_seq'::regclass)" ), ) chaoss_login_name = Column(String) @@ -122,16 +122,16 @@ class ChaossUser(Base): class ContributorAffiliation(Base): __tablename__ = "contributor_affiliations" __table_args__ = { - "schema": "augur_data", + "schema": "data", "comment": "This table exists outside of relations with other tables. The purpose is to provide a dynamic, owner maintained (and collectoss augmented) list of affiliations. This table is processed in affiliation information in the DM_ tables generated when CollectOSS is finished counting commits using the Facade Worker. ", } ca_id = Column( BigInteger, - Sequence('contributor_affiliations_ca_id_seq', start=25430, schema='augur_data'), + Sequence('contributor_affiliations_ca_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.contributor_affiliations_ca_id_seq'::regclass)" + "nextval('data.contributor_affiliations_ca_id_seq'::regclass)" ), ) ca_domain = Column(String(64), nullable=False, unique=True) @@ -178,7 +178,7 @@ class Contributor(Base): Index("login-contributor-idx", "cntrb_login"), { - "schema": "augur_data", + "schema": "data", "comment": "For GitHub, this should be repeated from gh_login. for other systems, it should be that systems login. \nGithub now allows a user to change their login name, but their user id remains the same in this case. So, the natural key is the combination of id and login, but there should never be repeated logins. ", }, ) @@ -337,7 +337,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): ), Index("repo_id,email_copy_1", "repo_id", "email"), Index("repo_id,affiliation_copy_1", "repo_id", "affiliation"), - schema="augur_data", + schema="data", ) @@ -363,7 +363,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): ), Index("projects_id,email_copy_1", "repo_group_id", "email"), Index("projects_id,affiliation_copy_1", "repo_group_id", "affiliation"), - schema="augur_data", + schema="data", ) @@ -394,7 +394,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index( "projects_id,year,affiliation_copy_1", "repo_group_id", "year", "affiliation" ), - schema="augur_data", + schema="data", ) @@ -423,7 +423,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index("projects_id,email", "repo_group_id", "email"), Index("projects_id,year,email", "repo_group_id", "year", "email"), Index("projects_id,year,affiliation", "repo_group_id", "year", "affiliation"), - schema="augur_data", + schema="data", ) @@ -452,7 +452,7 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index("repo_id,year,affiliation_copy_1", "repo_id", "year", "affiliation"), Index("repo_id,affiliation_copy_2", "repo_id", "affiliation"), Index("repo_id,email_copy_2", "repo_id", "email"), - schema="augur_data", + schema="data", ) @@ -481,13 +481,13 @@ def from_github(cls, contributor, tool_source, tool_version, data_source): Index("repo_id,email", "repo_id", "email"), Index("repo_id,year,email", "repo_id", "year", "email"), Index("repo_id,year,affiliation", "repo_id", "year", "affiliation"), - schema="augur_data", + schema="data", ) class Exclude(Base): __tablename__ = "exclude" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} id = Column(Integer, primary_key=True) projects_id = Column(Integer, nullable=False) @@ -497,14 +497,14 @@ class Exclude(Base): class LstmAnomalyModel(Base): __tablename__ = "lstm_anomaly_models" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} model_id = Column( BigInteger, - Sequence('lstm_anomaly_models_model_id_seq', start=1, schema='augur_data'), + Sequence('lstm_anomaly_models_model_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.lstm_anomaly_models_model_id_seq'::regclass)" + "nextval('data.lstm_anomaly_models_model_id_seq'::regclass)" ), ) model_name = Column(String) @@ -525,14 +525,14 @@ class Platform(Base): __tablename__ = "platform" __table_args__ = ( Index("plat", "pltfrm_id", unique=True), - {"schema": "augur_data"} + {"schema": "data"} ) pltfrm_id = Column( BigInteger, - Sequence('platform_pltfrm_id_seq', start=25430, schema="augur_data"), + Sequence('platform_pltfrm_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('augur_data.platform_pltfrm_id_seq'::regclass)"), + server_default=text("nextval('data.platform_pltfrm_id_seq'::regclass)"), ) pltfrm_name = Column(String) pltfrm_version = Column(String) @@ -548,16 +548,16 @@ class RepoGroup(Base): __table_args__ = ( Index("rgidm", "repo_group_id", unique=True), Index("rgnameindex", "rg_name"), - {"schema": "augur_data", + {"schema": "data", "comment": "rg_type is intended to be either a GitHub Organization or a User Created Repo Group. "}, ) repo_group_id = Column( BigInteger, - Sequence('repo_groups_repo_group_id_seq', start=25430, schema='augur_data'), + Sequence('repo_groups_repo_group_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_groups_repo_group_id_seq'::regclass)" + "nextval('data.repo_groups_repo_group_id_seq'::regclass)" ), ) rg_name = Column(String, nullable=False) @@ -622,13 +622,13 @@ def get_by_name(session, rg_name): ), Index("repos_id,status", "repos_id", "status"), Index("repos_id,statusops", "repos_id", "status"), - schema="augur_data", + schema="data", ) class Settings(Base): __tablename__ = "settings" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} id = Column(Integer, primary_key=True) setting = Column(String(32), nullable=False) @@ -640,14 +640,14 @@ class Settings(Base): class TopicWord(Base): __tablename__ = "topic_words" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} topic_words_id = Column( BigInteger, - Sequence('topic_words_topic_words_id_seq', start=1, schema='augur_data'), + Sequence('topic_words_topic_words_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.topic_words_topic_words_id_seq'::regclass)" + "nextval('data.topic_words_topic_words_id_seq'::regclass)" ), ) topic_id = Column(BigInteger) @@ -678,20 +678,20 @@ class TopicWord(Base): server_default=text("CURRENT_TIMESTAMP"), ), Index("type,projects_id", "type", "repo_group_id"), - schema="augur_data", + schema="data", ) class UnresolvedCommitEmail(Base): __tablename__ = "unresolved_commit_emails" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} email_unresolved_id = Column( BigInteger, - Sequence('unresolved_commit_emails_email_unresolved_id_seq', start=1, schema='augur_data'), + Sequence('unresolved_commit_emails_email_unresolved_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.unresolved_commit_emails_email_unresolved_id_seq'::regclass)" + "nextval('data.unresolved_commit_emails_email_unresolved_id_seq'::regclass)" ), ) email = Column(String, nullable=False, unique=True) @@ -706,13 +706,13 @@ class UnresolvedCommitEmail(Base): class UtilityLog(Base): __tablename__ = "utility_log" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} id = Column( BigInteger, - Sequence('utility_log_id_seq1', start=1, schema="augur_data"), + Sequence('utility_log_id_seq1', start=1, schema="data"), primary_key=True, - server_default=text("nextval('augur_data.utility_log_id_seq1'::regclass)"), + server_default=text("nextval('data.utility_log_id_seq1'::regclass)"), ) level = Column(String(8), nullable=False) status = Column(String, nullable=False) @@ -728,7 +728,7 @@ class UtilityLog(Base): Column( "working_commit", String(40), server_default=text("'NULL'::character varying") ), - schema="augur_data", + schema="data", ) @@ -737,22 +737,22 @@ class ContributorRepo(Base): __table_args__ = ( UniqueConstraint("event_id", "tool_version"), { - "schema": "augur_data", + "schema": "data", "comment": 'Developed in Partnership with Andrew Brain.', }, ) cntrb_repo_id = Column( BigInteger, - Sequence('contributor_repo_cntrb_repo_id_seq', start=1, schema='augur_data'), + Sequence('contributor_repo_cntrb_repo_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.contributor_repo_cntrb_repo_id_seq'::regclass)" + "nextval('data.contributor_repo_cntrb_repo_id_seq'::regclass)" ), ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, comment="This is not null because what is the point without the contributor in this table? ", @@ -782,22 +782,22 @@ class ContributorsAlias(Base): __table_args__ = ( UniqueConstraint("cntrb_id","alias_email", name="cntrb-email-insert-unique"), { - "schema": "augur_data", + "schema": "data", "comment": "Every open source user may have more than one email used to make contributions over time. CollectOSS selects the first email it encounters for a user as its “canonical_email”. \n\nThe canonical_email is also added to the contributors_aliases table, with the canonical_email and alias_email being identical. Using this strategy, an email search will only need to join the alias table for basic email information, and can then more easily map the canonical email from each alias row to the same, more detailed information in the contributors table for a user. ", }, ) cntrb_alias_id = Column( BigInteger, - Sequence('contributors_aliases_cntrb_alias_id_seq', start=1, schema='augur_data'), + Sequence('contributors_aliases_cntrb_alias_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.contributors_aliases_cntrb_alias_id_seq'::regclass)" + "nextval('data.contributors_aliases_cntrb_alias_id_seq'::regclass)" ), ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", + "data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -838,19 +838,19 @@ class Repo(Base): Index("therepo", "repo_id", unique=True), { - "schema": "augur_data", + "schema": "data", "comment": "This table is a combination of the columns in Facade’s repo table and GHTorrent’s projects table. ", }, ) repo_id = Column( BigInteger, - Sequence('repo_repo_id_seq', start=25480, schema='augur_data'), + Sequence('repo_repo_id_seq', start=25480, schema='data'), primary_key=True, - server_default=text("nextval('augur_data.repo_repo_id_seq'::regclass)"), + server_default=text("nextval('data.repo_repo_id_seq'::regclass)"), ) repo_group_id = Column( - ForeignKey("augur_data.repo_groups.repo_group_id"), nullable=False + ForeignKey("data.repo_groups.repo_group_id"), nullable=False ) repo_git = Column(String, nullable=False) @@ -1192,22 +1192,22 @@ class HistoricalRepoURLs(Base): """ __tablename__ = "historical_repo_urls" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} - repo_id = Column(ForeignKey("augur_data.repo.repo_id"), primary_key=True) + repo_id = Column(ForeignKey("data.repo.repo_id"), primary_key=True) git_url = Column(String, primary_key=True) date_collected = Column(DateTime(timezone=True), server_default=func.now(), nullable=True) class RepoTestCoverage(Base): __tablename__ = "repo_test_coverage" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), - Sequence('repo_test_coverage_repo_id_seq', start=1, schema='augur_data'), + ForeignKey("data.repo.repo_id"), + Sequence('repo_test_coverage_repo_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_test_coverage_repo_id_seq'::regclass)" + "nextval('data.repo_test_coverage_repo_id_seq'::regclass)" ), ) repo_clone_date = Column(TIMESTAMP(precision=0)) @@ -1231,19 +1231,19 @@ class RepoTestCoverage(Base): class RepoGroupInsight(Base): __tablename__ = "repo_group_insights" __table_args__ = { - "schema": "augur_data", + "schema": "data", "comment": 'This table is output from an analytical worker. It runs through the different metrics on a REPOSITORY_GROUP and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', } rgi_id = Column( BigInteger, - Sequence('repo_group_insights_rgi_id_seq', start=25430, schema='augur_data'), + Sequence('repo_group_insights_rgi_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_group_insights_rgi_id_seq'::regclass)" + "nextval('data.repo_group_insights_rgi_id_seq'::regclass)" ), ) - repo_group_id = Column(ForeignKey("augur_data.repo_groups.repo_group_id")) + repo_group_id = Column(ForeignKey("data.repo_groups.repo_group_id")) rgi_metric = Column(String) rgi_value = Column(String) cms_id = Column(BigInteger) @@ -1266,19 +1266,19 @@ class RepoGroupsListServe(Base): __table_args__ = ( UniqueConstraint("rgls_id", "repo_group_id"), Index("lister", "rgls_id", "repo_group_id", unique=True), - {"schema": "augur_data"}, + {"schema": "data"}, ) rgls_id = Column( BigInteger, - Sequence('repo_groups_list_serve_rgls_id_seq', start=25430, schema='augur_data'), + Sequence('repo_groups_list_serve_rgls_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_groups_list_serve_rgls_id_seq'::regclass)" + "nextval('data.repo_groups_list_serve_rgls_id_seq'::regclass)" ), ) repo_group_id = Column( - ForeignKey("augur_data.repo_groups.repo_group_id"), nullable=False + ForeignKey("data.repo_groups.repo_group_id"), nullable=False ) rgls_name = Column(String) rgls_description = Column(String(3000)) @@ -1319,19 +1319,19 @@ class Commit(Base): Index("repo_id,commit", "repo_id", "cmt_commit_hash"), { - "schema": "augur_data", + "schema": "data", "comment": "Commits.\nEach row represents changes to one FILE within a single commit. So you will encounter multiple rows per commit hash in many cases. ", }, ) cmt_id = Column( BigInteger, - Sequence('commits_cmt_id_seq', start=25430, schema="augur_data"), + Sequence('commits_cmt_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), + server_default=text("nextval('data.commits_cmt_id_seq'::regclass)"), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=False, ) cmt_commit_hash = Column(String(80), nullable=False) @@ -1355,7 +1355,7 @@ class Commit(Base): cmt_filename = Column(String, nullable=False) cmt_date_attempted = Column(TIMESTAMP(precision=0), nullable=False) cmt_ght_author_id = Column(ForeignKey( - "augur_data.contributors.cntrb_id", + "data.contributors.cntrb_id", name="cmt_ght_author_cntrb_id_fk", onupdate="CASCADE", ondelete="RESTRICT", @@ -1368,7 +1368,7 @@ class Commit(Base): cmt_author_timestamp = Column(TIMESTAMP(True, 0)) cmt_author_platform_username = Column( ForeignKey( - "augur_data.contributors.cntrb_login", + "data.contributors.cntrb_login", name="fk_commits_contributors_3", ondelete="CASCADE", onupdate="CASCADE", @@ -1376,7 +1376,7 @@ class Commit(Base): deferrable=True, ), ForeignKey( - "augur_data.contributors.cntrb_login", + "data.contributors.cntrb_login", name="fk_commits_contributors_4", ondelete="CASCADE", onupdate="CASCADE", @@ -1404,20 +1404,20 @@ class CommitMessage(Base): __table_args__ = ( UniqueConstraint("repo_id","cmt_hash", name="commit-message-insert-unique"), { - "schema": "augur_data", + "schema": "data", "comment": "This table holds commit messages", } ) cmt_msg_id = Column( BigInteger, - Sequence('commits_cmt_id_seq', start=25430, schema="augur_data"), + Sequence('commits_cmt_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('augur_data.commits_cmt_id_seq'::regclass)"), + server_default=text("nextval('data.commits_cmt_id_seq'::regclass)"), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=False, ) @@ -1442,20 +1442,20 @@ class Issue(Base): UniqueConstraint("repo_id", "gh_issue_id"), UniqueConstraint("issue_url", name="issue-insert-unique"), - {"schema": "augur_data"}, + {"schema": "data"}, ) issue_id = Column( BigInteger, - Sequence('issue_seq', start=31000, schema="augur_data"), + Sequence('issue_seq', start=31000, schema="data"), primary_key=True, - server_default=text("nextval('augur_data.issue_seq'::regclass)"), + server_default=text("nextval('data.issue_seq'::regclass)"), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), ) reporter_id = Column( - ForeignKey("augur_data.contributors.cntrb_id"), + ForeignKey("data.contributors.cntrb_id"), comment="The ID of the person who opened the issue. ", ) pull_request = Column(BigInteger) @@ -1464,7 +1464,7 @@ class Issue(Base): issue_title = Column(String) issue_body = Column(String) cntrb_id = Column( - ForeignKey("augur_data.contributors.cntrb_id"), + ForeignKey("data.contributors.cntrb_id"), comment="The ID of the person who closed the issue. ", ) comment_count = Column(BigInteger) @@ -1509,15 +1509,15 @@ class Issue(Base): class Library(Base): __tablename__ = "libraries" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} library_id = Column( BigInteger, - Sequence('libraries_library_id_seq', start=25430, schema="augur_data"), + Sequence('libraries_library_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('augur_data.libraries_library_id_seq'::regclass)"), + server_default=text("nextval('data.libraries_library_id_seq'::regclass)"), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) platform = Column(String) name = Column(String) created_timestamp = Column( @@ -1551,19 +1551,19 @@ class Library(Base): class LstmAnomalyResult(Base): __tablename__ = "lstm_anomaly_results" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} result_id = Column( BigInteger, - Sequence('lstm_anomaly_results_result_id_seq', start=1, schema='augur_data'), + Sequence('lstm_anomaly_results_result_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.lstm_anomaly_results_result_id_seq'::regclass)" + "nextval('data.lstm_anomaly_results_result_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) repo_category = Column(String) - model_id = Column(ForeignKey("augur_data.lstm_anomaly_models.model_id")) + model_id = Column(ForeignKey("data.lstm_anomaly_models.model_id")) metric = Column(String) contamination_factor = Column(Float(53)) mean_absolute_error = Column(Float(53)) @@ -1592,18 +1592,18 @@ class Message(Base): Index("msg-cntrb-id-idx", "cntrb_id"), Index("platformgrouper", "msg_id", "pltfrm_id"), Index("messagegrouper", "msg_id", "rgls_id", unique=True), - {"schema": "augur_data"}, + {"schema": "data"}, ) msg_id = Column( BigInteger, - Sequence('message_msg_id_seq', start=25430, schema="augur_data"), + Sequence('message_msg_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('augur_data.message_msg_id_seq'::regclass)"), + server_default=text("nextval('data.message_msg_id_seq'::regclass)"), ) rgls_id = Column( ForeignKey( - "augur_data.repo_groups_list_serve.rgls_id", + "data.repo_groups_list_serve.rgls_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -1612,7 +1612,7 @@ class Message(Base): platform_node_id = Column(String) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -1621,7 +1621,7 @@ class Message(Base): ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" ), comment="Not populated for mailing lists. Populated for GitHub issues. ", ) @@ -1631,7 +1631,7 @@ class Message(Base): msg_header = Column(String) pltfrm_id = Column( ForeignKey( - "augur_data.platform.pltfrm_id", ondelete="CASCADE", onupdate="CASCADE" + "data.platform.pltfrm_id", ondelete="CASCADE", onupdate="CASCADE" ), nullable=False, ) @@ -1661,19 +1661,19 @@ class Message(Base): class MessageAnalysisSummary(Base): __tablename__ = "message_analysis_summary" __table_args__ = { - "schema": "augur_data", + "schema": "data", "comment": "In a relationally perfect world, we would have a table called “message_analysis_run” the incremented the “worker_run_id” for both message_analysis and message_analysis_summary. For now, we decided this was overkill. ", } msg_summary_id = Column( BigInteger, - Sequence('message_analysis_summary_msg_summary_id_seq', start=1, schema='augur_data'), + Sequence('message_analysis_summary_msg_summary_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.message_analysis_summary_msg_summary_id_seq'::regclass)" + "nextval('data.message_analysis_summary_msg_summary_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) worker_run_id = Column( BigInteger, comment='This value should reflect the worker_run_id for the messages summarized in the table. There is not a relation between these two tables for that purpose because its not *really*, relationaly a concept unless we create a third table for "worker_run_id", which we determined was unnecessarily complex. ', @@ -1701,19 +1701,19 @@ class MessageAnalysisSummary(Base): class MessageSentimentSummary(Base): __tablename__ = "message_sentiment_summary" __table_args__ = { - "schema": "augur_data", + "schema": "data", "comment": "In a relationally perfect world, we would have a table called “message_sentiment_run” the incremented the “worker_run_id” for both message_sentiment and message_sentiment_summary. For now, we decided this was overkill. ", } msg_summary_id = Column( BigInteger, - Sequence('message_sentiment_summary_msg_summary_id_seq', start=1, schema='augur_data'), + Sequence('message_sentiment_summary_msg_summary_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.message_sentiment_summary_msg_summary_id_seq'::regclass)" + "nextval('data.message_sentiment_summary_msg_summary_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) worker_run_id = Column( BigInteger, comment='This value should reflect the worker_run_id for the messages summarized in the table. There is not a relation between these two tables for that purpose because its not *really*, relationaly a concept unless we create a third table for "worker_run_id", which we determined was unnecessarily complex. ', @@ -1749,19 +1749,19 @@ class PullRequest(Base): "pull_requests_idx_repo_id_data_datex", "repo_id", "data_collection_date" ), Index("pr_ID_prs_table", "pull_request_id"), - {"schema": "augur_data"}, + {"schema": "data"}, ) pull_request_id = Column( BigInteger, - Sequence('pull_requests_pull_request_id_seq', start=25430, schema='augur_data'), + Sequence('pull_requests_pull_request_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_requests_pull_request_id_seq'::regclass)" + "nextval('data.pull_requests_pull_request_id_seq'::regclass)" ), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE"), server_default=text("0"), ) pr_url = Column(String) @@ -1784,7 +1784,7 @@ class PullRequest(Base): pr_src_title = Column(String) pr_augur_contributor_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), comment="This is to link to the contributor record. ", ) @@ -1883,15 +1883,15 @@ def from_github(cls, pr, repo_id, tool_source, tool_version): class Release(Base): __tablename__ = "releases" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} release_id = Column( CHAR(256), - Sequence('releases_release_id_seq', start=1, schema="augur_data"), + Sequence('releases_release_id_seq', start=1, schema="data"), primary_key=True, - server_default=text("nextval('augur_data.releases_release_id_seq'::regclass)"), + server_default=text("nextval('data.releases_release_id_seq'::regclass)"), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id"), nullable=False) + repo_id = Column(ForeignKey("data.repo.repo_id"), nullable=False) release_name = Column(String) release_description = Column(String) release_author = Column(String) @@ -1916,19 +1916,19 @@ class Release(Base): class RepoBadging(Base): __tablename__ = "repo_badging" __table_args__ = { - "schema": "augur_data", + "schema": "data", "comment": "This will be collected from the LF’s Badging API\nhttps://bestpractices.coreinfrastructure.org/projects.json?pq=https%3A%2F%2Fgithub.com%2Fchaoss%2Faugur\n", } badge_collection_id = Column( BigInteger, - Sequence('repo_badging_badge_collection_id_seq', start=25012, schema='augur_data'), + Sequence('repo_badging_badge_collection_id_seq', start=25012, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_badging_badge_collection_id_seq'::regclass)" + "nextval('data.repo_badging_badge_collection_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) created_at = Column( TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) @@ -1960,17 +1960,17 @@ def insert(session, repo_id: int, data: dict) -> dict: class RepoClusterMessage(Base): __tablename__ = "repo_cluster_messages" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} msg_cluster_id = Column( BigInteger, - Sequence('repo_cluster_messages_msg_cluster_id_seq', start=1, schema='augur_data'), + Sequence('repo_cluster_messages_msg_cluster_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_cluster_messages_msg_cluster_id_seq'::regclass)" + "nextval('data.repo_cluster_messages_msg_cluster_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) cluster_content = Column(Integer) cluster_mechanism = Column(Integer) tool_source = Column(String) @@ -1988,21 +1988,21 @@ class RepoDependency(Base): __table_args__ = ( UniqueConstraint("repo_id","dep_name","data_collection_date", name="deps-insert-unique"), { - "schema": "augur_data", + "schema": "data", "comment": "Contains the dependencies for a repo." }, ) repo_dependencies_id = Column( BigInteger, - Sequence('repo_dependencies_repo_dependencies_id_seq', start=1, schema='augur_data'), + Sequence('repo_dependencies_repo_dependencies_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_dependencies_repo_dependencies_id_seq'::regclass)" + "nextval('data.repo_dependencies_repo_dependencies_id_seq'::regclass)" ), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), comment="Forign key for repo id. " + ForeignKey("data.repo.repo_id"), comment="Forign key for repo id. " ) dep_name = Column(String, comment="Name of the dependancy found in project. ") dep_count = Column(Integer, comment="Number of times the dependancy was found. ") @@ -2021,18 +2021,18 @@ class RepoDepsLibyear(Base): __tablename__ = "repo_deps_libyear" __table_args__ = ( UniqueConstraint("repo_id","name", "data_collection_date", name="deps-libyear-insert-unique"), - {"schema": "augur_data"} + {"schema": "data"} ) repo_deps_libyear_id = Column( BigInteger, - Sequence('repo_deps_libyear_repo_deps_libyear_id_seq', start=1, schema='augur_data'), + Sequence('repo_deps_libyear_repo_deps_libyear_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_deps_libyear_repo_deps_libyear_id_seq'::regclass)" + "nextval('data.repo_deps_libyear_repo_deps_libyear_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) name = Column(String) requirement = Column(String) type = Column(String) @@ -2056,18 +2056,18 @@ class RepoDepsScorecard(Base): __tablename__ = "repo_deps_scorecard" __table_args__ = ( UniqueConstraint("repo_id","name", "data_collection_date", name="deps_scorecard_new_unique"), - {"schema": "augur_data"} + {"schema": "data"} ) repo_deps_scorecard_id = Column( BigInteger, - Sequence('repo_deps_scorecard_repo_deps_scorecard_id_seq1', start=1, schema='augur_data'), + Sequence('repo_deps_scorecard_repo_deps_scorecard_id_seq1', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_deps_scorecard_repo_deps_scorecard_id_seq1'::regclass)" + "nextval('data.repo_deps_scorecard_repo_deps_scorecard_id_seq1'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) name = Column(String) #status = Column(String) scorecard_check_details = Column(JSONB) @@ -2087,18 +2087,18 @@ class RepoInfo(Base): __table_args__ = ( Index("repo_info_idx_repo_id_data_date_1x", "repo_id", "data_collection_date"), Index("repo_info_idx_repo_id_data_datex", "repo_id", "data_collection_date"), - {"schema": "augur_data"}, + {"schema": "data"}, ) repo_info_id = Column( BigInteger, - Sequence('repo_info_repo_info_id_seq', start=25430, schema='augur_data'), + Sequence('repo_info_repo_info_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_info_repo_info_id_seq'::regclass)" + "nextval('data.repo_info_repo_info_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id"), nullable=False) + repo_id = Column(ForeignKey("data.repo.repo_id"), nullable=False) last_updated = Column( TIMESTAMP(precision=0), server_default=text("NULL::timestamp without time zone") ) @@ -2143,17 +2143,17 @@ class RepoInfo(Base): class RepoInsight(Base): __tablename__ = "repo_insights" __table_args__ = { - "schema": "augur_data", + "schema": "data", "comment": 'This table is output from an analytical worker. It runs through the different metrics on a repository and identifies the five to ten most “interesting” metrics as defined by some kind of delta or other factor. The algorithm is going to evolve. \n\nWorker Design Notes: The idea is that the "insight worker" will scan through a bunch of active metrics or "synthetic metrics" to list the most important insights. ', } ri_id = Column( BigInteger, - Sequence('repo_insights_ri_id_seq', start=25430, schema="augur_data"), + Sequence('repo_insights_ri_id_seq', start=25430, schema="data"), primary_key=True, - server_default=text("nextval('augur_data.repo_insights_ri_id_seq'::regclass)"), + server_default=text("nextval('data.repo_insights_ri_id_seq'::regclass)"), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) ri_metric = Column(String) ri_value = Column(String) ri_date = Column(TIMESTAMP(precision=0)) @@ -2178,20 +2178,20 @@ class RepoInsightsRecord(Base): __tablename__ = "repo_insights_records" __table_args__ = ( Index("dater", "ri_date"), - {"schema": "augur_data"} + {"schema": "data"} ) ri_id = Column( BigInteger, - Sequence('repo_insights_records_ri_id_seq', start=1, schema='augur_data'), + Sequence('repo_insights_records_ri_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_insights_records_ri_id_seq'::regclass)" + "nextval('data.repo_insights_records_ri_id_seq'::regclass)" ), comment="Primary key. ", ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="SET NULL", onupdate="CASCADE"), + ForeignKey("data.repo.repo_id", ondelete="SET NULL", onupdate="CASCADE"), comment="Refers to repo table primary key. Will have a foreign key", ) ri_metric = Column(String, comment="The metric endpoint") @@ -2223,20 +2223,20 @@ class RepoLabor(Base): __table_args__ = ( UniqueConstraint("repo_id", "rl_analysis_date", "file_path", "file_name"), { - "schema": "augur_data", + "schema": "data", "comment": "repo_labor is a derivative of tables used to store scc code and complexity counting statistics that are inputs to labor analysis, which are components of CHAOSS value metric calculations. ", }, ) repo_labor_id = Column( BigInteger, - Sequence('repo_labor_repo_labor_id_seq', start=25430, schema='augur_data'), + Sequence('repo_labor_repo_labor_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_labor_repo_labor_id_seq'::regclass)" + "nextval('data.repo_labor_repo_labor_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) repo_clone_date = Column(TIMESTAMP(precision=0)) rl_analysis_date = Column(TIMESTAMP(precision=0)) programming_language = Column(String) @@ -2261,17 +2261,17 @@ class RepoLabor(Base): class RepoMeta(Base): __tablename__ = "repo_meta" - __table_args__ = {"schema": "augur_data", "comment": "Project Languages"} + __table_args__ = {"schema": "data", "comment": "Project Languages"} repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), primary_key=True, nullable=False + ForeignKey("data.repo.repo_id"), primary_key=True, nullable=False ) rmeta_id = Column( BigInteger, - Sequence('repo_meta_rmeta_id_seq', start=25430, schema="augur_data"), + Sequence('repo_meta_rmeta_id_seq', start=25430, schema="data"), primary_key=True, nullable=False, - server_default=text("nextval('augur_data.repo_meta_rmeta_id_seq'::regclass)"), + server_default=text("nextval('data.repo_meta_rmeta_id_seq'::regclass)"), ) rmeta_name = Column(String) rmeta_value = Column(String, server_default=text("0")) @@ -2285,18 +2285,18 @@ class RepoMeta(Base): class RepoSbomScan(Base): __tablename__ = "repo_sbom_scans" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} rsb_id = Column( BigInteger, - Sequence('repo_sbom_scans_rsb_id_seq', start=25430, schema='augur_data'), + Sequence('repo_sbom_scans_rsb_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_sbom_scans_rsb_id_seq'::regclass)" + "nextval('data.repo_sbom_scans_rsb_id_seq'::regclass)" ), ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="CASCADE", onupdate="CASCADE") ) sbom_scan = Column(JSON) @@ -2305,17 +2305,17 @@ class RepoSbomScan(Base): class RepoStat(Base): __tablename__ = "repo_stats" - __table_args__ = {"schema": "augur_data", "comment": "Project Watchers"} + __table_args__ = {"schema": "data", "comment": "Project Watchers"} repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), primary_key=True, nullable=False + ForeignKey("data.repo.repo_id"), primary_key=True, nullable=False ) rstat_id = Column( BigInteger, - Sequence('repo_stats_rstat_id_seq', start=25430, schema="augur_data"), + Sequence('repo_stats_rstat_id_seq', start=25430, schema="data"), primary_key=True, nullable=False, - server_default=text("nextval('augur_data.repo_stats_rstat_id_seq'::regclass)"), + server_default=text("nextval('data.repo_stats_rstat_id_seq'::regclass)"), ) rstat_name = Column(String(400)) rstat_value = Column(BigInteger) @@ -2329,17 +2329,17 @@ class RepoStat(Base): class RepoTopic(Base): __tablename__ = "repo_topic" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} repo_topic_id = Column( BigInteger, - Sequence('repo_topic_repo_topic_id_seq', start=1, schema='augur_data'), + Sequence('repo_topic_repo_topic_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_topic_repo_topic_id_seq'::regclass)" + "nextval('data.repo_topic_repo_topic_id_seq'::regclass)" ), ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id")) + repo_id = Column(ForeignKey("data.repo.repo_id")) topic_id = Column(Integer) topic_prob = Column(Float(53)) tool_source = Column(String) @@ -2356,27 +2356,27 @@ class CommitCommentRef(Base): __tablename__ = "commit_comment_ref" __table_args__ = ( Index("comment_id", "cmt_comment_src_id", "cmt_comment_id", "msg_id"), - {"schema": "augur_data"}, + {"schema": "data"}, ) cmt_comment_id = Column( BigInteger, - Sequence('commit_comment_ref_cmt_comment_id_seq', start=25430, schema='augur_data'), + Sequence('commit_comment_ref_cmt_comment_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.commit_comment_ref_cmt_comment_id_seq'::regclass)" + "nextval('data.commit_comment_ref_cmt_comment_id_seq'::regclass)" ), ) cmt_id = Column( ForeignKey( - "augur_data.commits.cmt_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.commits.cmt_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) repo_id = Column(BigInteger) msg_id = Column( ForeignKey( - "augur_data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) @@ -2413,21 +2413,21 @@ class CommitParent(Base): __table_args__ = ( Index("commit_parents_ibfk_1", "cmt_id"), Index("commit_parents_ibfk_2", "parent_id"), - {"schema": "augur_data"} + {"schema": "data"} ) cmt_id = Column( - ForeignKey("augur_data.commits.cmt_id"), + ForeignKey("data.commits.cmt_id"), primary_key=True, nullable=False, ) parent_id = Column( - ForeignKey("augur_data.commits.cmt_id"), - Sequence('commit_parents_parent_id_seq', start=25430, schema='augur_data'), + ForeignKey("data.commits.cmt_id"), + Sequence('commit_parents_parent_id_seq', start=25430, schema='data'), primary_key=True, nullable=False, server_default=text( - "nextval('augur_data.commit_parents_parent_id_seq'::regclass)" + "nextval('data.commit_parents_parent_id_seq'::regclass)" ), ) tool_source = Column(String) @@ -2446,19 +2446,19 @@ class CommitParent(Base): class DiscourseInsight(Base): __tablename__ = "discourse_insights" __table_args__ = { - "schema": "augur_data", + "schema": "data", "comment": "This table is populated by the “Discourse_Analysis_Worker”. It examines sequential discourse, using computational linguistic methods, to draw statistical inferences regarding the discourse in a particular comment thread. ", } msg_discourse_id = Column( BigInteger, - Sequence('discourse_insights_msg_discourse_id_seq1', start=1, schema='augur_data'), + Sequence('discourse_insights_msg_discourse_id_seq1', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.discourse_insights_msg_discourse_id_seq1'::regclass)" + "nextval('data.discourse_insights_msg_discourse_id_seq1'::regclass)" ), ) - msg_id = Column(ForeignKey("augur_data.message.msg_id")) + msg_id = Column(ForeignKey("data.message.msg_id")) discourse_act = Column(String) tool_source = Column(String) tool_version = Column(String) @@ -2475,22 +2475,22 @@ class IssueAssignee(Base): __table_args__ = ( Index("issue-cntrb-assign-idx-1", "cntrb_id"), UniqueConstraint("issue_assignee_src_id", "issue_id", name="issue-assignee-insert-unique"), - {"schema": "augur_data"} + {"schema": "data"} ) issue_assignee_id = Column( BigInteger, - Sequence('issue_assignees_issue_assignee_id_seq', start=1, schema='augur_data'), + Sequence('issue_assignees_issue_assignee_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.issue_assignees_issue_assignee_id_seq'::regclass)" + "nextval('data.issue_assignees_issue_assignee_id_seq'::regclass)" ), ) - issue_id = Column(ForeignKey("augur_data.issues.issue_id")) + issue_id = Column(ForeignKey("data.issues.issue_id")) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) - cntrb_id = Column(ForeignKey("augur_data.contributors.cntrb_id")) + cntrb_id = Column(ForeignKey("data.contributors.cntrb_id")) issue_assignee_src_id = Column( BigInteger, comment="This ID comes from the source. In the case of GitHub, it is the id that is the first field returned from the issue events API in the issue_assignees embedded JSON object. We may discover it is an ID for the person themselves; but my hypothesis is that its not.", @@ -2535,29 +2535,29 @@ class IssueEvent(Base): Index("issue_events_ibfk_1", "issue_id"), Index("issue_events_ibfk_2", "cntrb_id"), - {"schema": "augur_data"}, + {"schema": "data"}, ) event_id = Column( BigInteger, - Sequence('issue_events_event_id_seq', start=25430, schema='augur_data'), + Sequence('issue_events_event_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.issue_events_event_id_seq'::regclass)" + "nextval('data.issue_events_event_id_seq'::regclass)" ), ) issue_id = Column( ForeignKey( - "augur_data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE" + "data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE" ), nullable=False, ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ) ) action = Column(String, nullable=False) @@ -2572,7 +2572,7 @@ class IssueEvent(Base): node_url = Column(String) platform_id = Column( ForeignKey( - "augur_data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) @@ -2620,22 +2620,22 @@ class IssueLabel(Base): __tablename__ = "issue_labels" __table_args__ = ( UniqueConstraint("label_src_id", "issue_id"), - {"schema": "augur_data"}, + {"schema": "data"}, ) issue_label_id = Column( BigInteger, - Sequence('issue_labels_issue_label_id_seq', start=25430, schema='augur_data'), + Sequence('issue_labels_issue_label_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.issue_labels_issue_label_id_seq'::regclass)" + "nextval('data.issue_labels_issue_label_id_seq'::regclass)" ), ) issue_id = Column( - ForeignKey("augur_data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE") + ForeignKey("data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE") ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) label_text = Column(String) label_description = Column(String) @@ -2677,20 +2677,20 @@ class IssueMessageRef(Base): __tablename__ = "issue_message_ref" __table_args__ = ( UniqueConstraint("issue_msg_ref_src_comment_id", "issue_id", name="issue-message-ref-insert-unique"), - {"schema": "augur_data"}, + {"schema": "data"}, ) issue_msg_ref_id = Column( BigInteger, - Sequence('issue_message_ref_issue_msg_ref_id_seq', start=25430, schema='augur_data'), + Sequence('issue_message_ref_issue_msg_ref_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.issue_message_ref_issue_msg_ref_id_seq'::regclass)" + "nextval('data.issue_message_ref_issue_msg_ref_id_seq'::regclass)" ), ) issue_id = Column( ForeignKey( - "augur_data.issues.issue_id", + "data.issues.issue_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -2699,7 +2699,7 @@ class IssueMessageRef(Base): ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -2708,7 +2708,7 @@ class IssueMessageRef(Base): ) msg_id = Column( ForeignKey( - "augur_data.message.msg_id", + "data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -2739,18 +2739,18 @@ class LibraryDependency(Base): __tablename__ = "library_dependencies" __table_args__ = ( Index("REPO_DEP", "library_id"), - {"schema": "augur_data"} + {"schema": "data"} ) lib_dependency_id = Column( BigInteger, - Sequence('library_dependencies_lib_dependency_id_seq', start=25430, schema='augur_data'), + Sequence('library_dependencies_lib_dependency_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.library_dependencies_lib_dependency_id_seq'::regclass)" + "nextval('data.library_dependencies_lib_dependency_id_seq'::regclass)" ), ) - library_id = Column(ForeignKey("augur_data.libraries.library_id")) + library_id = Column(ForeignKey("data.libraries.library_id")) manifest_platform = Column(String) manifest_filepath = Column( String(1000), server_default=text("NULL::character varying") @@ -2767,17 +2767,17 @@ class LibraryDependency(Base): class LibraryVersion(Base): __tablename__ = "library_version" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} library_version_id = Column( BigInteger, - Sequence('library_version_library_version_id_seq', start=25430, schema='augur_data'), + Sequence('library_version_library_version_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.library_version_library_version_id_seq'::regclass)" + "nextval('data.library_version_library_version_id_seq'::regclass)" ), ) - library_id = Column(ForeignKey("augur_data.libraries.library_id")) + library_id = Column(ForeignKey("data.libraries.library_id")) library_platform = Column(String) version_number = Column(String) version_release_date = Column( @@ -2793,17 +2793,17 @@ class LibraryVersion(Base): class MessageAnalysis(Base): __tablename__ = "message_analysis" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} msg_analysis_id = Column( BigInteger, - Sequence('message_analysis_msg_analysis_id_seq', start=1, schema='augur_data'), + Sequence('message_analysis_msg_analysis_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.message_analysis_msg_analysis_id_seq'::regclass)" + "nextval('data.message_analysis_msg_analysis_id_seq'::regclass)" ), ) - msg_id = Column(ForeignKey("augur_data.message.msg_id")) + msg_id = Column(ForeignKey("data.message.msg_id")) worker_run_id = Column( BigInteger, comment="This column is used to indicate analyses run by a worker during the same execution period, and is useful for grouping, and time series analysis. ", @@ -2836,17 +2836,17 @@ class MessageAnalysis(Base): class MessageSentiment(Base): __tablename__ = "message_sentiment" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} msg_analysis_id = Column( BigInteger, - Sequence('message_sentiment_msg_analysis_id_seq', start=1, schema='augur_data'), + Sequence('message_sentiment_msg_analysis_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.message_sentiment_msg_analysis_id_seq'::regclass)" + "nextval('data.message_sentiment_msg_analysis_id_seq'::regclass)" ), ) - msg_id = Column(ForeignKey("augur_data.message.msg_id")) + msg_id = Column(ForeignKey("data.message.msg_id")) worker_run_id = Column( BigInteger, comment="This column is used to indicate analyses run by a worker during the same execution period, and is useful for grouping, and time series analysis. ", @@ -2881,15 +2881,15 @@ class PullRequestAnalysis(Base): pull_request_analysis_id = Column( BigInteger, - Sequence('pull_request_analysis_pull_request_analysis_id_seq', start=1, schema='augur_data'), + Sequence('pull_request_analysis_pull_request_analysis_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_analysis_pull_request_analysis_id_seq'::regclass)" + "nextval('data.pull_request_analysis_pull_request_analysis_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ), @@ -2915,7 +2915,7 @@ class PullRequestAnalysis(Base): __table_args__ = ( Index("pr_anal_idx", pull_request_id), Index("probability_idx", merge_probability.desc().nullslast()), - {"schema": "augur_data"} + {"schema": "data"} ) pull_request = relationship("PullRequest") @@ -2926,34 +2926,34 @@ class PullRequestAssignee(Base): __table_args__ = ( Index("pr_meta_cntrb-idx", "contrib_id"), UniqueConstraint("pull_request_id", "pr_assignee_src_id", name="assigniees-unique"), - {"schema": "augur_data"} + {"schema": "data"} ) pr_assignee_map_id = Column( BigInteger, - Sequence('pull_request_assignees_pr_assignee_map_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_assignees_pr_assignee_map_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_assignees_pr_assignee_map_id_seq'::regclass)" + "nextval('data.pull_request_assignees_pr_assignee_map_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, initially="DEFERRED", ) ) - contrib_id = Column(ForeignKey("augur_data.contributors.cntrb_id")) + contrib_id = Column(ForeignKey("data.contributors.cntrb_id")) pr_assignee_src_id = Column(BigInteger) tool_source = Column(String) tool_version = Column(String) @@ -2987,28 +2987,28 @@ class PullRequestCommit(Base): __table_args__ = ( UniqueConstraint("pull_request_id", "repo_id", "pr_cmt_sha"), { - "schema": "augur_data", + "schema": "data", "comment": "Pull request commits are an enumeration of each commit associated with a pull request. \nNot all pull requests are from a branch or fork into master. \nThe commits table intends to count only commits that end up in the master branch (i.e., part of the deployed code base for a project).\nTherefore, there will be commit “SHA”’s in this table that are no associated with a commit SHA in the commits table. \nIn cases where the PR is to the master branch of a project, you will find a match. In cases where the PR does not involve the master branch, you will not find a corresponding commit SHA in the commits table. This is expected. ", }, ) pr_cmt_id = Column( BigInteger, - Sequence('pull_request_commits_pr_cmt_id_seq', start=1, schema='augur_data'), + Sequence('pull_request_commits_pr_cmt_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_commits_pr_cmt_id_seq'::regclass)" + "nextval('data.pull_request_commits_pr_cmt_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) pr_cmt_sha = Column( String, @@ -3019,7 +3019,7 @@ class PullRequestCommit(Base): pr_cmt_comments_url = Column(String) pr_cmt_author_cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" ) ) pr_cmt_timestamp = Column(TIMESTAMP(precision=0)) @@ -3044,20 +3044,20 @@ class PullRequestEvent(Base): UniqueConstraint("repo_id", "issue_event_src_id", name="pr_events_repo_id_event_src_id_unique"), UniqueConstraint("platform_id", "node_id", name="unique-pr-event-id"), UniqueConstraint("node_id", name="pr-unqiue-event"), - {"schema": "augur_data"}, + {"schema": "data"}, ) pr_event_id = Column( BigInteger, - Sequence('pull_request_events_pr_event_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_events_pr_event_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_events_pr_event_id_seq'::regclass)" + "nextval('data.pull_request_events_pr_event_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ), @@ -3065,7 +3065,7 @@ class PullRequestEvent(Base): ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="RESTRICT", deferrable=True, @@ -3073,7 +3073,7 @@ class PullRequestEvent(Base): ) ) cntrb_id = Column( - ForeignKey("augur_data.contributors.cntrb_id") + ForeignKey("data.contributors.cntrb_id") ) action = Column(String, nullable=False) action_commit_hash = Column(String) @@ -3091,7 +3091,7 @@ class PullRequestEvent(Base): node_url = Column(String) platform_id = Column( ForeignKey( - "augur_data.platform.pltfrm_id", + "data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="RESTRICT", deferrable=True, @@ -3142,29 +3142,29 @@ class PullRequestFile(Base): Index("pr_id_pr_files","pull_request_id"), UniqueConstraint("pull_request_id", "repo_id", "pr_file_path", name="prfiles_unique"), { - "schema": "augur_data", + "schema": "data", "comment": "Pull request commits are an enumeration of each commit associated with a pull request. \nNot all pull requests are from a branch or fork into master. \nThe commits table intends to count only commits that end up in the master branch (i.e., part of the deployed code base for a project).\nTherefore, there will be commit “SHA”’s in this table that are no associated with a commit SHA in the commits table. \nIn cases where the PR is to the master branch of a project, you will find a match. In cases where the PR does not involve the master branch, you will not find a corresponding commit SHA in the commits table. This is expected. ", }, ) pr_file_id = Column( BigInteger, - Sequence('pull_request_files_pr_file_id_seq', start=25150, schema='augur_data'), + Sequence('pull_request_files_pr_file_id_seq', start=25150, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_files_pr_file_id_seq'::regclass)" + "nextval('data.pull_request_files_pr_file_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3196,26 +3196,26 @@ class PullRequestLabel(Base): __tablename__ = "pull_request_labels" __table_args__ = ( UniqueConstraint("pr_src_id", "pull_request_id"), - {"schema": "augur_data"}, + {"schema": "data"}, ) pr_label_id = Column( BigInteger, - Sequence('pull_request_labels_pr_label_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_labels_pr_label_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_labels_pr_label_id_seq'::regclass)" + "nextval('data.pull_request_labels_pr_label_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) pr_src_id = Column(BigInteger) pr_src_node_id = Column(String) @@ -3258,20 +3258,20 @@ class PullRequestMessageRef(Base): __tablename__ = "pull_request_message_ref" __table_args__ = ( UniqueConstraint("pr_message_ref_src_comment_id", "pull_request_id", name="pull-request-message-ref-insert-unique"), - {"schema": "augur_data"}, + {"schema": "data"}, ) pr_msg_ref_id = Column( BigInteger, - Sequence('pull_request_message_ref_pr_msg_ref_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_message_ref_pr_msg_ref_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_message_ref_pr_msg_ref_id_seq'::regclass)" + "nextval('data.pull_request_message_ref_pr_msg_ref_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", deferrable=True, @@ -3279,11 +3279,11 @@ class PullRequestMessageRef(Base): ) ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) msg_id = Column( ForeignKey( - "augur_data.message.msg_id", + "data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3310,28 +3310,28 @@ class PullRequestMeta(Base): __table_args__ = ( Index("pr_meta-cntrbid-idx", "cntrb_id"), UniqueConstraint("pull_request_id", "pr_head_or_base", 'pr_sha', name="pull-request-meta-insert-unique"), - {"schema": "augur_data", + {"schema": "data", "comment": 'Pull requests contain referencing metadata. There are a few columns that are discrete. There are also head and base designations for the repo on each side of the pull request. Similar functions exist in GitLab, though the language here is based on GitHub.'}, ) pr_repo_meta_id = Column( BigInteger, - Sequence('pull_request_meta_pr_repo_meta_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_meta_pr_repo_meta_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_meta_pr_repo_meta_id_seq'::regclass)" + "nextval('data.pull_request_meta_pr_repo_meta_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3348,7 +3348,7 @@ class PullRequestMeta(Base): ) pr_src_meta_ref = Column(String) pr_sha = Column(String) - cntrb_id = Column(ForeignKey("augur_data.contributors.cntrb_id")) + cntrb_id = Column(ForeignKey("data.contributors.cntrb_id")) tool_source = Column(String) tool_version = Column(String) data_source = Column(String) @@ -3384,20 +3384,20 @@ class PullRequestReviewer(Base): __table_args__ = ( Index("pr-reviewers-cntrb-idx1", "cntrb_id"), UniqueConstraint("pull_request_id", "pr_reviewer_src_id"), - {"schema": "augur_data"}, + {"schema": "data"}, ) pr_reviewer_map_id = Column( BigInteger, - Sequence('pull_request_reviewers_pr_reviewer_map_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_reviewers_pr_reviewer_map_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_reviewers_pr_reviewer_map_id_seq'::regclass)" + "nextval('data.pull_request_reviewers_pr_reviewer_map_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -3409,7 +3409,7 @@ class PullRequestReviewer(Base): repo_id = Column(BigInteger) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="CASCADE", onupdate="CASCADE" ), ) pr_reviewer_src_id = Column( @@ -3446,31 +3446,31 @@ class PullRequestReview(Base): __table_args__ = ( UniqueConstraint("pr_review_src_id", name="pr_review_unique"), Index("pr_id_pr_reviews", "pull_request_id"), - {"schema": "augur_data"}, + {"schema": "data"}, ) pr_review_id = Column( BigInteger, - Sequence('pull_request_reviews_pr_review_id_seq', start=1, schema='augur_data'), + Sequence('pull_request_reviews_pr_review_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_reviews_pr_review_id_seq'::regclass)" + "nextval('data.pull_request_reviews_pr_review_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ), nullable=False, ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") + ForeignKey("data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE") ) cntrb_id = Column( ForeignKey( - "augur_data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" + "data.contributors.cntrb_id", ondelete="RESTRICT", onupdate="CASCADE" ), nullable=False, ) @@ -3485,7 +3485,7 @@ class PullRequestReview(Base): pr_review_commit_id = Column(String) platform_id = Column( ForeignKey( - "augur_data.platform.pltfrm_id", + "data.platform.pltfrm_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3515,19 +3515,19 @@ class PullRequestReview(Base): class PullRequestTeam(Base): __tablename__ = "pull_request_teams" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} pr_team_id = Column( BigInteger, - Sequence('pull_request_teams_pr_team_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_teams_pr_team_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_teams_pr_team_id_seq'::regclass)" + "nextval('data.pull_request_teams_pr_team_id_seq'::regclass)" ), ) pull_request_id = Column( ForeignKey( - "augur_data.pull_requests.pull_request_id", + "data.pull_requests.pull_request_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -3557,21 +3557,21 @@ class PullRequestRepo(Base): __tablename__ = "pull_request_repo" __table_args__ = ( Index("pr-cntrb-idx-repo", "pr_cntrb_id"), - {"schema": "augur_data", + {"schema": "data", "comment": "This table is for storing information about forks that exist as part of a pull request. Generally we do not want to track these like ordinary repositories. "}, ) pr_repo_id = Column( BigInteger, - Sequence('pull_request_repo_pr_repo_id_seq', start=25430, schema='augur_data'), + Sequence('pull_request_repo_pr_repo_id_seq', start=25430, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_repo_pr_repo_id_seq'::regclass)" + "nextval('data.pull_request_repo_pr_repo_id_seq'::regclass)" ), ) pr_repo_meta_id = Column( ForeignKey( - "augur_data.pull_request_meta.pr_repo_meta_id", + "data.pull_request_meta.pr_repo_meta_id", ondelete="CASCADE", onupdate="CASCADE", ) @@ -3585,7 +3585,7 @@ class PullRequestRepo(Base): pr_repo_name = Column(String) pr_repo_full_name = Column(String) pr_repo_private_bool = Column(Boolean) - pr_cntrb_id = Column(ForeignKey("augur_data.contributors.cntrb_id")) + pr_cntrb_id = Column(ForeignKey("data.contributors.cntrb_id")) tool_source = Column(String) tool_version = Column(String) data_source = Column(String) @@ -3601,20 +3601,20 @@ class PullRequestReviewMessageRef(Base): __tablename__ = "pull_request_review_message_ref" __table_args__ = ( UniqueConstraint("pr_review_msg_src_id", name="pull-request-review-message-ref-insert-unique"), - {"schema": "augur_data"}, + {"schema": "data"}, ) pr_review_msg_ref_id = Column( BigInteger, - Sequence('pull_request_review_message_ref_pr_review_msg_ref_id_seq', start=1, schema='augur_data'), + Sequence('pull_request_review_message_ref_pr_review_msg_ref_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.pull_request_review_message_ref_pr_review_msg_ref_id_seq'::regclass)" + "nextval('data.pull_request_review_message_ref_pr_review_msg_ref_id_seq'::regclass)" ), ) pr_review_id = Column( ForeignKey( - "augur_data.pull_request_reviews.pr_review_id", + "data.pull_request_reviews.pr_review_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3624,7 +3624,7 @@ class PullRequestReviewMessageRef(Base): ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3633,7 +3633,7 @@ class PullRequestReviewMessageRef(Base): ) msg_id = Column( ForeignKey( - "augur_data.message.msg_id", + "data.message.msg_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3675,19 +3675,19 @@ class PullRequestReviewMessageRef(Base): class RepoClone(Base): __tablename__ = "repo_clones_data" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} repo_clone_data_id = Column( BigInteger, - Sequence('repo_clones_data_id_seq', start=1, schema='augur_data'), + Sequence('repo_clones_data_id_seq', start=1, schema='data'), primary_key=True, server_default=text( - "nextval('augur_data.repo_clones_data_id_seq'::regclass)" + "nextval('data.repo_clones_data_id_seq'::regclass)" ), ) repo_id = Column( ForeignKey( - "augur_data.repo.repo_id", + "data.repo.repo_id", ondelete="RESTRICT", onupdate="CASCADE", deferrable=True, @@ -3704,7 +3704,7 @@ class RepoClone(Base): class TopicModelMeta(Base): __tablename__ = "topic_model_meta" - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} model_id = Column( UUID(as_uuid=True), @@ -3713,7 +3713,7 @@ class TopicModelMeta(Base): comment="Unique identifier for the topic model" ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id"), + ForeignKey("data.repo.repo_id"), comment="Repository this model was trained on" ) model_method = Column( @@ -3811,7 +3811,7 @@ class TopicModelEvent(Base): __table_args__ = ( Index("ix_tme_repo_ts", "repo_id", "ts"), Index("ix_tme_event", "event"), - {"schema": "augur_data"} + {"schema": "data"} ) event_id = Column( @@ -3827,14 +3827,14 @@ class TopicModelEvent(Base): ) repo_id = Column( Integer, - ForeignKey("augur_data.repo.repo_id", name="fk_tme_repo_id"), + ForeignKey("data.repo.repo_id", name="fk_tme_repo_id"), nullable=True, comment="Repository associated with this event" ) model_id = Column( UUID(as_uuid=True), ForeignKey( - "augur_data.topic_model_meta.model_id", + "data.topic_model_meta.model_id", name="fk_tme_model_id", ondelete="SET NULL" ), diff --git a/collectoss/application/db/models/augur_operations.py b/collectoss/application/db/models/operations.py similarity index 92% rename from collectoss/application/db/models/augur_operations.py rename to collectoss/application/db/models/operations.py index 41a4cef6b..19eff3bae 100644 --- a/collectoss/application/db/models/augur_operations.py +++ b/collectoss/application/db/models/operations.py @@ -1,5 +1,5 @@ # encoding: utf-8 -from sqlalchemy import BigInteger, SmallInteger, Column, Index, Integer, String, Table, text, UniqueConstraint, Boolean, ForeignKey, update, CheckConstraint, Sequence +from sqlalchemy import BigInteger, SmallInteger, Column, Index, Integer, String, Table, text, UniqueConstraint, Boolean, ForeignKey, update, CheckConstraint, Sequence, DateTime, func from sqlalchemy.dialects.postgresql import TIMESTAMP from sqlalchemy.orm.exc import NoResultFound, MultipleResultsFound from sqlalchemy.exc import IntegrityError @@ -83,23 +83,23 @@ def retrieve_owner_repos(session, owner: str) -> List[str]: Column("Count", String), Column("WeightedComplexity", String), Column("Files", String), - schema="augur_operations", + schema="operations", ) class Settings(Base): __tablename__ = "augur_settings" __table_args__ = { - "schema": "augur_operations", + "schema": "operations", "comment": "CollectOSS settings include the schema version, and the CollectOSS API Key as of 10/25/2020. Future augur settings may be stored in this table, which has the basic structure of a name-value pair. ", } id = Column( BigInteger, - Sequence("augur_settings_id_seq", start=1, schema="augur_operations"), + Sequence("augur_settings_id_seq", start=1, schema="operations"), primary_key=True, server_default=text( - "nextval('augur_operations.augur_settings_id_seq'::regclass)" + "nextval('operations.augur_settings_id_seq'::regclass)" ), ) setting = Column(String) @@ -119,23 +119,23 @@ class Settings(Base): server_default=text("CURRENT_TIMESTAMP"), ), Index("repos_id,statusops", "repos_id", "status"), - schema="augur_operations", - comment="For future use when we move all working tables to the augur_operations schema. ", + schema="operations", + comment="For future use when we move all working tables to the operations schema. ", ) class WorkerHistory(Base): __tablename__ = "worker_history" __table_args__ = { - "schema": "augur_operations", + "schema": "operations", "comment": "This table stores the complete history of job execution, including success and failure. It is useful for troubleshooting. ", } history_id = Column( BigInteger, - Sequence("gh_worker_history_history_id_seq", start=1, schema="augur_operations"), + Sequence("gh_worker_history_history_id_seq", start=1, schema="operations"), primary_key=True, server_default=text( - "nextval('augur_operations.gh_worker_history_history_id_seq'::regclass)" + "nextval('operations.gh_worker_history_history_id_seq'::regclass)" ), ) repo_id = Column(BigInteger) @@ -150,7 +150,7 @@ class WorkerHistory(Base): class WorkerJob(Base): __tablename__ = "worker_job" __table_args__ = { - "schema": "augur_operations", + "schema": "operations", "comment": "This table stores the jobs workers collect data for. A job is found in the code, and in the augur.config.json under the construct of a “model”. ", } @@ -172,16 +172,16 @@ class WorkerJob(Base): class WorkerOauth(Base): __tablename__ = "worker_oauth" __table_args__ = { - "schema": "augur_operations", + "schema": "operations", "comment": "This table stores credentials for retrieving data from platform API’s. Entries in this table must comply with the terms of service for each platform. ", } oauth_id = Column( BigInteger, - Sequence("worker_oauth_oauth_id_seq", start=1000, schema="augur_operations"), + Sequence("worker_oauth_oauth_id_seq", start=1000, schema="operations"), primary_key=True, server_default=text( - "nextval('augur_operations.worker_oauth_oauth_id_seq'::regclass)" + "nextval('operations.worker_oauth_oauth_id_seq'::regclass)" ), ) name = Column(String(255), nullable=False) @@ -196,8 +196,8 @@ class WorkerOauth(Base): class WorkerSettingsFacade(Base): __tablename__ = "worker_settings_facade" __table_args__ = { - "schema": "augur_operations", - "comment": "For future use when we move all working tables to the augur_operations schema. ", + "schema": "operations", + "comment": "For future use when we move all working tables to the operations schema. ", } id = Column(Integer, primary_key=True) @@ -215,19 +215,19 @@ class WorkerSettingsFacade(Base): Column( "working_commit", String(40), server_default=text("'NULL'::character varying") ), - schema="augur_operations", - comment="For future use when we move all working tables to the augur_operations schema. ", + schema="operations", + comment="For future use when we move all working tables to the operations schema. ", ) class BadgingDEI(Base): __tablename__ = 'dei_badging' - __table_args__ = {"schema": "augur_data"} + __table_args__ = {"schema": "data"} id = Column(Integer, primary_key=True, nullable=False) badging_id = Column(Integer, nullable=False) level = Column(String, nullable=False) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False + ForeignKey("data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False ) repo = relationship("Repo") @@ -237,7 +237,7 @@ class Config(Base): __tablename__ = 'config' __table_args__ = ( UniqueConstraint('section_name', "setting_name", name='unique-config-setting'), - {"schema": "augur_operations"} + {"schema": "operations"} ) id = Column(SmallInteger, primary_key=True, nullable=False) @@ -255,7 +255,7 @@ class User(Base): UniqueConstraint('email', name='user-unique-email'), UniqueConstraint('login_name', name='user-unique-name'), UniqueConstraint('text_phone', name='user-unique-phone'), - {"schema": "augur_operations"} + {"schema": "operations"} ) user_id = Column(Integer, primary_key=True) @@ -634,12 +634,12 @@ class UserGroup(Base): __tablename__ = 'user_groups' __table_args__ = ( UniqueConstraint('user_id', 'name', name='user_groups_user_id_name_key'), - {"schema": "augur_operations"} + {"schema": "operations"} ) group_id = Column(BigInteger, primary_key=True) user_id = Column(Integer, - ForeignKey("augur_operations.users.user_id", name="user_group_user_id_fkey"), nullable=False + ForeignKey("operations.users.user_id", name="user_group_user_id_fkey"), nullable=False ) name = Column(String, nullable=False) favorited = Column(Boolean, nullable=False, server_default=text("FALSE")) @@ -743,13 +743,13 @@ def convert_group_name_to_id(session, user_id: int, group_name: str) -> int: class UserRepo(Base): __tablename__ = "user_repos" - __table_args__ = { "schema": "augur_operations" } + __table_args__ = { "schema": "operations" } group_id = Column( - ForeignKey("augur_operations.user_groups.group_id", name="user_repo_group_id_fkey"), primary_key=True, nullable=False + ForeignKey("operations.user_groups.group_id", name="user_repo_group_id_fkey"), primary_key=True, nullable=False ) repo_id = Column( - ForeignKey("augur_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False + ForeignKey("data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False ) repo = relationship("Repo", back_populates="user_repo") @@ -1010,12 +1010,12 @@ def add_github_org_repos(session, url: List[str], user_id: int, group_name: int) class UserSessionToken(Base): __tablename__ = "user_session_tokens" - __table_args__ = { "schema": "augur_operations" } + __table_args__ = { "schema": "operations" } token = Column(String, primary_key=True, nullable=False) - user_id = Column(ForeignKey("augur_operations.users.user_id", name="user_session_token_user_id_fkey"), nullable=False) + user_id = Column(ForeignKey("operations.users.user_id", name="user_session_token_user_id_fkey"), nullable=False) expiration = Column(BigInteger) - application_id = Column(ForeignKey("augur_operations.client_applications.id", name="user_session_token_application_id_fkey")) + application_id = Column(ForeignKey("operations.client_applications.id", name="user_session_token_application_id_fkey")) created_at = Column(BigInteger) user = relationship("User", back_populates="tokens") @@ -1048,10 +1048,10 @@ def delete_refresh_tokens(self, session): class ClientApplication(Base): __tablename__ = "client_applications" - __table_args__ = { "schema": "augur_operations" } + __table_args__ = { "schema": "operations" } id = Column(String, primary_key=True, nullable=False) - user_id = Column(ForeignKey("augur_operations.users.user_id", name="client_application_user_id_fkey"), nullable=False) + user_id = Column(ForeignKey("operations.users.user_id", name="client_application_user_id_fkey"), nullable=False) name = Column(String, nullable=False) redirect_url = Column(String, nullable=False) api_key = Column(String, nullable=False) @@ -1072,13 +1072,27 @@ def get_by_id(session, client_id): session.rollback() raise e +class ForgeInstance(Base): + __tablename__ = "forge_instance" + __table_args__ = { "schema": "operations" } + + id = Column(Integer, primary_key=True, nullable=False, comment="Internal unique identifier for this forge instance") + # platform_type stores an integer that CollectOSS maps/will map to it's internal platform identifier Enum + # (as used in ContributorUUID) for identifying the API endpoints and tasks to use for collection + platform_type = Column(Integer, nullable=False, comment="Type specifier identifying the relevant platform API interface to CollectOSS") + name = Column(String, nullable=False, comment="User-specified name for this forge instance") + # https://stackoverflow.com/a/54800233 + date_added = Column(DateTime(timezone=True), nullable=False, default=func.now()) + domain_name = Column(String, nullable=False, comment="The base domain name (without the scheme) where this instance is hosted") + enabled = Column(Boolean, default=True, nullable=False, comment="denotes whether collection should run for this instance") + class Subscription(Base): __tablename__ = "subscriptions" - __table_args__ = { "schema": "augur_operations" } + __table_args__ = { "schema": "operations" } - application_id = Column(ForeignKey("augur_operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) - type_id = Column(ForeignKey("augur_operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) + application_id = Column(ForeignKey("operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) + type_id = Column(ForeignKey("operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) application = relationship("ClientApplication", back_populates="subscriptions") type = relationship("SubscriptionType", back_populates="subscriptions") @@ -1087,7 +1101,7 @@ class SubscriptionType(Base): __tablename__ = "subscription_types" __table_args__ = ( UniqueConstraint('name', name='subscription_type_title_unique'), - {"schema": "augur_operations"} + {"schema": "operations"} ) @@ -1101,11 +1115,11 @@ class RefreshToken(Base): __tablename__ = "refresh_tokens" __table_args__ = ( UniqueConstraint('user_session_token', name='refresh_token_user_session_token_id_unique'), - {"schema": "augur_operations"} + {"schema": "operations"} ) id = Column(String, primary_key=True) - user_session_token = Column(ForeignKey("augur_operations.user_session_tokens.token", name="refresh_token_session_token_id_fkey"), nullable=False) + user_session_token = Column(ForeignKey("operations.user_session_tokens.token", name="refresh_token_session_token_id_fkey"), nullable=False) user_session = relationship("UserSessionToken", back_populates="refresh_tokens") @@ -1187,10 +1201,10 @@ class CollectionStatus(Base): "NOT (core_status = 'Pending' AND secondary_status = 'Collecting')", name='core_secondary_dependency_check' ), - {"schema": "augur_operations"} + {"schema": "operations"} ) - repo_id = Column(ForeignKey("augur_data.repo.repo_id", name="collection_status_repo_id_fk"), primary_key=True) + repo_id = Column(ForeignKey("data.repo.repo_id", name="collection_status_repo_id_fk"), primary_key=True) core_data_last_collected = Column(TIMESTAMP) core_status = Column(String, nullable=False, server_default=text("'Pending'")) core_task_id = Column(String) diff --git a/collectoss/application/db/timestamp_utils.py b/collectoss/application/db/timestamp_utils.py new file mode 100644 index 000000000..3e969f957 --- /dev/null +++ b/collectoss/application/db/timestamp_utils.py @@ -0,0 +1,130 @@ +""" +Timestamp correction utilities for git commit data. + +This module provides functions to validate and correct timestamp strings +before database insertion, specifically handling invalid timezone offsets +that PostgreSQL cannot process. Resides in the db layer so it can be used +by db-layer bulk insert logic without crossing into the tasks layer. +""" + +import logging +from typing import List, Optional + + +# Valid PostgreSQL timezone offsets in format ±HHMM (e.g., -0500, +0530) +# Range: -12:00 to +14:00 including all real-world fractional hour offsets +POSTGRES_VALID_TIMEZONES = { + -1200, -1100, -1000, -930, -900, -800, -700, + -600, -500, -430, -400, -330, -300, -230, -200, -100, 0, + 100, 200, 300, 330, 400, 430, 500, 530, 545, 600, + 630, 700, 800, 845, 900, 930, 1000, 1030, 1100, 1130, 1200, + 1245, 1300, 1345, 1400 +} + + +def correct_timestamp( + timestamp_str: str, + fallback: Optional[str] = None, + logger: Optional[logging.Logger] = None +) -> str: + """Fix invalid timezone in timestamp string. + + Validates the timezone portion of a timestamp and corrects it if invalid. + Handles three cases: + 1. Valid timezone → return as-is + 2. Invalid timezone → replace with fallback or UTC + 3. Unparseable format → return fallback or default + + Args: + timestamp_str: Timestamp string in format 'YYYY-MM-DD HH:MM:SS ±HHMM' + fallback: Optional fallback timestamp to use if correction needed + logger: Optional logger for recording corrections + + Returns: + Corrected timestamp string safe for PostgreSQL insertion + """ + if not timestamp_str: + return fallback or "1970-01-01 00:00:15 +0000" + + # Split on last space to separate date/time from timezone + # Example: '2025-11-03 16:28:43 -0500' → ['2025-11-03 16:28:43', '-0500'] + parts = timestamp_str.strip().rsplit(' ', 1) + + if len(parts) != 2: + # No space found, can't parse + if logger: + logger.warning(f"Unparseable timestamp format (no space): {timestamp_str}") + return fallback or "1970-01-01 00:00:15 +0000" + + date_time, tz_string = parts + + # Validate timezone starts with + or - + if not tz_string or tz_string[0] not in ('+', '-'): + if logger: + logger.warning(f"Unparseable timezone (no sign): {tz_string}") + return fallback or "1970-01-01 00:00:15 +0000" + + # Normalize timezone: remove colons (handles both -0500 and -05:00) + tz_normalized = tz_string.replace(':', '') + + # Try to parse as integer + try: + tz_offset = int(tz_normalized) + except ValueError: + if logger: + logger.warning(f"Could not parse timezone as integer: {tz_string}") + return fallback or "1970-01-01 00:00:15 +0000" + + # Check if timezone is valid + if tz_offset in POSTGRES_VALID_TIMEZONES: + # Valid timezone, return original + return timestamp_str + + # Invalid timezone detected + if fallback: + if logger: + logger.info(f"Invalid timezone {tz_offset} in '{timestamp_str}', using fallback") + return fallback + + # No fallback, default to UTC + if logger: + logger.warning(f"Invalid timezone {tz_offset} in '{timestamp_str}', defaulting to UTC") + return f"{date_time} +0000" + + +def clean_commit_timestamps(records: List[dict], logger: logging.Logger) -> None: + """Validate and correct timestamps in commit records in-place. + + Processes a batch of commit records, validating both author and committer + timestamps. For invalid committer timestamps, uses the corrected author + timestamp as a fallback before defaulting to UTC. + + This prevents PostgreSQL insertion failures due to invalid timezone offsets + (e.g., -13068837 which is outside the valid ±14:00 range). + + Args: + records: List of commit record dicts with keys: + - 'cmt_author_timestamp' + - 'cmt_committer_timestamp' + logger: Logger for recording corrections + + Returns: + None (modifies records in-place) + """ + for record in records: + author_ts = record.get('cmt_author_timestamp', '') + committer_ts = record.get('cmt_committer_timestamp', '') + + # Correct author timestamp first (no fallback, will use UTC if invalid) + author_corrected = correct_timestamp(author_ts, fallback=None, logger=logger) + + # Correct committer timestamp, using corrected author as fallback + # This minimizes data loss per issue discussion (prefer author time over UTC) + committer_corrected = correct_timestamp( + committer_ts, + fallback=author_corrected, + logger=logger + ) + + record['cmt_author_timestamp'] = author_corrected + record['cmt_committer_timestamp'] = committer_corrected diff --git a/collectoss/application/environment.py b/collectoss/application/environment.py new file mode 100644 index 000000000..eee8942ed --- /dev/null +++ b/collectoss/application/environment.py @@ -0,0 +1,86 @@ +from typing import Optional +import os +import warnings +import logging + +logger = logging.getLogger(__name__) + +def extract_prefix(key: str, prefixes: list[str], separator = "_") -> Optional[str]: + """Detect and return the prefix present on the provided key + + Args: + key (str): the key to remove the prefix from + prefixes (list[str]): the prefixes to look for + separator (str, optional): the separator between elements of the key to also remove (if they would otherwise be dangling). Defaults to "_". + + Returns: + str: The detected prefix (including any separators) if any, otherwise None + """ + k = key.upper() + for p in prefixes: + p_up = p.upper() + if k == p_up: + return key[:len(p)] + if k.startswith(p_up + separator): + return key[:len(p) + len(separator)] + return None + + +class SystemEnv: + """Centralized environment variable access + Built for enabling migration of environment variable names + """ + + _prefixes = ["COLLECTOSS", "AUGUR"] + _warn_prefixes = ["AUGUR"] + _separator = "_" + + @classmethod + def get(cls, key: str, default = None, prefixes = _prefixes) -> Optional[str]: + # extract the suffix so we can try multiple prefixes + canonical_prefix = extract_prefix(key, prefixes, cls._separator) + suffix = key[len(canonical_prefix):] if canonical_prefix is not None else key + # check prefixes in order and use the first one that has a value + for p in prefixes: + check_key = f"{p}{cls._separator}{suffix}" + value = os.getenv(check_key, None) + + if value is not None: + # emit a warning if configured + if p in cls._warn_prefixes: + msg = ( + f"Environment variable '{check_key}' is deprecated. " + f"Use '{key}' instead. This automatic recovery may become a failure in a future version " + ) + logger.warning(msg) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + return value + + if not canonical_prefix: + return os.getenv(key, default) + + return default + + @classmethod + def get_bool(cls, key:str, default: bool, prefixes = _prefixes) -> bool: + """gets a value from the environment and cast it to a boolean + """ + raw_val = cls.get(key, None, prefixes) + if raw_val is None: + return default + return raw_val.lower() in ('true', '1', 't', 'y', 'yes') + + @classmethod + def set(cls, key: str, value: str, overwrite=True) -> None: + if os.getenv(key) is not None and not overwrite: + return + + os.environ[key] = value + + @classmethod + def set_default(cls, key: str, value: str) -> None: + if cls.get(key) is None: + cls.set(key, value) + return value + return cls.get(key) \ No newline at end of file diff --git a/collectoss/application/logs.py b/collectoss/application/logs.py index 253482877..aaf6cb5d8 100644 --- a/collectoss/application/logs.py +++ b/collectoss/application/logs.py @@ -192,8 +192,8 @@ def __init__(self, logger_name, disable_log_files=False,reset_logfiles=False,bas log_config = get_log_config() - if log_config["logs_directory"] != "": - base_log_dir=log_config["logs_directory"] + if log_config.get("logs_directory", "") != "": + base_log_dir=log_config.get("logs_directory") if reset_logfiles is True: try: diff --git a/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py b/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py index f6aeeca20..d903ae8fd 100644 --- a/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py +++ b/collectoss/application/schema/alembic/versions/31_update_pr_events_unique.py @@ -20,7 +20,7 @@ # conn = op.get_bind() # conn.execute(text(""" - # UPDATE pull_request_events + # UPDATE augur_data.pull_request_events # SET issue_event_src_id = substring(node_url FROM '.*/([0-9]+)$')::BIGINT; # """)) @@ -32,7 +32,7 @@ def upgrade(): with engine.connect() as conn: - result = conn.execute(text("SELECT COUNT(*) FROM pull_request_events WHERE issue_event_src_id=pr_platform_event_id")) + result = conn.execute(text("SELECT COUNT(*) FROM augur_data.pull_request_events WHERE issue_event_src_id=pr_platform_event_id")) total_rows = result.scalar() if total_rows != 0: print(f"Rows needing updated: {total_rows}") @@ -43,14 +43,14 @@ def upgrade(): result = conn.execute(text(""" WITH cte AS ( SELECT pr_event_id - FROM pull_request_events + FROM augur_data.pull_request_events WHERE issue_event_src_id=pr_platform_event_id LIMIT 250000 ) - UPDATE pull_request_events + UPDATE augur_data.pull_request_events SET issue_event_src_id = substring(node_url FROM '.*/([0-9]+)$')::BIGINT FROM cte - WHERE pull_request_events.pr_event_id = cte.pr_event_id + WHERE augur_data.pull_request_events.pr_event_id = cte.pr_event_id RETURNING 1; """)) @@ -77,7 +77,7 @@ def downgrade(): print("Please run in background. This downgrade will take a very *very* long time") conn = op.get_bind() conn.execute(text(""" - UPDATE pull_request_events + UPDATE augur_data.pull_request_events SET issue_event_src_id = pr_platform_event_id WHERE issue_event_src_id <> pr_platform_event_id; """)) \ No newline at end of file diff --git a/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py b/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py index f4e17a08b..ba0c4568a 100644 --- a/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py +++ b/collectoss/application/schema/alembic/versions/34_add_contrib_to_config.py @@ -20,38 +20,8 @@ logger = logging.getLogger(__name__) def upgrade(): - - with DatabaseSession(logger) as session: - config = SystemConfig(logger,session) - config_dict = config.load_config() - - #Update the missing fields of the facade section in the config - section = config_dict.get("Facade") - - #Just copy the default if section doesn't exist. - if section: - if 'facade_contributor_full_recollect' not in section.keys(): - section['facade_contributor_full_recollect'] = 0 - - else: - section = config.default_config["Facade"] - - config.add_section_from_json("Facade", section) + pass def downgrade(): - - conn = op.get_bind() - - conn.execute(text(f""" - DELETE FROM augur_operations.config - WHERE section_name='Facade' AND (setting_name='facade_contributor_full_recollect'); - """)) - - try: - conn.execute(text(f""" - DELETE FROM augur_operations.config - WHERE section_name='Facade' AND (setting_name='facade_contributor_full_recollect'); - """)) - except: - pass \ No newline at end of file + pass \ No newline at end of file diff --git a/collectoss/application/schema/alembic/versions/42_introduce_empty_instances_table.py b/collectoss/application/schema/alembic/versions/42_introduce_empty_instances_table.py new file mode 100644 index 000000000..fd80723d6 --- /dev/null +++ b/collectoss/application/schema/alembic/versions/42_introduce_empty_instances_table.py @@ -0,0 +1,37 @@ +"""introduce empty instances table + +Revision ID: 42 +Revises: 41 +Create Date: 2026-05-07 15:51:17.510641 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '42' +down_revision = '41' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('forge_instance', + sa.Column('id', sa.Integer(), nullable=False, comment='Internal unique identifier for this forge instance'), + sa.Column('platform_type', sa.Integer(), nullable=False, comment='Type specifier identifying the relevant platform API interface to CollectOSS'), + sa.Column('name', sa.String(), nullable=False, comment='User-specified name for this forge instance'), + sa.Column('date_added', sa.DateTime(timezone=True), nullable=False), + sa.Column('domain_name', sa.String(), nullable=False, comment='The base domain name (without the scheme) where this instance is hosted'), + sa.Column('enabled', sa.Boolean(), nullable=False, comment='denotes whether collection should run for this instance'), + sa.PrimaryKeyConstraint('id'), + schema='augur_operations' + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('forge_instance', schema='augur_operations') + # ### end Alembic commands ### diff --git a/collectoss/application/schema/alembic/versions/43_rename_schema.py b/collectoss/application/schema/alembic/versions/43_rename_schema.py new file mode 100644 index 000000000..4e6692551 --- /dev/null +++ b/collectoss/application/schema/alembic/versions/43_rename_schema.py @@ -0,0 +1,69 @@ +"""rename schema + +Revision ID: 43 +Revises: 42 +Create Date: 2026-05-27 15:28:12.439500 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy import text +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '43' +down_revision = '42' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + conn = op.get_bind() + conn.execute(text("ALTER SCHEMA augur_data RENAME TO data;")) + conn.execute(text("ALTER SCHEMA augur_operations RENAME TO operations;")) + + op.create_table_comment( + 'repos_fetch_log', + 'For future use when we move all working tables to the operations schema. ', + existing_comment='For future use when we move all working tables to the augur_operations schema. ', + schema='operations' + ) + op.create_table_comment( + 'worker_settings_facade', + 'For future use when we move all working tables to the operations schema. ', + existing_comment='For future use when we move all working tables to the augur_operations schema. ', + schema='operations' + ) + op.create_table_comment( + 'working_commits', + 'For future use when we move all working tables to the operations schema. ', + existing_comment='For future use when we move all working tables to the augur_operations schema. ', + schema='operations' + ) + + + +def downgrade() -> None: + + op.create_table_comment( + 'working_commits', + 'For future use when we move all working tables to the augur_operations schema. ', + existing_comment='For future use when we move all working tables to the operations schema. ', + schema='operations' + ) + op.create_table_comment( + 'worker_settings_facade', + 'For future use when we move all working tables to the augur_operations schema. ', + existing_comment='For future use when we move all working tables to the operations schema. ', + schema='operations' + ) + op.create_table_comment( + 'repos_fetch_log', + 'For future use when we move all working tables to the augur_operations schema. ', + existing_comment='For future use when we move all working tables to the operations schema. ', + schema='operations' + ) + + conn = op.get_bind() + conn.execute(text("ALTER SCHEMA data RENAME TO augur_data;")) + conn.execute(text("ALTER SCHEMA operations RENAME TO augur_operations;")) \ No newline at end of file diff --git a/collectoss/application/service_manager.py b/collectoss/application/service_manager.py index 3cebb4d34..2497f37ef 100644 --- a/collectoss/application/service_manager.py +++ b/collectoss/application/service_manager.py @@ -110,31 +110,31 @@ def clear_redis_caches(): #Make sure that database reflects collection status when processes are killed/stopped. def clean_collection_status(session): session.execute_sql(s.sql.text(""" - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET core_status='Pending',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET core_status='Success',core_task_id = NULL WHERE core_status='Collecting' AND core_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Pending',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET secondary_status='Success',secondary_task_id = NULL WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET facade_status='Update', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET facade_status='Success', facade_task_id=NULL WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NOT NULL; - UPDATE augur_operations.collection_status + UPDATE operations.collection_status SET facade_status='Pending', facade_task_id=NULL WHERE facade_status='Failed Clone' OR facade_status='Initializing'; """)) diff --git a/collectoss/tasks/data_analysis/clustering_worker/tasks.py b/collectoss/tasks/data_analysis/clustering_worker/tasks.py index c9e269e5f..da24e6028 100644 --- a/collectoss/tasks/data_analysis/clustering_worker/tasks.py +++ b/collectoss/tasks/data_analysis/clustering_worker/tasks.py @@ -78,10 +78,10 @@ def clustering_model(repo_git: str,logger,engine) -> None: i.issue_title thread_title, M.msg_id FROM - augur_data.repo r, - augur_data.issues i, - augur_data.message M, - augur_data.issue_message_ref imr + data.repo r, + data.issues i, + data.message M, + data.issue_message_ref imr WHERE r.repo_id = i.repo_id AND imr.issue_id = i.issue_id @@ -98,10 +98,10 @@ def clustering_model(repo_git: str,logger,engine) -> None: pr.pr_src_title thread_title, M.msg_id FROM - augur_data.repo r, - augur_data.pull_requests pr, - augur_data.message M, - augur_data.pull_request_message_ref prmr + data.repo r, + data.pull_requests pr, + data.message M, + data.pull_request_message_ref prmr WHERE r.repo_id = pr.repo_id AND prmr.pull_request_id = pr.pull_request_id @@ -289,15 +289,15 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): get_messages_sql = s.sql.text( """ SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id - FROM augur_data.repo r, augur_data.issues i, - augur_data.message m, augur_data.issue_message_ref imr + FROM data.repo r, data.issues i, + data.message m, data.issue_message_ref imr WHERE r.repo_id=i.repo_id AND imr.issue_id=i.issue_id AND imr.msg_id=m.msg_id UNION SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, pr.pull_request_id thread_id,m.msg_text,pr.pr_src_title thread_title,m.msg_id - FROM augur_data.repo r, augur_data.pull_requests pr, - augur_data.message m, augur_data.pull_request_message_ref prmr + FROM data.repo r, data.pull_requests pr, + data.message m, data.pull_request_message_ref prmr WHERE r.repo_id=pr.repo_id AND prmr.pull_request_id=pr.pull_request_id AND prmr.msg_id=m.msg_id @@ -365,7 +365,7 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): # key_sequence_words_sql = s.sql.text( # """ - # SELECT nextval('augur_data.topic_words_topic_words_id_seq'::text) + # SELECT nextval('data.topic_words_topic_words_id_seq'::text) # """ # ) diff --git a/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 108326b50..4c8aec067 100644 --- a/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/collectoss/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -113,11 +113,11 @@ def contributor_breadth_model(self) -> None: if len(cntrb_events) == 0: logger.info("There are no cntrb events, or new events for this user.\n") continue - except UrlNotFoundException as e: - logger.warning(e) + logger.warning( + f"UrlNotFoundException while processing contributor {cntrb['gh_login']}: {e}" + ) continue - events = process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source) logger.info(f"Inserting {len(events)} events") diff --git a/collectoss/tasks/data_analysis/discourse_analysis/tasks.py b/collectoss/tasks/data_analysis/discourse_analysis/tasks.py index a95756b8c..fccf169e8 100644 --- a/collectoss/tasks/data_analysis/discourse_analysis/tasks.py +++ b/collectoss/tasks/data_analysis/discourse_analysis/tasks.py @@ -51,16 +51,16 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: get_messages_for_repo_sql = s.sql.text(""" (SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id - FROM augur_data.repo r, augur_data.issues i, - augur_data.message m, augur_data.issue_message_ref imr + FROM data.repo r, data.issues i, + data.message m, data.issue_message_ref imr WHERE r.repo_id=i.repo_id AND imr.issue_id=i.issue_id AND imr.msg_id=m.msg_id AND r.repo_id = :repo_id) UNION (SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, pr.pull_request_id thread_id,m.msg_text,pr.pr_src_title thread_title,m.msg_id - FROM augur_data.repo r, augur_data.pull_requests pr, - augur_data.message m, augur_data.pull_request_message_ref prmr + FROM data.repo r, data.pull_requests pr, + data.message m, data.pull_request_message_ref prmr WHERE r.repo_id=pr.repo_id AND prmr.pull_request_id=pr.pull_request_id AND prmr.msg_id=m.msg_id diff --git a/collectoss/tasks/data_analysis/message_insights/tasks.py b/collectoss/tasks/data_analysis/message_insights/tasks.py index 7913a5d13..751da1ea7 100644 --- a/collectoss/tasks/data_analysis/message_insights/tasks.py +++ b/collectoss/tasks/data_analysis/message_insights/tasks.py @@ -52,7 +52,7 @@ def message_insight_model(repo_git: str,logger,engine) -> None: # Check to see if repo has been analyzed previously repo_exists_SQL = s.sql.text(""" - SELECT exists (SELECT 1 FROM augur_data.message_analysis_summary WHERE repo_id = :repo_id LIMIT 1)""") + SELECT exists (SELECT 1 FROM data.message_analysis_summary WHERE repo_id = :repo_id LIMIT 1)""") with engine.connect() as conn: df_rep = pd.read_sql_query(repo_exists_SQL, conn, params={'repo_id': repo_id}) @@ -66,17 +66,17 @@ def message_insight_model(repo_git: str,logger,engine) -> None: # Fetch the timestamp of last analyzed message for the repo past_SQL = s.sql.text(""" select message_analysis.msg_id, message.msg_timestamp - from augur_data.message_analysis - inner join augur_data.message on message.msg_id = message_analysis.msg_id - inner join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - inner join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + from data.message_analysis + inner join data.message on message.msg_id = message_analysis.msg_id + inner join data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + inner join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where message.repo_id = :repo_id UNION select message_analysis.msg_id, message.msg_timestamp - from augur_data.message_analysis - inner join augur_data.message on message.msg_id = message_analysis.msg_id - inner join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - inner join augur_data.issues on issue_message_ref.issue_id = issues.issue_id + from data.message_analysis + inner join data.message on message.msg_id = message_analysis.msg_id + inner join data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + inner join data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id """) @@ -97,28 +97,28 @@ def message_insight_model(repo_git: str,logger,engine) -> None: # Fetch only recent messages join_SQL = s.sql.text(""" - select message.msg_id, msg_timestamp, msg_text from augur_data.message - left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + select message.msg_id, msg_timestamp, msg_text from data.message + left outer join data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + left outer join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where message.repo_id = :repo_id and msg_timestamp > :begin_date UNION - select message.msg_id, msg_timestamp, msg_text from augur_data.message - left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id + select message.msg_id, msg_timestamp, msg_text from data.message + left outer join data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + left outer join data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id and msg_timestamp > :begin_date""") else: logger.info(f'Fetching all past messages of repo {repo_id}...') # Fetch all messages join_SQL = s.sql.text(""" - select message.msg_id, msg_timestamp, msg_text from augur_data.message - left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + select message.msg_id, msg_timestamp, msg_text from data.message + left outer join data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + left outer join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where message.repo_id = :repo_id UNION - select message.msg_id, msg_timestamp, msg_text from augur_data.message - left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id + select message.msg_id, msg_timestamp, msg_text from data.message + left outer join data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + left outer join data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id""") with engine.connect() as conn: @@ -147,14 +147,14 @@ def message_insight_model(repo_git: str,logger,engine) -> None: if not full_train: merge_SQL = s.sql.text(""" - select novelty_flag, reconstruction_error from augur_data.message_analysis - left outer join augur_data.pull_request_message_ref on message_analysis.msg_id = pull_request_message_ref.msg_id - left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + select novelty_flag, reconstruction_error from data.message_analysis + left outer join data.pull_request_message_ref on message_analysis.msg_id = pull_request_message_ref.msg_id + left outer join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where pull_request_message_ref.repo_id = :repo_id UNION - select novelty_flag, reconstruction_error from augur_data.message_analysis - left outer join augur_data.issue_message_ref on message_analysis.msg_id = issue_message_ref.msg_id - left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id + select novelty_flag, reconstruction_error from data.message_analysis + left outer join data.issue_message_ref on message_analysis.msg_id = issue_message_ref.msg_id + left outer join data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") with engine.connect() as conn: diff --git a/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py index aa8d5a0a0..34512fbc9 100644 --- a/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/collectoss/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -59,8 +59,8 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: pull_request_commits.pr_cmt_id, pr_augur_contributor_id, pr_src_author_association - from augur_data.pull_requests - INNER JOIN augur_data.pull_request_commits on pull_requests.pull_request_id = pull_request_commits.pull_request_id + from data.pull_requests + INNER JOIN data.pull_request_commits on pull_requests.pull_request_id = pull_request_commits.pull_request_id where pr_created_at > :begin_date and pull_requests.repo_id = :repo_id and pr_src_state like 'open' @@ -90,13 +90,13 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: # Get sentiment score of all messages relating to the PR messages_SQL = s.sql.text(""" - select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from augur_data.message - left outer join augur_data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id - left outer join augur_data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where pull_request_message_ref.repo_id = :repo_id + select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from data.message + left outer join data.pull_request_message_ref on message.msg_id = pull_request_message_ref.msg_id + left outer join data.pull_requests on pull_request_message_ref.pull_request_id = pull_requests.pull_request_id where pull_request_message_ref.repo_id = :repo_id UNION - select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from augur_data.message - left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id - left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") + select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from data.message + left outer join data.issue_message_ref on message.msg_id = issue_message_ref.msg_id + left outer join data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") with engine.connect() as conn: df_message = pd.read_sql_query(messages_SQL, conn, params={'repo_id': repo_id}) @@ -104,7 +104,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: # Map PR to its corresponding messages - pr_ref_sql = s.sql.text("select * from augur_data.pull_request_message_ref") + pr_ref_sql = s.sql.text("select * from data.pull_request_message_ref") with engine.connect() as conn: df_pr_ref = pd.read_sql_query(pr_ref_sql, conn) df_merge = pd.merge(df_pr, df_pr_ref, on='pull_request_id', how='left') @@ -142,7 +142,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: ''' # Get cntrb info from API - cntrb_sql = 'SELECT cntrb_id, gh_login FROM augur_data.contributors' + cntrb_sql = 'SELECT cntrb_id, gh_login FROM data.contributors' df_ctrb = pd.read_sql_query(cntrb_SQL, create_database_engine()) df_fin1 = pd.merge(df_fin,df_ctrb,left_on='pr_augur_contributor_id', right_on='cntrb_id', how='left') df_fin1 = df_fin1.drop(['cntrb_id'],axis=1) @@ -157,7 +157,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: # Get repo info repo_sql = s.sql.text(""" SELECT repo_id, pull_requests_merged, pull_request_count,watchers_count, last_updated FROM - augur_data.repo_info where repo_id = :repo_id + data.repo_info where repo_id = :repo_id """) with engine.connect() as conn: diff --git a/collectoss/tasks/db/refresh_materialized_views.py b/collectoss/tasks/db/refresh_materialized_views.py index 95f169722..751a97e93 100644 --- a/collectoss/tasks/db/refresh_materialized_views.py +++ b/collectoss/tasks/db/refresh_materialized_views.py @@ -19,78 +19,78 @@ def refresh_materialized_views(self): #self.logger = logging.getLogger(refresh_materialized_views.__name__) mv1_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.api_get_all_repo_prs with data; + REFRESH MATERIALIZED VIEW concurrently data.api_get_all_repo_prs with data; COMMIT; """) mv2_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.api_get_all_repos_commits with data; + REFRESH MATERIALIZED VIEW concurrently data.api_get_all_repos_commits with data; COMMIT; """) mv3_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.api_get_all_repos_issues with data; + REFRESH MATERIALIZED VIEW concurrently data.api_get_all_repos_issues with data; COMMIT; """) mv4_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.augur_new_contributors with data; + REFRESH MATERIALIZED VIEW concurrently data.augur_new_contributors with data; COMMIT; """) mv5_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_commits_and_committers_daily_count with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_commits_and_committers_daily_count with data; COMMIT; """) mv6_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_new_contributors with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_new_contributors with data; COMMIT; """) mv7_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_entry_list with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_entry_list with data; COMMIT; """) mv8_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_contributor_actions with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_contributor_actions with data; COMMIT; """) mv9_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_user_repos with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_user_repos with data; COMMIT; """) mv10_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response_times with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_pr_response_times with data; COMMIT; """) mv11_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_assignments with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_pr_assignments with data; COMMIT; """) mv12_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_issue_assignments with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_issue_assignments with data; COMMIT; """) mv13_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_pr_response with data; COMMIT; """) mv14_refresh = s.sql.text(""" - REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_repo_languages with data; + REFRESH MATERIALIZED VIEW concurrently data.explorer_repo_languages with data; COMMIT; """) diff --git a/collectoss/tasks/frontend.py b/collectoss/tasks/frontend.py index 4ed2e24aa..d78fc1e1d 100644 --- a/collectoss/tasks/frontend.py +++ b/collectoss/tasks/frontend.py @@ -10,7 +10,7 @@ from collectoss.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from collectoss.application.db.lib import get_group_by_name, get_repo_by_repo_git, get_github_repo_by_src_id, get_gitlab_repo_by_src_id from collectoss.tasks.github.util.util import get_owner_repo -from collectoss.application.db.models.augur_operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup, CollectionStatus +from collectoss.application.db.models.operations import retrieve_owner_repos, FRONTEND_REPO_GROUP_NAME, RepoGroup, CollectionStatus from collectoss.tasks.github.util.github_paginator import hit_api from collectoss.application.db.models import UserRepo, Repo diff --git a/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py index 46304490f..752582d64 100644 --- a/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py +++ b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py @@ -117,6 +117,5 @@ def get_libyear(current_version, current_release_date, latest_version, latest_re latest_release_date = dateutil.parser.parse(latest_release_date) libdays = (latest_release_date - current_release_date).days - print(libdays) libyear = libdays/365 return libyear \ No newline at end of file diff --git a/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py index 372e64c82..0a74492f2 100644 --- a/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py +++ b/collectoss/tasks/git/dependency_libyear_tasks/libyear_util/util.py @@ -95,7 +95,6 @@ def get_libyear(current_version, current_release_date, latest_version, latest_re latest_release_date = dateutil.parser.parse(latest_release_date) libdays = (latest_release_date - current_release_date).days - print(libdays) libyear = libdays/365 return libyear diff --git a/collectoss/tasks/git/dependency_tasks/core.py b/collectoss/tasks/git/dependency_tasks/core.py index a9e74b4e1..0648231b0 100644 --- a/collectoss/tasks/git/dependency_tasks/core.py +++ b/collectoss/tasks/git/dependency_tasks/core.py @@ -1,7 +1,8 @@ from datetime import datetime import os from collectoss.application.db.models import * -from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value, get_session +from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value +from collectoss.application.environment import SystemEnv from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from collectoss.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from collectoss.tasks.util.worker_util import parse_json_from_subprocess_call @@ -79,19 +80,11 @@ def generate_scorecard(logger, repo_git): command = '--repo=' + path #this is path where our scorecard project is located - path_to_scorecard = os.getenv('SCORECARD_DIR', os.environ['HOME'] + '/scorecard') + path_to_scorecard = SystemEnv.get('SCORECARD_DIR', os.path.expanduser('~/scorecard')) #setting the environmental variable which is required by scorecard - - with get_session() as session: - #key_handler = GithubRandomKeyAuth(logger) - key_handler = GithubApiKeyHandler(logger) - os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() - - # This seems outdated - #setting the environmental variable which is required by scorecard - #key_handler = GithubApiKeyHandler(session, session.logger) - #os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + key_handler = GithubApiKeyHandler(logger) + SystemEnv.set('GITHUB_AUTH_TOKEN', key_handler.get_random_key()) try: required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) diff --git a/collectoss/tasks/git/scc_value_tasks/core.py b/collectoss/tasks/git/scc_value_tasks/core.py index 7c9e0bafd..770165522 100644 --- a/collectoss/tasks/git/scc_value_tasks/core.py +++ b/collectoss/tasks/git/scc_value_tasks/core.py @@ -2,6 +2,7 @@ import os from collectoss.application.db.models import * from collectoss.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value +from collectoss.application.environment import SystemEnv from collectoss.tasks.util.worker_util import parse_json_from_subprocess_call from collectoss.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path @@ -20,7 +21,7 @@ def value_model(logger,repo_git): logger.info(f"Repo ID: {repo_id}, Path: {path}") logger.info('Running scc...') - path_to_scc = os.getenv('SCC_DIR', os.environ['HOME'] + '/scc') + path_to_scc = SystemEnv.get('SCC_DIR', os.path.expanduser('~/scc')) required_output = parse_json_from_subprocess_call(logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/config.py b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py index 7da6495bd..2b536a3a4 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/config.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/config.py @@ -40,11 +40,13 @@ from collectoss.application.db.lib import execute_sql from logging import Logger +from collectoss.application.environment import SystemEnv + logger = logging.getLogger(__name__) def get_database_args_from_env(): - db_str = os.getenv("AUGUR_DB") + db_str = SystemEnv.get("COLLECTOSS_DB") try: db_json_file_location = os.getcwd() + "/db.config.json" except FileNotFoundError: @@ -55,7 +57,7 @@ def get_database_args_from_env(): if not db_str and not db_json_exists: - logger.error("ERROR no way to get connection to the database. \n\t\t\t\t\t\t There is no db.config.json and the AUGUR_DB environment variable is not set\n\t\t\t\t\t\t Please run make install or set the AUGUR_DB environment then run make install") + logger.error("ERROR no way to get connection to the database. \n\t\t\t\t\t\t There is no db.config.json and the COLLECTOSS_DB environment variable is not set\n\t\t\t\t\t\t Please run make install or set the COLLECTOSS_DB environment then run make install") sys.exit() credentials = {} diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py index 658ddc1d0..dfb331c1d 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -32,10 +32,11 @@ import sqlalchemy as s from .utilitymethods import update_repo_log, get_absolute_repo_path from sqlalchemy.orm.exc import NoResultFound -from collectoss.application.db.models.augur_data import * -from collectoss.application.db.models.augur_operations import CollectionStatus +from collectoss.application.db.models.data import * +from collectoss.application.db.models.operations import CollectionStatus from collectoss.application.db.util import execute_session_query, convert_orm_list_to_dict_list from collectoss.application.db.lib import execute_sql, get_repo_by_repo_git +from typing_extensions import deprecated class GitCloneError(Exception): pass @@ -174,8 +175,7 @@ def git_repo_initialize(facade_helper, session, repo_git): facade_helper.log_activity('Info', f"Fetching new repos (complete)") -# Deprecated functionality. No longer used -# Should be re-purposed in start_tasks when tasks are being scheduled +@deprecated("Deprecated functionality. No longer used. Should be re-purposed in start_tasks when tasks are being scheduled") def check_for_repo_updates(session, repo_git): # Check the last time a repo was updated and if it has been longer than the @@ -244,7 +244,7 @@ def check_for_repo_updates(session, repo_git): # Deprecated. No longer used. - +@deprecated("This functionality is deprecated and won't work with present facade versions") def force_repo_updates(session, repo_git): raise NotImplementedError( "This functionality is deprecated and won't work with present facade versions") @@ -263,7 +263,7 @@ def force_repo_updates(session, repo_git): # Deprecated. No longer used. - +@deprecated("This functionality is deprecated and won't work with present facade versions") def force_repo_analysis(session, repo_git): raise NotImplementedError( "This functionality is deprecated and won't work with present facade versions") diff --git a/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 513390d07..afba70fa2 100644 --- a/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/collectoss/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -35,6 +35,7 @@ from collectoss.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes, get_repo_by_repo_git, get_session from collectoss.application.db.util import execute_session_query #from collectoss.tasks.git.util.facade_worker.facade +from typing_extensions import deprecated def update_repo_log(logger, facade_helper, repos_id,status): @@ -176,6 +177,7 @@ def get_repo_commit_count(logger, facade_helper, repo_git): return commit_count +@deprecated("This method of scheduling is legacy and should be removed") def get_facade_weight_time_factor(repo_git): with get_session() as session: @@ -194,15 +196,16 @@ def get_facade_weight_time_factor(repo_git): return time_factor +@deprecated("This method of scheduling is legacy and should be removed") def get_facade_weight_with_commit_count(repo_git, commit_count): return commit_count - get_facade_weight_time_factor(repo_git) - +@deprecated("This method of scheduling is legacy and should be removed") def get_repo_weight_by_commit(logger, repo_git): facade_helper = FacadeHelper(logger) return get_repo_commit_count(logger, facade_helper, repo_git) - get_facade_weight_time_factor(repo_git) - +@deprecated("This method of scheduling is legacy and should be removed") def update_facade_scheduling_fields(repo_git, weight, commit_count): repo = get_repo_by_repo_git(repo_git) diff --git a/collectoss/tasks/github/__init__.py b/collectoss/tasks/github/__init__.py index de3f37bd8..e69de29bb 100644 --- a/collectoss/tasks/github/__init__.py +++ b/collectoss/tasks/github/__init__.py @@ -1,7 +0,0 @@ -from collectoss.tasks.github.contributors import * -from collectoss.tasks.github.events import * -from collectoss.tasks.github.issues import * -from collectoss.tasks.github.messages import * -from collectoss.tasks.github.pull_requests.tasks import * -from collectoss.tasks.github.repo_info.tasks import * -from collectoss.tasks.github.releases.tasks import * diff --git a/collectoss/tasks/github/detect_move/core.py b/collectoss/tasks/github/detect_move/core.py index 1c0d7dba8..5adcc83fa 100644 --- a/collectoss/tasks/github/detect_move/core.py +++ b/collectoss/tasks/github/detect_move/core.py @@ -110,7 +110,7 @@ def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='c repo_update_dict = { 'repo_git': f"https://github.com/{owner}/{name}", 'repo_path': None, - 'repo_name': None, + 'repo_name': name, 'description': f"(Originally hosted at {url}) {old_description}" } diff --git a/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 1e064f033..b1b163a2b 100644 --- a/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/collectoss/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -8,7 +8,7 @@ # Debugger from collectoss.tasks.github.util.github_paginator import GithubApiResult from collectoss.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql, get_contributors_by_github_user_id - +from typing_extensions import deprecated ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -27,7 +27,7 @@ def clean_dict(d): return {k: ("" if v is None else v) for k, v in d.items()} -# deprecated in favor of GithubDataAcess.get_resource() +@deprecated("Please use GithubDataAcess.get_resource() instead") def request_dict_from_endpoint(logger, session, url, timeout_wait=10): """Hit the endpoint specified by the url and return the json that it returns if it returns a dict. @@ -107,7 +107,7 @@ def request_dict_from_endpoint(logger, session, url, timeout_wait=10): return response_data - +@deprecated("Please use GithubDataAcess.endpoint_url() instead") def create_endpoint_from_email(email): # Note: I added "+type:user" to avoid having user owned organizations be returned # Also stopped splitting per note above. @@ -117,7 +117,7 @@ def create_endpoint_from_email(email): return url - +@deprecated("Please use GithubDataAcess.endpoint_url() instead") def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): logger.debug( f"Trying to create endpoint from commit hash: {commit_sha}") diff --git a/collectoss/tasks/github/facade_github/tasks.py b/collectoss/tasks/github/facade_github/tasks.py index ab7a18eab..cc380d497 100644 --- a/collectoss/tasks/github/facade_github/tasks.py +++ b/collectoss/tasks/github/facade_github/tasks.py @@ -207,12 +207,12 @@ def insert_facade_contributors(self, repo_git): commits.cmt_commit_hash AS hash, commits.cmt_author_raw_email AS email_raw FROM - augur_data.commits + data.commits WHERE commits.repo_id = :repo_id AND commits.cmt_ght_author_id IS NULL AND commits.cmt_author_raw_email NOT IN ( - SELECT email FROM augur_data.unresolved_commit_emails + SELECT email FROM data.unresolved_commit_emails ) """).bindparams(repo_id=repo_id) @@ -253,19 +253,19 @@ def insert_facade_contributors(self, repo_git): resolve_email_to_cntrb_id_sql = s.sql.text(""" WITH email_to_contributor AS ( SELECT cntrb_email AS email, cntrb_id - FROM augur_data.contributors + FROM data.contributors WHERE cntrb_email IS NOT NULL UNION ALL SELECT cntrb_canonical AS email, cntrb_id - FROM augur_data.contributors + FROM data.contributors WHERE cntrb_canonical IS NOT NULL UNION ALL SELECT alias_email AS email, cntrb_id - FROM augur_data.contributors_aliases + FROM data.contributors_aliases WHERE alias_email IS NOT NULL ), deduplicated AS ( @@ -277,7 +277,7 @@ def insert_facade_contributors(self, repo_git): d.cntrb_id, c.cmt_author_email AS email FROM - augur_data.commits c + data.commits c INNER JOIN deduplicated d ON c.cmt_author_email = d.email diff --git a/collectoss/tasks/github/pull_requests/tasks.py b/collectoss/tasks/github/pull_requests/tasks.py index f8966ee6e..3efaddf3b 100644 --- a/collectoss/tasks/github/pull_requests/tasks.py +++ b/collectoss/tasks/github/pull_requests/tasks.py @@ -75,10 +75,15 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[ logger.debug(f"Collecting pull requests for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all&direction=desc&sort=updated" - github_data_access = GithubDataAccess(key_auth, logger) + search_args = {"state": "all", "direction": "desc", "sort": "updated"} + url = github_data_access.endpoint_url(f"repos/{owner}/{repo}/pulls", search_args) + + if not github_data_access.check_prs_enabled(owner, repo): + logger.info(f"{owner}/{repo}: Pull requests appear to be disabled for this repo. Skipping.") + return + num_pages = github_data_access.get_resource_page_count(url) logger.debug(f"{owner}/{repo}: Retrieving {num_pages} pages of pull requests") diff --git a/collectoss/tasks/github/repo_info/core.py b/collectoss/tasks/github/repo_info/core.py index 55b1def2a..582a5ed45 100644 --- a/collectoss/tasks/github/repo_info/core.py +++ b/collectoss/tasks/github/repo_info/core.py @@ -9,7 +9,7 @@ from collectoss.application.db.models import * from collectoss.application.db.lib import execute_sql from collectoss.tasks.github.util.github_task_session import * -from collectoss.application.db.models.augur_data import RepoBadging +from collectoss.application.db.models.data import RepoBadging from urllib.parse import quote def query_committers_count(key_auth, logger, owner, repo): @@ -282,6 +282,10 @@ def badges_model(logger,repo_git,repo_id,db): #Hit cii api with no api key. response = hit_api(None, url, logger) + if not response: + logger.error(f"An error occurred fetching data from {url} in badges_model") + return + try: response_data = response.json() except: diff --git a/collectoss/tasks/github/util/github_data_access.py b/collectoss/tasks/github/util/github_data_access.py index 18256fe68..0fc017124 100644 --- a/collectoss/tasks/github/util/github_data_access.py +++ b/collectoss/tasks/github/util/github_data_access.py @@ -44,6 +44,27 @@ def __init__(self, key_manager, logger: logging.Logger, feature="rest"): self.key = None self.expired_keys_for_request = [] + def endpoint_url(self, path: str, params: dict = None) -> str: + """Build a URL for a github endpoint using the specified path and query parameters + + Args: + path (str): the path to use (i.e. "/users/MoralCode") + params (dict): optional query parameters to add to the url, as a dict + + Returns: + str: the full URL to the specified resource. + """ + # using pythons url processing library helps handle accidental + # inclusion of query parameters in the path string, ensuring all query + # parameters are properly encoded and escaped + + if not path.startswith("/"): + path = "/" + path + + url = "https://api.github.com" + path + + return self.__add_query_params(url, params or {}) + def get_resource_count(self, url): # set per_page to 100 explicitly so we know each page is 100 long @@ -60,6 +81,20 @@ def get_resource_count(self, url): return (100 * (num_pages -1)) + len(data) + def check_prs_enabled(self, owner: str, repo: str,) -> bool: + """ + Checks whether pull requests are enabled for a repository. + Returns False if PRs are disabled (404 on /pulls) and true if there are PRs. + """ + try: + url = self.endpoint_url(f"repos/{owner}/{repo}/pulls", {"per_page": "1"}) + self.get_resource_page_count(url) + return True + except UrlNotFoundException: + self.logger.info(f"{owner}/{repo}: Pull requests are disabled. Skipping PR collection.") + return False + + def paginate_resource(self, url): response = self.make_request_with_retries(url) diff --git a/collectoss/tasks/github/util/github_paginator.py b/collectoss/tasks/github/util/github_paginator.py index 990bc4f73..4a5469552 100644 --- a/collectoss/tasks/github/util/github_paginator.py +++ b/collectoss/tasks/github/util/github_paginator.py @@ -7,8 +7,9 @@ from typing import Optional from enum import Enum +from typing_extensions import deprecated - +@deprecated("Deprecated. Use GithubDataAccess class instead") def hit_api(key_manager, url: str, logger: logging.Logger, timeout: float = 10, method: str = 'GET', follow_redirects=True) -> Optional[httpx.Response]: """Ping the api and get the data back for the page. diff --git a/collectoss/tasks/github/util/github_random_key_auth.py b/collectoss/tasks/github/util/github_random_key_auth.py index 6797ba785..1dbbd2d65 100644 --- a/collectoss/tasks/github/util/github_random_key_auth.py +++ b/collectoss/tasks/github/util/github_random_key_auth.py @@ -3,7 +3,9 @@ from collectoss.tasks.util.random_key_auth import RandomKeyAuth from collectoss.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from sqlalchemy.orm import Session +from typing_extensions import deprecated +@deprecated("This class is deprecated. Use the KeyClient interface to the Keymanager process instead.") class GithubRandomKeyAuth(RandomKeyAuth): """Defines a github specific RandomKeyAuth class so github collections can have a class randomly selects an api key for each request diff --git a/collectoss/tasks/github/util/util.py b/collectoss/tasks/github/util/util.py index a0f009855..c25c738d9 100644 --- a/collectoss/tasks/github/util/util.py +++ b/collectoss/tasks/github/util/util.py @@ -8,6 +8,7 @@ from collectoss.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from collectoss.application.db.lib import get_repo_by_repo_git from collectoss.tasks.util.worker_util import calculate_date_weight_from_timestamps +from typing_extensions import deprecated def get_repo_src_id(owner, repo, logger): @@ -87,6 +88,7 @@ def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dic logger.warning(f"invalid return. Response was: {response.text}. Exception: {e}") return json.loads(json.dumps(response.text)) +@deprecated("This method of scheduling is legacy and should be removed") def get_repo_weight_by_issue(logger,repo_git): """ Retrieve the sum of the number of issues and prs in a repository from a graphql query. @@ -111,6 +113,7 @@ def get_repo_weight_by_issue(logger,repo_git): return number_of_issues_and_prs #Get the weight for each repo for the core collection hook +@deprecated("This method of scheduling is legacy and should be removed") def get_repo_weight_core(logger,repo_git): repo = get_repo_by_repo_git(repo_git) diff --git a/collectoss/tasks/init/celery_app.py b/collectoss/tasks/init/celery_app.py index e14230f99..4b10af18a 100644 --- a/collectoss/tasks/init/celery_app.py +++ b/collectoss/tasks/init/celery_app.py @@ -17,6 +17,7 @@ from collectoss.application.db import get_engine from collectoss.application.db.lib import get_session from collectoss.application.config import SystemConfig +from collectoss.application.environment import SystemEnv from collectoss.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string from collectoss.application.db.models import Repo from collectoss.tasks.util.collection_state import CollectionState @@ -63,7 +64,7 @@ tasks = start_tasks + github_tasks + gitlab_tasks + git_tasks + materialized_view_tasks + frontend_tasks -if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": +if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": tasks += data_analysis_tasks redis_db_number, redis_conn_string = get_redis_conn_values() diff --git a/collectoss/tasks/start_tasks.py b/collectoss/tasks/start_tasks.py index 644b6cbc4..51bf25cd7 100644 --- a/collectoss/tasks/start_tasks.py +++ b/collectoss/tasks/start_tasks.py @@ -7,8 +7,15 @@ import sqlalchemy as s -from collectoss.tasks.github import * -if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": +from collectoss.tasks.github.contributors import * +from collectoss.tasks.github.events import * +from collectoss.tasks.github.issues import * +from collectoss.tasks.github.messages import * +from collectoss.tasks.github.pull_requests.tasks import * +from collectoss.tasks.github.repo_info.tasks import * +from collectoss.tasks.github.releases.tasks import * +from collectoss.application.environment import SystemEnv +if SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') != "1": from collectoss.tasks.data_analysis import * from collectoss.tasks.github.detect_move.tasks import detect_github_repo_move_core, detect_github_repo_move_secondary from collectoss.tasks.github.releases.tasks import collect_releases @@ -32,7 +39,7 @@ from collectoss.application.db.lib import execute_sql, get_session from collectoss.application.config import SystemConfig -RUNNING_DOCKER = os.environ.get('AUGUR_DOCKER_DEPLOY') == "1" +RUNNING_DOCKER = SystemEnv.get('COLLECTOSS_DOCKER_DEPLOY') == "1" CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) @@ -377,7 +384,7 @@ def create_collection_status_records(self): logger = logging.getLogger(create_collection_status_records.__name__) query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.collection_status) + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM operations.collection_status) """) repo = execute_sql(query).first() diff --git a/collectoss/tasks/util/collection_util.py b/collectoss/tasks/util/collection_util.py index 18009d207..c0b8d1984 100644 --- a/collectoss/tasks/util/collection_util.py +++ b/collectoss/tasks/util/collection_util.py @@ -17,7 +17,7 @@ from collectoss.tasks.util.collection_state import CollectionState from collectoss.application.db.session import DatabaseSession from collectoss.application.config import SystemConfig - +from typing_extensions import deprecated class CollectionRequest: def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab_phases=None): @@ -74,7 +74,7 @@ def get_newly_added_repos(session, limit, hook): repo_query = s.sql.text(f""" select repo_git - from augur_operations.collection_status x, augur_data.repo y + from operations.collection_status x, data.repo y where x.repo_id=y.repo_id and {condition_string} order by {order_by_field} @@ -96,7 +96,7 @@ def get_repos_for_recollection(session, limit, hook, days_until_collect_again): repo_query = s.sql.text(f""" select repo_git - from augur_operations.collection_status x, repo y + from operations.collection_status x, repo y where x.repo_id = y.repo_id and {condition_string} and {hook}_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' @@ -252,6 +252,7 @@ def core_task_success_util(self, repo_git): issue_pr_task_update_weight_util([int(raw_count)],repo_git=repo_git,session=session) #Update the existing core and secondary weights as well as the raw sum of issues and prs +@deprecated("This method of scheduling is legacy and should be removed") def update_issue_pr_weights(logger,session,repo_git,raw_sum): repo = Repo.get_by_repo_git(session, repo_git) status = repo.collection_status[0] diff --git a/collectoss/tasks/util/worker_util.py b/collectoss/tasks/util/worker_util.py index 2c5943560..7f315d5b0 100644 --- a/collectoss/tasks/util/worker_util.py +++ b/collectoss/tasks/util/worker_util.py @@ -11,6 +11,7 @@ import json import subprocess +from typing_extensions import deprecated from collectoss.tasks.util.metadata_exception import MetadataException @@ -109,6 +110,7 @@ def remove_duplicate_naturals(data, natural_keys): def date_weight_factor(days_since_last_collection,domain_shift=0): return (days_since_last_collection - domain_shift) ** 4 +@deprecated("This method of scheduling is legacy and should be removed") def calculate_date_weight_from_timestamps(added,last_collection,domain_start_days=30): #Get the time since last collection as well as when the repo was added. if last_collection is None: diff --git a/collectoss/util/repo_load_controller.py b/collectoss/util/repo_load_controller.py index 5455411e4..6a8b41587 100644 --- a/collectoss/util/repo_load_controller.py +++ b/collectoss/util/repo_load_controller.py @@ -6,7 +6,7 @@ from collectoss.application.db.engine import DatabaseEngine from collectoss.application.db.models import Repo, UserRepo, RepoGroup, UserGroup, User, CollectionStatus -from collectoss.application.db.models.augur_operations import retrieve_owner_repos +from collectoss.application.db.models.operations import retrieve_owner_repos from collectoss.application.db.util import execute_session_query from sqlalchemy import Column, Table, MetaData, or_ @@ -19,11 +19,11 @@ with DatabaseEngine() as engine: - augur_data_schema = MetaData(schema = "augur_data") - augur_data_schema.reflect(bind = engine, views = True) + data_schema = MetaData(schema = "data") + data_schema.reflect(bind = engine, views = True) - commits_materialized_view: Table = augur_data_schema.tables["augur_data.api_get_all_repos_commits"] - issues_materialized_view: Table = augur_data_schema.tables["augur_data.api_get_all_repos_issues"] + commits_materialized_view: Table = data_schema.tables["data.api_get_all_repos_commits"] + issues_materialized_view: Table = data_schema.tables["data.api_get_all_repos_issues"] class RepoLoadController: diff --git a/collectoss/util/startup.py b/collectoss/util/startup.py new file mode 100644 index 000000000..8fe5b2374 --- /dev/null +++ b/collectoss/util/startup.py @@ -0,0 +1,244 @@ +## Startup helpers + + +from pathlib import Path +import os +import getpass +import subprocess +from subprocess import check_call +import platform +import sys + +from sqlalchemy.orm.attributes import get_history +from collectoss.application.config import SystemConfig +from collectoss.application.db.session import DatabaseSession +from collectoss.application.environment import SystemEnv +from typing_extensions import deprecated + +from collectoss.util.inspect_without_import import get_phase_names_without_import + +ROOT_PROJECT_REPO_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) + +def check_init_schema(): + """Initialize the CollectOSS database schema as appropriate + """ + + pass + # does public.alembic_version exist? + # if yes, do nothing + # if no, do a sanity check to make sure the other schemas dont exist, + # then init the current db with sqlalchemy and stamp the current version with alembic + +def check_update_schema(): + """ensure the CollectOSS schema is on the latest version + """ + pass + # alembic upgrade head, unless theres an env var preventing automatic migration + check_call(["alembic", "upgrade", "head"]) + +def collect_env_variables(logger): + """convenience helper for assembling more complex environment variables out of smaller ones + and other environment variable convenience operations + """ + + if SystemEnv.get("COLLECTOSS_DB") is None: + names = ["COLLECTOSS_DB_HOST", "COLLECTOSS_DB_USER", "COLLECTOSS_DB_PASSWORD", "COLLECTOSS_DB_NAME"] + values = [SystemEnv.get(n) for n in names] + + if all(map(lambda p: p is not None, values)): + host, user, passwd, name = values + logger.debug(f"Assembling COLLECTOSS_DB string from provided variables") + SystemEnv.set("COLLECTOSS_DB", f"postgresql+psycopg2://{user}:{passwd}@{host}/{name}") + else: + logger.warning("CollectOSS was unable to create your database connection string automatically") + logger.warning("The following environment variables are missing:") + for n, v in zip(names, values): + if v is None: + logger.warning(n) + + + + db_string = SystemEnv.get("COLLECTOSS_DB") + if db_string and "localhost" in db_string: + logger.debug(f"Swapping localhost in COLLECTOSS_DB string with docker host gateway name") + SystemEnv.set("COLLECTOSS_DB", db_string.replace("localhost", "host.docker.internal")) + elif db_string and "127.0.0.1" in db_string: + logger.debug(f"Swapping 127.0.0.1 in COLLECTOSS_DB string with docker host gateway name") + SystemEnv.set("COLLECTOSS_DB", db_string.replace("127.0.0.1", "host.docker.internal")) + + redis_string = SystemEnv.get("REDIS_CONN_STRING") + if redis_string and "localhost" in redis_string: + logger.debug(f"Swapping localhost in REDIS_CONN_STRING with docker host gateway name") + SystemEnv.set("REDIS_CONN_STRING", redis_string.replace("localhost", "host.docker.internal")) + elif redis_string and "127.0.0.1" in redis_string: + logger.debug(f"Swapping 127.0.0.1 in REDIS_CONN_STRING with docker host gateway name") + SystemEnv.set("REDIS_CONN_STRING", redis_string.replace("127.0.0.1", "host.docker.internal")) + + + # if user didnt specify gitlab credentials, just inject fake ones so we can start up. + if SystemEnv.get("COLLECTOSS_GITLAB_API_KEY") is None: + logger.debug(f"Detected no specified gitlab key, using made up values as a workaround") + SystemEnv.set("COLLECTOSS_GITLAB_API_KEY", "fake") + if SystemEnv.get("COLLECTOSS_GITLAB_USERNAME") is None: + logger.debug(f"Detected no specified gitlab username, using made up value as a workaround") + SystemEnv.set("COLLECTOSS_GITLAB_USERNAME", "fake") + + # provide a default value for the facade repo directory (assumes docker paths) + facade_repo_directory = SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") + if facade_repo_directory is None: + logger.debug(f"Setting default value for COLLECTOSS_FACADE_REPO_DIRECTORY") + SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", "/collectoss/facade/") + else: + # Check if the path is resolveable/make it absolute + logger.debug(f"Resolving full path to COLLECTOSS_FACADE_REPO_DIRECTORY") + SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", str(Path(facade_repo_directory).resolve(strict=True))) + + # ensure trailing slash is present + facade_repo_directory = SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") + if facade_repo_directory and not facade_repo_directory.endswith("/"): + facade_repo_directory += "/" + SystemEnv.set("COLLECTOSS_FACADE_REPO_DIRECTORY", facade_repo_directory) + +@deprecated("The bulk of this function is handling .git-credentials, which will be replaced with pygit2 (see issue #258)", category=None) +def setup_facade_directory(logger): + """Perform permission checks and create the facade directory if it doesnt exist + """ + + facade_directory_path = SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") or "/collectoss/facade/" + + facade_directory = Path(facade_directory_path) + + if not facade_directory.exists(): + logger.debug(f"Specified facade directory {facade_directory_path} does not exist. Creating...") + facade_directory.mkdir() + + git_credentials = facade_directory.joinpath(".git-credentials") + git_credentials.touch(exist_ok=True) + + if not os.access(git_credentials, os.R_OK): + logger.error(f"User {getpass.getuser()} does not have permission to write to {git_credentials}. Please select another location") + else: + logger.debug(f"Permission check passed for {git_credentials}") + + + credentials = [] + + gh_names = ["COLLECTOSS_GITHUB_USERNAME","COLLECTOSS_GITHUB_API_KEY"] + gh_values = [SystemEnv.get(n) for n in gh_names] + + if all(map(lambda p: p is not None, gh_values)): + user, key = gh_values + credentials.append(f"https://{user}:{key}@github.com") + + + gl_names = ["COLLECTOSS_GITLAB_USERNAME","COLLECTOSS_GITLAB_API_KEY"] + gl_values = [SystemEnv.get(n) for n in gl_names] + + if all(map(lambda p: p is not None, gl_values)): + user, key = gl_values + credentials.append(f"https://{user}:{key}@gitlab.com") + + with git_credentials.open("w", encoding="utf-8") as c: + c.writelines(credentials) + + subprocess.call(["git", "config", "--global", "credential.helper", f"store --file {str(git_credentials)}"]) + + +def merge_config( + engine, + logger, + github_api_key:str | None = None, + facade_repo_directory:str | None = None, + gitlab_api_key:str | None = None, + redis_conn_string:str | None = None, + rabbitmq_conn_string:str | None = None, + logs_directory:str | None = None + ): + """Merge config items provided via environment variables into a place where SystemConfig can pick them up. + + Args: + engine: the database engine to connect to + logger: object to use for outputting logging messages + github_api_key (str): config value + facade_repo_directory (str): config value + gitlab_api_key (str): config value + redis_conn_string (str): config value + rabbitmq_conn_string (str): config value + logs_directory (str): config value + """ + + github_api_key = github_api_key or SystemEnv.get("COLLECTOSS_GITHUB_API_KEY") + facade_repo_directory = facade_repo_directory or SystemEnv.get("COLLECTOSS_FACADE_REPO_DIRECTORY") + gitlab_api_key = gitlab_api_key or SystemEnv.get("COLLECTOSS_GITLAB_API_KEY") + redis_conn_string = redis_conn_string or SystemEnv.get("REDIS_CONN_STRING") + rabbitmq_conn_string = rabbitmq_conn_string or SystemEnv.get("RABBITMQ_CONN_STRING") + logs_directory = logs_directory or SystemEnv.get("COLLECTOSS_LOGS_DIRECTORY") + + keys = {} + + keys["github_api_key"] = github_api_key + keys["gitlab_api_key"] = gitlab_api_key + + with DatabaseSession(logger, engine=engine) as session: + + config = SystemConfig(logger, session) + + augmented_config = config.base_config + + phase_names = get_phase_names_without_import() + + #Add all phases as enabled by default + for name in phase_names: + + if name not in augmented_config['Task_Routine']: + augmented_config['Task_Routine'].update({name : 1}) + + #print(default_config) + if redis_conn_string: + + try: + redis_string_array = redis_conn_string.split("/") + cache_number = int(redis_string_array[-1]) + digits = len(str(cache_number)) + + redis_conn_string = redis_conn_string[:-digits] + + except ValueError: + pass + + augmented_config["Redis"]["connection_string"] = redis_conn_string + + if rabbitmq_conn_string: + augmented_config["RabbitMQ"]["connection_string"] = rabbitmq_conn_string + + augmented_config["Keys"] = keys + + augmented_config["Facade"]["repo_directory"] = facade_repo_directory + + augmented_config["Logging"]["logs_directory"] = logs_directory or (ROOT_PROJECT_REPO_DIRECTORY + "/logs/") + + config.load_config_from_dict(augmented_config) + + +@deprecated("automatic import is deprecated. This is a function to warn users and help them transition") +def warn_import_repos(logger): + """We are choosing not to auto import repos and repo groups automatically + This function detects attempts to use the automatic feature and warns users to use the CLI themselves + + Args: + logger: the logger to use + """ + + if Path("/repo_groups.csv").exists(): + logger.warning("Detected /repo_groups.csv file at startup. Automatic import of repo groups is deprecated.") + logger.warning("To import repo groups from a CSV, use the CLI: collectoss db add-repo-groups /repo_groups.csv") + + if Path("/repos.csv").exists(): + logger.warning("Detected /repos.csv file at startup. Automatic import of repos is deprecated.") + logger.warning("To import repos from a CSV, use the CLI: collectoss db add-repos /repos.csv") + + +def print_platform_information(logger): + logger.info(f"PATH: {os.environ.get('PATH')}") + logger.info(f"Python executable (current): {sys.executable}") + logger.info(f"Python version: {platform.python_version()}") \ No newline at end of file diff --git a/conftest.py b/conftest.py index 55eae98b5..a142f72ff 100644 --- a/conftest.py +++ b/conftest.py @@ -14,7 +14,6 @@ from collectoss.application.config import SystemConfig from collectoss.application.db.engine import get_database_string, create_database_engine, parse_database_string, execute_sql_file - logger = logging.getLogger(__name__) default_repo_id = "25430" @@ -104,7 +103,8 @@ def generate_db_from_template(template_name): create_database(conn, cursor, test_db_name, template_name) # create engine to connect to db - engine = create_database_engine(test_db_string, poolclass=StaticPool) + engine = create_database_engine(test_db_string, poolclass=StaticPool, connect_args={"application_name": f"collectoss tests"}) + yield engine @@ -195,7 +195,7 @@ def read_only_db(empty_db): database_name = empty_db.url.database test_username = "testuser" test_password = "testpass" - schemas = ["public", "augur_data", "augur_operations"] + schemas = ["public", "data", "operations"] # create read-only user empty_db.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) diff --git a/docker-compose.yml b/docker-compose.yml index 55e1127be..1b455a352 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,13 +7,22 @@ services: restart: unless-stopped environment: - "POSTGRES_DB=augur" - - "POSTGRES_USER=${AUGUR_DB_USER:-augur}" - - "POSTGRES_PASSWORD=${AUGUR_DB_PASSWORD:-augur}" + - "POSTGRES_USER=${COLLECTOSS_DB_USER:-augur}" + - "POSTGRES_PASSWORD=${COLLECTOSS_DB_PASSWORD:-augur}" - "PGDATA=/var/lib/postgresql/data/pgdata" ports: - - "${AUGUR_DB_PORT:-5432}:5432" + - "${COLLECTOSS_DB_PORT:-5432}:5432" volumes: - augurpostgres:/var/lib/postgresql/data + healthcheck: + test: + [ + "CMD-SHELL", + "pg_isready -U ${COLLECTOSS_DB_USER:-augur} -d ${COLLECTOSS_DB_NAME:-augur}", + ] + interval: 10s + timeout: 5s + retries: 5 redis: image: "redis:alpine" @@ -36,15 +45,15 @@ services: context: . dockerfile: ./docker/rabbitmq/Dockerfile args: - - RABBIT_MQ_DEFAULT_USER=${AUGUR_RABBITMQ_USERNAME:-augur} - - RABBIT_MQ_DEFAULT_PASSWORD=${AUGUR_RABBITMQ_PASSWORD:-password123} - - RABBIT_MQ_DEFAULT_VHOST=${AUGUR_RABBITMQ_VHOST:-collectoss_vhost} + - RABBIT_MQ_DEFAULT_USER=${COLLECTOSS_RABBITMQ_USERNAME:-augur} + - RABBIT_MQ_DEFAULT_PASSWORD=${COLLECTOSS_RABBITMQ_PASSWORD:-password123} + - RABBIT_MQ_DEFAULT_VHOST=${COLLECTOSS_RABBITMQ_VHOST:-collectoss_vhost} core: image: collectoss:latest build: context: . - dockerfile: ./docker/backend/${AUGUR_TARGET:-Dockerfile} + dockerfile: ./docker/backend/${COLLECTOSS_TARGET:-Dockerfile} volumes: - cache:/cache:rw - config:/config:rw @@ -56,26 +65,32 @@ services: #extra_hosts: # - "host.docker.internal:host-gateway" #Be able to ping services on the local machine environment: - - "AUGUR_DB=postgresql+psycopg2://${AUGUR_DB_USER:-augur}:${AUGUR_DB_PASSWORD:-augur}@database:5432/augur" - - "AUGUR_DB_SCHEMA_BUILD=1" - - AUGUR_FACADE_REPO_DIRECTORY=/facade - - "AUGUR_FLAGS=$AUGUR_FLAGS" - - "AUGUR_GITHUB_API_KEY=${AUGUR_GITHUB_API_KEY}" - - "AUGUR_GITLAB_API_KEY=${AUGUR_GITLAB_API_KEY}" - - "AUGUR_GITHUB_USERNAME=${AUGUR_GITHUB_USERNAME}" - - "AUGUR_GITLAB_USERNAME=${AUGUR_GITLAB_USERNAME}" + - "COLLECTOSS_DB=postgresql+psycopg2://${COLLECTOSS_DB_USER:-augur}:${COLLECTOSS_DB_PASSWORD:-augur}@database:5432/augur" + - "COLLECTOSS_DB_SCHEMA_BUILD=1" + - COLLECTOSS_FACADE_REPO_DIRECTORY=/facade + - COLLECTOSS_LOGS_DIRECTORY=/logs + - "COLLECTOSS_FLAGS=$COLLECTOSS_FLAGS" + - "COLLECTOSS_GITHUB_API_KEY=${COLLECTOSS_GITHUB_API_KEY}" + - "COLLECTOSS_GITLAB_API_KEY=${COLLECTOSS_GITLAB_API_KEY}" + - "COLLECTOSS_GITHUB_USERNAME=${COLLECTOSS_GITHUB_USERNAME}" + - "COLLECTOSS_GITLAB_USERNAME=${COLLECTOSS_GITLAB_USERNAME}" + - COLLECTOSS_DOCKER_DEPLOY=1 - REDIS_CONN_STRING=redis://redis:6379 - - RABBITMQ_CONN_STRING=amqp://${AUGUR_RABBITMQ_USERNAME:-augur}:${AUGUR_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${AUGUR_RABBITMQ_VHOST:-collectoss_vhost} + - RABBITMQ_CONN_STRING=amqp://${COLLECTOSS_RABBITMQ_USERNAME:-augur}:${COLLECTOSS_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${COLLECTOSS_RABBITMQ_VHOST:-collectoss_vhost} - CONFIG_LOCATION=/config/config.yml - CONFIG_DATADIR=/config - CACHE_DATADIR=/cache - CACHE_LOCKDIR=/cache - CELERYBEAT_SCHEDULE_DB=/tmp/celerybeat-schedule.db depends_on: - - database - - redis - - keyman - - rabbitmq + database: + condition: service_healthy + redis: + condition: service_started + keyman: + condition: service_started + rabbitmq: + condition: service_started user: 2345:2345 # Run as an arbitrary non-root user post_start: # Make sure the user has access to the volumes @@ -92,9 +107,9 @@ services: # ports: # - 5555:5555 # environment: - # - "AUGUR_DB=postgresql+psycopg2://${AUGUR_DB_USER:-augur}:${AUGUR_DB_PASSWORD:-augur}@database:5432/augur" + # - "COLLECTOSS_DB=postgresql+psycopg2://${COLLECTOSS_DB_USER:-augur}:${COLLECTOSS_DB_PASSWORD:-augur}@database:5432/augur" # - REDIS_CONN_STRING=redis://redis:6379 - # - RABBITMQ_CONN_STRING=amqp://${AUGUR_RABBITMQ_USERNAME:-augur}:${AUGUR_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${AUGUR_RABBITMQ_VHOST:-collectoss_vhost} + # - RABBITMQ_CONN_STRING=amqp://${COLLECTOSS_RABBITMQ_USERNAME:-augur}:${COLLECTOSS_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${COLLECTOSS_RABBITMQ_VHOST:-collectoss_vhost} # depends_on: # - core # - database diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 98e9f06ef..344756a06 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -22,7 +22,7 @@ FROM python:3.11-slim-bullseye LABEL org.opencontainers.image.authors="CHAOSS https://chaoss.community" LABEL org.opencontainers.image.licenses="MIT" LABEL org.opencontainers.image.source="https://github.com/chaoss/collectoss" -LABEL org.opencontainers.image.documentation="https://collectoss.readthedocs.io" +LABEL org.opencontainers.image.documentation="https://docs.collectoss.org" ARG VERSION LABEL org.opencontainers.image.version=${VERSION} @@ -33,6 +33,8 @@ LABEL org.opencontainers.image.revision=${REVISION} ENV DEBIAN_FRONTEND=noninteractive ENV PATH="/usr/bin/:/usr/local/bin:/usr/lib:${PATH}" +ENV COLLECTOSS_DOCKER_DEPLOY="1" + RUN set -x \ && apt-get update \ && apt-get -y install --no-install-recommends \ @@ -111,9 +113,7 @@ RUN ${SCORECARD_DIR}/scorecard version RUN mkdir -p repos/ logs/ /collectoss/facade/ RUN ln -s /cache /collectoss/collectoss/static/cache -# Copy in the entrypoint and init scripts, ensuring they are executable -COPY --chmod=755 ./docker/backend/entrypoint.sh / -COPY --chmod=755 ./docker/backend/init.sh / -RUN chmod +x /entrypoint.sh /init.sh -ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] -CMD ["/init.sh"] +COPY --chmod=755 ./docker/backend/preflight.py /preflight.py +RUN chmod +x /preflight.py +CMD python3 /preflight.py && collectoss backend start --pidfile /tmp/main.pid +# CMD ["collectoss", "backend", "start", "--pidfile", "/tmp/main.pid"] diff --git a/docker/backend/entrypoint.sh b/docker/backend/entrypoint.sh deleted file mode 100644 index 78eda49e2..000000000 --- a/docker/backend/entrypoint.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -#SPDX-License-Identifier: MIT -set -e - -if [[ -z "$AUGUR_DB" ]]; then - # If AUGUR_DB is not set, check for individual environment variables and construct AUGUR_DB connection string - if [[ -n "$AUGUR_DB_HOST" ]] && [[ -n "$AUGUR_DB_USER" ]] && [[ -n "$AUGUR_DB_PASSWORD" ]] && [[ -n "$AUGUR_DB_NAME" ]]; then - export AUGUR_DB="postgresql+psycopg2://${AUGUR_DB_USER}:${AUGUR_DB_PASSWORD}@${AUGUR_DB_HOST}/${AUGUR_DB_NAME}" - fi -fi - - -if [[ "$AUGUR_DB" == *"localhost"* ]]; then - echo "localhost db connection" - export AUGUR_DB="${AUGUR_DB/localhost/host.docker.internal}" -elif [[ "$AUGUR_DB" == *"127.0.0.1"* ]]; then - echo "localhost db connection" - export AUGUR_DB="${AUGUR_DB/127.0.0.1/host.docker.internal}" -fi - -export AUGUR_FACADE_REPO_DIRECTORY=${AUGUR_FACADE_REPO_DIRECTORY:-/collectoss/facade/} -export AUGUR_DOCKER_DEPLOY="1" - -#Deal with special case where 'localhost' is the machine that started the container -if [[ "$REDIS_CONN_STRING" == *"localhost"* ]] || [[ "$REDIS_CONN_STRING" == *"127.0.0.1"* ]]; then - echo "localhost redis connection" - export redis_conn_string="redis://host.docker.internal:6379" -else - export redis_conn_string=$REDIS_CONN_STRING -fi - -exec "$@" diff --git a/docker/backend/init.sh b/docker/backend/init.sh deleted file mode 100644 index 782b8fa53..000000000 --- a/docker/backend/init.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -#SPDX-License-Identifier: MIT -set -e - -if [[ "$AUGUR_DB_SCHEMA_BUILD" == "1" ]]; then - collectoss db create-schema -fi - - -if [ ! -v AUGUR_NO_CONFIG ]; then - ./scripts/docker/config.sh docker -fi - -if [[ -f /repo_groups.csv ]]; then - collectoss db add-repo-groups /repo_groups.csv -fi - -if [[ -f /repos.csv ]]; then - collectoss db add-repos /repos.csv -fi - -echo "PATH: $PATH" -echo "Python executable: $(which python)" -python --version - -exec collectoss backend start --pidfile /tmp/main.pid diff --git a/docker/backend/preflight.py b/docker/backend/preflight.py new file mode 100755 index 000000000..4207db8b7 --- /dev/null +++ b/docker/backend/preflight.py @@ -0,0 +1,35 @@ +from collectoss.util.startup import collect_env_variables, check_init_schema, check_update_schema, setup_facade_directory, merge_config, warn_import_repos, print_platform_information +from collectoss.application.logs import getFormatter +from collectoss.application.cli import DatabaseContext +import sys +import logging + +if __name__ == "__main__": + # We cannot use systemLogger here because it depends on the database + # At this point in execution, the database may not yet be initialized or + # usable for configuration. So for now we DIY it as a temporary measure + # until we can more comprehensively improve the high level configuration system + logger = logging.getLogger("collectoss.preflight") + log_level = logging.INFO + if not logger.handlers: + handler = logging.StreamHandler() + handler.setLevel(log_level) + handler.setFormatter(getFormatter(log_level)) + logger.addHandler(handler) + logger.setLevel(log_level) + logger.propagate = False + + collect_env_variables(logger) + + check_init_schema() + check_update_schema() + + setup_facade_directory(logger) + + merge_config(DatabaseContext().engine, logger) + + warn_import_repos(logger) + + print_platform_information(logger) + + sys.exit(0) diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index aa769649c..c239db9b6 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -4,7 +4,7 @@ FROM postgres:16 LABEL org.opencontainers.image.authors="CHAOSS https://chaoss.community" LABEL org.opencontainers.image.licenses="MIT" LABEL org.opencontainers.image.source="https://github.com/chaoss/collectoss" -LABEL org.opencontainers.image.documentation="https://collectoss.readthedocs.io" +LABEL org.opencontainers.image.documentation="https://docs.collectoss.org" ARG VERSION LABEL org.opencontainers.image.version=${VERSION} diff --git a/docker/keyman/Dockerfile b/docker/keyman/Dockerfile index 93de9fc28..33413a680 100644 --- a/docker/keyman/Dockerfile +++ b/docker/keyman/Dockerfile @@ -3,7 +3,7 @@ FROM python:3.11.12-alpine LABEL org.opencontainers.image.authors="CHAOSS https://chaoss.community" LABEL org.opencontainers.image.licenses="MIT" LABEL org.opencontainers.image.source="https://github.com/chaoss/collectoss" -LABEL org.opencontainers.image.documentation="https://collectoss.readthedocs.io" +LABEL org.opencontainers.image.documentation="https://docs.collectoss.org" ARG VERSION LABEL org.opencontainers.image.version=${VERSION} diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile index 6e8916f32..aea2806ac 100644 --- a/docker/rabbitmq/Dockerfile +++ b/docker/rabbitmq/Dockerfile @@ -3,7 +3,7 @@ FROM rabbitmq:4.1-management-alpine LABEL org.opencontainers.image.authors="CHAOSS https://chaoss.community" LABEL org.opencontainers.image.licenses="MIT" LABEL org.opencontainers.image.source="https://github.com/chaoss/collectoss" -LABEL org.opencontainers.image.documentation="https://collectoss.readthedocs.io" +LABEL org.opencontainers.image.documentation="https://docs.collectoss.org" ARG VERSION LABEL org.opencontainers.image.version=${VERSION} diff --git a/docker/rabbitmq/collectoss.conf b/docker/rabbitmq/collectoss.conf index d31435d02..ee8ed92c2 100644 --- a/docker/rabbitmq/collectoss.conf +++ b/docker/rabbitmq/collectoss.conf @@ -5,7 +5,3 @@ default_permissions.read = .* default_permissions.write = .* default_user_tags.administrator = true -default_user_tags.augur = true -default_user_tags.augurTag = true -default_user_tags.collectoss = true -default_user_tags.collectossTag = true diff --git a/docs/source/deployment/production.rst b/docs/source/deployment/production.rst index e65a987f1..186a38c4c 100644 --- a/docs/source/deployment/production.rst +++ b/docs/source/deployment/production.rst @@ -11,11 +11,11 @@ Environment Variables CollectOSS uses several environment variables in production. Make sure to configure the ones relevant to your deployment: -- ``AUGUR_RESET_LOGS`` : Controls automatic log reset on server startup -- ``AUGUR_DB`` : PostgreSQL database connection string (used if variable not set) +- ``COLLECTOSS_RESET_LOGS`` : Controls automatic log reset on server startup +- ``COLLECTOSS_DB`` : PostgreSQL database connection string (used if variable not set) -AUGUR_RESET_LOGS ----------------- +COLLECTOSS_RESET_LOGS +--------------------- **Description:** Controls whether CollectOSS resets its log files every time the server starts. Useful for managing log size or integrating with external log rotation systems. @@ -27,7 +27,7 @@ boolean `True` : CollectOSS clears old logs at startup. **Environment Variable:** -AUGUR_RESET_LOGS +COLLECTOSS_RESET_LOGS **Notes:** If set to `False`, CollectOSS will not reset logs automatically. Administrators must ensure log rotation or cleanup is handled manually. @@ -36,10 +36,10 @@ If set to `False`, CollectOSS will not reset logs automatically. Administrators .. code-block:: bash - export AUGUR_RESET_LOGS=False + export COLLECTOSS_RESET_LOGS=False -AUGUR_DB --------- +COLLECTOSS_DB +------------- **Description:** Specifies the connection string for the PostgreSQL database used by CollectOSS. If omitted, the default Docker database is used. @@ -48,10 +48,10 @@ Specifies the connection string for the PostgreSQL database used by CollectOSS. string **Default:** -Docker container database (if `AUGUR_DB` is not specified) +Docker container database (if `COLLECTOSS_DB` is not specified) **Environment Variable:** -AUGUR_DB +COLLECTOSS_DB Related Resources ----------------- diff --git a/docs/source/development-guide/configuration-file-reference.rst b/docs/source/development-guide/configuration-file-reference.rst index ecca79590..27fe868c2 100644 --- a/docs/source/development-guide/configuration-file-reference.rst +++ b/docs/source/development-guide/configuration-file-reference.rst @@ -1,7 +1,7 @@ Configuration file reference =============================== -CollectOSS's configuration template file, which generates your locally deployed ``augur.config.json`` file, is found at ``collectoss/config.py``. You will notice a small collection of workers are turned on to start with, by examining the ``switch`` variable within the ``Workers`` block of the config file. You can also specify the number of processes to spawn for each worker using the ``workers`` command. The default is one, and we recommend you start here. If you are going to spawn multiple workers, be sure you have enough credentials cached in the ``augur_operations.worker_oath`` table for the platforms you use. +CollectOSS's configuration template file, which generates your locally deployed ``augur.config.json`` file, is found at ``collectoss/config.py``. You will notice a small collection of workers are turned on to start with, by examining the ``switch`` variable within the ``Workers`` block of the config file. You can also specify the number of processes to spawn for each worker using the ``workers`` command. The default is one, and we recommend you start here. If you are going to spawn multiple workers, be sure you have enough credentials cached in the ``operations.worker_oath`` table for the platforms you use. If you have questions or would like to help please open an issue on GitHub_. diff --git a/docs/source/development-guide/create-a-metric/api-development.rst b/docs/source/development-guide/create-a-metric/api-development.rst index 8aea48aac..12a010465 100644 --- a/docs/source/development-guide/create-a-metric/api-development.rst +++ b/docs/source/development-guide/create-a-metric/api-development.rst @@ -11,13 +11,13 @@ JSON Metrics are here: .. code-block:: bash - $ AUGUR_HOME/collectoss/metrics + $ COLLECTOSS_HOME/collectoss/metrics Visualization Metrics are here: .. code-block:: bash - $ AUGUR_HOME/collectoss/routes + $ COLLECTOSS_HOME/collectoss/routes Existing metrics files (JSON Metric) "Standard Metrics": @@ -46,7 +46,7 @@ You can see that one of the imports is our standard metric import from the util .. code-block:: python - AUGUR_HOME/collectoss/routes/util.py + COLLECTOSS_HOME/collectoss/routes/util.py All "Standard Metrics" share declaration and a method signature diff --git a/docs/source/development-guide/create-a-metric/metrics-steps.rst b/docs/source/development-guide/create-a-metric/metrics-steps.rst index 5604c422b..a2fb24a02 100644 --- a/docs/source/development-guide/create-a-metric/metrics-steps.rst +++ b/docs/source/development-guide/create-a-metric/metrics-steps.rst @@ -11,7 +11,7 @@ There are many paths, but we usually follow something along these lines: 2. Sometimes, there are metrics endpoints that integrate, or visualize several metrics. 3. Determine what tables in the CollectOSS Schema contain the data we need to develop this metric 4. Construct a very basic query that does the work of joining those tables in a minimal way so we have a "baseline query." -5. Refine the query so that it takes the standard inputs for a "standard metric" if that's what type it is; alternatively, look at non-standard metrics as they are defined in ``AUGUR_HOME/collectoss/routes``, or one of the visualization metrics in ``AUGUR_HOME/collectoss/routes/contributor.py``, ``AUGUR_HOME/collectoss/routes/pull_requests.py`` or ``AUGUR_HOME/collectoss/routes/nonstandard_metrics.py``. (This step is explained in the next section.) +5. Refine the query so that it takes the standard inputs for a "standard metric" if that's what type it is; alternatively, look at non-standard metrics as they are defined in ``COLLECTOSS_HOME/collectoss/routes``, or one of the visualization metrics in ``COLLECTOSS_HOME/collectoss/routes/contributor.py``, ``COLLECTOSS_HOME/collectoss/routes/pull_requests.py`` or ``COLLECTOSS_HOME/collectoss/routes/nonstandard_metrics.py``. (This step is explained in the next section.) Example Query diff --git a/docs/source/development-guide/make/development.rst b/docs/source/development-guide/make/development.rst deleted file mode 100644 index e2be118e1..000000000 --- a/docs/source/development-guide/make/development.rst +++ /dev/null @@ -1,10 +0,0 @@ -Development -============ - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/issues - -These commands are used to control CollectOSS's backend and frontend servers simultaneously. diff --git a/docs/source/development-guide/make/documentation.rst b/docs/source/development-guide/make/documentation.rst deleted file mode 100644 index dc8ff0c14..000000000 --- a/docs/source/development-guide/make/documentation.rst +++ /dev/null @@ -1,71 +0,0 @@ -Documentation -============= - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/issues - -These commands are used to build and view CollectOSS's documentation. -Before making any documentation changes, please read the `documentation guide <../documentation.html>`_. - --------------------------- - -``make docs`` --------------- -Generate both library and API documentation. - -Example\: - -.. code-block:: bash - - $ make docs - --------------------------- - -``make library-docs`` ----------------------- -Generate the library documentation (the documentation you're reading). - -Example\: - -.. code-block:: bash - - $ make library-docs - --------------------------- - -``make library-docs-view`` --------------------------- -Generate the library documentation, and automatically open a new browser tab to view it. - -Example\: - -.. code-block:: bash - - $ make library-docs-view - --------------------------- - -``make api-docs`` ------------------- -Generate the API documentation. - -Example\: - -.. code-block:: bash - - $ make api-docs - --------------------------- - -``make api-docs-view`` ------------------------ -Generate the API documentation, and automatically open a new browser tab to view it. - -Example\: - -.. code-block:: bash - - $ make api-docs-view diff --git a/docs/source/development-guide/make/installation.rst b/docs/source/development-guide/make/installation.rst deleted file mode 100644 index 647ea00f5..000000000 --- a/docs/source/development-guide/make/installation.rst +++ /dev/null @@ -1,74 +0,0 @@ -Installation -============= - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/toss/issues - -This section explicitly explains the commands that are used to manage the installation of CollectOSS locally. - ---------------- - -``make install`` ------------------ -This command installs the project dependencies, sets up the default configuration file, and gathers database credentials. - -Example\: - -.. code-block:: bash - - $ make install - ---------------- - -``make install-dev`` ---------------------- -The same as ``make install``, except it installs the additional developer dependencies and installs the packages in editable mode. - -Example\: - -.. code-block:: bash - - $ make install-dev - ---------------- - -``make clean`` ----------------- -Removes logs, caches, and some other cruft that can get annoying. This command is used when things aren't building properly or you think an old version of collectoss is getting in the way. - -Example\: - -.. code-block:: bash - - $ make clean - ---------------- - -``make rebuild`` ----------------- -Used in conjunction with ``make clean`` to remove all build/compiled files and binaries and reinstall the project. Useful for upgrading in place. - -Example\: - -.. code-block:: bash - - $ make rebuild - ---------------- - -``make rebuild-dev`` ---------------------- -The same as ``make rebuild``, except it installs the additional developer dependencies and installs the packages in editable mode. - -.. note:: - - You can still use ``make clean`` as normal if something went wrong. - -Example\: - -.. code-block:: bash - - $ make rebuild-dev diff --git a/docs/source/development-guide/make/testing.rst b/docs/source/development-guide/make/testing.rst deleted file mode 100644 index 05508154e..000000000 --- a/docs/source/development-guide/make/testing.rst +++ /dev/null @@ -1,58 +0,0 @@ -Testing -======= - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/issues - -These commands are used to run specific subsets of unit tests. We use ``pytest`` as our test runner. - --------------- - -``make test`` -------------- -This command runs ALL available tests for both the metric functions and their API endpoints. - -Example\: - -.. code-block:: bash - - $ make test - --------------- - -``make test-metrics`` ------------------------- -This command will run ALL unit tests for the metric functions. - -Example\: - -.. code-block:: bash - - $ make test-metrics - --------------- - -``make test-metrics-api`` --------------------------- -The above command runs ALL tests for the metrics API. - -Example\: - -.. code-block:: bash - - $ make test-metrics-api - --------------- - -``pytest`` ----------- -You can also run the tests directly using the ``pytest`` command. - -Example\: - -.. code-block:: bash - - $ uv run pytest diff --git a/docs/source/development-guide/make/toc.rst b/docs/source/development-guide/make/toc.rst deleted file mode 100644 index 75da18238..000000000 --- a/docs/source/development-guide/make/toc.rst +++ /dev/null @@ -1,16 +0,0 @@ -Make commands -=============== - -**THIS SECTION IS UNDER CONSTRUCTION.** - -If you have questions or would like to help please open an issue on GitHub_. - -.. _GitHub: https://github.com/chaoss/collectoss/issues - -.. toctree:: - :maxdepth: 1 - - installation - development - testing - documentation diff --git a/docs/source/development-guide/tech-breakdown.rst b/docs/source/development-guide/tech-breakdown.rst index ce4425877..36caa035b 100644 --- a/docs/source/development-guide/tech-breakdown.rst +++ b/docs/source/development-guide/tech-breakdown.rst @@ -127,7 +127,7 @@ Your CollectOSS instance will now be available at http://servername-or-ip:port_number Note: CollectOSS will run on port 5000 by default (you probably need to -change that in augur_operations.config for OSX) +change that in operations.config for OSX) Stopping your CollectOSS Instance --------------------------------- diff --git a/docs/source/development-guide/testing/toc.rst b/docs/source/development-guide/testing/toc.rst index cbe469805..5a973f209 100644 --- a/docs/source/development-guide/testing/toc.rst +++ b/docs/source/development-guide/testing/toc.rst @@ -1,8 +1,68 @@ Testing =============== -**THIS SECTION IS UNDER CONSTRUCTION.** -If you have questions or would like to help please open an issue on GitHub_. +CollectOSS aims to have a comprehensive set of tests to enable more rapid iteration and greater confidence that changes have not caused new breakage. -.. _GitHub: https://github.com/chaoss/collectoss/toss/issues + +Types of Testing +----------------- + +The tests of the CollectOSS app fall into one of several general types. +* unit tests - standalone tests that are simple to run and test single units of functionality (often individual functions or classes) +* integration tests - small subsystem tests that require bringing up additional pieces, such as redis or a database, to perform the test +* end-to-end tests - complete system tests that require running everything + +Unit Tests +~~~~~~~~~~~ + +Unit tests are implemented via pytest and tagged as ``unit`` to make them easy to run. + +To run the unit tests, clone the CollectOSS repository and run ``uv run pytest -m unit`` + + +Integration Tests +~~~~~~~~~~~~~~~~~~ +Unit tests are also implemented via pytest and tagged as ``integration``. +Because they require additional components, they are not quite as easy to run. + + +To run the integration tests you will need to start up the associated services. This can be done as follows: + +1. Enter the tests directory with ``cd tests/``, this ensures you use the correct dockerfile. +2. Bring up the associated services using the ``docker-compose.yml`` file by running ``docker compose up`` or the podman equivalent. +3. The tests can now be run in a new terminal using ``uv run pytest -m integration`` + +End to End (E2E) Tests +~~~~~~~~~~~~~~~~~~~~~~~ + +The end to end tests are currently run as part of a CI job in github actions that is run on pull request. + +The main form of end to end test is the smoke test. This test brings up and runs the full container stack for three minutes. +A script monitors the output logs and looks for specific log statements that indicate that CollectOSS is coming up and behaving as expected. + +Future end to end tests may also run CollectOSS to the point of fully collecting on some smaller repositories and validating that the database is as expected. + + +Testing Standards +----------------- + +Different parts of the CollectOSS codebase are held to different standards when it comes to how thoroughly changes are expected to be tested/validated before being allowed to merge. + +An approximate, non-exhaustive list of the various levels of testing include: + +* **Code Review** - only a code review is needed to make sure things look okay (spelling/grammar, formatting etc). Typically used for README changes or changes to other simple, non-functional text files in the repo +* **Sanity Check** - a simple, automated check, such as a build job, should be run to ensure that syntax is correct and that the changes aren't causing a build failure. Typically used for documentation (what you are reading now) +* **Automated Functional Test** - A more complex automated check, such as unit tests, integration tests, E2E smoke tests, etc should be run to ensure that CollectOSS can at least start up successfully with the new code. Typically used for trivial changes to subcomponents that already have automated tests +* **Manual Functional Test Procedure** - A set of pre-defined testing steps designed to exercise the specific code/problem being changed. This will usually be derived from the reproduction steps for the bug being solved or documented in the related issue/PR before testing so others can reproduce it. Typically used to test fixes for specific bugs +* **Full Collection Test** - The change should be built and run on a small instance (with relevant repos being added to the collection set if necessary) and the instance should be allowed to run to full collection (all collection stages for all repos marked as "success" in the ``collection_status`` operations table). Typically used for basic/generalized behavior changes +* **Difficult Repo Test** - Either the manual functional test or the full collection test can be made more "difficult" by including one or more known-difficult repositories, such as `chaoss/jank `_ (an artificial repo intended to contain a bunch of examples of problematic git data), or any other repo demonstrating a relevant and extreme/difficult scenario (huge overall size, huge commit count, 50-100k+ commits, etc). Typically used for parsing/performance tests +* **Stress/Scale Test** - the change should be run on an instance (likely pre-existing) with at least 10k diverse repositories for at least one or more full cycles of the collection interval (about 1-2 weeks) to ensure that nothing breaks under load or other scaling-related conditions. Typically used for performance issues, bugs unique to large scale repos, and code thats important enough to require testing on a wide range of different repositories. + +Both the final merge decisions as well as decisions about which level of testing is appropriate for a given PR rests with the project maintainers. + + + +If you have questions about testing in CollectOSS or would like to help please reach out via the `CHAOSS Slack `_ (in the #wg-collectoss-8knot channel) or open an issue on GitHub_. + +.. _GitHub: https://github.com/chaoss/collectoss/issues diff --git a/docs/source/development-guide/toc.rst b/docs/source/development-guide/toc.rst index fc447be06..c6b10af1c 100644 --- a/docs/source/development-guide/toc.rst +++ b/docs/source/development-guide/toc.rst @@ -7,7 +7,6 @@ This is the development guide for CollectOSS. See our `Contributing to CollectOS :maxdepth: 1 installation - make/toc logging documentation workers/toc diff --git a/docs/source/development-guide/workers/creating_a_new_worker.rst b/docs/source/development-guide/workers/creating_a_new_worker.rst index 4e713c4ac..a34d73f4b 100644 --- a/docs/source/development-guide/workers/creating_a_new_worker.rst +++ b/docs/source/development-guide/workers/creating_a_new_worker.rst @@ -132,7 +132,7 @@ In the Worker block you need to add something like this: There should NOT be a comma after the final entry in each block. -ALSO, if you wanted to have those blocks installed with auger itself when you do the PR, you need to add them to the `$AUGUR_ROOT/collectoss/config.py` file. The recommended way is to set a port range not already in use and assign a random variable range with the others, like this `your_new_worker_p = randint(56500, 56999)` ... its totally ok to compress a couple other port ranges for this process. +ALSO, if you wanted to have those blocks installed with auger itself when you do the PR, you need to add them to the `$COLLECTOSS_ROOT/collectoss/config.py` file. The recommended way is to set a port range not already in use and assign a random variable range with the others, like this `your_new_worker_p = randint(56500, 56999)` ... its totally ok to compress a couple other port ranges for this process. You can copy the housekeeper block verbatim from what you added to your own `augur.config.json`. For the worker block, in the `config.py` it would look like this: diff --git a/docs/source/docker/docker-compose.rst b/docs/source/docker/docker-compose.rst index ae38fab6c..96e8e1c51 100644 --- a/docs/source/docker/docker-compose.rst +++ b/docs/source/docker/docker-compose.rst @@ -27,16 +27,16 @@ This section of the documentation details how to use CollectOSS's Docker Compose .. warning:: Don't forget to provide your external database credentials in a file called ``.env`` file. Make sure all the following environment variables are specified, keep placeholder values if you don't need some of them. - Don't specify AUGUR_DB if you want the docker database to be used. + Don't specify COLLECTOSS_DB if you want the docker database to be used. Example .env: .. code:: - AUGUR_GITHUB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx - AUGUR_GITHUB_USERNAME=usernameGithub - AUGUR_GITLAB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx - AUGUR_GITLAB_USERNAME=usernameGitlab - AUGUR_DB=yourDBString + COLLECTOSS_GITHUB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx + COLLECTOSS_GITHUB_USERNAME=usernameGithub + COLLECTOSS_GITLAB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx + COLLECTOSS_GITLAB_USERNAME=usernameGitlab + COLLECTOSS_DB=yourDBString @@ -50,7 +50,7 @@ To run CollectOSS **with** the database container: .. code-block:: bash - docker compose -f docker-compose.yml -f database-compose.yml up + docker compose up Stopping the containers diff --git a/docs/source/docker/getting-started.rst b/docs/source/docker/getting-started.rst index 0648236a5..db6822b79 100644 --- a/docs/source/docker/getting-started.rst +++ b/docs/source/docker/getting-started.rst @@ -31,14 +31,14 @@ with the following fields (don't remove any variable, keep placeholder values if .. code:: python - AUGUR_DB=collectoss - AUGUR_DB_USER=collectoss - AUGUR_DB_PASSWORD=password_here + COLLECTOSS_DB=collectoss + COLLECTOSS_DB_USER=collectoss + COLLECTOSS_DB_PASSWORD=password_here - AUGUR_GITHUB_API_KEY=ghp_value_here - AUGUR_GITHUB_USERNAME=gh_username - AUGUR_GITLAB_API_KEY=placeholder - AUGUR_GITLAB_USERNAME=placeholder + COLLECTOSS_GITHUB_API_KEY=ghp_value_here + COLLECTOSS_GITHUB_USERNAME=gh_username + COLLECTOSS_GITLAB_API_KEY=placeholder + COLLECTOSS_GITLAB_USERNAME=placeholder Then run: @@ -98,11 +98,11 @@ You can provide your own ``.env`` file to pull from. The file should have the be .. code:: - AUGUR_GITHUB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx - AUGUR_GITHUB_USERNAME=usernameGithub - AUGUR_GITLAB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx - AUGUR_GITLAB_USERNAME=usernameGitlab - AUGUR_DB=yourDBString + COLLECTOSS_GITHUB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx + COLLECTOSS_GITHUB_USERNAME=usernameGithub + COLLECTOSS_GITLAB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxx + COLLECTOSS_GITLAB_USERNAME=usernameGitlab + COLLECTOSS_DB=yourDBString Now that you've created your config file or are ready to generate it yourself, you're ready to `get going `_ . diff --git a/docs/source/docker/quick-start.rst b/docs/source/docker/quick-start.rst index 86b552ea3..c71d9dfa2 100644 --- a/docs/source/docker/quick-start.rst +++ b/docs/source/docker/quick-start.rst @@ -13,14 +13,14 @@ Before you get off to such a quick start, go ahead and .. code:: python - AUGUR_DB=collectoss - AUGUR_DB_USER=collectoss - AUGUR_DB_PASSWORD=password_here + COLLECTOSS_DB=collectoss + COLLECTOSS_DB_USER=collectoss + COLLECTOSS_DB_PASSWORD=password_here - AUGUR_GITHUB_API_KEY=ghp_value_here - AUGUR_GITHUB_USERNAME=gh_username - AUGUR_GITLAB_API_KEY=placeholder - AUGUR_GITLAB_USERNAME=placeholder + COLLECTOSS_GITHUB_API_KEY=ghp_value_here + COLLECTOSS_GITHUB_USERNAME=gh_username + COLLECTOSS_GITLAB_API_KEY=placeholder + COLLECTOSS_GITLAB_USERNAME=placeholder 5. Build the container using one of the following commands: @@ -57,14 +57,14 @@ And collectoss should be up and running! .. code-block:: - AUGUR_DB=collectoss - AUGUR_DB_USER=collectoss - AUGUR_DB_PASSWORD=password_here + COLLECTOSS_DB=collectoss + COLLECTOSS_DB_USER=collectoss + COLLECTOSS_DB_PASSWORD=password_here - AUGUR_GITHUB_API_KEY=ghp_value_here - AUGUR_GITHUB_USERNAME=gh_username - AUGUR_GITLAB_API_KEY=placeholder - AUGUR_GITLAB_USERNAME=placeholder + COLLECTOSS_GITHUB_API_KEY=ghp_value_here + COLLECTOSS_GITHUB_USERNAME=gh_username + COLLECTOSS_GITLAB_API_KEY=placeholder + COLLECTOSS_GITLAB_USERNAME=placeholder 4. Execute the code from the base directory of the CollectOSS repository: diff --git a/docs/source/getting-started/collecting-data.rst b/docs/source/getting-started/collecting-data.rst index 78b421f0f..efc7980f8 100644 --- a/docs/source/getting-started/collecting-data.rst +++ b/docs/source/getting-started/collecting-data.rst @@ -60,7 +60,7 @@ There are many collection jobs that ship ready to collect out of the box: - ``collectoss.tasks.github.releases.tasks`` (collects release data from the GitHub API) - ``collectoss.tasks.data_analysis.insight_worker.tasks`` (queries CollectOSS's metrics API to find interesting anomalies in the collected data) -All worker configuration options are found in the config table generated when collectoss was installed. The config table is located in the augur_operations schema of your postgresql database. Each configurable data collection job set has its subsection with the same or similar title as the task's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. +All worker configuration options are found in the config table generated when collectoss was installed. The config table is located in the operations schema of your postgresql database. Each configurable data collection job set has its subsection with the same or similar title as the task's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. Worker-specific configuration options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/getting-started/command-line-interface/backend.rst b/docs/source/getting-started/command-line-interface/backend.rst index d53fd36ae..2adcce0ef 100644 --- a/docs/source/getting-started/command-line-interface/backend.rst +++ b/docs/source/getting-started/command-line-interface/backend.rst @@ -145,29 +145,29 @@ Successful output looks like: .. code-block:: bash - > CLI: [util.export_env] [INFO] Exporting AUGUR_GITHUB_API_KEY - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_HOST - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_NAME - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_PORT - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_USER - > CLI: [util.export_env] [INFO] Exporting AUGUR_DB_PASSWORD + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_GITHUB_API_KEY + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_HOST + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_NAME + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_PORT + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_USER + > CLI: [util.export_env] [INFO] Exporting COLLECTOSS_DB_PASSWORD # contents of collectoss_export_env.sh #!/bin/bash - export AUGUR_GITHUB_API_KEY="your_key_here" - export AUGUR_DB_HOST="your_host" - export AUGUR_DB_NAME="your_db_name" - export AUGUR_DB_PORT="your_db_port" - export AUGUR_DB_USER="your_db_user" - export AUGUR_DB_PASSWORD="your_db_password" + export COLLECTOSS_GITHUB_API_KEY="your_key_here" + export COLLECTOSS_DB_HOST="your_host" + export COLLECTOSS_DB_NAME="your_db_name" + export COLLECTOSS_DB_PORT="your_db_port" + export COLLECTOSS_DB_USER="your_db_user" + export COLLECTOSS_DB_PASSWORD="your_db_password" # contents of docker_env.txt - AUGUR_GITHUB_API_KEY="your_key_here" - AUGUR_DB_HOST="your_host" - AUGUR_DB_NAME="your_db_name" - AUGUR_DB_PORT="your_db_port" - AUGUR_DB_USER="your_db_user" - AUGUR_DB_PASSWORD="your_db_password" + COLLECTOSS_GITHUB_API_KEY="your_key_here" + COLLECTOSS_DB_HOST="your_host" + COLLECTOSS_DB_NAME="your_db_name" + COLLECTOSS_DB_PORT="your_db_port" + COLLECTOSS_DB_USER="your_db_user" + COLLECTOSS_DB_PASSWORD="your_db_password" ``repo-reset`` diff --git a/docs/source/getting-started/command-line-interface/configure.rst b/docs/source/getting-started/command-line-interface/configure.rst index 5659cf6ec..89350bc1a 100644 --- a/docs/source/getting-started/command-line-interface/configure.rst +++ b/docs/source/getting-started/command-line-interface/configure.rst @@ -12,19 +12,19 @@ The ``init`` command is used to create a configuration file, by default named `` Each of the available parameters is optional, and can also be configured using an existing environment variable. Below is the list of available parameters, their defaults, and the corresponding environment variable. ---db_name Database name for your data collection database. Defaults to ``augur``. Set by the ``AUGUR_DB_NAME`` environment variable +--db_name Database name for your data collection database. Defaults to ``augur``. Set by the ``COLLECTOSS_DB_NAME`` environment variable ---db_host Host for your data collection database. Defaults to ``localhost``. Set by the ``AUGUR_DB_HOST`` environment variable +--db_host Host for your data collection database. Defaults to ``localhost``. Set by the ``COLLECTOSS_DB_HOST`` environment variable ---db_user User for your data collection database. Defaults to ``augur``. Set by the ``AUGUR_DB_USER`` environment variable +--db_user User for your data collection database. Defaults to ``augur``. Set by the ``COLLECTOSS_DB_USER`` environment variable ---db_port Port for your data collection database. Defaults to ``5432``. Set by the ``AUGUR_DB_PORT`` environment variable +--db_port Port for your data collection database. Defaults to ``5432``. Set by the ``COLLECTOSS_DB_PORT`` environment variable ---db_password Password for your data collection database. Defaults to ``augur``. Set by the ``AUGUR_DB_PASSWORD`` environment variable +--db_password Password for your data collection database. Defaults to ``augur``. Set by the ``COLLECTOSS_DB_PASSWORD`` environment variable ---github_api_key GitHub API key for data collection from the GitHub API. Defaults to ``key``. Set by the ``AUGUR_GITHUB_API_KEY`` environment variable +--github_api_key GitHub API key for data collection from the GitHub API. Defaults to ``key``. Set by the ``COLLECTOSS_GITHUB_API_KEY`` environment variable ---facade_repo_directory The directory on this machine where Facade should store its cloned repos. Defaults to ``repos/``. Set by the ``AUGUR_FACADE_REPO_DIRECTORY`` environment variable +--facade_repo_directory The directory on this machine where Facade should store its cloned repos. Defaults to ``repos/``. Set by the ``COLLECTOSS_FACADE_REPO_DIRECTORY`` environment variable --rc-config-file Path to an existing CollectOSS config file whose values will be used as the defaults. Defaults to ``None``. This parameter does not support being set by an environment variable. @@ -41,7 +41,7 @@ Example usage\: $ uv run collectoss config init --db_name "db_name" --db_host "host" --db_port "port" --db_user "db_user" --db_password "password" --github_api_key "github_api_key" --facade_repo_directory "facade_repo_directory" # to generate an augur.config.json given all credentials and environment variables - $ uv run collectoss config init --db_name $AUGUR_DB_NAME --db_host $AUGUR_DB_HOST --db_port $AUGUR_DB_PORT --db_user $AUGUR_DB_DB_USER --db_password $AUGUR_DB_PASSWORD --github_api_key $AUGUR_GITHUB_API_KEY --facade_repo_directory $AUGUR_FACADE_REPO_DIRECTORY + $ uv run collectoss config init --db_name $COLLECTOSS_DB_NAME --db_host $COLLECTOSS_DB_HOST --db_port $COLLECTOSS_DB_PORT --db_user $COLLECTOSS_DB_DB_USER --db_password $COLLECTOSS_DB_PASSWORD --github_api_key $COLLECTOSS_GITHUB_API_KEY --facade_repo_directory $COLLECTOSS_FACADE_REPO_DIRECTORY # successful output looks like: > CLI: [config.init] [INFO] Config written to /Users/carter/.collectoss/augur.config.json diff --git a/docs/source/getting-started/command-line-interface/db.rst b/docs/source/getting-started/command-line-interface/db.rst index 853cd2a66..229a942d5 100644 --- a/docs/source/getting-started/command-line-interface/db.rst +++ b/docs/source/getting-started/command-line-interface/db.rst @@ -167,15 +167,15 @@ Example usage\: > [INFO] Config file loaded successfully > CLI: [db.check_pgpass_credentials] [INFO] Credentials found in $HOME/.pgpass > CLI: [db.upgrade_db_version] [INFO] Upgrading from 16 to 17 - > ALTER TABLE "augur_data"."repo" + > ALTER TABLE "data"."repo" > ALTER COLUMN "forked_from" TYPE varchar USING "forked_from"::varchar; > ALTER TABLE - > ALTER TABLE "augur_data"."repo" + > ALTER TABLE "data"."repo" > ADD COLUMN "repo_archived" int4, > ADD COLUMN "repo_archived_date_collected" timestamptz(0), > ALTER COLUMN "forked_from" TYPE varchar USING "forked_from"::varchar; > ALTER TABLE - > update "augur_operations"."augur_settings" set value = 17 where setting = 'augur_data_version'; + > update "operations"."augur_settings" set value = 17 where setting = 'augur_data_version'; > UPDATE 1 > CLI: [db.upgrade_db_version] [INFO] Upgrading from 17 to 18 > etc... @@ -193,4 +193,4 @@ Example usage\: $ uv run collectoss db create-schema .. note:: - If this runs successfully, you should see a bunch of schema creation commands fly by pretty fast. If everything worked you should see: ``update "augur_operations"."augur_settings" set value = xx where setting = 'augur_data_version';`` at the end. + If this runs successfully, you should see a bunch of schema creation commands fly by pretty fast. If everything worked you should see: ``update "operations"."augur_settings" set value = xx where setting = 'augur_data_version';`` at the end. diff --git a/docs/source/getting-started/using-docker.rst b/docs/source/getting-started/using-docker.rst index 5028d5c3a..c427372b1 100644 --- a/docs/source/getting-started/using-docker.rst +++ b/docs/source/getting-started/using-docker.rst @@ -14,14 +14,14 @@ the following resources (or more). .. code:: python - AUGUR_DB=augur - AUGUR_DB_USER=augur - AUGUR_DB_PASSWORD=password_here - - AUGUR_GITHUB_API_KEY=ghp_value_here - AUGUR_GITHUB_USERNAME=gh_username - AUGUR_GITLAB_API_KEY=placeholder - AUGUR_GITLAB_USERNAME=placeholder + COLLECTOSS_DB=augur + COLLECTOSS_DB_USER=augur + COLLECTOSS_DB_PASSWORD=password_here + + COLLECTOSS_GITHUB_API_KEY=ghp_value_here + COLLECTOSS_GITHUB_USERNAME=gh_username + COLLECTOSS_GITLAB_API_KEY=placeholder + COLLECTOSS_GITLAB_USERNAME=placeholder 3. Build the container using one of the following commands: diff --git a/docs/source/index.rst b/docs/source/index.rst index 95d14998c..bb19b1657 100755 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -44,6 +44,17 @@ How CollectOSS works 3. It organizes this data into a standard format called a data model. 4. Then it calculates metrics that tell you about the project’s health. +Where CollectOSS gets its data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +CollectOSS collects data from a variety of sources: + +1. Raw Git commit logs (commits, contributors) +2. GitHub’s API (issues, pull requests, contributors, releases, repository metadata) +3. The Linux Foundation’s `Core Infrastructure Initiative `_ API (repository metadata) +4. `Succinct Code Counter `_, a blazingly fast Sloc, Cloc, and Code tool that also performs COCOMO calculations +5. `OpenSSF Scorecard `_ analysis (security health metrics for open source projects) + Example of a metric: Burstiness ------------------------------- - Burstiness is one of CollectOSS’s metrics. diff --git a/docs/source/quick-start.rst b/docs/source/quick-start.rst index a5466272f..1610eaef5 100644 --- a/docs/source/quick-start.rst +++ b/docs/source/quick-start.rst @@ -139,7 +139,7 @@ http://servername-or-ip:port_number Note: CollectOSS will run on port 5000 by default (you probably need to -change that in collectoss_operations.config for OSX) +change that in operations.config for OSX) Stopping your CollectOSS Instance --------------------------------- diff --git a/docs/source/schema/overview.rst b/docs/source/schema/overview.rst index 1322cce8d..58f0340a9 100644 --- a/docs/source/schema/overview.rst +++ b/docs/source/schema/overview.rst @@ -35,7 +35,7 @@ Schema Overview CollectOSS Data ------------------------------------------------------- -The ``augur_data`` schema contains *most* of the information analyzed +The ``data`` schema contains *most* of the information analyzed and constructed by CollectOSS. The origin’s of the data inside of collectoss are from data collection tasks and populate this schema.: @@ -61,9 +61,9 @@ gathered from commits, issues, and other info. CollectOSS Operations ------------------------------------------------------- -The ``augur_operations`` tables are where most of the operations tables +The ``operations`` tables are where most of the operations tables exist. There are a few, like ``settings`` that remain in -``augur_data`` for now, but will be moved. They keep records related to +``data`` for now, but will be moved. They keep records related to analytical history and data provenance for data in the schema. They also store information including API keys. diff --git a/docs/source/schema/regularly_used_data.rst b/docs/source/schema/regularly_used_data.rst index 14cdcb1f8..aab64a2c8 100644 --- a/docs/source/schema/regularly_used_data.rst +++ b/docs/source/schema/regularly_used_data.rst @@ -347,7 +347,7 @@ Repo_meta Repo_sbom_scans --------------- - This table links the collectoss_data schema to the collectoss_spdx schema to keep a list of repositories that need licenses scanned. (These are for file level license declarations, which are common in Linux Foundation projects, but otherwise not in wide use). + This table links the data schema to the collectoss_spdx schema to keep a list of repositories that need licenses scanned. (These are for file level license declarations, which are common in Linux Foundation projects, but otherwise not in wide use). .. image:: images/repo_sbom_scans.png :width: 200 diff --git a/environment.txt b/environment.txt index 42d00b9c1..3d4c4a721 100644 --- a/environment.txt +++ b/environment.txt @@ -1,12 +1,12 @@ -AUGUR_DB_HOST=collectoss -AUGUR_DB_NAME=collectoss -AUGUR_DB_USER=collectoss -AUGUR_DB_PASSWORD= +COLLECTOSS_DB_HOST=collectoss +COLLECTOSS_DB_NAME=collectoss +COLLECTOSS_DB_USER=collectoss +COLLECTOSS_DB_PASSWORD= -AUGUR_GITHUB_API_KEY= -AUGUR_GITHUB_USERNAME= -AUGUR_GITLAB_API_KEY= -AUGUR_GITLAB_USERNAME= +COLLECTOSS_GITHUB_API_KEY= +COLLECTOSS_GITHUB_USERNAME= +COLLECTOSS_GITLAB_API_KEY= +COLLECTOSS_GITLAB_USERNAME= -AUGUR_RABBITMQ_USERNAME= -AUGUR_RABBITMQ_PASSWORD= +COLLECTOSS_RABBITMQ_USERNAME= +COLLECTOSS_RABBITMQ_PASSWORD= diff --git a/keyman/README.md b/keyman/README.md index 1deb1b8b9..2405a0a18 100644 --- a/keyman/README.md +++ b/keyman/README.md @@ -119,7 +119,7 @@ python keyman/Orchestrator.py ## Adding Keys ```sql -INSERT INTO augur_operations.worker_oauth +INSERT INTO operations.worker_oauth (name, consumer_key, consumer_secret, access_token, access_token_secret, platform) VALUES ('My GitHub Key', 'not_used', 'not_used', 'ghp_YOURTOKEN', 'not_used', 'github_rest'); diff --git a/metadata.py b/metadata.py index f93aaa8b9..8b93a183c 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "1.0.0" -__release__ = "v1.0.0" +__version__ = "1.1.0" +__release__ = "v1.1.0-RC1" __author__ = "CollectOSS Community" __license__ = "MIT" diff --git a/pyproject.toml b/pyproject.toml index 6445d832e..5671e21de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,7 +110,7 @@ collectoss = "collectoss.application.cli._multicommand:run" [project.urls] Homepage = "https://github.com/chaoss/collectoss" -Documentation = "https://collectoss.readthedocs.io/en/latest/" +Documentation = "https://docs.collectoss.org/en/latest/" ############################################################ @@ -146,7 +146,9 @@ addopts = "-ra -s" testpaths = [ "tests/test_classes", "tests/test_application/test_cli/test_csv_utils.py", - # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of CollectOSS + "tests/test_tasks/test_task_utilities/test_util/", + "tests/test_application/test_db/test_timestamp_utils.py", + # "tests/test_routes", # runs, but needs a fixture for connecting to the web interface of Augur # "tests/test_metrics", # "tests/test_tasks", # "tests/test_application", @@ -154,9 +156,17 @@ testpaths = [ # "tests/test_workers/worker_persistence/", # "tests/test_routes/runner.py" ] +markers = [ + "unit: pure logic tests with no external dependencies", + "integration: tests requiring a database, Redis, or network access", +] + [tool.mypy] -files = ['collectoss/application/db/*.py'] +files = [ + 'collectoss/application/db/*.py', + 'collectoss/application/environment.py', +] ignore_missing_imports = true follow_imports = "skip" disallow_untyped_defs = false diff --git a/scripts/docker/config.sh b/scripts/docker/config.sh deleted file mode 100755 index 6170ea57e..000000000 --- a/scripts/docker/config.sh +++ /dev/null @@ -1,244 +0,0 @@ -#!/bin/bash - -PS3=" -Please type the number corresponding to your selection and then press the Enter/Return key. -Your choice: " - -target=$1 - -function blank_confirm() { - if [ -z "${1}" ]; then - echo "Bad usage of blank_confirm at:" - caller - return - fi - - confirm_placeholder=${!1} - - while [ -z "${confirm_placeholder}" ]; do - echo "You entered a blank line, are you sure?" - read -p "enter 'yes' to continue, or enter the intended value: " confirm_placeholder - case "$confirm_placeholder" in - [yY][eE][sS] | [yY][eE] | [yY]) - return - ;; - *) - continue - ;; - esac - done - printf -v "$1" "%s" $confirm_placeholder -} - -function get_github_username() { - echo - echo "Please provide your username for Github." - echo "** This is required for CollectOSS to clone Github repos ***" - read -p "GitHub username: " github_username - blank_confirm github_username - echo -} - -function get_github_api_key() { - echo - echo "Please provide a valid GitHub API key." - echo "For more information on how to create the key, visit:" - echo "https://collectoss.readthedocs.io/en/latest/getting-started/installation.html#backend" - echo "** This is required for CollectOSS to gather data ***" - read -p "GitHub API Key: " github_api_key - blank_confirm github_api_key - echo -} - -function get_gitlab_username() { - echo - echo "Please provide your username for GitLab." - echo "** This is required for CollectOSS to clone GitLab repos ***" - read -p "GitLab username: " gitlab_username - blank_confirm gitlab_username - echo -} - -function get_gitlab_api_key() { - echo - echo "Please provide a valid GitLab API key." - echo "For more information on how to create the key, visit:" - echo "https://collectoss.readthedocs.io/en/latest/getting-started/installation.html#backend" - echo "** This is required for CollectOSS to gather data ***" - read -p "GitLab API Key: " gitlab_api_key - blank_confirm gitlab_api_key - echo -} - -function get_facade_repo_path() { - - echo "The Facade data collection worker will clone repositories to this machine to run its analysis." - echo "Please select a new or existing directory for the Facade worker to use:" - echo - - while true; do - read -e -p "Facade worker directory: " facade_repo_directory - blank_confirm facade_repo_directory - - facade_repo_directory=$(realpath $facade_repo_directory) - echo - - # if ! [ -w $facade_repo_directory/.git-credentials ]; then - # echo "User $(whoami) does not have permission to write to that location" - # echo "Please select another location" - # continue - # fi - - # Check if the file exists and create it if it doesn't - if [ ! -f "$facade_repo_directory/.git-credentials" ]; then - echo "File .git-credentials does not exist. Creating it..." - touch "$facade_repo_directory/.git-credentials" - fi - - # Check for write permissions - if ! [ -w "$facade_repo_directory/.git-credentials" ]; then - echo "User $(whoami) does not have permission to write to $facade_repo_directory/.git-credentials" - echo "Please select another location" - continue - else - echo "Permission check passed for $facade_repo_directory/.git-credentials" - fi - - if [[ -d "$facade_repo_directory" ]]; then - read -r -p "That directory already exists. Use it? [Y/n]: " facade_response - case "$facade_response" in - [nN][oO] | [nN]) - continue - ;; - *) - break - ;; - esac - else - read -r -p "That directory does not exist. Create it? [Y/n]: " facade_response - case "$facade_response" in - [nN][oO] | [nN]) - continue - ;; - *) - mkdir "$facade_repo_directory" - echo "Directory created." - break - ;; - esac - fi - done - - [[ "${facade_repo_directory}" != */ ]] && facade_repo_directory="${facade_repo_directory}/" -} - -function get_rabbitmq_broker_url() { - echo - echo "Please provide your rabbitmq broker url." - echo "** This is required for CollectOSS to run all collection tasks. ***" - read -p "broker_url: " rabbitmq_conn_string - blank_confirm rabbitmq_conn_string - echo -} - -function create_config() { - - if [[ -z "${AUGUR_GITHUB_API_KEY}" ]]; then - get_github_api_key - else - echo - echo "Found AUGUR_GITHUB_API_KEY environment variable" - echo "Using it in the config" - echo "Please unset AUGUR_GITHUB_API_KEY if you would like to be prompted for a github api key" - github_api_key=$AUGUR_GITHUB_API_KEY - echo - fi - - if [[ -z "${AUGUR_GITHUB_USERNAME}" ]]; then - get_github_username - else - echo - echo "Found AUGUR_GITHUB_USERNAME environment variable" - echo "Using it in the config" - echo "Please unset AUGUR_GITHUB_USERNAME if you would like to be prompted for a github username" - github_username=$AUGUR_GITHUB_USERNAME - echo - fi - - if [[ -z "${AUGUR_GITLAB_API_KEY}" ]]; then - get_gitlab_api_key - else - echo - echo "Found AUGUR_GITLAB_API_KEY environment variable" - echo "Using it in the config" - echo "Please unset AUGUR_GITLAB_API_KEY if you would like to be prompted for a gitlab api key" - gitlab_api_key=$AUGUR_GITLAB_API_KEY - echo - fi - - if [[ -z "${AUGUR_GITLAB_USERNAME}" ]]; then - get_gitlab_username - else - echo - echo "Found AUGUR_GITLAB_USERNAME environment variable" - echo "Using it in the config" - echo "Please unset AUGUR_GITLAB_USERNAME if you would like to be prompted for a gitlab username" - gitlab_username=$AUGUR_GITLAB_USERNAME - echo - fi - - if [[ -z "${AUGUR_FACADE_REPO_DIRECTORY}" ]]; then - get_facade_repo_path - else - echo - echo "Found AUGUR_FACADE_REPO_DIRECTORY environment variable with value $AUGUR_FACADE_REPO_DIRECTORY" - echo "Using it in the config" - echo "IMPORTANT NOTE: This assumes that this directory already exists" - echo "Please unset AUGUR_FACADE_REPO_DIRECTORY if you would like to be prompted for the facade repo directory" - facade_repo_directory=$AUGUR_FACADE_REPO_DIRECTORY - echo - fi - - if [[ -z "${RABBITMQ_CONN_STRING}" ]]; then - get_rabbitmq_broker_url - else - echo - echo "Found RABBITMQ_CONN_STRING environment variable with value $RABBITMQ_CONN_STRING" - echo "Using it in the config" - echo "Please unset RABBITMQ_CONN_STRING if you would like to be prompted for the rabbit MQ connection string" - rabbitmq_conn_string=$RABBITMQ_CONN_STRING - echo - fi - - # echo $rabbitmq_conn_string - # echo $facade_repo_directory - # echo $gitlab_username - # echo $gitlab_api_key - # echo $github_username - # echo $github_api_key - - #special case for docker entrypoint - if [ $target = "docker" ]; then - cmd=( collectoss config init --github-api-key $github_api_key --gitlab-api-key $gitlab_api_key --facade-repo-directory $facade_repo_directory --redis-conn-string $redis_conn_string --rabbitmq-conn-string $rabbitmq_conn_string --logs-directory /logs) - echo "init with redis $redis_conn_string" - else - cmd=( collectoss config init --github-api-key $github_api_key --gitlab-api-key $gitlab_api_key --facade-repo-directory $facade_repo_directory --rabbitmq-conn-string $rabbitmq_conn_string ) - fi - - #Create and cache credentials for github and gitlab - touch $facade_repo_directory/.git-credentials - - echo "https://$github_username:$github_api_key@github.com" > $facade_repo_directory/.git-credentials - echo "https://$gitlab_username:$gitlab_api_key@gitlab.com" >> $facade_repo_directory/.git-credentials - - git config --global credential.helper "store --file $facade_repo_directory/.git-credentials" - "${cmd[@]}" -} -echo -echo "Collecting data for config..." -create_config -echo -echo "Config created" -echo - -# config_prompt diff --git a/tests/test_application/test_cli/test_csv_utils.py b/tests/test_application/test_cli/test_csv_utils.py index 395ed0936..d15a7f04b 100644 --- a/tests/test_application/test_cli/test_csv_utils.py +++ b/tests/test_application/test_cli/test_csv_utils.py @@ -14,7 +14,7 @@ MAX_FILE_SIZE_BYTES, ) - +@pytest.mark.unit class TestValidateGitUrl: """Tests for validate_git_url function""" @@ -40,7 +40,7 @@ def test_whitespace_handling(self): """Test that whitespace is properly stripped""" assert validate_git_url(" https://github.com/chaoss/collectoss ") - +@pytest.mark.unit class TestValidatePositiveInt: """Tests for validate_positive_int function""" @@ -71,7 +71,7 @@ def test_whitespace_handling(self): """Test that whitespace is properly stripped""" assert validate_positive_int(" 42 ") - +@pytest.mark.unit class TestDetectColumnOrder: """Tests for detect_column_order function""" @@ -153,7 +153,7 @@ def test_no_match_found_raises_error(self): with pytest.raises(ValueError, match="Could not detect column"): detect_column_order(sample_rows, validators) - +@pytest.mark.unit class TestProcessCsv: """Tests for process_csv function""" @@ -252,7 +252,7 @@ def test_whitespace_in_values(self, tmp_path): result = process_csv(str(csv_file), validators) assert result[0] == {"repo_url": "https://github.com/chaoss/collectoss", "repo_group_id": "10"} - +@pytest.mark.unit class TestProcessRepoCsv: """Tests for process_repo_csv function""" @@ -275,6 +275,7 @@ def test_process_repo_csv_without_headers(self, tmp_path): assert len(result) == 2 +@pytest.mark.unit class TestProcessRepoGroupCsv: """Tests for process_repo_group_csv function""" @@ -310,6 +311,7 @@ def test_empty_group_name_invalid(self, tmp_path): assert len(result) >= 1 +@pytest.mark.unit class TestEdgeCases: """Tests for edge cases and error conditions""" diff --git a/tests/test_application/test_config/test_config.py b/tests/test_application/test_config/test_config.py index b03db89c6..4f750876e 100644 --- a/tests/test_application/test_config/test_config.py +++ b/tests/test_application/test_config/test_config.py @@ -15,7 +15,7 @@ def test_config_get_value(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -26,7 +26,7 @@ def test_config_get_value(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_get_section(test_db_config, test_db_engine): @@ -43,7 +43,7 @@ def test_config_get_section(test_db_config, test_db_engine): for data in network_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -62,7 +62,7 @@ def test_config_get_section(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_load_config(test_db_config, test_db_engine): @@ -82,7 +82,7 @@ def test_config_load_config(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -102,7 +102,7 @@ def test_config_load_config(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_empty(test_db_config, test_db_engine): @@ -123,7 +123,7 @@ def test_config_empty(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -132,7 +132,7 @@ def test_config_empty(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_is_section_in_config(test_db_config, test_db_engine): @@ -151,7 +151,7 @@ def test_config_is_section_in_config(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -163,7 +163,7 @@ def test_config_is_section_in_config(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_add_settings(test_db_config, test_db_engine): @@ -174,7 +174,7 @@ def test_config_add_settings(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() assert result is not None assert len(result) == 2 @@ -189,7 +189,7 @@ def test_config_add_settings(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_update_settings(test_db_config, test_db_engine): @@ -212,7 +212,7 @@ def test_config_update_settings(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -222,7 +222,7 @@ def test_config_update_settings(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() assert len(result) == 3 @@ -235,7 +235,7 @@ def test_config_update_settings(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_add_section_from_json(test_db_config, test_db_engine): @@ -252,7 +252,7 @@ def test_config_add_section_from_json(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""") + result = connection.execute("""SELECT * FROM operations.config""") for row in result: dict_data = dict(row) @@ -266,7 +266,7 @@ def test_config_add_section_from_json(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_load_config_file(test_db_config): @@ -312,7 +312,7 @@ def test_config_load_config_from_dict(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() for row in result: dict_data = dict(row) @@ -328,7 +328,7 @@ def test_config_load_config_from_dict(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_config_clear(test_db_config, test_db_engine): @@ -342,7 +342,7 @@ def test_config_clear(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -350,14 +350,14 @@ def test_config_clear(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() assert len(result) == 0 finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_remove_section(test_db_config, test_db_engine): @@ -377,7 +377,7 @@ def test_remove_section(test_db_config, test_db_engine): for data in all_data: - query = text("""INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") + query = text("""INSERT INTO "operations"."config" ("section_name", "setting_name", "value", "type") VALUES (:section_name, :setting_name, :value, 'str');""") connection.execute(query, **data) @@ -385,7 +385,7 @@ def test_remove_section(test_db_config, test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute("""SELECT * FROM augur_operations.config""").fetchall() + result = connection.execute("""SELECT * FROM operations.config""").fetchall() for row in result: dict_data = dict(row) @@ -395,7 +395,7 @@ def test_remove_section(test_db_config, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") diff --git a/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py b/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py index 1eb7c7492..363773001 100644 --- a/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py +++ b/tests/test_application/test_db/test_models/test_augur_operations/test_user_group.py @@ -73,19 +73,19 @@ def test_add_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "augur_operations"."user_groups";""") + query = s.text("""SELECT * FROM "operations"."user_groups";""") result = connection.execute(query).fetchall() assert result is not None assert len(result) == 3 - query = s.text("""SELECT * FROM "augur_operations"."user_groups" WHERE "user_id"={};""".format(data["users"][0]["id"])) + query = s.text("""SELECT * FROM "operations"."user_groups" WHERE "user_id"={};""".format(data["users"][0]["id"])) result = connection.execute(query).fetchall() assert result is not None assert len(result) == 2 - query = s.text("""SELECT * FROM "augur_operations"."user_groups" WHERE "user_id"={};""".format(data["users"][1]["id"])) + query = s.text("""SELECT * FROM "operations"."user_groups" WHERE "user_id"={};""".format(data["users"][1]["id"])) result = connection.execute(query).fetchall() assert result is not None @@ -212,7 +212,7 @@ def test_remove_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "augur_operations"."user_groups";""") + query = s.text("""SELECT * FROM "operations"."user_groups";""") result = connection.execute(query).fetchall() assert result is not None @@ -226,7 +226,7 @@ def test_remove_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "augur_operations"."user_groups";""") + query = s.text("""SELECT * FROM "operations"."user_groups";""") result = connection.execute(query).fetchall() assert result is not None diff --git a/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py b/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py index ee7abf4c1..493af0116 100644 --- a/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py +++ b/tests/test_application/test_db/test_models/test_augur_operations/test_user_repo.py @@ -74,7 +74,7 @@ def test_add_repo_to_user_group(test_db_engine): with test_db_engine.connect() as connection: - query = s.text("""SELECT * FROM "augur_operations"."user_repos";""") + query = s.text("""SELECT * FROM "operations"."user_repos";""") # WHERE "group_id"=:user_group_id AND "repo_id"=:repo_id result = connection.execute(query).fetchall() @@ -82,14 +82,14 @@ def test_add_repo_to_user_group(test_db_engine): assert len(result) == 4 - query = s.text("""SELECT * FROM "augur_operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) + query = s.text("""SELECT * FROM "operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) result = connection.execute(query).fetchall() assert result is not None assert len(result) == 2 - query = s.text("""SELECT * FROM "augur_operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) + query = s.text("""SELECT * FROM "operations"."user_repos" WHERE "group_id"={};""".format(data["user_group_ids"][0])) result = connection.execute(query).fetchall() assert result is not None diff --git a/tests/test_application/test_db/test_session.py b/tests/test_application/test_db/test_session.py index 36698b217..f3fb5f0e8 100644 --- a/tests/test_application/test_db/test_session.py +++ b/tests/test_application/test_db/test_session.py @@ -26,7 +26,7 @@ def test_execute_sql(test_db_engine): for data in all_data: - statement = s.sql.text("""INSERT INTO "augur_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") + statement = s.sql.text("""INSERT INTO "data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") connection.execute(statement, **data) @@ -35,7 +35,7 @@ def test_execute_sql(test_db_engine): with DatabaseSession(logger, engine=test_db_engine) as session: cntrb_id = data['cntrb_id'] - result = session.execute_sql(f"SELECT * FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() + result = session.execute_sql(f"SELECT * FROM data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() assert result is not None assert isinstance(result[0], s.engine.result.RowProxy) @@ -57,7 +57,7 @@ def test_execute_sql(test_db_engine): for data in all_data: cntrb_id = data["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id='{cntrb_id}';") def test_insert_data_with_duplicates(test_db_engine): @@ -79,7 +79,7 @@ def test_insert_data_with_duplicates(test_db_engine): cntrb_id = data_1['cntrb_id'] - result = session.execute_sql(f"SELECT * FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() + result = session.execute_sql(f"SELECT * FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() assert result is not None assert len(result) == 3 @@ -94,7 +94,7 @@ def test_insert_data_with_duplicates(test_db_engine): for data in duplicate_data_list: cntrb_id = data["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id='{cntrb_id}';") def test_insert_data_with_updates(test_db_engine): @@ -106,7 +106,7 @@ def test_insert_data_with_updates(test_db_engine): with test_db_engine.connect() as connection: - statement = s.sql.text("""INSERT INTO "augur_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") + statement = s.sql.text("""INSERT INTO "data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") connection.execute(statement, **data_1) @@ -117,7 +117,7 @@ def test_insert_data_with_updates(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1['cntrb_id'] - result = connection.execute(f"SELECT * FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() assert result is not None assert dict(result[0])["gh_user_id"] == 6 @@ -127,7 +127,7 @@ def test_insert_data_with_updates(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id='{cntrb_id}';") def test_insert_data_with_bulk(test_db_engine): @@ -145,7 +145,7 @@ def test_insert_data_with_bulk(test_db_engine): with test_db_engine.connect() as connection: - result = connection.execute(f"SELECT * FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() assert result is not None assert len(result) == 4 @@ -160,7 +160,7 @@ def test_insert_data_with_bulk(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") @@ -172,7 +172,7 @@ def test_insert_data_partial_update(test_db_engine): try: with test_db_engine.connect() as connection: - statement = s.sql.text("""INSERT INTO "augur_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") + statement = s.sql.text("""INSERT INTO "data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, NULL, NULL, NULL, NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, :gh_user_id, :gh_login, 'https://api.github.com/users/ivanayov', 'https://github.com/ivanayov', 'MDQ6VXNlcjQxNjAxMzM=', 'https://avatars.githubusercontent.com/u/4160133?v=4', '', 'https://api.github.com/users/ivanayov/followers', 'https://api.github.com/users/ivanayov/following{/other_user}', 'https://api.github.com/users/ivanayov/gists{/gist_id}', 'https://api.github.com/users/ivanayov/starred{/owner}{/repo}', 'https://api.github.com/users/ivanayov/subscriptions', 'https://api.github.com/users/ivanayov/orgs', 'https://api.github.com/users/ivanayov/repos', 'https://api.github.com/users/ivanayov/events{/privacy}', 'https://api.github.com/users/ivanayov/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'Pr Task', '2.0', 'Github API', '2022-08-05 09:06:39', :cntrb_id);""") connection.execute(statement, **data_1) @@ -183,7 +183,7 @@ def test_insert_data_partial_update(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1['cntrb_id'] - result = connection.execute(f"SELECT * FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM data.contributors WHERE cntrb_id='{cntrb_id}'").fetchall() assert result is not None assert dict(result[0])["gh_user_id"] == 6 @@ -193,7 +193,7 @@ def test_insert_data_partial_update(test_db_engine): with test_db_engine.connect() as connection: cntrb_id = data_1["cntrb_id"] - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id='{cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id='{cntrb_id}';") issue_data_with_null_strings = [] @@ -210,11 +210,11 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): # insert the cntrb_id and cntrb_login into the contributors table so the contributor is present. # This is so we don't get a foreign key error on the cntrb_id when we insert the prs query = s.sql.text(""" - DELETE FROM "augur_data"."repo"; - DELETE FROM "augur_data"."repo_groups"; - INSERT INTO "augur_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); + DELETE FROM "data"."repo"; + DELETE FROM "data"."repo_groups"; + INSERT INTO "data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); - INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); + INSERT INTO "data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); """) connection.execute(query) @@ -232,7 +232,7 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): return_columns=issue_return_columns, string_fields=issue_string_columns) data_inserted_count = len(issue_data_with_null_strings) - result = connection.execute(f"Select * FROM augur_data.issues;").fetchall() + result = connection.execute(f"Select * FROM data.issues;").fetchall() assert issue_return_data is not None assert len(issue_return_data) == data_inserted_count @@ -242,9 +242,9 @@ def test_insert_issue_data_with_invalid_strings(test_db_engine): with test_db_engine.connect() as connection: connection.execute(""" - DELETE FROM augur_data.issues; - DELETE FROM "augur_data"."repo"; - DELETE FROM "augur_data"."repo_groups"; + DELETE FROM data.issues; + DELETE FROM "data"."repo"; + DELETE FROM "data"."repo_groups"; """) diff --git a/tests/test_application/test_db/test_timestamp_utils.py b/tests/test_application/test_db/test_timestamp_utils.py new file mode 100644 index 000000000..76b1706c6 --- /dev/null +++ b/tests/test_application/test_db/test_timestamp_utils.py @@ -0,0 +1,169 @@ +""" +Unit tests for git commit timestamp correction functions. + +Tests the timestamp_utils module which validates and corrects invalid +timezone offsets in git commit timestamps before PostgreSQL insertion. +""" + +import pytest +import logging +from collectoss.application.db.timestamp_utils import ( + correct_timestamp, + clean_commit_timestamps, + POSTGRES_VALID_TIMEZONES +) + + +@pytest.fixture +def test_logger(): + """Provide a basic logger for tests.""" + return logging.getLogger("test_correction") + + +class TestCorrectTimestamp: + """Tests for the correct_timestamp function.""" + + def test_valid_timestamp_unchanged(self, test_logger): + """Valid timestamp should pass through unchanged.""" + valid_ts = "2025-11-03 16:28:43 -0500" + result = correct_timestamp(valid_ts, logger=test_logger) + assert result == valid_ts + + def test_valid_utc_timestamp(self, test_logger): + """UTC timestamp (offset 0) should pass through unchanged.""" + utc_ts = "2025-11-03 16:28:43 +0000" + result = correct_timestamp(utc_ts, logger=test_logger) + assert result == utc_ts + + def test_invalid_timezone_uses_fallback(self, test_logger): + """Invalid timezone should use fallback timestamp.""" + invalid_ts = "2106-02-07 06:28:23 -13068837" + fallback_ts = "2025-11-03 16:28:43 -0500" + result = correct_timestamp(invalid_ts, fallback=fallback_ts, logger=test_logger) + assert result == fallback_ts + + def test_invalid_timezone_uses_utc_if_no_fallback(self, test_logger): + """Invalid timezone without fallback should default to UTC.""" + invalid_ts = "2106-02-07 06:28:23 -13068837" + result = correct_timestamp(invalid_ts, fallback=None, logger=test_logger) + # Should replace timezone with +0000, keep date/time + assert result == "2106-02-07 06:28:23 +0000" + + def test_empty_string_returns_default(self, test_logger): + """Empty timestamp string should return default epoch.""" + result = correct_timestamp("", logger=test_logger) + assert result == "1970-01-01 00:00:15 +0000" + + def test_unparseable_format_returns_default(self, test_logger): + """Unparseable timestamp format should return default.""" + unparseable = "not a timestamp" + result = correct_timestamp(unparseable, logger=test_logger) + assert result == "1970-01-01 00:00:15 +0000" + + def test_unparseable_with_fallback_returns_fallback(self, test_logger): + """Unparseable timestamp with fallback should return fallback.""" + unparseable = "not a timestamp" + fallback = "2025-11-03 16:28:43 -0500" + result = correct_timestamp(unparseable, fallback=fallback, logger=test_logger) + assert result == fallback + + def test_none_returns_default(self, test_logger): + """None timestamp (e.g. from record.get() with no default) should return default epoch.""" + result = correct_timestamp(None, logger=test_logger) + assert result == "1970-01-01 00:00:15 +0000" + + def test_none_with_fallback_returns_fallback(self, test_logger): + """None timestamp with fallback should return fallback.""" + fallback = "2025-11-03 16:28:43 -0500" + result = correct_timestamp(None, fallback=fallback, logger=test_logger) + assert result == fallback + + +class TestCleanCommitTimestamps: + """Tests for the clean_commit_timestamps function.""" + + def test_issue_3472_exact_case(self, test_logger): + """Reproduce the exact bug from issue #3472. + + Author timestamp has valid timezone (-0500). + Committer timestamp has invalid timezone (-13068837). + Should use author timestamp as fallback for committer. + """ + records = [ + { + 'cmt_commit_hash': '5de262a839', + 'cmt_author_timestamp': '2025-11-03 16:28:43 -0500', + 'cmt_committer_timestamp': '2106-02-07 06:28:23 -13068837' + } + ] + + clean_commit_timestamps(records, test_logger) + + # Author should be unchanged (valid) + assert records[0]['cmt_author_timestamp'] == '2025-11-03 16:28:43 -0500' + + # Committer should use author as fallback (invalid → fallback) + assert records[0]['cmt_committer_timestamp'] == '2025-11-03 16:28:43 -0500' + + def test_clean_commit_timestamps_batch(self, test_logger): + """Test batch processing of multiple commits.""" + records = [ + { + 'cmt_author_timestamp': '2025-11-03 16:28:43 -0500', # Valid + 'cmt_committer_timestamp': '2025-11-03 16:28:43 -0500' # Valid + }, + { + 'cmt_author_timestamp': '2025-11-04 10:00:00 +0000', # Valid + 'cmt_committer_timestamp': '2106-02-07 06:28:23 -99999' # Invalid + }, + { + 'cmt_author_timestamp': '2025-11-05 12:00:00 -12345', # Invalid + 'cmt_committer_timestamp': '2025-11-05 13:00:00 +0530' # Valid + } + ] + + clean_commit_timestamps(records, test_logger) + + # Record 1: Both valid, unchanged + assert records[0]['cmt_author_timestamp'] == '2025-11-03 16:28:43 -0500' + assert records[0]['cmt_committer_timestamp'] == '2025-11-03 16:28:43 -0500' + + # Record 2: Author valid, committer invalid → use author as fallback + assert records[1]['cmt_author_timestamp'] == '2025-11-04 10:00:00 +0000' + assert records[1]['cmt_committer_timestamp'] == '2025-11-04 10:00:00 +0000' + + # Record 3: Author invalid → UTC, committer valid → unchanged + assert records[2]['cmt_author_timestamp'] == '2025-11-05 12:00:00 +0000' + assert records[2]['cmt_committer_timestamp'] == '2025-11-05 13:00:00 +0530' + + def test_both_timestamps_invalid(self, test_logger): + """When both timestamps invalid, both should default to UTC.""" + records = [ + { + 'cmt_author_timestamp': '2025-11-03 16:28:43 -99999', + 'cmt_committer_timestamp': '2106-02-07 06:28:23 -88888' + } + ] + + clean_commit_timestamps(records, test_logger) + + # Author invalid → UTC (no fallback) + assert records[0]['cmt_author_timestamp'] == '2025-11-03 16:28:43 +0000' + + # Committer invalid → fallback to corrected author (which is UTC) + assert records[0]['cmt_committer_timestamp'] == '2025-11-03 16:28:43 +0000' + + +class TestPostgresValidTimezones: + """Verify the POSTGRES_VALID_TIMEZONES set is correct.""" + + def test_valid_timezones_range(self): + """Valid timezones should be in range -12:00 to +14:00.""" + for tz in POSTGRES_VALID_TIMEZONES: + assert -1200 <= tz <= 1400 + + def test_common_timezones_present(self): + """Common timezone offsets should be in the set.""" + common = [0, -500, -400, -800, 100, 530, 800] # UTC, EST, EDT, PST, CET, IST, CST + for tz in common: + assert tz in POSTGRES_VALID_TIMEZONES diff --git a/tests/test_application/test_repo_load_controller/helper.py b/tests/test_application/test_repo_load_controller/helper.py index 051e48eff..d4373132b 100644 --- a/tests/test_application/test_repo_load_controller/helper.py +++ b/tests/test_application/test_repo_load_controller/helper.py @@ -20,27 +20,27 @@ def get_delete_statement(schema, table): def get_repo_delete_statement(): - return get_delete_statement("augur_data", "repo") + return get_delete_statement("data", "repo") def get_repo_group_delete_statement(): - return get_delete_statement("augur_data", "repo_groups") + return get_delete_statement("data", "repo_groups") def get_user_delete_statement(): - return get_delete_statement("augur_operations", "users") + return get_delete_statement("operations", "users") def get_user_repo_delete_statement(): - return get_delete_statement("augur_operations", "user_repos") + return get_delete_statement("operations", "user_repos") def get_user_group_delete_statement(): - return get_delete_statement("augur_operations", "user_groups") + return get_delete_statement("operations", "user_groups") def get_config_delete_statement(): - return get_delete_statement("augur_operations", "config") + return get_delete_statement("operations", "config") def get_repo_related_delete_statements(table_list): """Takes a list of tables related to the RepoLoadController class and generates a delete statement. @@ -92,26 +92,26 @@ def add_keys_to_test_db(test_db_engine): def get_repo_insert_statement(repo_id, rg_id, repo_url="place holder url"): - return """INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) + return """INSERT INTO "data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) def get_user_repo_insert_statement(repo_id, group_id): - return """INSERT INTO "augur_operations"."user_repos" ("repo_id", "group_id") VALUES ({}, {});""".format(repo_id, group_id) + return """INSERT INTO "operations"."user_repos" ("repo_id", "group_id") VALUES ({}, {});""".format(repo_id, group_id) def get_repo_group_insert_statement(rg_id): - return """INSERT INTO "augur_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) + return """INSERT INTO "data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) def get_user_insert_statement(user_id, username="bil", email="default@gmail.com", password="pass"): - return """INSERT INTO "augur_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, User.compute_hashsed_password(password), email) + return """INSERT INTO "operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, User.compute_hashsed_password(password), email) def get_user_group_insert_statement(user_id, group_name, group_id=None): if group_id: - return """INSERT INTO "augur_operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) + return """INSERT INTO "operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) - return """INSERT INTO "augur_operations"."user_groups" ("user_id", "name") VALUES ({}, '{}');""".format(user_id, group_name) + return """INSERT INTO "operations"."user_groups" ("user_id", "name") VALUES ({}, '{}');""".format(user_id, group_name) ######## Helper Functions to get retrieve data from tables ################# @@ -119,7 +119,7 @@ def get_user_group_insert_statement(user_id, group_name, group_id=None): def get_repos(connection, where_string=None): query_list = [] - query_list.append('SELECT * FROM "augur_data"."repo"') + query_list.append('SELECT * FROM "data"."repo"') if where_string: if where_string.endswith(";"): @@ -135,7 +135,7 @@ def get_repos(connection, where_string=None): def get_user_repos(connection): - return connection.execute(s.text("""SELECT * FROM "augur_operations"."user_repos";""")).fetchall() + return connection.execute(s.text("""SELECT * FROM "operations"."user_repos";""")).fetchall() ######## Helper Functions to get repos in an org ################# diff --git a/tests/test_application/test_repo_load_controller/util.py b/tests/test_application/test_repo_load_controller/util.py index 1283e7580..d966a7be7 100644 --- a/tests/test_application/test_repo_load_controller/util.py +++ b/tests/test_application/test_repo_load_controller/util.py @@ -6,27 +6,27 @@ def get_delete_statement(schema, table): def get_repo_delete_statement(): - return get_delete_statement("augur_data", "repo") + return get_delete_statement("data", "repo") def get_repo_group_delete_statement(): - return get_delete_statement("augur_data", "repo_groups") + return get_delete_statement("data", "repo_groups") def get_user_delete_statement(): - return get_delete_statement("augur_operations", "users") + return get_delete_statement("operations", "users") def get_user_repo_delete_statement(): - return get_delete_statement("augur_operations", "user_repos") + return get_delete_statement("operations", "user_repos") def get_user_group_delete_statement(): - return get_delete_statement("augur_operations", "user_groups") + return get_delete_statement("operations", "user_groups") def get_config_delete_statement(): - return get_delete_statement("augur_operations", "config") + return get_delete_statement("operations", "config") def get_repo_related_delete_statements(table_list): """Takes a list of tables related to the RepoLoadController class and generates a delete statement. @@ -78,22 +78,22 @@ def add_keys_to_test_db(test_db_engine): def get_repo_insert_statement(repo_id, rg_id, repo_url="place holder url"): - return """INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) + return """INSERT INTO "data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, {}, '{}', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07');""".format(repo_id, rg_id, repo_url) def get_repo_group_insert_statement(rg_id): - return """INSERT INTO "augur_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) + return """INSERT INTO "data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ({}, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25');""".format(rg_id) def get_user_insert_statement(user_id): - return """INSERT INTO "augur_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, 'bil', 'pass', 'b@gmil.com', 'bill', 'bob', false);""".format(user_id) + return """INSERT INTO "operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, 'bil', 'pass', 'b@gmil.com', 'bill', 'bob', false);""".format(user_id) def get_user_group_insert_statement(user_id, group_name, group_id=None): if group_id: - return """INSERT INTO "augur_operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) + return """INSERT INTO "operations"."user_groups" ("group_id", "user_id", "name") VALUES ({}, {}, '{}');""".format(group_id, user_id, group_name) - return """INSERT INTO "augur_operations"."user_groups" (user_id", "name") VALUES (1, 'default');""".format(user_id, group_name) + return """INSERT INTO "operations"."user_groups" (user_id", "name") VALUES (1, 'default');""".format(user_id, group_name) ######## Helper Functions to get retrieve data from tables ################# @@ -101,7 +101,7 @@ def get_user_group_insert_statement(user_id, group_name, group_id=None): def get_repos(connection, where_string=None): query_list = [] - query_list.append('SELECT * FROM "augur_data"."repo"') + query_list.append('SELECT * FROM "data"."repo"') if where_string: if where_string.endswith(";"): @@ -117,7 +117,7 @@ def get_repos(connection, where_string=None): def get_user_repos(connection): - return connection.execute(s.text("""SELECT * FROM "augur_operations"."user_repos";""")).fetchall() + return connection.execute(s.text("""SELECT * FROM "operations"."user_repos";""")).fetchall() ######## Helper Functions to get repos in an org ################# diff --git a/tests/test_classes/test_config_stores.py b/tests/test_classes/test_config_stores.py index 8c15fd020..cf23f646f 100644 --- a/tests/test_classes/test_config_stores.py +++ b/tests/test_classes/test_config_stores.py @@ -14,75 +14,128 @@ def mock_session(): return Mock() -def test_jsonconfig_readonly_flags(mock_logger): - cfg = JsonConfig({"A": {"x": 1}}, mock_logger) - assert cfg.writable is False - assert cfg.empty is False - - -def test_jsonconfig_empty_true_false(mock_logger): - assert JsonConfig({}, mock_logger).empty is True - assert JsonConfig({"A": {}}, mock_logger).empty is False - - -def test_jsonconfig_write_protection(mock_logger): - # JsonConfig should be not writeable by default, so we should be unable to change - # its values, even by abusing references - - data = {"Alpha": {"a": 1, "b": "str"}, "Beta": {}} - cfg = JsonConfig(data, mock_logger) - - # mutation via input - data["Alpha"]["a"] = 2 - - config_test = cfg.retrieve_dict() - assert config_test != data # the data in the config should not change - - # mutation via output - config_test["Alpha"]["a"] = 3 - - config_test = cfg.retrieve_dict() - assert config_test != data # the data in the config should not change - -def test_jsonconfig_retrieve_has_get(mock_logger): - data = {"Alpha": {"a": 1, "b": "str"}, "Beta": {}} - cfg = JsonConfig(data, mock_logger) - - # retrieve full dict - assert cfg.retrieve_dict() == data - - # has/get section - assert cfg.has_section("Alpha") is True - assert cfg.has_section("Missing") is False - assert cfg.get_section("Alpha") == {"a": 1, "b": "str"} - assert cfg.get_section("Missing") is None - - # has/get value - assert cfg.has_value("Alpha", "a") is True - assert cfg.has_value("Alpha", "missing") is False - assert cfg.has_value("Missing", "a") is False - assert cfg.get_value("Alpha", "a") == 1 - assert cfg.get_value("Alpha", "missing") is None - assert cfg.get_value("Missing", "a") is None - - -@pytest.mark.parametrize( - "callable_name, args, kwargs", - [ - ("load_dict", ({"X": {"y": 2}},), {"ignore_existing": False}), - ("clear", tuple(), {}), - ("remove_section", ("X",), {}), - ("create_section", ("X", {"y": 2}), {"ignore_existing": False}), - ("remove_value", ("X", "y"), {}), - ("add_value", ("X", "y", 2), {"ignore_existing": False}), - ], -) -def test_jsonconfig_mutations_raise_not_writable(mock_logger, callable_name, args, kwargs): - cfg = JsonConfig({"A": {"x": 1}}, mock_logger) - with pytest.raises(NotWriteableException): - getattr(cfg, callable_name)(*args, **kwargs) +class TestJSONConfig: + def test_jsonconfig_readonly_flags(self, mock_logger): + cfg = JsonConfig({"A": {"x": 1}}, mock_logger) + assert cfg.writable is False + assert cfg.empty is False + + def test_jsonconfig_empty_true_false(self, mock_logger): + assert JsonConfig({}, mock_logger).empty is True + assert JsonConfig({"A": {}}, mock_logger).empty is False + + + def test_jsonconfig_write_protection(self, mock_logger): + # JsonConfig should be not writeable by default, so we should be unable to change + # its values, even by abusing references + + data = {"Alpha": {"a": 1, "b": "str"}, "Beta": {}} + cfg = JsonConfig(data, mock_logger) + + # mutation via input + data["Alpha"]["a"] = 2 + + config_test = cfg.retrieve_dict() + assert config_test != data # the data in the config should not change + + # mutation via output + config_test["Alpha"]["a"] = 3 + + config_test = cfg.retrieve_dict() + assert config_test != data # the data in the config should not change + + def test_jsonconfig_retrieve_has_get(self, mock_logger): + data = {"Alpha": {"a": 1, "b": "str"}, "Beta": {}} + cfg = JsonConfig(data, mock_logger) + + # retrieve full dict + assert cfg.retrieve_dict() == data + + # has/get section + assert cfg.has_section("Alpha") is True + assert cfg.has_section("Missing") is False + assert cfg.get_section("Alpha") == {"a": 1, "b": "str"} + assert cfg.get_section("Missing") is None + + # has/get value + assert cfg.has_value("Alpha", "a") is True + assert cfg.has_value("Alpha", "missing") is False + assert cfg.has_value("Missing", "a") is False + assert cfg.get_value("Alpha", "a") == 1 + assert cfg.get_value("Alpha", "missing") is None + assert cfg.get_value("Missing", "a") is None + + + @pytest.mark.parametrize( + "callable_name, args, kwargs", + [ + ("load_dict", ({"X": {"y": 2}},), {"ignore_existing": False}), + ("clear", tuple(), {}), + ("remove_section", ("X",), {}), + ("create_section", ("X", {"y": 2}), {"ignore_existing": False}), + ("remove_value", ("X", "y"), {}), + ("add_value", ("X", "y", 2), {"ignore_existing": False}), + ], + ) + def test_jsonconfig_mutations_raise_not_writable(self, mock_logger, callable_name, args, kwargs): + cfg = JsonConfig({"A": {"x": 1}}, mock_logger) + with pytest.raises(NotWriteableException): + getattr(cfg, callable_name)(*args, **kwargs) + + + def test_fetching_real_defaults(self, mock_logger, mock_session): + cfg = SystemConfig(mock_logger, mock_session) + cfg.config_sources = [JsonConfig(default_config, mock_logger)] + + assert cfg.get_value("Redis", "cache_group") == 0 + + + def test_load_config_utilizes_hierarchy(self): + + default_dict = { + "Section1": {"alpha": 1, "beta": "x"}, + "Section2": {"gamma": False, "delta": 3.14}, + } + + override_dict = { + "Section1": {"beta": "y"}, + "Section2": {"Epsilon": True, "delta": 6.28}, + "Section3": {"hi": "there"} + } + + cfg = SystemConfig(None, None, [JsonConfig(default_dict, mock_logger), JsonConfig(override_dict, mock_logger)]) + + expected_dict = { + "Section1": {"alpha": 1, "beta": "y"}, + "Section2": {"gamma": False, "Epsilon": True, "delta": 6.28}, + "Section3": {"hi": "there"} # test that new sections are accounted for too + } + + assert cfg.load_config() == expected_dict + + + def test_get_section_incorporates_hierarchy(self): + + default_dict = { + "Section1": {"alpha": 1, "beta": "x"}, + "Section2": {"gamma": False, "delta": 3.14}, + } + + override_dict = { + "Section1": {"beta": "y"}, + "Section2": {"gamma": False, "delta": 3.14}, + } + + cfg = SystemConfig(None, None, [JsonConfig(default_dict, mock_logger), JsonConfig(override_dict, mock_logger)]) + + expected_dict = {"alpha": 1, "beta": "y"} + + assert cfg.get_section("Section1") == expected_dict + + +@pytest.mark.unit def test_dict_to_config_table_happy_path(): input_dict = { "Section1": {"alpha": 1, "beta": "x"}, @@ -122,53 +175,3 @@ def test_dict_to_config_table_happy_path(): assert rows == expected - -def test_fetching_real_defaults(mock_logger, mock_session): - cfg = SystemConfig(mock_logger, mock_session) - cfg.config_sources = [JsonConfig(default_config, mock_logger)] - - assert cfg.get_value("Redis", "cache_group") == 0 - - -def test_load_config_utilizes_hierarchy(): - - default_dict = { - "Section1": {"alpha": 1, "beta": "x"}, - "Section2": {"gamma": False, "delta": 3.14}, - } - - override_dict = { - "Section1": {"beta": "y"}, - "Section2": {"Epsilon": True, "delta": 6.28}, - "Section3": {"hi": "there"} - } - - cfg = SystemConfig(None, None, [JsonConfig(default_dict, mock_logger), JsonConfig(override_dict, mock_logger)]) - - expected_dict = { - "Section1": {"alpha": 1, "beta": "y"}, - "Section2": {"gamma": False, "Epsilon": True, "delta": 6.28}, - "Section3": {"hi": "there"} # test that new sections are accounted for too - } - - assert cfg.load_config() == expected_dict - - -def test_get_section_incorporates_hierarchy(): - - default_dict = { - "Section1": {"alpha": 1, "beta": "x"}, - "Section2": {"gamma": False, "delta": 3.14}, - } - - override_dict = { - "Section1": {"beta": "y"}, - "Section2": {"gamma": False, "delta": 3.14}, - } - - cfg = SystemConfig(None, None, [JsonConfig(default_dict, mock_logger), JsonConfig(override_dict, mock_logger)]) - - expected_dict = {"alpha": 1, "beta": "y"} - - assert cfg.get_section("Section1") == expected_dict - diff --git a/tests/test_classes/test_environment.py b/tests/test_classes/test_environment.py new file mode 100644 index 000000000..e6621062a --- /dev/null +++ b/tests/test_classes/test_environment.py @@ -0,0 +1,83 @@ +from collectoss.application.environment import SystemEnv, extract_prefix +import logging +import os + +logger = logging.getLogger(__name__) + +prefixes = ["COLLECTOSS", "OTHER"] + +class TestExtractPrefix: + def test_env_extract_prefix(self): + assert extract_prefix("OTHER_DB", prefixes) == "OTHER_" + assert extract_prefix("COLLECTOSS_DB", prefixes) == "COLLECTOSS_" + + def test_env_extract_prefix_default(self): + assert extract_prefix("SOME_DB", prefixes) is None + assert extract_prefix("THINGY_DB", prefixes) is None + + + def test_env_extract_prefix_unprefixed(self): + assert extract_prefix("DB", prefixes) is None + +class TestSystemEnv: + + def test_fetching_env(self): + # plain + os.environ["COLLECTOSS_NAME"] = "A" + assert SystemEnv.get("COLLECTOSS_NAME") == "A" + + # fallback handling + os.environ["OTHER_THING"] = "B" + assert SystemEnv.get("COLLECTOSS_THING", None, prefixes) == "B" + + # cleanup + del os.environ["COLLECTOSS_NAME"] + del os.environ["OTHER_THING"] + + def test_fetching_env_backwards(self): + os.environ["COLLECTOSS_NAME"] = "A" + assert SystemEnv.get("OTHER_NAME", None, prefixes) == "A" + + # cleanup + del os.environ["COLLECTOSS_NAME"] + + def test_fetching_env_no_value(self): + assert SystemEnv.get("COLLECTOSS_MISSING", None, prefixes) is None + + def test_fetching_env_default(self): + assert SystemEnv.get("COLLECTOSS_DEFAULT", "SOME", prefixes) == "SOME" + + def test_no_known_prefix(self): + # fallback handling + os.environ["THING"] = "C" + assert SystemEnv.get("THING", None, prefixes) == "C" + + + def test_get_bool_trues(self): + + cases = ["1", "true", "True", "TRUE", "y", "Y", "yes", "Yes"] + + for case in cases: + os.environ["OTHER_BOOL"] = case + assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == True, f"value '{case}' should resolve to True" + del os.environ["OTHER_BOOL"] + + def test_get_bool_falses(self): + + cases = ["0", "false", "False", "FALSE", "n", "N", "no", "No"] + + for case in cases: + os.environ["OTHER_BOOL"] = case + assert SystemEnv.get_bool("OTHER_BOOL", True, prefixes) == False, f"value '{case}' should resolve to False" + del os.environ["OTHER_BOOL"] + + def test_get_bool_default(self): + + cases = ["?", "maybe", "Stuff", "333"] + + for case in cases: + os.environ["OTHER_BOOL"] = case + assert SystemEnv.get_bool("OTHER_BOOL", False, prefixes) == False, f"value '{case}' should resolve to Default value" + del os.environ["OTHER_BOOL"] + + diff --git a/tests/test_classes/test_github_data_access.py b/tests/test_classes/test_github_data_access.py new file mode 100644 index 000000000..3ebd4db79 --- /dev/null +++ b/tests/test_classes/test_github_data_access.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: MIT +import pytest +from unittest.mock import Mock, patch + +from collectoss.tasks.github.util.github_data_access import GithubDataAccess + + +@pytest.fixture +def mock_logger(): + return Mock() + + +@pytest.fixture +def mock_key_manager(): + return Mock() + + +@pytest.fixture +def gda(mock_key_manager, mock_logger): + with patch("collectoss.tasks.github.util.github_data_access.KeyClient"): + return GithubDataAccess(mock_key_manager, mock_logger) + +@pytest.mark.unit +class TestEndpointUrl: + + def test_basic_path(self, gda): + result = gda.endpoint_url("/users/MoralCode") + assert result == "https://api.github.com/users/MoralCode" + + def test_path_without_leading_slash(self, gda): + result = gda.endpoint_url("repos/owner/repo") + assert result == "https://api.github.com/repos/owner/repo" + + def test_with_single_param(self, gda): + result = gda.endpoint_url("/users/MoralCode", {"per_page": "100"}) + assert "per_page=100" in result + assert result.startswith("https://api.github.com/users/MoralCode") + + def test_with_multiple_params(self, gda): + result = gda.endpoint_url("/repos/owner/repo/pulls", {"per_page": "50", "state": "open"}) + assert "per_page=50" in result + assert "state=open" in result + assert result.startswith("https://api.github.com/repos/owner/repo/pulls") + + def test_none_params_produces_no_query_string(self, gda): + result = gda.endpoint_url("/users/MoralCode", None) + assert result == "https://api.github.com/users/MoralCode" + + def test_empty_params_produces_no_query_string(self, gda): + result = gda.endpoint_url("/users/MoralCode", {}) + assert result == "https://api.github.com/users/MoralCode" + + def test_path_with_existing_query_params(self, gda): + result = gda.endpoint_url("/search/repositories?q=python", {"per_page": "10"}) + assert "q=python" in result + assert "per_page=10" in result + assert result.startswith("https://api.github.com/search/repositories") diff --git a/tests/test_helpers.py b/tests/test_helpers.py index a0401f369..e73be9c9e 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -14,7 +14,7 @@ def set_search_path(dbapi_connection, connection_record): existing_autocommit = dbapi_connection.autocommit dbapi_connection.autocommit = True cursor = dbapi_connection.cursor() - cursor.execute("SET SESSION search_path=public,augur_data,augur_operations,spdx") + cursor.execute("SET SESSION search_path=public,data,operations,spdx") cursor.close() dbapi_connection.autocommit = existing_autocommit @@ -79,4 +79,4 @@ def test_discover_config_file_env_no_exception(): -""" \ No newline at end of file +""" diff --git a/tests/test_tasks/test_git/__init__.py b/tests/test_tasks/test_git/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_tasks/test_github_tasks/test_pull_requests.py b/tests/test_tasks/test_github_tasks/test_pull_requests.py index 4dc2c9e73..847522303 100644 --- a/tests/test_tasks/test_github_tasks/test_pull_requests.py +++ b/tests/test_tasks/test_github_tasks/test_pull_requests.py @@ -263,7 +263,7 @@ def test_insert_pr_contributors(github_api_key_headers, test_db_session, pr_numb with test_db_session.engine.connect() as connection: - result = connection.execute(f"SELECT * FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() + result = connection.execute(f"SELECT * FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}'").fetchall() assert result is not None assert len(result) == len(unique_contributors) @@ -277,7 +277,7 @@ def test_insert_pr_contributors(github_api_key_headers, test_db_session, pr_numb with test_db_session.engine.connect() as connection: - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") repos = [] repos.append({"owner": "chaoss", "repo": "augur"}) @@ -312,13 +312,13 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): # insert the cntrb_id and cntrb_login into the contributors table so the contributor is present. # This is so we don't get a foreign key error on the cntrb_id when we insert the prs - query = text("""INSERT INTO "augur_data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, 'kannayoshihiro@gmail.com', 'KANNA Yoshihiro', 'UTMC', '2009-04-17 12:43:58', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, 'kannayoshihiro@gmail.com', '2021-01-28 21:56:10-06', 74832, :gh_login, 'https://api.github.com/users/nan', 'https://github.com/nan', 'MDQ6VXNlcjc0ODMy', 'https://avatars.githubusercontent.com/u/74832?v=4', '', 'https://api.github.com/users/nan/followers', 'https://api.github.com/users/nan/following{/other_user}', 'https://api.github.com/users/nan/gists{/gist_id}', 'https://api.github.com/users/nan/starred{/owner}{/repo}', 'https://api.github.com/users/nan/subscriptions', 'https://api.github.com/users/nan/orgs', 'https://api.github.com/users/nan/repos', 'https://api.github.com/users/nan/events{/privacy}', 'https://api.github.com/users/nan/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'GitHub API Worker', '1.0.0', 'GitHub API', '2021-10-28 15:23:46', :cntrb_id); + query = text("""INSERT INTO "data"."contributors" ("cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date", "cntrb_id") VALUES (:cntrb_login, 'kannayoshihiro@gmail.com', 'KANNA Yoshihiro', 'UTMC', '2009-04-17 12:43:58', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, 'kannayoshihiro@gmail.com', '2021-01-28 21:56:10-06', 74832, :gh_login, 'https://api.github.com/users/nan', 'https://github.com/nan', 'MDQ6VXNlcjc0ODMy', 'https://avatars.githubusercontent.com/u/74832?v=4', '', 'https://api.github.com/users/nan/followers', 'https://api.github.com/users/nan/following{/other_user}', 'https://api.github.com/users/nan/gists{/gist_id}', 'https://api.github.com/users/nan/starred{/owner}{/repo}', 'https://api.github.com/users/nan/subscriptions', 'https://api.github.com/users/nan/orgs', 'https://api.github.com/users/nan/repos', 'https://api.github.com/users/nan/events{/privacy}', 'https://api.github.com/users/nan/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'GitHub API Worker', '1.0.0', 'GitHub API', '2021-10-28 15:23:46', :cntrb_id); - DELETE FROM "augur_data"."repo"; - DELETE FROM "augur_data"."repo_groups"; - INSERT INTO "augur_data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); + DELETE FROM "data"."repo"; + DELETE FROM "data"."repo_groups"; + INSERT INTO "data"."repo_groups" ("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2019-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); - INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); + INSERT INTO "data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "repo_archived_date_collected", "repo_archived", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (1, 1, 'https://github.com/chaoss/collectoss', NULL, NULL, '2022-08-15 21:08:07', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 'CLI', '1.0', 'Git', '2022-08-15 21:08:07'); """) connection.execute(query, **contributor) @@ -336,7 +336,7 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): with test_db_session.engine.connect() as connection: - result = connection.execute(f"SELECT * FROM augur_data.pull_requests;").fetchall() + result = connection.execute(f"SELECT * FROM data.pull_requests;").fetchall() assert result is not None assert len(result) == len(prs) == len(return_data) @@ -353,11 +353,11 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): with test_db_session.engine.connect() as connection: - connection.execute(f"DELETE FROM augur_data.pull_requests;") - connection.execute("""DELETE FROM "augur_data"."repo"; - DELETE FROM "augur_data"."repo_groups"; + connection.execute(f"DELETE FROM data.pull_requests;") + connection.execute("""DELETE FROM "data"."repo"; + DELETE FROM "data"."repo_groups"; """) - connection.execute(f"DELETE FROM augur_data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") + connection.execute(f"DELETE FROM data.contributors WHERE cntrb_id!='{not_provided_cntrb_id}' AND cntrb_id!='{nan_cntrb_id}';") diff --git a/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py b/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py index edf5ac3cf..44d42f3f3 100644 --- a/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py +++ b/tests/test_tasks/test_task_utilities/test_key_handler/test_github_api_key_handler.py @@ -33,7 +33,7 @@ def test_get_config_key(key_handler, test_db_engine): data = {"github_api_key": "asdfdfkey"} with test_db_engine.connect() as connection: - query = text("""INSERT INTO "augur_operations"."config" ("id", "section_name", "setting_name", "value", "type") VALUES (3, 'Keys', 'github_api_key', :github_api_key, 'str');""") + query = text("""INSERT INTO "operations"."config" ("id", "section_name", "setting_name", "value", "type") VALUES (3, 'Keys', 'github_api_key', :github_api_key, 'str');""") connection.execute(query, **data) @@ -43,7 +43,7 @@ def test_get_config_key(key_handler, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.config""") + connection.execute("""DELETE FROM operations.config""") def test_get_config_key_with_none_specified(key_handler, test_db_engine): @@ -64,7 +64,7 @@ def test_get_api_keys_from_database(key_handler, test_db_engine): for value in data: - query = text("""INSERT INTO "augur_operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") + query = text("""INSERT INTO "operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") connection.execute(query, **value) @@ -78,7 +78,7 @@ def test_get_api_keys_from_database(key_handler, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.worker_oauth""") + connection.execute("""DELETE FROM operations.worker_oauth""") api_key_list = ["asdfdfkey", "jloire", "zdfdr", "asdrxer"] @pytest.mark.parametrize("api_key", api_key_list) @@ -101,7 +101,7 @@ def test_get_api_keys(key_handler, test_db_engine): for value in data: - query = text("""INSERT INTO "augur_operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") + query = text("""INSERT INTO "operations"."worker_oauth" ("name", "consumer_key", "consumer_secret", "access_token", "access_token_secret", "repo_directory", "platform") VALUES ('test_key', '0', '0', :api_key, '0', NULL, 'github');""") connection.execute(query, **value) @@ -112,4 +112,4 @@ def test_get_api_keys(key_handler, test_db_engine): finally: with test_db_engine.connect() as connection: - connection.execute("""DELETE FROM augur_operations.worker_oauth""") \ No newline at end of file + connection.execute("""DELETE FROM operations.worker_oauth""") \ No newline at end of file diff --git a/tests/test_tasks/test_task_utilities/test_paginators/test_github_paginator.py b/tests/test_tasks/test_task_utilities/test_paginators/test_github_paginator.py index a8ea375f3..ec0c27745 100644 --- a/tests/test_tasks/test_task_utilities/test_paginators/test_github_paginator.py +++ b/tests/test_tasks/test_task_utilities/test_paginators/test_github_paginator.py @@ -92,7 +92,7 @@ def test_github_paginator_len(key_auth): assert len_contributors_list == 0 -def test_github_paginator_get_item(key_auth): +def test_github_paginator_get_item_2(key_auth): owner = "chaoss" name = "whitepaper" diff --git a/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py b/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py new file mode 100644 index 000000000..2ae9e8807 --- /dev/null +++ b/tests/test_tasks/test_task_utilities/test_util/test_check_swapped_emails.py @@ -0,0 +1,34 @@ +import pytest +from collectoss.tasks.git.util.facade_worker.facade_worker.analyzecommit import check_swapped_emails + +def test_correct_input_unchanged(): + name, email = check_swapped_emails("John Smith", "john@gmail.com") + assert name == "John Smith" + assert email == "john@gmail.com" + +def test_swapped_input_is_corrected(): + name, email = check_swapped_emails("john@gmail.com", "John Smith") + assert name == "John Smith" + assert email == "john@gmail.com" + +def test_name_field_contains_mixed_name_and_email(): + # name field has both a name and email mixed together + name, email = check_swapped_emails("John Smith john@gmail.com", "") + assert name == "" + assert email == "John Smith john@gmail.com" + +def test_email_field_contains_mixed_name_and_email(): + # email field has both a name and email mixed together + name, email = check_swapped_emails("John Smith", "John Smith john@gmail.com") + assert name == "John Smith" + assert email == "John Smith john@gmail.com" + +def test_both_fields_contain_mixed_name_and_email(): + name, email = check_swapped_emails("John Smith john@gmail.com", "Jane Doe jane@gmail.com") + assert name == "John Smith john@gmail.com" + assert email == "Jane Doe jane@gmail.com" + +def test_when_both_empty_strings(): + name, email = check_swapped_emails("", "") + assert name == "" + assert email == "" diff --git a/tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py b/tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py new file mode 100644 index 000000000..40f5cdc27 --- /dev/null +++ b/tests/test_tasks/test_task_utilities/test_util/test_contributor_uuid.py @@ -0,0 +1,161 @@ +import pytest +import uuid +from collectoss.tasks.util.ContributorUUID import ContributorUUID, GithubUUID, GitlabUUID, UnresolvableUUID + +# ContributorUUID tests +@pytest.mark.unit +class TestContributorUUID: + # this checks whether a brand new ContributorUUID object starts as 16 zero bytes + def test_augur_uuid_initializes_with_16_zero_bytes(self): + uid = ContributorUUID() + assert len(uid.bytes) == 16 + assert all(b == 0 for b in uid.bytes) + + # checks that githubUUID sets its platform number to 1 + def test_github_uuid_platform_is_1(self): + uid = GithubUUID() + assert uid["platform"] == 1 + + # checks that gitlabUUID sets its platform number to 2 + def test_gitlab_uuid_platform_is_2(self): + uid = GitlabUUID() + assert uid["platform"] == 2 + + # checks the that you can store a value in the user field + def test_github_uuid_set_user(self): + uid = GithubUUID() + uid["user"] = 12345 + assert uid["user"] == 12345 + + # tests platform_id edge cases + def test_set_platform_id_raises_on_non_integer(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_platform_id("github") + + def test_set_platform_id_raises_on_overflow(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_platform_id(256) # too big for 1 byte + + # checks that writing to one field doesnt accidentally overwrite bytes belonging to another field + def test_fields_dont_overlap(self): + uid = GithubUUID() + + uid["user"] = 12345 + uid["repo"] = 99999 + + assert uid["user"] == 12345 + assert uid["repo"] == 99999 + + # checks that to_UUID returs the uuid.UUID object + def test_to_uuid_returns_valid_uuid(self): + uid = GithubUUID() + uid["user"] = 15 + result = uid.to_UUID() + assert isinstance(result, uuid.UUID) + + # checks the start_byte is within range(0, 16) for set_bytes + def test_set_bytes_raises_on_invalid_start_byte(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_bytes([1, 2, 3], 16) + + # checks that set_bytes correctly raises an error when you write more bytes that will fit in the UUID starting at a given position + def test_set_bytes_raises_on_too_many_bytes(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_bytes([1] * 10, 10) + + # checks that writeint correctly rejects a number + def test_write_int_raises_on_overflow(self): + uid = GithubUUID() + with pytest.raises(ValueError): + uid["user"] = 99999999999 # too big for 4 bytes + + def test_write_int_with_non_integer(self): + uid = GithubUUID() + + with pytest.raises(ValueError): + uid.write_int("abc", 1, 4) + + def test_write_int_and_get_int_roundtrip(self): + uid = ContributorUUID() + uid.write_int(65535, 1, 2) + assert uid.get_int(1, 2) == 65535 + + # checks __int__ method + def test_int_conversion(self): + uid = ContributorUUID() + uid.set_byte(15, 1) + assert int(uid) == 1 + + def test_get_byte_invalid_index(self): + uid = ContributorUUID() + with pytest.raises(IndexError): + uid.get_byte(20) + + # checks that set_byte correctly rejects a value that is too large + def test_set_byte_raises_on_invalid_value(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_byte(0, 256) # too big for one byte + + # checks that set_byte rejects an index that doesnt exist + def test_set_byte_raises_on_out_of_range_index(self): + uid = ContributorUUID() + with pytest.raises(IndexError): + uid.set_byte(16, 1) # index 16 is out of bounds + + def test_set_byte_raises_on_non_integer(self): + uid = ContributorUUID() + with pytest.raises(ValueError): + uid.set_byte(0, "hello") + + # checks that 2 UUIDs with the same values are considered equal. + def test_equality(self): + uid1 = GithubUUID() + uid2 = GithubUUID() + uid1["user"] = 100 + uid2["user"] = 100 + assert uid1 == uid2 + + # checks that 2 UUIDs with different values are not equal + def test_inequality(self): + uid1 = GithubUUID() + uid2 = GithubUUID() + uid1["user"] = 100 + uid2["user"] = 200 + assert uid1 != uid2 + + # checks that the same user produces different user IDs across platforms + def test_github_and_gitlab_different_for_same_user(self): + github_uid = GithubUUID() + gitlab_uid = GitlabUUID() + github_uid["user"] = 100 + gitlab_uid["user"] = 100 + assert github_uid != gitlab_uid + + def test_dict_representation(self): + uid = GithubUUID() + uid["user"] = 10 + + result = uid.__dict__() + + assert result["platform"] == 1 + assert result["user"] == 10 + + def test_string_representation(self): + uid = GithubUUID() + uid["user"] = 10 + + result = str(uid) + + assert "user" in result + assert "platform" in result + + def test_setting_same_field_twice(self): + uid = GithubUUID() + uid["user"] = 42 + uid["user"] = 100 # overwrite with different value + assert uid["user"] == 100 \ No newline at end of file diff --git a/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py b/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py index affd40248..410c1ef70 100644 --- a/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py +++ b/tests/test_tasks/test_task_utilities/test_util/test_worker_util.py @@ -2,11 +2,12 @@ import pytest import sqlalchemy as s -from collectoss.tasks.util.worker_util import * +from collectoss.tasks.util.worker_util import remove_duplicates_by_uniques logger = logging.getLogger(__name__) -def test_remove_duplicates_by_uniques(test_db_engine): +@pytest.mark.unit +def test_remove_duplicates_by_uniques(): data_1 = {"cntrb_login": "Bob", "gh_user_id": 4, "gh_login": "bob", "cntrb_id": "01003f7a-8500-0000-0000-000000000000"} data_2 = {"cntrb_login": "amazing", "gh_user_id": 1700, "gh_login": "hello", "cntrb_id": "01003f7a-8500-0000-0000-000123002000"} diff --git a/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py b/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py index 980f09fb3..4ab662561 100644 --- a/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py +++ b/tests/test_workers/test_facade/test_facade_contributor_interface/test_endpoints.py @@ -9,11 +9,11 @@ @pytest.fixture def set_up_repo_groups(database_connection): - df = pd.read_sql(s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), database_connection) + df = pd.read_sql(s.sql.text("SELECT repo_group_id FROM data.repo_groups"), database_connection) repo_group_IDs = df['repo_group_id'].values.tolist() insert_repo_group_sql = s.sql.text(""" - INSERT INTO "augur_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); + INSERT INTO "data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:repo_group_id, :repo_group_name, '', '', 0, CURRENT_TIMESTAMP, 'Unknown', 'Loaded by user', '1.0', 'Git', CURRENT_TIMESTAMP); """) with open("tests/test_workers/test_facade/test_facade_contributor_interface/test_repo_groups.csv") as create_repo_groups_file: @@ -31,12 +31,12 @@ def set_up_repo_groups(database_connection): - df = database_connection.execute(s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups")) + df = database_connection.execute(s.sql.text("SELECT repo_group_id FROM data.repo_groups")) repo_group_IDs = [group[0] for group in df.fetchall()] insertSQL = s.sql.text(""" - INSERT INTO augur_data.repo(repo_group_id, repo_git, + INSERT INTO data.repo(repo_group_id, repo_git, tool_source, tool_version, data_source, data_collection_date) VALUES (:repo_group_id, :repo_git, 'CLI', 1.0, 'Git', CURRENT_TIMESTAMP) """) diff --git a/tests/test_workers/test_set_up_fixtures.py b/tests/test_workers/test_set_up_fixtures.py index 8d3ad70f1..9248d238a 100644 --- a/tests/test_workers/test_set_up_fixtures.py +++ b/tests/test_workers/test_set_up_fixtures.py @@ -12,7 +12,7 @@ def poll_database_connection(database_string): print("Attempting to create db engine") db = s.create_engine(database_string, poolclass=s.pool.NullPool, - connect_args={'options': '-csearch_path={}'.format('augur_data')}) + connect_args={'options': '-csearch_path={}'.format('data')}) return db @@ -153,8 +153,8 @@ def initialize_database_connections(self): "augur", "augur", "172.17.0.1", 5400, "test" ) - self.db_schema = 'augur_data' - self.helper_schema = 'augur_operations' + self.db_schema = 'data' + self.helper_schema = 'operations' self.helper_db = s.create_engine(DB_STR, poolclass=s.pool.NullPool, connect_args={'options': '-csearch_path={}'.format(self.helper_schema)})