From 798a8937cabfb59f5c95cbf69946c3d2f2f34c92 Mon Sep 17 00:00:00 2001 From: Sean Goggins Date: Thu, 4 May 2023 15:57:37 -0500 Subject: [PATCH 01/48] documentation tweak --- docs/source/quick-start.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/source/quick-start.rst b/docs/source/quick-start.rst index 88e7ac5d46..6ae6836cf1 100644 --- a/docs/source/quick-start.rst +++ b/docs/source/quick-start.rst @@ -1,9 +1,6 @@ Quickstart =============== -Augur Setup ------------ - Ubuntu 22.x =========== From 6d47b3f37182a3cec54cdd1214285469d3459583 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 9 May 2023 10:48:17 -0500 Subject: [PATCH 02/48] mkdir Signed-off-by: Isaac Milarsky --- .../versions/18_schedule_any_old_facade_repositories_to.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py b/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py index 2b94987e8b..84f8f088b1 100644 --- a/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py +++ b/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py @@ -55,7 +55,7 @@ def total_facade_reset(): shutil.rmtree(path) #Create path - path.touch() + path.mkdir() #Move credentials in shutil.move("/tmp/.git-credentials",f"{facade_base_dir}.git-credentials") From 5d7c885da3215ca8148874506001520621f62bf3 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 9 May 2023 18:23:03 -0500 Subject: [PATCH 03/48] set repo back to error when an unexpected error happened Signed-off-by: Isaac Milarsky --- augur/tasks/git/facade_tasks.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 52b3f28130..406e74f16d 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -353,25 +353,29 @@ def clone_repos(): try: git_repo_initialize(session, repo_git) session.commit() + + # get the commit count + commit_count = get_repo_commit_count(session, repo_git) + facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) + + update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) + + # set repo to update + setattr(repoStatus,"facade_status", CollectionState.UPDATE.value) + session.commit() except GitCloneError: # continue to next repo, since we can't calculate # commit_count or weight without the repo cloned - setattr(repoStatus,"facade_status", CollectionState.FAILED_CLONE.value) session.commit() - continue - - #logger.info("GOT HERE ISAAC") - - # get the commit count - commit_count = get_repo_commit_count(session, repo_git) - facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) - - update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) + except Exception as e: + logger.info(f"Ran into unexpected issue when cloning repositories \n Error: {e}") + # set repo to error + setattr(repoStatus,"facade_status", CollectionState.ERROR.value) + session.commit() - # set repo to update - setattr(repoStatus,"facade_status", CollectionState.UPDATE.value) - session.commit() + #Raise exception to activate handling before retry of task. + raise e clone_repos.si().apply_async(countdown=60*5) From b08cb3cf7f360d7327afd09a93999155a28d1430 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 10 May 2023 10:20:08 -0500 Subject: [PATCH 04/48] remove raise of error Signed-off-by: Isaac Milarsky --- augur/tasks/git/facade_tasks.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 406e74f16d..7f9e1c9b16 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -374,9 +374,6 @@ def clone_repos(): setattr(repoStatus,"facade_status", CollectionState.ERROR.value) session.commit() - #Raise exception to activate handling before retry of task. - raise e - clone_repos.si().apply_async(countdown=60*5) From 75ef404007b6eb540b4f6ec798f272e51d44cfcb Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 10 May 2023 10:23:04 -0500 Subject: [PATCH 05/48] error log Signed-off-by: Isaac Milarsky --- augur/tasks/git/facade_tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 7f9e1c9b16..d407011b06 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -369,7 +369,7 @@ def clone_repos(): setattr(repoStatus,"facade_status", CollectionState.FAILED_CLONE.value) session.commit() except Exception as e: - logger.info(f"Ran into unexpected issue when cloning repositories \n Error: {e}") + logger.error(f"Ran into unexpected issue when cloning repositories \n Error: {e}") # set repo to error setattr(repoStatus,"facade_status", CollectionState.ERROR.value) session.commit() From abf751399c80505e7cb9f2b72fa00700e6b9393c Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 10 May 2023 14:53:46 -0500 Subject: [PATCH 06/48] Fix success check for org and repo adding Signed-off-by: Andrew Brain --- augur/api/view/api.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 084ee4bce4..e0fd36d6f0 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -40,14 +40,14 @@ def av_add_user_repo(): # matches https://github.com/{org}/ or htts://github.com/{org} if Repo.parse_github_org_url(url): - added = current_user.add_org(group, url) + added = current_user.add_org(group, url)[0] if added: added_orgs += 1 # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} elif Repo.parse_github_repo_url(url)[0]: print("Adding repo") - added = current_user.add_repo(group, url) + added = current_user.add_repo(group, url)[0] if added: print("Repo added") added_repos += 1 @@ -56,7 +56,7 @@ def av_add_user_repo(): elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', url)): org, repo = match.groups() repo_url = f"https://github.com/{org}/{repo}/" - added = current_user.add_repo(group, repo_url) + added = current_user.add_repo(group, repo_url)[0] if added: added_repos += 1 @@ -64,11 +64,10 @@ def av_add_user_repo(): elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/?$', url)): org = match.group(1) org_url = f"https://github.com/{org}/" - added = current_user.add_org(group, org_url) + added = current_user.add_org(group, org_url)[0] if added: added_orgs += 1 - if not added_orgs and not added_repos: flash(f"Unable to add any repos or orgs") else: From 4604eff05c5b274ff0f2cb564febdf50b33d3a57 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 10 May 2023 14:54:31 -0500 Subject: [PATCH 07/48] Raise exception if no valid github api keys exist Signed-off-by: Andrew Brain --- augur/tasks/github/util/github_api_key_handler.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/augur/tasks/github/util/github_api_key_handler.py b/augur/tasks/github/util/github_api_key_handler.py index 86055d7a7f..2406ecef00 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/augur/tasks/github/util/github_api_key_handler.py @@ -8,6 +8,11 @@ from augur.application.db.session import DatabaseSession from augur.application.config import AugurConfig + +class NoValidKeysError(Exception): + pass + + class GithubApiKeyHandler(): """Handles Github API key retrieval from the database and redis @@ -122,6 +127,9 @@ def get_api_keys(self) -> List[str]: # add all the keys to redis self.redis_key_list.extend(valid_keys) + if not valid_keys: + raise NoValidKeysError("No valid github api keys found in the config or worker oauth table") + return valid_keys def is_bad_api_key(self, client: httpx.Client, oauth_key: str) -> bool: From 5cca96d20e45c02d81ae32fbefe0aefff62eedbb Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 10 May 2023 14:55:12 -0500 Subject: [PATCH 08/48] Catch exception when no valid keys exists and return False Signed-off-by: Andrew Brain --- augur/application/db/models/augur_operations.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index e65363ffb9..8e60a4aaf2 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -429,9 +429,14 @@ def remove_group(self, group_name): def add_repo(self, group_name, repo_url): from augur.tasks.github.util.github_task_session import GithubTaskSession + from augur.tasks.github.util.github_api_key_handler import NoValidKeysError with GithubTaskSession(logger) as session: - result = UserRepo.add(session, repo_url, self.user_id, group_name) + try: + with GithubTaskSession(logger) as session: + result = UserRepo.add(session, repo_url, self.user_id, group_name) + except NoValidKeysError: + return False, {"status": "No valid keys"} return result @@ -445,9 +450,13 @@ def remove_repo(self, group_name, repo_id): def add_org(self, group_name, org_url): from augur.tasks.github.util.github_task_session import GithubTaskSession + from augur.tasks.github.util.github_api_key_handler import NoValidKeysError - with GithubTaskSession(logger) as session: - result = UserRepo.add_org_repos(session, org_url, self.user_id, group_name) + try: + with GithubTaskSession(logger) as session: + result = UserRepo.add_org_repos(session, org_url, self.user_id, group_name) + except NoValidKeysError: + return False, {"status": "No valid keys"} return result From bab1ce46a6c192dbeb63996f9da4fe57bd231c36 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 10 May 2023 16:06:18 -0500 Subject: [PATCH 09/48] Handle github 204 api status Signed-off-by: Andrew Brain --- augur/tasks/github/util/github_paginator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 1c252d8ce6..548d25b0f9 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -384,6 +384,10 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. timeout = timeout * 1.1 num_attempts += 1 continue + + # if api returns a status of 204 No Content then return empty list + if response.status_code == 204: + return [], response, GithubApiResult.SUCCESS page_data = parse_json_response(self.logger, response) From 337268ce902e9587aa2c87b8a99f699aee771c1e Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 11 May 2023 12:38:03 -0500 Subject: [PATCH 10/48] default materialized view refresh for every 7 days Signed-off-by: Isaac Milarsky --- augur/tasks/init/celery_app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index 60701aab6a..bb6348f619 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -3,6 +3,7 @@ import logging from typing import List, Dict import os +import datetime from enum import Enum import traceback import celery @@ -211,7 +212,7 @@ def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task(non_domain_collection_interval, non_repo_domain_tasks.s()) logger.info(f"Scheduling refresh materialized view every night at 1am CDT") - sender.add_periodic_task(crontab(hour=1, minute=0), refresh_materialized_views.s()) + sender.add_periodic_task(datetime.timedelta(days=7), refresh_materialized_views.s()) logger.info(f"Scheduling update of collection weights on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) From aba50ad1ec93079f0563f0f4c09c0d81a2b9c284 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 11 May 2023 12:52:13 -0500 Subject: [PATCH 11/48] simplify worker start logic Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 50 ++++++++++++++------------------ 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 7cf1292ca1..49756ce262 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -83,26 +83,15 @@ def start(disable_collection, development, port): logger.info('Gunicorn webserver started...') logger.info(f'Augur is running at: {"http" if development else "https"}://{host}:{port}') - scheduling_worker_process = None - core_worker_process = None - secondary_worker_process = None + celery_beat_process = None - facade_worker_process = None if not disable_collection: if os.path.exists("celerybeat-schedule.db"): logger.info("Deleting old task schedule") os.remove("celerybeat-schedule.db") - scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h" - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=10 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" - facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=15 -n facade:{uuid.uuid4().hex}@%h -Q facade" - - scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" ")) - core_worker_process = subprocess.Popen(core_worker.split(" ")) - secondary_worker_process = subprocess.Popen(secondary_worker.split(" ")) - facade_worker_process = subprocess.Popen(facade_worker.split(" ")) + processes = start_celery_worker_processes() time.sleep(5) @@ -135,21 +124,10 @@ def start(disable_collection, development, port): logger.info("Shutting down server") server.terminate() - if core_worker_process: - logger.info("Shutting down celery process: core") - core_worker_process.terminate() - - if scheduling_worker_process: - logger.info("Shutting down celery process: scheduling") - scheduling_worker_process.terminate() - - if secondary_worker_process: - logger.info("Shutting down celery process: secondary") - secondary_worker_process.terminate() - - if facade_worker_process: - logger.info("Shutting down celery process: facade") - facade_worker_process.terminate() + logger.info("Shutting down all celery worker processes") + for p in processes: + if p: + p.terminate() if celery_beat_process: logger.info("Shutting down celery beat process") @@ -162,6 +140,21 @@ def start(disable_collection, development, port): except RedisConnectionError: pass +def start_celery_worker_processes(): + + scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h" + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=10 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=15 -n facade:{uuid.uuid4().hex}@%h -Q facade" + + process_list = [] + process_list.append(subprocess.Popen(scheduling_worker.split(" "))) + process_list.append(subprocess.Popen(core_worker.split(" "))) + process_list.append(subprocess.Popen(secondary_worker.split(" "))) + process_list.append(subprocess.Popen(facade_worker.split(" "))) + + return process_list + @cli.command('stop') def stop(): @@ -378,7 +371,6 @@ def raise_open_file_limit(num_files): return - # def initialize_components(augur_app, disable_housekeeper): # master = None # manager = None From c8355de43ce9eface9f1551216136ca868e1b9c8 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 11 May 2023 13:20:44 -0500 Subject: [PATCH 12/48] write method to scale celery processes based on memory Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 49756ce262..0dfda7b8bb 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -142,10 +142,22 @@ def start(disable_collection, development, port): def start_celery_worker_processes(): + #Calculate process scaling based on how much memory is available on the system in bytes. + #Each celery process takes ~500MB or 500 * 1024^2 bytes + + available_memory_in_bytes = psutil.virtual_memory().available + available_memory_in_megabytes = available_memory_in_bytes / (1024 ** 2) + process_estimate = available_memory_in_megabytes // 500 + + #2 processes are always reserved as a baseline. scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h" - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=10 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" - facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=15 -n facade:{uuid.uuid4().hex}@%h -Q facade" + + #60% of estimate + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={round(process_estimate * .6)} -n core:{uuid.uuid4().hex}@%h" + #20% of estimate + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={round(process_estimate * .2)} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + #15% of estimate + facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={round(process_estimate * .15)} -n facade:{uuid.uuid4().hex}@%h -Q facade" process_list = [] process_list.append(subprocess.Popen(scheduling_worker.split(" "))) From 5949f138d1867133e197fc8cbcb3fe4de7c219d0 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 11 May 2023 13:43:36 -0500 Subject: [PATCH 13/48] Cap algorithm at a maximum amount of processes to schedule Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 0dfda7b8bb..582a758d80 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -147,22 +147,38 @@ def start_celery_worker_processes(): available_memory_in_bytes = psutil.virtual_memory().available available_memory_in_megabytes = available_memory_in_bytes / (1024 ** 2) - process_estimate = available_memory_in_megabytes // 500 + max_process_estimate = available_memory_in_megabytes // 500 + + #Get a subset of the maximum procesess available using a ratio, not exceeding a maximum value + def determine_worker_processes(ratio,maximum): + return min(round(max_process_estimate * ratio),maximum) #2 processes are always reserved as a baseline. scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" - #60% of estimate - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={round(process_estimate * .6)} -n core:{uuid.uuid4().hex}@%h" - #20% of estimate - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={round(process_estimate * .2)} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" - #15% of estimate - facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={round(process_estimate * .15)} -n facade:{uuid.uuid4().hex}@%h -Q facade" + core_num_processes = determine_worker_processes(.6, 45) + #60% of estimate, Maximum value of 45 + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" + + secondary_num_processes = determine_worker_processes(.2, 25) + #20% of estimate, Maximum value of 25 + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + + facade_num_processes = determine_worker_processes(.15, 20) + #15% of estimate, Maximum value of 20 + facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" process_list = [] + process_list.append(subprocess.Popen(scheduling_worker.split(" "))) + + logger.info(f"Starting core worker processes with concurrency={core_num_processes}") process_list.append(subprocess.Popen(core_worker.split(" "))) + + logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") process_list.append(subprocess.Popen(secondary_worker.split(" "))) + + logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") process_list.append(subprocess.Popen(facade_worker.split(" "))) return process_list From f97b648783fa2fd8ad546ae2aa0943ffcee183d3 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 11 May 2023 14:26:12 -0500 Subject: [PATCH 14/48] fix indent issue due to uncaught merge error Signed-off-by: Isaac Milarsky --- augur/application/db/models/augur_operations.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 8e60a4aaf2..6d714e6ca8 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -430,8 +430,6 @@ def add_repo(self, group_name, repo_url): from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.github.util.github_api_key_handler import NoValidKeysError - - with GithubTaskSession(logger) as session: try: with GithubTaskSession(logger) as session: result = UserRepo.add(session, repo_url, self.user_id, group_name) From 6b7324af2fa68c092df7be8bad7ac51592c0da16 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 11 May 2023 14:50:56 -0500 Subject: [PATCH 15/48] add hard cap on memory usage with celery worker children Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 582a758d80..773306544a 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -145,7 +145,8 @@ def start_celery_worker_processes(): #Calculate process scaling based on how much memory is available on the system in bytes. #Each celery process takes ~500MB or 500 * 1024^2 bytes - available_memory_in_bytes = psutil.virtual_memory().available + #Cap memory usage to 30% of total virtual memory + available_memory_in_bytes = psutil.virtual_memory().total * .3 available_memory_in_megabytes = available_memory_in_bytes / (1024 ** 2) max_process_estimate = available_memory_in_megabytes // 500 From b0ea69a975ac876c2101e78b1623ba882d5519f8 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 11 May 2023 15:13:03 -0500 Subject: [PATCH 16/48] subtract 2 from max_process_estimate to ensure not to exceed 30% of total virtual memory Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 773306544a..c2380dfa3c 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -156,6 +156,7 @@ def determine_worker_processes(ratio,maximum): #2 processes are always reserved as a baseline. scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" + max_process_estimate -= 2 core_num_processes = determine_worker_processes(.6, 45) #60% of estimate, Maximum value of 45 @@ -165,7 +166,7 @@ def determine_worker_processes(ratio,maximum): #20% of estimate, Maximum value of 25 secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" - facade_num_processes = determine_worker_processes(.15, 20) + facade_num_processes = determine_worker_processes(.2, 20) #15% of estimate, Maximum value of 20 facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" From de0d9b04564a637729b2c878b451563c95e11f2d Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 11 May 2023 15:27:09 -0500 Subject: [PATCH 17/48] Make sure that each worker has a minimum process of 1 Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index c2380dfa3c..63febe3b0b 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -146,13 +146,13 @@ def start_celery_worker_processes(): #Each celery process takes ~500MB or 500 * 1024^2 bytes #Cap memory usage to 30% of total virtual memory - available_memory_in_bytes = psutil.virtual_memory().total * .3 + available_memory_in_bytes = psutil.virtual_memory().total * .4 available_memory_in_megabytes = available_memory_in_bytes / (1024 ** 2) max_process_estimate = available_memory_in_megabytes // 500 #Get a subset of the maximum procesess available using a ratio, not exceeding a maximum value def determine_worker_processes(ratio,maximum): - return min(round(max_process_estimate * ratio),maximum) + return max(min(round(max_process_estimate * ratio),maximum),1) #2 processes are always reserved as a baseline. scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" From fc72e8e1b8b631cc05b0cdc80158af54d48181f0 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 11 May 2023 15:32:03 -0500 Subject: [PATCH 18/48] default 25% Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 63febe3b0b..0a58e8236a 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -146,7 +146,7 @@ def start_celery_worker_processes(): #Each celery process takes ~500MB or 500 * 1024^2 bytes #Cap memory usage to 30% of total virtual memory - available_memory_in_bytes = psutil.virtual_memory().total * .4 + available_memory_in_bytes = psutil.virtual_memory().total * .25 available_memory_in_megabytes = available_memory_in_bytes / (1024 ** 2) max_process_estimate = available_memory_in_megabytes // 500 From 10a16cbe992de04c89ee31685f4d4851d27000b9 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 15 May 2023 12:30:09 -0500 Subject: [PATCH 19/48] Add alembic revision for new version of the config Signed-off-by: Isaac Milarsky --- augur/application/config.py | 4 +- ...add_extra_celery_options_to_the_config_.py | 50 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py diff --git a/augur/application/config.py b/augur/application/config.py index 134dd3dafd..1c59965159 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -66,7 +66,9 @@ def get_development_flag(): "log_level": "INFO", }, "Celery": { - "concurrency": 12 + "concurrency": 12, + "worker_process_vmem_cap": 0.25, + "refresh_materialized_views_interval_in_days": 7 }, "Redis": { "cache_group": 0, diff --git a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py new file mode 100644 index 0000000000..f652887ef5 --- /dev/null +++ b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py @@ -0,0 +1,50 @@ +"""Add extra celery options to the config if they do not exist + +Revision ID: 19 +Revises: 18 +Create Date: 2023-05-15 12:03:57.171011 + +""" +from alembic import op +import sqlalchemy as sa +from augur.application.db.session import DatabaseSession +from augur.application.config import * +from sqlalchemy.sql import text + +# revision identifiers, used by Alembic. +revision = '19' +down_revision = '18' +branch_labels = None +depends_on = None + +logger = logging.getLogger(__name__) + +def upgrade(): + + with DatabaseSession(logger) as session: + config = AugurConfig(logger,session) + config_dict = config.load_config() + + section = config_dict.get("Celery") + + if section: + if 'worker_process_vmem_cap' not in section.keys(): + section['worker_process_vmem_cap'] = 0.25 + + if 'refresh_materialized_views_interval_in_days' not in section.keys(): + section['refresh_materialized_views_interval_in_days'] = 7 + else: + section = config.default_config["Celery"] + + config.add_section_from_json("Celery", section) + + + +def downgrade(): + + conn = op.get_bind() + + con.execute(text(f""" + DELETE FROM augur_operations.config + WHERE section_name='Celery' AND (setting_name='worker_process_vmem_cap' OR setting_name='refresh_materialized_views_interval_in_days'); + """)) \ No newline at end of file From e96bb41706d02a45794bddd79000d7aef298c020 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 15 May 2023 12:30:51 -0500 Subject: [PATCH 20/48] missing import Signed-off-by: Isaac Milarsky --- .../versions/19_add_extra_celery_options_to_the_config_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py index f652887ef5..e64a5ccb12 100644 --- a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py +++ b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py @@ -10,6 +10,7 @@ from augur.application.db.session import DatabaseSession from augur.application.config import * from sqlalchemy.sql import text +import logging # revision identifiers, used by Alembic. revision = '19' From 3b085b36d4bcc242b13742e943d268b2009c2c05 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 15 May 2023 12:31:16 -0500 Subject: [PATCH 21/48] syntax Signed-off-by: Isaac Milarsky --- .../versions/19_add_extra_celery_options_to_the_config_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py index e64a5ccb12..ddd2ff3ecf 100644 --- a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py +++ b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py @@ -45,7 +45,7 @@ def downgrade(): conn = op.get_bind() - con.execute(text(f""" + conn.execute(text(f""" DELETE FROM augur_operations.config WHERE section_name='Celery' AND (setting_name='worker_process_vmem_cap' OR setting_name='refresh_materialized_views_interval_in_days'); """)) \ No newline at end of file From d2bb1ce6436719163d748b23d69f74453f9d43e7 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 15 May 2023 12:38:24 -0500 Subject: [PATCH 22/48] Implement use of config values Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 7 ++++--- .../versions/19_add_extra_celery_options_to_the_config_.py | 2 ++ augur/tasks/init/celery_app.py | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 0a58e8236a..d97e59925a 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -75,6 +75,7 @@ def start(disable_collection, development, port): if not port: port = config.get_value("Server", "port") + worker_vmem_cap = config.get_value("Celery", 'worker_process_vmem_cap') gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app" server = subprocess.Popen(gunicorn_command.split(" ")) @@ -91,7 +92,7 @@ def start(disable_collection, development, port): logger.info("Deleting old task schedule") os.remove("celerybeat-schedule.db") - processes = start_celery_worker_processes() + processes = start_celery_worker_processes(float(worker_vmem_cap)) time.sleep(5) @@ -140,13 +141,13 @@ def start(disable_collection, development, port): except RedisConnectionError: pass -def start_celery_worker_processes(): +def start_celery_worker_processes(vmem_cap_ratio): #Calculate process scaling based on how much memory is available on the system in bytes. #Each celery process takes ~500MB or 500 * 1024^2 bytes #Cap memory usage to 30% of total virtual memory - available_memory_in_bytes = psutil.virtual_memory().total * .25 + available_memory_in_bytes = psutil.virtual_memory().total * vmem_cap_ratio available_memory_in_megabytes = available_memory_in_bytes / (1024 ** 2) max_process_estimate = available_memory_in_megabytes // 500 diff --git a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py index ddd2ff3ecf..4bc8e7cae1 100644 --- a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py +++ b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py @@ -26,8 +26,10 @@ def upgrade(): config = AugurConfig(logger,session) config_dict = config.load_config() + #Update the missing fields of the celery section in the config section = config_dict.get("Celery") + #Just copy the default if section doesn't exist. if section: if 'worker_process_vmem_cap' not in section.keys(): section['worker_process_vmem_cap'] = 0.25 diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index bb6348f619..3359177f09 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -211,8 +211,9 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Scheduling non-repo-domain collection every {non_domain_collection_interval/60} minutes") sender.add_periodic_task(non_domain_collection_interval, non_repo_domain_tasks.s()) + mat_views_interval = int(config.get_value('Celery', 'refresh_materialized_views_interval_in_days')) logger.info(f"Scheduling refresh materialized view every night at 1am CDT") - sender.add_periodic_task(datetime.timedelta(days=7), refresh_materialized_views.s()) + sender.add_periodic_task(datetime.timedelta(days=mat_views_interval), refresh_materialized_views.s()) logger.info(f"Scheduling update of collection weights on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) From 149117206feb3cd631eede0075ed0e5813b6be89 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 15 May 2023 16:41:08 -0500 Subject: [PATCH 23/48] remove celery concurrency option Signed-off-by: Isaac Milarsky --- augur/application/config.py | 1 - .../19_add_extra_celery_options_to_the_config_.py | 15 ++++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/augur/application/config.py b/augur/application/config.py index 1c59965159..2472da17e8 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -66,7 +66,6 @@ def get_development_flag(): "log_level": "INFO", }, "Celery": { - "concurrency": 12, "worker_process_vmem_cap": 0.25, "refresh_materialized_views_interval_in_days": 7 }, diff --git a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py index 4bc8e7cae1..24d7fd08c6 100644 --- a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py +++ b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py @@ -41,6 +41,12 @@ def upgrade(): config.add_section_from_json("Celery", section) + #delete old setting + session.execute_sql(text(f""" + DELETE FROM augur_operations.config + WHERE section_name='Celery' AND setting_name='concurrency'; + """)) + def downgrade(): @@ -50,4 +56,11 @@ def downgrade(): conn.execute(text(f""" DELETE FROM augur_operations.config WHERE section_name='Celery' AND (setting_name='worker_process_vmem_cap' OR setting_name='refresh_materialized_views_interval_in_days'); - """)) \ No newline at end of file + """)) + + try: + conn.execute(text(f""" + INSERT INTO augur_operations.config (section_name,setting_name,value,type) VALUES ('Celery','concurrency',12,'int'); + """)) + except: + pass \ No newline at end of file From f268c7c46b5f3e5802484e9c1745a8f7bc690252 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 17 May 2023 09:57:02 -0500 Subject: [PATCH 24/48] Add helper functions for db fixtures Signed-off-by: Andrew Brain --- conftest.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 4e9f4f8f80..8d4ddd2d15 100644 --- a/conftest.py +++ b/conftest.py @@ -30,7 +30,32 @@ def create_full_routes(routes): return full_routes @pytest.fixture -def database(): +def create_connection(dbname='postgres'): + db_string = get_database_string() + user, password, host, port, _ = parse_database_string(db_string) + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + return conn, conn.cursor() + + +def create_database(conn, cursor, db_name, template=None): + if template: + cursor.execute(sql.SQL("CREATE DATABASE {} WITH TEMPLATE {};").format(sql.Identifier(db_name), sql.Identifier(template))) + else: + cursor.execute(sql.SQL("CREATE DATABASE {};").format(sql.Identifier(db_name))) + conn.commit() + +def drop_database(cursor, db_name): + # ensure connections are removed + cursor.execute(sql.SQL("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='{}';".format(db_name))) + # drop temporary database + cursor.execute(sql.SQL("DROP DATABASE {};").format(sql.Identifier(db_name))) db_string = get_database_string() From d7618541eff5207b3a15a55c9241be3a67be0dd9 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 17 May 2023 09:58:32 -0500 Subject: [PATCH 25/48] Add fresh db fixture with each scope level Signed-off-by: Andrew Brain --- conftest.py | 79 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/conftest.py b/conftest.py index 8d4ddd2d15..aaabb1611b 100644 --- a/conftest.py +++ b/conftest.py @@ -29,7 +29,7 @@ def create_full_routes(routes): full_routes.append(route) return full_routes -@pytest.fixture + def create_connection(dbname='postgres'): db_string = get_database_string() user, password, host, port, _ = parse_database_string(db_string) @@ -57,6 +57,13 @@ def drop_database(cursor, db_name): # drop temporary database cursor.execute(sql.SQL("DROP DATABASE {};").format(sql.Identifier(db_name))) + +def create_template_db(template_name): + + import time + + start_time = time.time() + db_string = get_database_string() user, password, host, port, _ = parse_database_string(db_string) @@ -80,14 +87,7 @@ def drop_database(cursor, db_name): # remove database_name and add test_db_name test_db_string = db_string[:db_string.rfind("/")+1] + test_db_name - # create the temporary database - cursor.execute(sql.SQL("CREATE DATABASE {};").format(sql.Identifier(test_db_name))) - - # Commit changes - conn.commit() - - # Install schema - execute_sql_file("tests/entire_db.sql", test_db_name, user, password, host, port) + create_database(conn, cursor, test_db_name, template_name) # create engine to connect to db engine = create_database_engine(test_db_string, poolclass=StaticPool) @@ -97,17 +97,76 @@ def drop_database(cursor, db_name): # dispose engine engine.dispose() + drop_database(cursor, test_db_name) + + # Close the cursor and the connection + cursor.close() + conn.close() + + + +@pytest.fixture(scope='session') +def db_template(): + + db_string = get_database_string() + + user, password, host, port, _ = parse_database_string(db_string) + + # Connect to the default 'postgres' database + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname='postgres' + ) + + # Set the isolation level to AUTOCOMMIT because CREATE DATABASE + # cannot be executed in a transaction block + conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + cursor = conn.cursor() + + + test_db_name = "test_db_template_" + uuid.uuid4().hex + create_database(conn, cursor, test_db_name) + + # Install schema + execute_sql_file("tests/entire_db.sql", test_db_name, user, password, host, port) + + # ensure connections are removed cursor.execute(sql.SQL("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='{}';".format(test_db_name))) # drop temporary database - cursor.execute(sql.SQL("DROP DATABASE {};").format(sql.Identifier(test_db_name))) + + yield test_db_name + + drop_database(cursor, test_db_name) # Close the cursor and the connection cursor.close() conn.close() + +@pytest.fixture(scope='session') +def fresh_db_session(db_template): + print("Creating fresh db session from template") + yield from create_template_db(db_template) + +@pytest.fixture(scope='package') +def fresh_db_package(db_template): + print("Creating fresh package level db") + yield from create_template_db(db_template) + +@pytest.fixture(scope='module') +def fresh_db_module(db_template): + yield from create_template_db(db_template) + +@pytest.fixture(scope='function') +def fresh_db_function(db_template): + yield from create_template_db(db_template) + @pytest.fixture def test_db_engine(): From 5d160667f2f18940ffa150c728d2505ffdef3b15 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 17 May 2023 10:00:10 -0500 Subject: [PATCH 26/48] Add read only database fixture Signed-off-by: Andrew Brain --- conftest.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/conftest.py b/conftest.py index aaabb1611b..c8900dc4ee 100644 --- a/conftest.py +++ b/conftest.py @@ -167,6 +167,42 @@ def fresh_db_module(db_template): def fresh_db_function(db_template): yield from create_template_db(db_template) + +@pytest.fixture(scope='session') +def read_only_db(fresh_db_session): + + print("Creating read-only db") + print("Fresh db session type: " + str(type(fresh_db_session))) + + database_name = fresh_db_session.url.database + test_username = "testuser" + test_password = "testpass" + schemas = ["public", "augur_data", "augur_operations"] + + # create read-only user + fresh_db_session.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) + fresh_db_session.execute(s.text(f"GRANT CONNECT ON DATABASE {database_name} TO {test_username};")) + for schema in schemas: + fresh_db_session.execute(s.text(f"GRANT USAGE ON SCHEMA {schema} TO {test_username};")) + fresh_db_session.execute(s.text(f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} TO {test_username};")) + + # create engine for read-only user + db_string = get_database_string() + _, _, host, port, _ = parse_database_string(db_string) + read_only_engine = s.create_engine(f'postgresql+psycopg2://{test_username}:{test_password}@{host}:{port}/{database_name}') + + yield read_only_engine + + read_only_engine.dispose() + + # remove read-only user + fresh_db_session.execute(s.text(f'REVOKE CONNECT ON DATABASE {database_name} FROM {test_username};')) + for schema in schemas: + fresh_db_session.execute(s.text(f'REVOKE USAGE ON SCHEMA {schema} FROM {test_username};')) + fresh_db_session.execute(s.text(f'REVOKE SELECT ON ALL TABLES IN SCHEMA {schema} FROM {test_username};')) + fresh_db_session.execute(s.text(f'DROP USER {test_username};')) + + @pytest.fixture def test_db_engine(): From 2c305cba2e8c8cac3f8710d7958ab41d05c7f4fa Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 17 May 2023 10:10:20 -0500 Subject: [PATCH 27/48] Increase security of password hashing and define standard method to hash passwords Signed-off-by: Andrew Brain --- augur/api/routes/application.py | 2 +- augur/api/routes/user.py | 4 ++-- augur/application/cli/user.py | 3 +-- augur/application/db/models/augur_operations.py | 8 ++++++-- tests/test_applicaton/test_repo_load_controller/helper.py | 5 ++--- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/augur/api/routes/application.py b/augur/api/routes/application.py index d758b020f3..3d2b22b8ed 100644 --- a/augur/api/routes/application.py +++ b/augur/api/routes/application.py @@ -12,7 +12,7 @@ import pandas as pd from flask import request, Response, jsonify, session from flask_login import login_user, logout_user, current_user, login_required -from werkzeug.security import generate_password_hash, check_password_hash +from werkzeug.security import check_password_hash from sqlalchemy.sql import text from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.exc import NoResultFound diff --git a/augur/api/routes/user.py b/augur/api/routes/user.py index cb2635e1dc..6e8ae680ef 100644 --- a/augur/api/routes/user.py +++ b/augur/api/routes/user.py @@ -12,7 +12,7 @@ import pandas as pd from flask import request, Response, jsonify, session from flask_login import login_user, logout_user, current_user, login_required -from werkzeug.security import generate_password_hash, check_password_hash +from werkzeug.security import check_password_hash from sqlalchemy.sql import text from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.exc import NoResultFound @@ -212,7 +212,7 @@ def update_user(): return jsonify({"status": "Email Updated"}) if new_password is not None: - current_user.login_hashword = generate_password_hash(new_password) + current_user.login_hashword = User.compute_hashsed_password(new_password) session.commit() session = Session() return jsonify({"status": "Password Updated"}) diff --git a/augur/application/cli/user.py b/augur/application/cli/user.py index e2846d5f37..54bdbcd2fd 100644 --- a/augur/application/cli/user.py +++ b/augur/application/cli/user.py @@ -8,7 +8,6 @@ import os import click import logging -from werkzeug.security import generate_password_hash from augur.application.db.models import User from augur.application.db.engine import DatabaseEngine from sqlalchemy.orm import sessionmaker @@ -48,7 +47,7 @@ def add_user(username, email, firstname, lastname, admin, phone_number, password user = session.query(User).filter(User.login_name == username).first() if not user: - password = generate_password_hash(password) + password = User.compute_hashsed_password(password) new_user = User(login_name=username, login_hashword=password, email=email, text_phone=phone_number, first_name=firstname, last_name=lastname, admin=admin, tool_source="User CLI", tool_version=None, data_source="CLI") session.add(new_user) session.commit() diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 6d714e6ca8..df81cbebd9 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -335,7 +335,7 @@ def create_user(username: str, password: str, email: str, first_name:str, last_n return False, {"status": "A User already exists with that email"} try: - user = User(login_name = username, login_hashword = generate_password_hash(password), email = email, first_name = first_name, last_name = last_name, tool_source="User API", tool_version=None, data_source="API", admin=admin) + user = User(login_name = username, login_hashword = User.compute_hashsed_password(password), email = email, first_name = first_name, last_name = last_name, tool_source="User API", tool_version=None, data_source="API", admin=admin) session.add(user) session.commit() @@ -373,7 +373,7 @@ def update_password(self, session, old_password, new_password): if not check_password_hash(self.login_hashword, old_password): return False, {"status": "Password did not match users password"} - self.login_hashword = generate_password_hash(new_password) + self.login_hashword = User.compute_hashsed_password(new_password) session.commit() return True, {"status": "Password updated"} @@ -585,6 +585,10 @@ def get_favorite_groups(self, session): return None, {"status": "Error when trying to get favorite groups"} return groups, {"status": "Success"} + + @staticmethod + def compute_hashsed_password(password): + return generate_password_hash(password, method='pbkdf2:sha512', salt_length=32) diff --git a/tests/test_applicaton/test_repo_load_controller/helper.py b/tests/test_applicaton/test_repo_load_controller/helper.py index 11ac16640f..29aa0dc9c7 100644 --- a/tests/test_applicaton/test_repo_load_controller/helper.py +++ b/tests/test_applicaton/test_repo_load_controller/helper.py @@ -4,10 +4,9 @@ from augur.util.repo_load_controller import ORG_REPOS_ENDPOINT from augur.application.db.session import DatabaseSession -from augur.application.db.models import Config +from augur.application.db.models import Config, User from augur.tasks.github.util.github_paginator import hit_api from augur.application.db.util import execute_session_query -from werkzeug.security import generate_password_hash logger = logging.getLogger(__name__) @@ -105,7 +104,7 @@ def get_repo_group_insert_statement(rg_id): def get_user_insert_statement(user_id, username="bil", email="default@gmail.com", password="pass"): - return """INSERT INTO "augur_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, generate_password_hash(password), email) + return """INSERT INTO "augur_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, User.compute_hashsed_password(password), email) def get_user_group_insert_statement(user_id, group_name, group_id=None): From ba94e5a7292df53defc66dbb9c50bc12ad6bf3bb Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 17 May 2023 14:51:58 -0500 Subject: [PATCH 28/48] Implement functionality to add orgs and repos asynchronously Signed-off-by: Andrew Brain --- augur/api/view/api.py | 41 ++----------------- augur/application/cli/backend.py | 4 ++ .../application/db/models/augur_operations.py | 11 +++++ augur/tasks/init/celery_app.py | 11 +++-- 4 files changed, 25 insertions(+), 42 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index e0fd36d6f0..8872709e6f 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -1,6 +1,7 @@ from flask import Flask, render_template, render_template_string, request, abort, jsonify, redirect, url_for, session, flash from flask_login import current_user, login_required from augur.application.db.models import Repo +from augur.tasks.frontend import add_org_repo_list from .utils import * from ..server import app @@ -33,45 +34,9 @@ def av_add_user_repo(): if group == "None": group = current_user.login_name + "_default" + add_org_repo_list.si(current_user.user_id, group, urls).apply_async() - added_orgs = 0 - added_repos = 0 - for url in urls: - - # matches https://github.com/{org}/ or htts://github.com/{org} - if Repo.parse_github_org_url(url): - added = current_user.add_org(group, url)[0] - if added: - added_orgs += 1 - - # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} - elif Repo.parse_github_repo_url(url)[0]: - print("Adding repo") - added = current_user.add_repo(group, url)[0] - if added: - print("Repo added") - added_repos += 1 - - # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} - elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', url)): - org, repo = match.groups() - repo_url = f"https://github.com/{org}/{repo}/" - added = current_user.add_repo(group, repo_url)[0] - if added: - added_repos += 1 - - # matches /{org}/ or /{org} or {org}/ or {org} - elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/?$', url)): - org = match.group(1) - org_url = f"https://github.com/{org}/" - added = current_user.add_org(group, org_url)[0] - if added: - added_orgs += 1 - - if not added_orgs and not added_repos: - flash(f"Unable to add any repos or orgs") - else: - flash(f"Successfully added {added_repos} repos and {added_orgs} orgs") + flash("Adding repos and orgs in the background") return redirect(url_for("user_settings") + "?section=tracker") diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 0a58e8236a..60860a23fd 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -158,6 +158,9 @@ def determine_worker_processes(ratio,maximum): scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" max_process_estimate -= 2 + frontend_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n frontend:{uuid.uuid4().hex}@%h -Q frontend" + max_process_estimate -= 1 + core_num_processes = determine_worker_processes(.6, 45) #60% of estimate, Maximum value of 45 core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" @@ -173,6 +176,7 @@ def determine_worker_processes(ratio,maximum): process_list = [] process_list.append(subprocess.Popen(scheduling_worker.split(" "))) + process_list.append(subprocess.Popen(frontend_worker.split(" "))) logger.info(f"Starting core worker processes with concurrency={core_num_processes}") process_list.append(subprocess.Popen(core_worker.split(" "))) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 6d714e6ca8..24c299962a 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -317,6 +317,17 @@ def get_user(session, username: str): return user except NoResultFound: return None + + @staticmethod + def get_by_id(session, user_id: int): + + if not isinstance(user_id, int): + return None + try: + user = session.query(User).filter(User.user_id == user_id).one() + return user + except NoResultFound: + return None @staticmethod def create_user(username: str, password: str, email: str, first_name:str, last_name:str, admin=False): diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index bb6348f619..2f2ddea684 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -62,10 +62,12 @@ class CollectionState(Enum): materialized_view_tasks = ['augur.tasks.db.refresh_materialized_views'] +frontend_tasks = ['augur.tasks.frontend'] + +tasks = start_tasks + github_tasks + git_tasks + materialized_view_tasks + frontend_tasks + if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": - tasks = start_tasks + github_tasks + git_tasks + materialized_view_tasks + data_analysis_tasks -else: - tasks = start_tasks + github_tasks + git_tasks + materialized_view_tasks + tasks += data_analysis_tasks redis_db_number, redis_conn_string = get_redis_conn_values() @@ -132,7 +134,8 @@ def on_failure(self,exc,task_id,args, kwargs, einfo): 'augur.tasks.github.pull_requests.tasks.collect_pull_request_review_comments': {'queue': 'secondary'}, 'augur.tasks.git.dependency_tasks.tasks.process_ossf_scorecard_metrics': {'queue': 'secondary'}, 'augur.tasks.git.dependency_tasks.tasks.process_dependency_metrics': {'queue': 'facade'}, - 'augur.tasks.git.dependency_libyear_tasks.tasks.process_libyear_dependency_metrics': {'queue': 'facade'} + 'augur.tasks.git.dependency_libyear_tasks.tasks.process_libyear_dependency_metrics': {'queue': 'facade'}, + 'augur.tasks.frontend.*': {'queue': 'frontend'} } #Setting to be able to see more detailed states of running tasks From c11f2998256b9ba031321297e2fc2d9d04946208 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 17 May 2023 21:47:35 +0000 Subject: [PATCH 29/48] sendgrid update Signed-off-by: Sean P. Goggins --- .gitignore | 2 +- sendgrid.env | 1 + sendgridtest.py | 19 +++++++++++++++++++ setup.py | 1 + 4 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 sendgrid.env create mode 100644 sendgridtest.py diff --git a/.gitignore b/.gitignore index 7ebccb6a15..a1361d8b9a 100644 --- a/.gitignore +++ b/.gitignore @@ -192,4 +192,4 @@ pgdata/ postgres-data/ # Generated files from github -.history/ \ No newline at end of file +.history/sendgrid.env diff --git a/sendgrid.env b/sendgrid.env new file mode 100644 index 0000000000..96405ac280 --- /dev/null +++ b/sendgrid.env @@ -0,0 +1 @@ +export SENDGRID_API_KEY='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' diff --git a/sendgridtest.py b/sendgridtest.py new file mode 100644 index 0000000000..0aea1e25c0 --- /dev/null +++ b/sendgridtest.py @@ -0,0 +1,19 @@ +# using SendGrid's Python Library +# https://github.com/sendgrid/sendgrid-python +import os +from sendgrid import SendGridAPIClient +from sendgrid.helpers.mail import Mail + +message = Mail( + from_email='metrix@goggins.com', + to_emails='gogginss@missouri.edu', + subject='Sending with Twilio SendGrid is Fun', + html_content='and easy to do anywhere, even with Python') +try: + sg = SendGridAPIClient(os.environ.get('SENDGRID_API_KEY')) + response = sg.send(message) + print(response.status_code) + print(response.body) + print(response.headers) +except Exception as e: + print(e.message) diff --git a/setup.py b/setup.py index bd7ef6d096..7e344e553c 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ ], install_requires=[ "wheel", + "sendgrid", "alembic==1.8.1", # 1.8.1 "coloredlogs==15.0", # 15.0.1 "Beaker==1.11.0", # 1.11.0 From ffe8f67849fd75bc083523baf58128fea5b763b8 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 17 May 2023 21:55:35 +0000 Subject: [PATCH 30/48] updating sendgrid .gitignore Signed-off-by: Sean P. Goggins --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a1361d8b9a..ee34c0ba15 100644 --- a/.gitignore +++ b/.gitignore @@ -193,3 +193,4 @@ postgres-data/ # Generated files from github .history/sendgrid.env +sendgrid.env From 650b13e2a67218ef79d7eefadd5f596d5da4f625 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 17 May 2023 21:58:51 +0000 Subject: [PATCH 31/48] sendgrid key removed Signed-off-by: Sean P. Goggins --- .gitignore | 2 ++ sendgrid.env | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ee34c0ba15..ff303c8209 100644 --- a/.gitignore +++ b/.gitignore @@ -194,3 +194,5 @@ postgres-data/ # Generated files from github .history/sendgrid.env sendgrid.env +*sendgrid*.env +./sendgrid.env diff --git a/sendgrid.env b/sendgrid.env index 96405ac280..92c411e635 100644 --- a/sendgrid.env +++ b/sendgrid.env @@ -1 +1 @@ -export SENDGRID_API_KEY='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' +export SENDGRID_API_KEY='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' From 45bad972fdf6aa60fb6276d5e717875220387008 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 17 May 2023 22:05:48 +0000 Subject: [PATCH 32/48] fixing send Signed-off-by: Sean P. Goggins --- .gitignore | 1 + sendgrid.env | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 sendgrid.env diff --git a/.gitignore b/.gitignore index ff303c8209..2feaa98037 100644 --- a/.gitignore +++ b/.gitignore @@ -196,3 +196,4 @@ postgres-data/ sendgrid.env *sendgrid*.env ./sendgrid.env +sendgrid.env diff --git a/sendgrid.env b/sendgrid.env deleted file mode 100644 index 92c411e635..0000000000 --- a/sendgrid.env +++ /dev/null @@ -1 +0,0 @@ -export SENDGRID_API_KEY='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' From a2325f094b7e42e136b52bf3957e2c3e28cee772 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Wed, 17 May 2023 17:20:36 -0500 Subject: [PATCH 33/48] Fix backend.py to run frontend worker when collection is disabled Signed-off-by: Andrew Brain --- augur/application/cli/backend.py | 76 +++++++++++++++----------------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 60860a23fd..f06e83ba38 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -83,19 +83,18 @@ def start(disable_collection, development, port): logger.info('Gunicorn webserver started...') logger.info(f'Augur is running at: {"http" if development else "https"}://{host}:{port}') - - celery_beat_process = None - if not disable_collection: - - if os.path.exists("celerybeat-schedule.db"): + processes = start_celery_worker_processes(disable_collection) + time.sleep(5) + if os.path.exists("celerybeat-schedule.db"): logger.info("Deleting old task schedule") os.remove("celerybeat-schedule.db") - processes = start_celery_worker_processes() + celery_beat_process = None + celery_command = "celery -A augur.tasks.init.celery_app.celery_app beat -l debug" + celery_beat_process = subprocess.Popen(celery_command.split(" ")) - time.sleep(5) + if not disable_collection: - with DatabaseSession(logger) as session: clean_collection_status(session) @@ -109,10 +108,6 @@ def start(disable_collection, development, port): augur_collection_monitor.si().apply_async() - - celery_command = "celery -A augur.tasks.init.celery_app.celery_app beat -l debug" - celery_beat_process = subprocess.Popen(celery_command.split(" ")) - else: logger.info("Collection disabled") @@ -140,11 +135,13 @@ def start(disable_collection, development, port): except RedisConnectionError: pass -def start_celery_worker_processes(): +def start_celery_worker_processes(disable_collection=False): #Calculate process scaling based on how much memory is available on the system in bytes. #Each celery process takes ~500MB or 500 * 1024^2 bytes + process_list = [] + #Cap memory usage to 30% of total virtual memory available_memory_in_bytes = psutil.virtual_memory().total * .25 available_memory_in_megabytes = available_memory_in_bytes / (1024 ** 2) @@ -153,39 +150,36 @@ def start_celery_worker_processes(): #Get a subset of the maximum procesess available using a ratio, not exceeding a maximum value def determine_worker_processes(ratio,maximum): return max(min(round(max_process_estimate * ratio),maximum),1) - - #2 processes are always reserved as a baseline. - scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" - max_process_estimate -= 2 - + frontend_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n frontend:{uuid.uuid4().hex}@%h -Q frontend" max_process_estimate -= 1 - - core_num_processes = determine_worker_processes(.6, 45) - #60% of estimate, Maximum value of 45 - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" - - secondary_num_processes = determine_worker_processes(.2, 25) - #20% of estimate, Maximum value of 25 - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" - - facade_num_processes = determine_worker_processes(.2, 20) - #15% of estimate, Maximum value of 20 - facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" - - process_list = [] - - process_list.append(subprocess.Popen(scheduling_worker.split(" "))) process_list.append(subprocess.Popen(frontend_worker.split(" "))) - logger.info(f"Starting core worker processes with concurrency={core_num_processes}") - process_list.append(subprocess.Popen(core_worker.split(" "))) - - logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") - process_list.append(subprocess.Popen(secondary_worker.split(" "))) + if not disable_collection: - logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") - process_list.append(subprocess.Popen(facade_worker.split(" "))) + #2 processes are always reserved as a baseline. + scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" + max_process_estimate -= 2 + process_list.append(subprocess.Popen(scheduling_worker.split(" "))) + + #60% of estimate, Maximum value of 45 + core_num_processes = determine_worker_processes(.6, 45) + logger.info(f"Starting core worker processes with concurrency={core_num_processes}") + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" + process_list.append(subprocess.Popen(core_worker.split(" "))) + + #20% of estimate, Maximum value of 25 + secondary_num_processes = determine_worker_processes(.2, 25) + logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + process_list.append(subprocess.Popen(secondary_worker.split(" "))) + + #15% of estimate, Maximum value of 20 + facade_num_processes = determine_worker_processes(.2, 20) + logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") + facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" + + process_list.append(subprocess.Popen(facade_worker.split(" "))) return process_list From 1ba63be4adcbc6a02c5e7dbed7f708900d7637eb Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Thu, 18 May 2023 16:17:00 -0500 Subject: [PATCH 34/48] Add missing file Signed-off-by: Andrew Brain --- augur/tasks/frontend.py | 79 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 augur/tasks/frontend.py diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py new file mode 100644 index 0000000000..6243c59dcb --- /dev/null +++ b/augur/tasks/frontend.py @@ -0,0 +1,79 @@ +import logging +import re + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.github.util.github_task_session import GithubTaskSession +from augur.application.db.models import UserRepo, Repo, User + + +@celery.task +def add_org_repo_list(user_id, group_name, urls): + + logger = logging.getLogger(add_org_repo_list.__name__) + + with GithubTaskSession(logger) as session: + + user = User.get_by_id(session, user_id) + + invalid_urls = [] + valid_orgs = [] + valid_repos = [] + for url in urls: + + # matches https://github.com/{org}/ or htts://github.com/{org} + if Repo.parse_github_org_url(url): + added = user.add_org(group_name, url)[0] + if added: + valid_orgs.append(url) + + # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} + elif Repo.parse_github_repo_url(url)[0]: + added = user.add_repo(group_name, url)[0] + if added: + valid_repos.append(url) + + # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} + elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', url)): + org, repo = match.groups() + repo_url = f"https://github.com/{org}/{repo}/" + added = user.add_repo(group_name, repo_url)[0] + if added: + valid_repos.append(url) + + # matches /{org}/ or /{org} or {org}/ or {org} + elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/?$', url)): + org = match.group(1) + org_url = f"https://github.com/{org}/" + added = user.add_org(group_name, org_url)[0] + if added: + valid_orgs.append(url) + else: + invalid_urls.append(url) + + return valid_orgs, valid_repos, invalid_urls + + + + + + +@celery.task +def add_repo(user_id, group_name, repo_url): + + logger = logging.getLogger(add_org.__name__) + + with GithubTaskSession(logger) as session: + result = UserRepo.add(session, repo_url, user_id, group_name) + + print(repo_url, result) + + +@celery.task +def add_org(user_id, group_name, org_url): + + logger = logging.getLogger(add_org.__name__) + + with GithubTaskSession(logger) as session: + result = UserRepo.add_org_repos(session, org_url, user_id, group_name) + + print(org_url, result) From 9b81444db3efbc090ef5ee78b653181c1050a648 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Fri, 19 May 2023 12:23:01 -0500 Subject: [PATCH 35/48] Db fixture updates Signed-off-by: Andrew Brain --- conftest.py | 105 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 34 deletions(-) diff --git a/conftest.py b/conftest.py index c8900dc4ee..63fca8b9f4 100644 --- a/conftest.py +++ b/conftest.py @@ -31,6 +31,11 @@ def create_full_routes(routes): def create_connection(dbname='postgres'): + """ + Creates a connection to the postgres server specified in the database string and connects to the dbname specified. + Returns the connection and cursor objects. + """ + db_string = get_database_string() user, password, host, port, _ = parse_database_string(db_string) conn = psycopg2.connect( @@ -45,6 +50,11 @@ def create_connection(dbname='postgres'): def create_database(conn, cursor, db_name, template=None): + """ + Creates a database with the name db_name. + If template is specified, the database will be created with the template specified. + """ + if template: cursor.execute(sql.SQL("CREATE DATABASE {} WITH TEMPLATE {};").format(sql.Identifier(db_name), sql.Identifier(template))) else: @@ -52,17 +62,21 @@ def create_database(conn, cursor, db_name, template=None): conn.commit() def drop_database(cursor, db_name): + """ + Drops the database with the name db_name. + """ + # ensure connections are removed cursor.execute(sql.SQL("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='{}';".format(db_name))) # drop temporary database cursor.execute(sql.SQL("DROP DATABASE {};").format(sql.Identifier(db_name))) -def create_template_db(template_name): - - import time - - start_time = time.time() +def generate_db_from_template(template_name): + """ + Generator function that creates a new database from the template specified. + Yields the engine object for the database created. + """ db_string = get_database_string() @@ -104,9 +118,11 @@ def create_template_db(template_name): conn.close() - -@pytest.fixture(scope='session') -def db_template(): +def generate_template_db(sql_file_path): + """ + Generator function that creates a new database and install the sql file specified + Yields the name of the database created. + """ db_string = get_database_string() @@ -131,7 +147,7 @@ def db_template(): create_database(conn, cursor, test_db_name) # Install schema - execute_sql_file("tests/entire_db.sql", test_db_name, user, password, host, port) + execute_sql_file(sql_file_path, test_db_name, user, password, host, port) # ensure connections are removed @@ -150,41 +166,62 @@ def db_template(): @pytest.fixture(scope='session') -def fresh_db_session(db_template): - print("Creating fresh db session from template") - yield from create_template_db(db_template) +def empty_db_template(): + """ + This fixture creates a template database with the entire schema installed. + Returns the name of the database created. + """ + + yield from generate_template_db("tests/entire_db.sql") + + +@pytest.fixture(scope='session') +def populated_db_template(): + """ + This fixture creates a template database with the entire schema installed and populated with the test repo data. + Returns the name of the database created. + """ + + yield from generate_template_db("tests/populated_db.sql") -@pytest.fixture(scope='package') -def fresh_db_package(db_template): - print("Creating fresh package level db") - yield from create_template_db(db_template) -@pytest.fixture(scope='module') -def fresh_db_module(db_template): - yield from create_template_db(db_template) +@pytest.fixture(scope='session') +def empty_db(empty_db_template): + """ + This fixture creates a database from the empty_db_template + """ -@pytest.fixture(scope='function') -def fresh_db_function(db_template): - yield from create_template_db(db_template) + yield from generate_db_from_template(empty_db_template) @pytest.fixture(scope='session') -def read_only_db(fresh_db_session): +def populated_db(populated_db_template): + """ + This fixture creates a database from the populated_db_template + Yields an engine object for the populated_db + """ + + yield from generate_db_from_template(populated_db_template) - print("Creating read-only db") - print("Fresh db session type: " + str(type(fresh_db_session))) - database_name = fresh_db_session.url.database +@pytest.fixture(scope='session') +def read_only_db(populated_db): + """ + This fixtture creates a read-only database from the populated_db_template. + Yields a read-only engine object for the populated_db. + """ + + database_name = populated_db.url.database test_username = "testuser" test_password = "testpass" schemas = ["public", "augur_data", "augur_operations"] # create read-only user - fresh_db_session.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) - fresh_db_session.execute(s.text(f"GRANT CONNECT ON DATABASE {database_name} TO {test_username};")) + populated_db.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) + populated_db.execute(s.text(f"GRANT CONNECT ON DATABASE {database_name} TO {test_username};")) for schema in schemas: - fresh_db_session.execute(s.text(f"GRANT USAGE ON SCHEMA {schema} TO {test_username};")) - fresh_db_session.execute(s.text(f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} TO {test_username};")) + populated_db.execute(s.text(f"GRANT USAGE ON SCHEMA {schema} TO {test_username};")) + populated_db.execute(s.text(f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} TO {test_username};")) # create engine for read-only user db_string = get_database_string() @@ -196,11 +233,11 @@ def read_only_db(fresh_db_session): read_only_engine.dispose() # remove read-only user - fresh_db_session.execute(s.text(f'REVOKE CONNECT ON DATABASE {database_name} FROM {test_username};')) + populated_db.execute(s.text(f'REVOKE CONNECT ON DATABASE {database_name} FROM {test_username};')) for schema in schemas: - fresh_db_session.execute(s.text(f'REVOKE USAGE ON SCHEMA {schema} FROM {test_username};')) - fresh_db_session.execute(s.text(f'REVOKE SELECT ON ALL TABLES IN SCHEMA {schema} FROM {test_username};')) - fresh_db_session.execute(s.text(f'DROP USER {test_username};')) + populated_db.execute(s.text(f'REVOKE USAGE ON SCHEMA {schema} FROM {test_username};')) + populated_db.execute(s.text(f'REVOKE SELECT ON ALL TABLES IN SCHEMA {schema} FROM {test_username};')) + populated_db.execute(s.text(f'DROP USER {test_username};')) @pytest.fixture From c7d2cb3f43da6a49334d669543a2b3c89dab0a1d Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 22 May 2023 17:57:11 -0500 Subject: [PATCH 36/48] Quickly add repos to group if they already exist Signed-off-by: Andrew Brain --- augur/api/view/api.py | 81 +++++++++++++++++++++++++++++++++++++++-- augur/tasks/frontend.py | 14 ++++++- 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 8872709e6f..d5452c3949 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -1,9 +1,11 @@ from flask import Flask, render_template, render_template_string, request, abort, jsonify, redirect, url_for, session, flash +import re from flask_login import current_user, login_required -from augur.application.db.models import Repo -from augur.tasks.frontend import add_org_repo_list +from augur.application.db.models import Repo, RepoGroup, UserGroup, UserRepo +from augur.tasks.frontend import add_org_repo_list, parse_org_and_repo_name, parse_org_name from .utils import * -from ..server import app +from ..server import app, engine +from augur.application.db.session import DatabaseSession @app.route('/cache/file/') @app.route('/cache/file/') @@ -12,6 +14,36 @@ def cache(file=None): return redirect(url_for('static', filename="cache")) return redirect(url_for('static', filename="cache/" + toCacheFilename(file, False))) + +def add_existing_repo_to_group(session, user_id, group_name, repo_id): + + logger.info("Adding existing repo to group") + + group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) + if group_id is None: + return False + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + return False + +def add_existing_org_to_group(session, user_id, group_name, rg_id): + + logger.info("Adding existing org to group") + + group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) + if group_id is None: + return False + + repos = session.query(Repo).filter(Repo.repo_group_id == rg_id).all() + logger.info("Length of repos in org: " + str(len(repos))) + for repo in repos: + result = UserRepo.insert(session, repo.repo_id, group_id) + if not result: + logger.info("Failed to add repo to group") + + + @app.route('/account/repos/add', methods = ['POST']) @login_required def av_add_user_repo(): @@ -34,7 +66,48 @@ def av_add_user_repo(): if group == "None": group = current_user.login_name + "_default" - add_org_repo_list.si(current_user.user_id, group, urls).apply_async() + invalid_urls = [] + + with DatabaseSession(logger, engine) as session: + for url in urls: + + # matches https://github.com/{org}/ or htts://github.com/{org} + if (org_name := Repo.parse_github_org_url(url)): + rg_obj = RepoGroup.get_by_name(session, org_name) + if rg_obj: + # add the orgs repos to the group + add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) + + # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} + elif Repo.parse_github_repo_url(url)[0]: + org_name, repo_name = Repo.parse_github_repo_url(url) + repo_git = f"https://github.com/{org_name}/{repo_name}" + repo_obj = Repo.get_by_repo_git(session, repo_git) + if repo_obj: + add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + + # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} + elif (match := parse_org_and_repo_name(url)): + org, repo = match.groups() + repo_git = f"https://github.com/{org}/{repo}" + repo_obj = Repo.get_by_repo_git(session, repo_git) + if repo_obj: + add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + + # matches /{org}/ or /{org} or {org}/ or {org} + elif (match := parse_org_name(url)): + org_name = match.group(1) + rg_obj = RepoGroup.get_by_name(session, org_name) + logger.info(rg_obj) + if rg_obj: + # add the orgs repos to the group + add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) + + else: + invalid_urls.append(url) + + if urls: + add_org_repo_list.si(current_user.user_id, group, urls).apply_async() flash("Adding repos and orgs in the background") diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index 6243c59dcb..b8eb8b203c 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -5,6 +5,16 @@ from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.application.db.models import UserRepo, Repo, User +def parse_org_name(string): + + match = re.match(r'^\/?([a-zA-Z0-9_-]+)\/?$', string) + return match + +def parse_org_and_repo_name(string): + + match = re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', string) + return match + @celery.task def add_org_repo_list(user_id, group_name, urls): @@ -33,7 +43,7 @@ def add_org_repo_list(user_id, group_name, urls): valid_repos.append(url) # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} - elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', url)): + elif (match := parse_org_and_repo_name(url)): org, repo = match.groups() repo_url = f"https://github.com/{org}/{repo}/" added = user.add_repo(group_name, repo_url)[0] @@ -41,7 +51,7 @@ def add_org_repo_list(user_id, group_name, urls): valid_repos.append(url) # matches /{org}/ or /{org} or {org}/ or {org} - elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/?$', url)): + elif (match := parse_org_name(url)): org = match.group(1) org_url = f"https://github.com/{org}/" added = user.add_org(group_name, org_url)[0] From f85e1a348aae83726feaa403af9b7f6eff867daf Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 22 May 2023 18:20:21 -0500 Subject: [PATCH 37/48] Remove populated db fixtures until functionalg Signed-off-by: Andrew Brain --- conftest.py | 41 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/conftest.py b/conftest.py index 63fca8b9f4..218ba31950 100644 --- a/conftest.py +++ b/conftest.py @@ -175,16 +175,6 @@ def empty_db_template(): yield from generate_template_db("tests/entire_db.sql") -@pytest.fixture(scope='session') -def populated_db_template(): - """ - This fixture creates a template database with the entire schema installed and populated with the test repo data. - Returns the name of the database created. - """ - - yield from generate_template_db("tests/populated_db.sql") - - @pytest.fixture(scope='session') def empty_db(empty_db_template): """ @@ -194,34 +184,25 @@ def empty_db(empty_db_template): yield from generate_db_from_template(empty_db_template) +# TODO: Add populated db template and populated db fixtures so this fixture is more useful @pytest.fixture(scope='session') -def populated_db(populated_db_template): - """ - This fixture creates a database from the populated_db_template - Yields an engine object for the populated_db - """ - - yield from generate_db_from_template(populated_db_template) - - -@pytest.fixture(scope='session') -def read_only_db(populated_db): +def read_only_db(empty_db): """ This fixtture creates a read-only database from the populated_db_template. Yields a read-only engine object for the populated_db. """ - database_name = populated_db.url.database + database_name = empty_db.url.database test_username = "testuser" test_password = "testpass" schemas = ["public", "augur_data", "augur_operations"] # create read-only user - populated_db.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) - populated_db.execute(s.text(f"GRANT CONNECT ON DATABASE {database_name} TO {test_username};")) + empty_db.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) + empty_db.execute(s.text(f"GRANT CONNECT ON DATABASE {database_name} TO {test_username};")) for schema in schemas: - populated_db.execute(s.text(f"GRANT USAGE ON SCHEMA {schema} TO {test_username};")) - populated_db.execute(s.text(f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} TO {test_username};")) + empty_db.execute(s.text(f"GRANT USAGE ON SCHEMA {schema} TO {test_username};")) + empty_db.execute(s.text(f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} TO {test_username};")) # create engine for read-only user db_string = get_database_string() @@ -233,11 +214,11 @@ def read_only_db(populated_db): read_only_engine.dispose() # remove read-only user - populated_db.execute(s.text(f'REVOKE CONNECT ON DATABASE {database_name} FROM {test_username};')) + empty_db.execute(s.text(f'REVOKE CONNECT ON DATABASE {database_name} FROM {test_username};')) for schema in schemas: - populated_db.execute(s.text(f'REVOKE USAGE ON SCHEMA {schema} FROM {test_username};')) - populated_db.execute(s.text(f'REVOKE SELECT ON ALL TABLES IN SCHEMA {schema} FROM {test_username};')) - populated_db.execute(s.text(f'DROP USER {test_username};')) + empty_db.execute(s.text(f'REVOKE USAGE ON SCHEMA {schema} FROM {test_username};')) + empty_db.execute(s.text(f'REVOKE SELECT ON ALL TABLES IN SCHEMA {schema} FROM {test_username};')) + empty_db.execute(s.text(f'DROP USER {test_username};')) @pytest.fixture From d40f576a98d713d808759f9198b4341b28156fb9 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 23 May 2023 08:23:34 -0500 Subject: [PATCH 38/48] Repo group casing fix Signed-off-by: Andrew Brain --- augur/api/view/api.py | 2 ++ augur/application/db/models/augur_operations.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 8872709e6f..c95ae78999 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -34,6 +34,8 @@ def av_add_user_repo(): if group == "None": group = current_user.login_name + "_default" + urls = [url.lower() for url in urls] + add_org_repo_list.si(current_user.user_id, group, urls).apply_async() flash("Adding repos and orgs in the background") diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index ec5aa0c7a5..0b40921a97 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -886,7 +886,7 @@ def add_org_repos(session, url: List[str], user_id: int, group_name: int): # if it doesn't exist create one if not repo_group: - repo_group = RepoGroup(rg_name=owner, rg_description="", rg_website="", rg_recache=0, rg_type="Unknown", + repo_group = RepoGroup(rg_name=owner.lower(), rg_description="", rg_website="", rg_recache=0, rg_type="Unknown", tool_source="Loaded by user", tool_version="1.0", data_source="Git") session.add(repo_group) session.commit() From c5f8b28366a51923beb20911854600214bfabe4b Mon Sep 17 00:00:00 2001 From: Ulincsys Date: Tue, 23 May 2023 09:56:35 -0500 Subject: [PATCH 39/48] add password reset command Signed-off-by: Ulincsys --- augur/application/cli/user.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/augur/application/cli/user.py b/augur/application/cli/user.py index 54bdbcd2fd..9d0b822be2 100644 --- a/augur/application/cli/user.py +++ b/augur/application/cli/user.py @@ -58,4 +58,21 @@ def add_user(username, email, firstname, lastname, admin, phone_number, password session.close() engine.dispose() - return 0 \ No newline at end of file + return 0 + +@cli.command('password_reset', short_help="Reset a user's password") +@click.argument("username") +@click.password_option(help="New password") +def reset_password(username, password): + session = Session() + + user = session.query(User).filter(User.login_name == username).first() + + if not user: + return click.echo("invalid username") + + password = User.compute_hashsed_password(password) + user.login_hashword = password + session.commit() + + return click.echo("Password updated") \ No newline at end of file From 963e5f5751a454623405484e4bfc937906946b1a Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 23 May 2023 15:49:13 -0500 Subject: [PATCH 40/48] Protect queries against sql injection Signed-off-by: Andrew Brain --- augur/util/repo_load_controller.py | 73 ++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 23 deletions(-) diff --git a/augur/util/repo_load_controller.py b/augur/util/repo_load_controller.py index 2f8bad6155..3436167aee 100644 --- a/augur/util/repo_load_controller.py +++ b/augur/util/repo_load_controller.py @@ -131,19 +131,19 @@ def paginate_repos(self, source, page=0, page_size=25, sort="repo_id", direction order_by = sort if sort else "repo_id" order_direction = direction if direction else "ASC" - query = self.generate_repo_query(source, count=False, order_by=order_by, direction=order_direction, + query, query_args, result = self.generate_repo_query(source, count=False, order_by=order_by, direction=order_direction, page=page, page_size=page_size, **kwargs) - if not query[0]: - return None, {"status": query[1]["status"]} + if not query: + return None, {"status": result["status"]} - if query[1]["status"] == "No data": + if result["status"] == "No data": return [], {"status": "No data"} - get_page_of_repos_sql = s.sql.text(query[0]) + get_page_of_repos_sql = s.sql.text(query) with DatabaseEngine(connection_pool_size=1) as engine: - results = pd.read_sql(get_page_of_repos_sql, engine) + results = pd.read_sql(get_page_of_repos_sql, engine, params=query_args) results['url'] = results['url'].apply(lambda datum: datum.split('//')[1]) @@ -170,24 +170,27 @@ def get_repo_count(self, source, **kwargs): print("Func: get_repo_count. Error: Invalid source") return None, {"status": "Invalid source"} - query = self.generate_repo_query(source, count=True, **kwargs) - if not query[0]: - return None, query[1] + query, query_args, result = self.generate_repo_query(source, count=True, **kwargs) + if not query: + return None, result - if query[1]["status"] == "No data": + if result["status"] == "No data": return 0, {"status": "No data"} # surround query with count query so we just get the count of the rows - final_query = f"SELECT count(*) FROM ({query[0]}) a;" + final_query = f"SELECT count(*) FROM ({query}) a;" get_page_of_repos_sql = s.sql.text(final_query) - result = self.session.fetchall_data_from_sql_text(get_page_of_repos_sql) + result = self.session.execute(get_page_of_repos_sql, query_args).fetchall() return result[0]["count"], {"status": "success"} def generate_repo_query(self, source, count, **kwargs): # TODO: need more flexible way of calculating count for variable column queries + + query_args = {} + if count: # only query for repos ids so the query is faster for getting the count select = """ DISTINCT(augur_data.repo.repo_id), @@ -195,7 +198,7 @@ def generate_repo_query(self, source, count, **kwargs): (regexp_match(augur_data.repo.repo_git, 'github\.com\/([A-Za-z0-9 \- _]+)\/[A-Za-z0-9 \- _ .]+$'))[1] as repo_owner""" else: - select = f""" DISTINCT(augur_data.repo.repo_id), + select = """ DISTINCT(augur_data.repo.repo_id), augur_data.repo.description, augur_data.repo.repo_git AS url, COALESCE(a.commits_all_time, 0) as commits_all_time, @@ -226,7 +229,9 @@ def generate_repo_query(self, source, count, **kwargs): query += "\t\t JOIN augur_operations.user_repos ON augur_data.repo.repo_id = augur_operations.user_repos.repo_id\n" query += "\t\t JOIN augur_operations.user_groups ON augur_operations.user_repos.group_id = augur_operations.user_groups.group_id\n" - query += f"\t\t WHERE augur_operations.user_groups.user_id = {user.user_id}\n" + query += "\t\t WHERE augur_operations.user_groups.user_id = :user_id\n" + + query_args["user_id"] = user.user_id elif source == "group": @@ -246,7 +251,9 @@ def generate_repo_query(self, source, count, **kwargs): return None, {"status": "Group does not exists"} query += "\t\t JOIN augur_operations.user_repos ON augur_data.repo.repo_id = augur_operations.user_repos.repo_id\n" - query += f"\t\t WHERE augur_operations.user_repos.group_id = {group_id}\n" + query += "\t\t WHERE augur_operations.user_repos.group_id = :group_id \n" + + query_args["group_id"] = group_id # implement sorting by query_key search = kwargs.get("search") @@ -264,21 +271,41 @@ def generate_repo_query(self, source, count, **kwargs): # It is only included because it is required by the SQL syntax if isinstance(qkey, list) and len(qkey) > 0: - query += f"\tWHERE {qkey.pop(0)} ilike '%{search}%'\n" - for key in qkey: - query += f"OR {key} ilike '%{search}%'\n" + query += f"\tWHERE :qkey_where ilike :search\n" + query_args["qkey_where"] = qkey.pop(0) + + for i, key in enumerate(qkey): + param_name = f"qkey_or_{i}" + query += f"OR :{param_name} ilike :search\n" + query_args[param_name] = key else: - query += f"\tWHERE {qkey} ilike '%{search}%'\n" + query += f"\tWHERE :qkey ilike :search\n" + query_args["qkey"] = qkey + + query_args["search"] = f'%{search}%' + if not count: order_by = kwargs.get("order_by") or "repo_id" - direction = kwargs.get("direction") or "ASC" page = kwargs.get("page") or 0 page_size = kwargs.get("page_size") or 25 + direction = kwargs.get("direction") or "ASC" + + if direction not in ["ASC", "DESC"]: + return None, None, {"status": "Invalid direction"} + + if order_by not in ["repo_id", "repo_name", "repo_owner", "commits_all_time", "issues_all_time"]: + return None, None, {"status": "Invalid order by"} + + offset = page*page_size query += f"\tORDER BY {order_by} {direction}\n" - query += f"\tLIMIT {page_size}\n" - query += f"\tOFFSET {page*page_size};\n" + query += "\tLIMIT :page_size\n" + query += "\tOFFSET :offset;\n" + + query_args["page_size"] = page_size + query_args["offset"] = offset + query_args["order_by"] = order_by - return query, {"status": "success"} + return query, query_args, {"status": "success"} From 165b6d95234b938eea5966785b1c6bc4b60d066d Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 24 May 2023 11:51:25 -0500 Subject: [PATCH 41/48] Update collecting-data docs page Signed-off-by: Isaac Milarsky --- augur/application/config.py | 1 + .../getting-started/collecting-data.rst | 122 ++++++------------ 2 files changed, 44 insertions(+), 79 deletions(-) diff --git a/augur/application/config.py b/augur/application/config.py index 2472da17e8..c9aff085b1 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -37,6 +37,7 @@ def get_development_flag(): "github": "", "gitlab": "" }, + #TODO: a lot of these are deprecated. "Facade": { "check_updates": 1, "create_xlsx_summary_files": 1, diff --git a/docs/source/getting-started/collecting-data.rst b/docs/source/getting-started/collecting-data.rst index ea7778427b..3996999737 100644 --- a/docs/source/getting-started/collecting-data.rst +++ b/docs/source/getting-started/collecting-data.rst @@ -1,7 +1,7 @@ Collecting data =============== -Now that you’ve installed Augur’s application server, it’s time to configure your data collection workers. If you just want to run Augur using the one repository in the default database, and default worker settings, all you need to do is start the redis server in one terminal, make sure rabbitmq is running, and the augur application in the other terminal. (Don't forget that the AUGUR_DB environment variable needs to be set in the terminal, or set permanently) +Now that you’ve installed Augur’s application server, it’s time to configure data collection if needed. If you just want to run Augur using the default repositories in the default database, and default celery collection settings, all you need to do is start the redis server in one terminal, make sure rabbitmq is running, and the augur application in the other terminal. (Don't forget that the AUGUR_DB environment variable needs to be set in the terminal, or set permanently) .. code-block:: bash @@ -10,13 +10,6 @@ Now that you’ve installed Augur’s application server, it’s time to configu # Starts the redis server redis-server -.. code-block:: bash - - # Terminal Window 2 - - # Start celery worker so it can accept tasks - celery -A augur.tasks.init.celery_app.celery_app worker --loglevel=info - .. code-block:: bash @@ -29,50 +22,42 @@ Now that you’ve installed Augur’s application server, it’s time to configu augur backend stop augur backend kill -Now, here's a ton of brain-splitting detail about workers, and their configuration. There are 2 pieces to data collection with Augur: the housekeeper, and the data collection workers. The housekeeper creates long-running "jobs" that specify what kind of data to collect for what set of repositories. The data collection workers can then accept these jobs, after which they will use the information provided in the job to find the repositories in question and collect the requested data. - -Since the default housekeeper setup will work for most use cases, we'll first cover how to configure the workers and then briefly touch on the housekeeper configuration options, after which we'll cover how to add repos and repo groups to the database. - -Configuring the Workers ------------------------- - -There are a few workers that ship ready to collect out of the box: +Now, here's a ton of brain-splitting detail about celery collection. There are 2 pieces to data collection with Augur: the celery worker processes, and the job messages passed through rabbitmq. The jobs to collect are determined by a monitor process started through the cli that starts the rest of augur. The monitor process generates the jobs messages to send to rabbitmq through the collection_status table that informs the status of jobs that have yet to be run. The celery collection workers can then accept these jobs, after which they will use the information provided in the job to find the repositories in question and collect the requested data. -- ``facade_worker`` (collects raw commit and contributor data by parsing Git logs) -- ``github_worker`` (collects issue data from the GitHub API) -- ``contributor_worker`` (collects contributor data from the GitHub API) -- ``pull_request_worker`` (collects pull request data from the GitHub API) -- ``repo_info_worker`` (collects repository statistics from the GitHub API) -- ``release_worker`` (collects release data from the GitHub API) -- ``linux_badge_worker`` (collects `CII badging `_ data from the CII API) -- ``insight_worker`` (queries Augur's metrics API to find interesting anomalies in the collected data) +Since the default setup will work for most use cases, we'll first cover how to configure some specific data collection jobs and then briefly touch on the celery configuration options, after which we'll cover how to add repos and repo groups to the database. -All worker configuration options are found in the ``Workers`` block of the ``augur.config.json`` file (which was generated for you at the end of the previous section). This file is located at ``$HOME/.augur/augur.config.json``. Each worker has its subsection with the same title as the worker's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. +Configuring Collection +---------------------- -Standard configuration options -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +There are many collection jobs that ship ready to collect out of the box: -Each worker has 3 configuration options that are standard across all workers. The worker-specific options are detailed in the sections following this one. +- ``augur.tasks.git.facade_taks`` (collects raw commit and contributor data by parsing Git logs) +- ``augur.tasks.github`` (parent module of all github specific collection jobs) +- ``augur.tasks.github.contributors.tasks`` (collects contributor data from the GitHub API) +- ``augur.tasks.github.pull_requests.tasks`` (collects pull request data from the GitHub API) +- ``augur.tasks.github.repo_info.tasks`` (collects repository statistics from the GitHub API) +- ``augur.tasks.github.releases.tasks`` (collects release data from the GitHub API) +- ``augur.tasks.data_analysis.insight_worker.tasks`` (queries Augur's metrics API to find interesting anomalies in the collected data) -The standard options are: - -- ``switch``, a boolean flag indicating if the worker should automatically be started with Augur. Defaults to ``0`` (false). -- ``workers``, the number of instances of this worker that Augur should spawn if ``switch`` is set to ``1``. Defaults to ``1`` for all workers except the ``value_worker`` and ``insight_worker``. -- ``port``, which is the base TCP port the worker will use the communicate with Augur's broker. The default is different for each worker, but the lowest is ``50100`` and each worker increments the default starting port by 100. If the ``workers`` parameter is > 1, then workers will bind to ``port`` + ``i`` for the ``i``'th worker spawned - -Keeping ``workers`` at 1 should be fine for small collection sets, but if you have a lot of repositories to collect data for, you can raise it. We also suggest double-checking that the default worker ports are free on your machine. +All worker configuration options are found in the config table generated when augur was installed. The config table is located in the augur_operations schema of your postgresql database. Each configurable data collection job set has its subsection with the same or similar title as the task's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. Worker-specific configuration options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Next up are the configuration options specific to each worker (but some workers require no additional configuration beyond the defaults). The most pertinent of these options is the ``facade_worker's`` ``repo_directory``, so make sure to pay attention to that one. +Next up are the configuration options specific to some collection tasks (but some tasks require no additional configuration beyond the defaults). The most pertinent of these options is the ``Facade`` section ``repo_directory``, so make sure to pay attention to that one. -``facade_worker`` +``Facade`` :::::::::::::::::: -- ``repo_directory``, which is the local directory where the ``facade_worker`` will clone the repositories it needs to analyze. You should have been prompted for this during installation, but if you need to change it, make sure that it's an absolute path (environment variables like ``$HOME`` are not supported) and that the directory already exists. Defaults to ``repos/``, but it's highly recommended you change this. +- ``repo_directory``, which is the local directory where the facade tasks will clone the repositories it needs to analyze. You should have been prompted for this during installation, but if you need to change it, make sure that it's an absolute path (environment variables like ``$HOME`` are not supported) and that the directory already exists. Defaults to ``repos/``, but it's highly recommended you change this. +- ``limited_run``, toggle between 0 and 1 to determine whether to run all facade tasks or not. Runs all tasks if set to 0 +- ``pull_repos``, toggle whether to pull updates from repos after cloning them. If turned off updates to repos will not be collected. +- ``run_analysis``, toggle whether to process commit data at all. If turned off will only clone repos and run tertiary tasks such as resolving contributors from any existing commits or collecting dependency relationships. Mainly used for testing. +- ``run_facade_contributors``, toggle whether to run contributor resolution tasks. This will process and parse through commit data to link emails to contributors as well as aliases, etc. +- ``force_invalidate_caches``, set every repo to reset the status of commit email affillation, which is the organization that an email is associated with. +- ``rebuild_caches``, toggle whether to enable parsing through commit data to determine affillation and web cache -``insight_worker`` +``Insight_Task`` :::::::::::::::::: We recommend leaving the defaults in place for the insight worker unless you are interested in other metrics, or anomalies for a different time period. @@ -83,63 +68,42 @@ We recommend leaving the defaults in place for the insight worker unless you are - ``contamination``, which is the "sensitivity" parameter for detecting anomalies. Acts as an estimated percentage of the training_days that are expected to be anomalous. The default is ``0.041`` for the default training days of 365: 4.1% of 365 days means that about 15 data points of the 365 days are expected to be anomalous. -- ``metrics``, which specifies which metrics the ``insight_worker`` should run the anomaly detection algorithm on. This is structured like so:: - { - 'endpoint_name_1': 'field_1_of_endpoint', - 'endpoint_name_1': 'field_2_of_endpoint', - 'endpoint_name_2': 'field_1_of_endpoint', - ... - } - - # defaults to the following - - { - "issues-new": "issues", - "code-changes": "commit_count", - "code-changes-lines": "added", - "reviews": "pull_requests", - "contributors-new": "new_contributors" - } - -``value_worker`` -:::::::::::::::::: +- ``switch``, toggles whether to run insight tasks at all. +- ``workers``, number of worker processes to use for insight tasks. -- ``scc_bin``, the command that the ``value_worker`` should use to invoke ``scc``. If installed with ``go get github.com/boyter/scc``, then the default of ``scc`` should probably work, but double check for your particular Go installation. +``Task_Routine`` +:::::::::::::::::: + +This section is for toggling sets of jobs on or off. -Housekeeper ------------- +- ``prelim_phase``, toggles whether to run preliminary tasks that check to see whether repos are valid or not. +- ``primary_repo_collect_phase``, toggle the standard collection jobs, mainly pull requests and issues +- ``secondary_repo_collect_phase``, toggle the secondary collection jobs, mainly jobs that take a while +- ``facade_phase``, toggle all facade jobs +- ``machine_learning_phase``, toggle all ml related jobs -**We strongly recommend leaving the default housekeeper blocks generated by the installation process, but if you would like to know more, or fine-tune them to your needs, read on.** +Celery Configuration +-------------------- -The housekeeper is responsible for generating the tasks that will tell the workers what data to collect, and how. Housekeeper configuration options are found in the ``Housekeeper`` block of the config file. The ``Housekeeper`` block has a single key, ``jobs``, which is an array of tasks the housekeeper should create. Each task has the following structure:: +**We strongly recommend leaving the default celery blocks generated by the installation process, but if you would like to know more, or fine-tune them to your needs, read on.** - { - "delay": , - "given": [ - "" - ], - "model": "", - "repo_group_id": , - ... //other task-specific parameters - } +The celery monitor is responsible for generating the tasks that will tell the other worker processes what data to collect, and how. The ``Celery`` block has 2 keys; one for memory cap and one for materialized views interval. +- ``worker_process_vmem_cap``, float between zero and one that determines the maximum percentage of total memory to use for worker processes -- The ``delay`` parameter is the amount of time the housekeeper should wait before scheduling a new update task. -- The ``given`` parameter is used in conjunction with the ``model`` parameter to determine which workers can accept a data collection task. Each worker can collect data if it is "given" data in a certain format, for example, a ``github_url`` (in the case of the ``github_worker`` and ``pull_request_worker``) or perhaps just any valid ``git_url`` (as in the case of the ``facade_worker``). -- The ``model`` parameter is the other parameter used to determine which workers can accept a given task. It represents the part of the conceptual data model that the worker can fulfill; for example, the ``facade_worker`` fills out the ``commits`` model since it primarly gathers data about commits, and the ``github_worker`` fills out both the ``issues`` and ``contributors`` model. -- The ``repo_group_id`` parameter specifies which group of repos the housekeeper should collect data for; use the default of ``0`` to specify ALL repo groups in the database. +- ``refresh_materialized_views_interval_in_days``, number of days to wait between refreshes of materialized views. Adding repos for collection ----------------------------- -If you're using the Docker container, you can use the `provided UI <../docker/usage.html>`_ to load your repositories. Otherwise, you'll need to use the `Augur CLI `_ to load your repositories. Please reference the respective sections of the documentation for detailed instructions on how to accomplish both of these steps. +If you're using the Docker container, you can use the `provided UI <../docker/usage.html>`_ to load your repositories. Otherwise, you'll need to use the `Augur CLI `_ or the augur frontend to load your repositories. Please reference the respective sections of the documentation for detailed instructions on how to accomplish both of these steps. Running collections -------------------- Congratulations! At this point you (hopefully) have a fully functioning and configured Augur instance. -After you've loaded your repos, you're ready for your first collection run. We recommend running only the default workers first to gather the initial data. If you're collecting data for a lot of repositories, or repositories with a lot of data, we recommend increasing the number of ``github_workers`` and ``pull_request_workers``. +After you've loaded your repos, you're ready for your first collection run. We recommend running only the default jobs first to gather the initial data. You can now run Augur and start the data collection by issuing the ``augur backend start`` command in the root ``augur`` directory. All your logs (including worker logs and error files) will be saved to a ``logs/`` subdirectory in that same folder, but this can be customized - more on that and other logging utilities `in the development guide <../development-guide/logging.html>`_. From 71b56965507f76e8c66d5321e7d8fe6c693332c9 Mon Sep 17 00:00:00 2001 From: Seltyk Date: Wed, 24 May 2023 12:55:58 -0400 Subject: [PATCH 42/48] Update new-install.md - fix PostgreSQL 15 permissions - fix AVX on VMs - start SQL SSL connection and RabbitMQ server Signed-off-by: Seltyk --- docs/new-install.md | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/docs/new-install.md b/docs/new-install.md index 9f46f496ef..072dfada97 100644 --- a/docs/new-install.md +++ b/docs/new-install.md @@ -71,11 +71,22 @@ CREATE USER augur WITH ENCRYPTED PASSWORD 'password'; GRANT ALL PRIVILEGES ON DATABASE augur TO augur; ``` -Once you are successfully logged out, return to your user by exiting `psql`, then typing `exit` to exit the postgres user, and `exit` a SECOND time to exit the root user. +**If you're using PostgreSQL 15 or later**, default database permissions will prevent Augur's installer from configuring the database. Add one last line after the above to fix this: +```sql +GRANT ALL ON SCHEMA public TO augur; +``` + +After that, return to your user by exiting `psql` ``` postgres=# \quit ``` +Here we want to start an SSL connection to the `augur` database on port 5432: +```shell +psql -h localhost -U postgres -p 5432 +``` + +Now type `exit` to log off the postgres user, and `exit` a SECOND time to log off the root user. ```shell exit exit @@ -98,6 +109,11 @@ sudo rabbitmqctl set_permissions -p augur_vhost augur ".*" ".*" ".*" NOTE: it is important to have a static hostname when using rabbitmq as it uses hostname to communicate with nodes. +RabbitMQ's server can then be started from systemd: +```shell +sudo systemctl start rabbitmq-server +``` + If your setup of rabbitmq is successful your broker url should look like this: **broker_url = `amqp://augur:password123@localhost:5672/augur_vhost`** @@ -139,7 +155,7 @@ Where AugurB is the vhost. The management API at port 15672 will only exist if y ## Proxying Augur through Nginx Assumes nginx is installed. -Then you create a file for the server you want Augur to run under in the location of your `sites-enabled` directory for nginx (In this example, Augur is running on port 5038: (the long timeouts on the settings page is for when a user adds a large number of repos or orgs in a single session to prevent timeouts from nginx) +Then you create a file for the server you want Augur to run under in the location of your `sites-enabled` directory for nginx. In this example, Augur is running on port 5038: (the long timeouts on the settings page is for when a user adds a large number of repos or orgs in a single session to prevent timeouts from nginx) ``` server { @@ -324,6 +340,8 @@ To access command line options, use `augur --help`. To load repos from GitHub or Start a Flower Dashboard, which you can use to monitor progress, and report any failed processes as issues on the Augur GitHub site. The error rate for tasks is currently 0.04%, and most errors involve unhandled platform API timeouts. We continue to identify and add fixes to handle these errors through additional retries. Starting Flower: `(nohup celery -A augur.tasks.init.celery_app.celery_app flower --port=8400 --max-tasks=1000000 &)` NOTE: You can use any open port on your server, and access the dashboard in a browser with http://servername-or-ip:8400 in the example above (assuming you have access to that port, and its open on your network.) +If you're using a virtual machine within Windows and you get an error about missing AVX instructions, you should kill Hyper-V. Even if it doesn't *appear* to be active, it might still be affecting your VM. Follow [these instructions](https://stackoverflow.com/a/68214280) to disable Hyper-V, and afterward AVX should pass to the VM. + ## Starting your Augur Instance Start Augur: `(nohup augur backend start &)` From 52a95957be9bb7218e36c44a6e68b192415a5eea Mon Sep 17 00:00:00 2001 From: Seltyk Date: Wed, 24 May 2023 13:18:20 -0400 Subject: [PATCH 43/48] Update new-install.rst - see previous commit; they're matched Signed-off-by: Seltyk --- docs/new-install.rst | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/docs/new-install.rst b/docs/new-install.rst index 86f6f0eeb1..639cf08c8f 100644 --- a/docs/new-install.rst +++ b/docs/new-install.rst @@ -100,14 +100,29 @@ Then, from within the resulting postgresql shell: CREATE USER augur WITH ENCRYPTED PASSWORD 'password'; GRANT ALL PRIVILEGES ON DATABASE augur TO augur; -Once you are successfully logged out, return to your user by exiting -``psql``, then typing ``exit`` to exit the postgres user, and ``exit`` a -SECOND time to exit the root user. +**If you're using PostgreSQL 15 or later**, default database permissions will +prevent Augur's installer from configuring the database. Add one last line +after the above to fix this: + +.. code:: sql + + GRANT ALL ON SCHEMA public TO augur; + +After that, return to your user by exiting ``psql`` :: postgres=# \quit +Here we want to start an SSL connection to the ``augur`` database on port 5432: + +.. code:: shell + + psql -h localhost -U postgres -p 5432 + +Now type ``exit`` to log off the postgres user, and ``exit`` a SECOND time to +log off the root user. + .. code:: shell exit @@ -136,6 +151,12 @@ instance. You can accomplish this by running the below commands: NOTE: it is important to have a static hostname when using rabbitmq as it uses hostname to communicate with nodes. +RabbitMQ's server can then be started from systemd: + +.. code:: shell + + sudo systemctl start rabbitmq-server + If your setup of rabbitmq is successful your broker url should look like this: @@ -439,6 +460,12 @@ NOTE: You can use any open port on your server, and access the dashboard in a browser with http://servername-or-ip:8400 in the example above (assuming you have access to that port, and its open on your network.) +If you're using a virtual machine within Windows and you get an error about +missing AVX instructions, you should kill Hyper-V. Even if it doesn't *appear* +to be active, it might still be affecting your VM. Follow +`these instructions `_ to disable +Hyper-V, and afterward AVX should pass to the VM. + Starting your Augur Instance ---------------------------- From 85100ce5989410ad16dc86620ca098cd3395e4f3 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 24 May 2023 17:33:06 -0500 Subject: [PATCH 44/48] Update a bunch of the worker docs to reflect present tasks Signed-off-by: Isaac Milarsky --- .../create-a-metric/metrics-steps.rst | 2 +- docs/source/development-guide/logging.rst | 4 +- .../workers/clustering_worker.rst | 34 ++--- docs/source/schema/overview.rst | 39 +++--- docs/source/schema/regularly_used_data.rst | 123 ++++++++---------- docs/source/schema/working_tables.rst | 10 +- 6 files changed, 89 insertions(+), 123 deletions(-) diff --git a/docs/source/development-guide/create-a-metric/metrics-steps.rst b/docs/source/development-guide/create-a-metric/metrics-steps.rst index c9225f0729..fe0138871c 100644 --- a/docs/source/development-guide/create-a-metric/metrics-steps.rst +++ b/docs/source/development-guide/create-a-metric/metrics-steps.rst @@ -79,4 +79,4 @@ If we look at the Augur Schema, we can see that effort and cost are contained in .. note:: - Augur uses https://github.com/boyter/scc to calculate information contained in the ``labor_value`` table, which is populated by the ``value_worker``. + Augur uses https://github.com/boyter/scc to calculate information contained in the ``labor_value`` table, which is populated by the ``value_worker`` tasks. diff --git a/docs/source/development-guide/logging.rst b/docs/source/development-guide/logging.rst index a37d1abf16..cdff38c10d 100644 --- a/docs/source/development-guide/logging.rst +++ b/docs/source/development-guide/logging.rst @@ -3,8 +3,8 @@ Logging Augur's log output can be configured with some basic verbosity and log levels. If you are contributing to Augur, we recommend you set the ``debug`` flag in the ``Logging`` section of your config file to ``1``. This will -turn the verbosity up, capture **all** logs of every level, and it will allow the workers to print their output to the screen -if they are being run manually in a separate terminal (as is often the case when one is developing a worker). +turn the verbosity up, capture **all** logs of every level, and it will allow the data collection tasks to print their output to the screen +if they are being run manually in a separate terminal. The verbosity and minimum log level can be controlled with the ``verbose`` (boolean flag) and ``log_level`` (one of ``DEBUG``, ``INFO``, ``WARNING``, ``ERROR``, or ``CRITICAL``) options respectively. There is also diff --git a/docs/source/development-guide/workers/clustering_worker.rst b/docs/source/development-guide/workers/clustering_worker.rst index 8ea363564e..a89bff3501 100644 --- a/docs/source/development-guide/workers/clustering_worker.rst +++ b/docs/source/development-guide/workers/clustering_worker.rst @@ -1,8 +1,8 @@ -Clustering Worker +Clustering Task ========================== -The worker analyzes the comments in issues and pull requests, and clusters the repositories based on contents of those messages. -The worker also performs topic modeling using Latent Dirichlet allocation +The task analyzes the comments in issues and pull requests, and clusters the repositories based on contents of those messages. +The task also performs topic modeling using Latent Dirichlet allocation Clustering of text documents @@ -11,9 +11,9 @@ Clustering of text documents Clustering is a type of unsupervised machine learning technique that involves grouping together similar data points. In case of textual data, it involves grouping together semantically similar documents. The document is a collection of sentences. In our case, document represents the collection of comments across issues and pull requests across a particular repository. Since, clustering algorithm works with numerical features, we need to first convert documents into vector representation. -Worker Implementation ---------------------- -The worker performs two tasks — clustering of the repositories represented as documents (collection of all messages from issues and pull requests within the repository) and topic modeling. If the pre-trained model doesn’t exist in the worker folder, the data from all the repository in the connected database are used to train the model. After the training, the following model files are dumped in the worker folder +Implementation +-------------- +The task performs two tasks — clustering of the repositories represented as documents (collection of all messages from issues and pull requests within the repository) and topic modeling. If the pre-trained model doesn’t exist in the clustering task's folder, the data from all the repository in the connected database are used to train the model. After the training, the following model files are dumped in the clustering task's folder - vocabulary : the set of features obtained from TF-IDF vectorization on text data (required in prediction phase) - kmeans_repo_messages : trained kmeans clustering model on tfidf features @@ -25,28 +25,14 @@ In addition, the training phase populates the ‘topic words’ database table w **Prediction** -If the trained model exists in the worker directory, the prediction is made on the documents corresponding to the repositories in the repo groups specified in the configuration. The worker populates the following tables +If the trained model exists in the task directory, the prediction is made on the documents corresponding to the repositories in the repo groups specified in the configuration. The task populates the following tables repo_topic : stores probability distribution over the topics for a particular repository repo_cluster_messages : stores clustering label assigned to a repository -Worker Configuration +Task Configuration -------------------- -Like standard worker configuration, we need to define delay, given, model and repo_group_id in housekeeper configuration block. - -{ - - "delay": 10000, - - "given":["git_url"], - - "model" : "clustering", - - "repo_group_id" : 60003 - -} - -Further, in workers configuration block, we need to define port, switch and number of workers. +For this task's configuration, in workers configuration block, we need to define port, switch and number of workers. .. code-block:: json @@ -60,7 +46,7 @@ Further, in workers configuration block, we need to define port, switch and numb "num_clusters" : 4 } -Additional Worker Parameters in `augur.config.json`: +Additional Worker Parameters: ------------------------------------------------------ In addition to standard worker parameters, clustering worker requires some worker-specific parameters which are described below: diff --git a/docs/source/schema/overview.rst b/docs/source/schema/overview.rst index 718e25a870..32d8900f51 100644 --- a/docs/source/schema/overview.rst +++ b/docs/source/schema/overview.rst @@ -36,42 +36,43 @@ Augur Data ------------------------------------------------------- The ``augur_data`` schema contains *most* of the information analyzed -and constructed by Augur. The origin’s of the data inside of augur are: +and constructed by Augur. The origin’s of the data inside of augur are +from data collection tasks and populate this schema.: -1. ``workers/augur_github_worker``: Pulls data from the GitHub API. -Presently this is focused on issues, including issue_comments, -issue_events, issue_labels and contributors. Note that all messages are -stored in Augur in the ``messages`` table. This is to facilitate easy -analysis of the tone and characteristics of text communication in a -project from one place. +1. ``augur.tasks.github.*``: Tasks that pull data from the GitHub API. +Primarily, pull requests and issues are collected before more complicated +data. Note that all messages are stored in Augur in the ``messages`` table. +This is to facilitate easy analysis of the tone and characteristics of text +communication in a project from one place. -2. ``workers/facade_worker``: Based on +2. ``augur.tasks.git.facade_tasks``: Based on http://www.github.com/brianwarner/facade, but substantially modified in the fork located at http://github.com/sgoggins/facade. The modifications include modularization of code, connections to Postgresql data instead -of MySQL and other changes noted in the commit logs. +of MySQL and other changes noted in the commit logs. Further modifications +have been made to work with augur as well as seemlessly integrate it into +data collection. -3. ``workers/insight_worker``: Generates summarizations from raw data +3. ``augur.tasks.data_analysis.insight_worker.tasks``: Generates summarizations from raw data gathered from commits, issues, and other info. -4. ``workers/linux_badge_worker``: Pulls data from the Linux Foundation’s -badging program. - -5. ``workers/value_worker``: Populates the table -``repo_labor`` using the “SCC” tool provided the -https://github.com/boyter/scc project. “SCC” required Go to be installed on your system. Visit `this resource `__ for instructions on Go installation. - -6. ``workers/pull_request_worker``: Collects Pull Request related data such as commits, contributors,assignees, etc. from the Github API and stores it in the Augur database. +4. ``augur.tasks.github.pull_requests.tasks``: Collects Pull Request related data such as commits, contributors,assignees, etc. from the Github API and stores it in the Augur database. Augur Operations ------------------------------------------------------- The ``augur_operations`` tables are where most of the operations tables -are going to exist. There are a few, like ``settings`` that remain in +exist. There are a few, like ``settings`` that remain in ``augur_data`` for now, but will be moved. They keep records related to analytical history and data provenance for data in the schema. They also store information including API keys. +Some key tables in this schema include: + +- ``config``, which contains the config options for the application. Key options include the facade repo_directory as well as primary api key. + +- ``collection_status``, contains the status of each aspect of data collection for each repo added to Augur. For example, it shows the status of the facade jobs for every repository. + SPDX ------------------------------------------------------- diff --git a/docs/source/schema/regularly_used_data.rst b/docs/source/schema/regularly_used_data.rst index 7e6f504b53..826b0c8200 100644 --- a/docs/source/schema/regularly_used_data.rst +++ b/docs/source/schema/regularly_used_data.rst @@ -1,14 +1,14 @@ List of Regularly Used Data Tables In Augur =========================================== -**This is a list of data tables in augur that are regularly used and the various workers attached to them.** +**This is a list of data tables in augur that are regularly used and the various tasks attached to them.** Commits ------- This is where a record for every file in every commit in every repository in an Augur instance is kept. - * Worker: Facade worker collects, and also stores platform user information in the commits table. + * Task: Facade tasks collect, and also stores platform user information in the commits table. .. image:: images/commits.png :width: 200 @@ -30,7 +30,7 @@ Contributor_repo Storage of a snowball sample of all the repositories anyone in your schema has accessed on GitHub. So, for example, if you wanted to know all the repositories that people on your project contributed to, this would be the table. - * Contributor_breadth_worker populates this table + * contributor_breadth_model populates this table * Population of this table happens last, and can take a long time. .. image:: images/contributor_repo.png @@ -41,13 +41,13 @@ Contributors These are all the contributors to a project/repo. In Augur, all types of contributions create a contributor record. This includes issue comments, pull request comments, label addition, etc. This is different than how GitHub counts contributors; they only include committers. - * Workers Adding Contributors: + * Tasks Adding Contributors: - * Github Issue Worker - * Pull Request Worker - * GitLab Issue Worker - * GitLab Merge Request Worker - * Facade Worker + * Github Issue Tasks + * Pull Request Tasks + * GitLab Issue Tasks + * GitLab Merge Request Tasks + * Facade Tasks .. image:: images/contributors.png :width: 200 @@ -57,9 +57,9 @@ Contributors_aliases These are all the alternate emails that the same contributor might use. These records arise almost entirely from the commit log. For example, if I have two different emails on two different computers that I use when I make a commit, then an alias is created for whatever the 2nd to nth email Augur runs across. If a user’s email cannot be resolved, it is placed in the unresolved_commit_emails table. Coverage is greater than 98% since Augur 1.2.4. - * Worker: + * Tasks: - * Facade Worker + * Facade Tasks .. image:: images/contributors_aliases.png :width: 200 @@ -67,7 +67,7 @@ Contributors_aliases Discourse_insights ------------------ -There are nine specific discourse act types identified by the computational linguistic algorithm that underlies the discourse insights worker. This worker analyzes each comment on each issue or pull request sequentially so that context is applied when determining the discourse act type. These types are: +There are nine specific discourse act types identified by the computational linguistic algorithm that underlies the discourse insights task. This task analyzes each comment on each issue or pull request sequentially so that context is applied when determining the discourse act type. These types are: * negative-reaction * answer @@ -79,18 +79,18 @@ There are nine specific discourse act types identified by the computational ling * announcement * appreciation - * Worker: + * Tasks: - * Discourse Insights Worker + * Discourse Insights Task .. image:: images/discourse_insights.png :width: 200 issue_assignees || issue_events || issue_labels ---------------------------------------------- - * Worker: + * Task: - * Github or Gitlab Issues Worker + * Github or Gitlab Issues Task .. image:: images/issue_assignees.png :width: 200 @@ -100,9 +100,9 @@ issue_message_ref A link between the issue and each message stored in the message table. - * Worker: + * Task: - * Github or Gitlab Issues Worker + * Github or Gitlab Issues Task .. image:: images/issue_message_ref.png :width: 200 @@ -112,9 +112,9 @@ issues Is all the data related to a GitHub Issue. - * Worker: + * Task: - * Github or Gitlab Issues Worker + * Github or Gitlab Issues Task .. image:: images/issues.png :width: 200 @@ -132,9 +132,9 @@ Message_analysis Two factors evaluated for every pull request on issues message: What is the sentiment of the message (positive or negative), and what is the novelty of the message in the context of other messages in that repository. - * Worker: + * Task: - * Message Insights Worker + * Message Insights Task .. image:: images/message_analysis.png :width: 200 @@ -144,9 +144,9 @@ Message_analysis_summary A summary level representation of the granular data in message_analysis. - * Worker: + * Task: - * Message Insights Worker + * Message Insights Task .. image:: images/message_analysis_summary.png :width: 200 @@ -156,21 +156,15 @@ Platform Reference data with two rows: one for GitHub, one for GitLab. - * Worker: - * Platform_worker - - .. image:: images/platform.png - :width: 200 - Pull_request_analysis --------------------- - A representation of the probability of a pull request being merged into a repository, based on analysis of the properties of previously merged pull requests in a repository. (Machine learning worker) + A representation of the probability of a pull request being merged into a repository, based on analysis of the properties of previously merged pull requests in a repository. (Machine learning tasks) - * Worker: + * Task: - * Pull request analysis worker + * Pull request analysis task .. image:: images/pull_request_analysis.png :width: 200 @@ -228,9 +222,9 @@ Releases Github declared software releases or release tags. For example: https://github.com/chaoss/augur/releases - * Worker: + * Task: - * Release Worker. + * Release Task. .. image:: images/releases.png :width: 200 @@ -248,21 +242,15 @@ Repo_badging A list of CNCF badging information for a project. Reads this api endpoint: https://bestpractices.coreinfrastructure.org/projects.json - * Worker: - - * linux_badge_worker - - .. image:: images/repo_badging.png - :width: 200 Repo_cluster_messages --------------------- Identifying which messages and repositories are clustered together. Identifies project similarity based on communication patterns. - * Worker: + * Task: - * Clustering Worker + * Clustering task .. image:: images/repo_cluster_messages.png :width: 200 @@ -272,9 +260,9 @@ Repo_dependencies Enumerates every dependency, including dependencies that are not package managed. - * Worker: + * Task: - * deps_worker + * process_dependency_metrics .. image:: images/repo_dependencies.png :width: 200 @@ -282,15 +270,15 @@ Repo_dependencies Repo_deps_libyear ----------------- - (enumerates every package managed dependency) Looks up the latest release of any library that is imported into a project. Then it compares that release date, the release version of the library version in your project (and its release date), and calculates how old your version is, compared to the latest version. The resulting statistic is “libyear”. This worker runs at least once a month, so over time, you will see if your libraries are being kept up to date, or not. + (enumerates every package managed dependency) Looks up the latest release of any library that is imported into a project. Then it compares that release date, the release version of the library version in your project (and its release date), and calculates how old your version is, compared to the latest version. The resulting statistic is “libyear”. This task runs with the facade tasks, so over time, you will see if your libraries are being kept up to date, or not. * Scenarios: * If a library is updated, but you didn’t change your version, the libyear statistic gets larger * If you updated a library and it didn’t get older, the libyear statistic gets smaller. - * Worker: + * Task: - * deps_libyear_worker + * process_libyear_dependency_metrics .. image:: images/repo_deps_libyear.png :width: 200 @@ -300,9 +288,9 @@ Repo_deps_scorecard Runs the OSSF Scorecard over every repository ( https://github.com/ossf/scorecard ) : There are 16 factors that are explained at that repository location. - * Worker: + * Task: - * deps_worker + * process_ossf_scorecard_metrics .. image:: images/repo_deps_scorecard.png :width: 200 @@ -318,11 +306,11 @@ Repo_groups Repo_info --------- - This worker gathers metadata from the platform API that includes things like “number of stars”, “number of forks”, etc. AND it also gives us : Number of issues, number of pull requests, etc. .. THAT information we use to determine if we have collected all of the PRs and Issues associated with a repository. + This task gathers metadata from the platform API that includes things like “number of stars”, “number of forks”, etc. AND it also gives us : Number of issues, number of pull requests, etc. .. THAT information we use to determine if we have collected all of the PRs and Issues associated with a repository. - * Worker: + * Task: - * repo info worker + * repo info task .. image:: images/repo_info.png :width: 200 @@ -330,9 +318,9 @@ Repo_info Repo_insights ----------- - * Worker: + * Task: - * Insight worker + * Insight task .. image:: images/repo_insights.png :width: 200 @@ -340,22 +328,13 @@ Repo_insights Repo_insights_records ---------- - * Worker: + * Task: - * Insight worker + * Insight task .. image:: images/repo_insights_records.png :width: 200 -Repo_labor --------- - - * Worker: - - * Value worker - - .. image:: images/repo_labor.png - :width: 200 Repo_meta --------- @@ -386,9 +365,9 @@ Repo_topic Identifies probable topics of conversation in discussion threads around issues and pull requests. - * Worker: + * Task: - * Clustering Worker + * Clustering task .. image:: images/repo_topic.png :width: 200 @@ -398,9 +377,9 @@ Topic_words Unigrams, bigrams, and trigrams associated with topics in the repo_topic table. - * Worker: + * Task: - * Clustering Worker + * Clustering task .. image:: images/topic_words.png :width: 200 @@ -410,9 +389,9 @@ Unresolved_commit_emails Emails from commits that were not initially able to be resolved using automated mechanisms. - * Worker: + * Task: - * Facade Worker. + * Facade Tasks. .. image:: images/unresolved_commit_emails.png :width: 200 diff --git a/docs/source/schema/working_tables.rst b/docs/source/schema/working_tables.rst index d26bca88c0..ffd3b2c08d 100644 --- a/docs/source/schema/working_tables.rst +++ b/docs/source/schema/working_tables.rst @@ -1,20 +1,20 @@ List of Working Data Tables In Augur =================================== -**This Is A List of Working Tables In Augur and The Workers Attached to Them.** +**This Is A List of Working Tables In Augur and The Tasks Attached to Them.** They are in lowercase to represent exactly how they look like on the actual table. - * analysis_log - this table is a record of the analysis steps the facade worker has taken on an augur instance. A listing of all the analysis steps taken for every repository is recorded as they are completed. + * analysis_log - this table is a record of the analysis steps the facade tasks have taken on an augur instance. A listing of all the analysis steps taken for every repository is recorded as they are completed. - * Worker Associated With It? + * Tasks Associated With It? - * Facade Worker + * Facade Tasks .. image:: images/analysis_log.png :width: 200 - * commit_parents - this table keeps a record of parent commits that are squashed during Facade Worker execution. + * commit_parents - this table keeps a record of parent commits that are squashed during Facade collection. .. image:: images/commit_parents.png :width: 200 From 7600b52e86fe4c2428841f85be7bd0338963ab9a Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 26 May 2023 14:37:39 -0500 Subject: [PATCH 45/48] Add linux badge worker functionality Signed-off-by: Isaac Milarsky --- augur/application/db/models/augur_data.py | 15 +++++++++++ augur/tasks/git/dependency_tasks/tasks.py | 4 +-- augur/tasks/github/repo_info/core.py | 33 ++++++++++++++++++++++- augur/tasks/github/repo_info/tasks.py | 16 +++++++++++ augur/tasks/init/celery_app.py | 2 +- augur/tasks/start_tasks.py | 4 +-- 6 files changed, 68 insertions(+), 6 deletions(-) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 70caa0230d..a49ae90249 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -1691,6 +1691,21 @@ class RepoBadging(Base): repo = relationship("Repo") + @staticmethod + def insert(session, repo_id: int, data: dict) -> dict: + + insert_statement = text("""INSERT INTO repo_badging (repo_id,tool_source,tool_version,data_source,data) + VALUES (:repo_id,:t_source,:t_version,:d_source,:data) + """).bindparams( + repo_id=repo_id, + t_source="collect_linux_badge_info", + t_version="0.50.3", + d_source="OSSF CII", + data=data + ) + + session.execute_sql(insert_statement) + class RepoClusterMessage(Base): __tablename__ = "repo_cluster_messages" diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index 898de37cb1..0cdd333b25 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -25,10 +25,10 @@ def process_dependency_metrics(repo_git): @celery.task(base=AugurCoreRepoCollectionTask) -def process_ossf_scorecard_metrics(repo_git): +def process_ossf_dependency_metrics(repo_git): from augur.tasks.init.celery_app import engine - logger = logging.getLogger(process_ossf_scorecard_metrics.__name__) + logger = logging.getLogger(process_ossf_dependency_metrics.__name__) with DatabaseSession(logger, engine) as session: logger.info(f"repo_git: {repo_git}") diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index 287f7368fd..ac2ccc5db4 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -12,7 +12,8 @@ from augur.tasks.github.util.gh_graphql_entities import hit_api_graphql, request_graphql_dict from augur.application.db.models import * from augur.tasks.github.util.github_task_session import * - +from augur.application.db.models.augur_data import RepoBadging +from urllib.parse import quote def query_committers_count(key_auth, logger, owner, repo): @@ -292,3 +293,33 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): logger.info(f"Inserted info for {owner}/{repo}\n") +def badges_model(logger,repo_git,repo_id,db): + """ Data collection and storage method + Query the CII API and store the result in the DB for the badges model + + This is a github task because it only covers github repos, this is not + part of the regular repo info model because it uses a differant api + github. + """ + cii_endpoint = "https://bestpractices.coreinfrastructure.org/projects.json?pq=" + + logger.info(f"Collecting badge data for {repo_git}") + git_url_extension = quote(repo_git[0:-4]) + + url = cii_endpoint + git_url_extension + logger.debug(f"Hitting CII endpoint: {url}") + + #Hit cii api with no api key. + response = hit_api(None, url, logger) + + try: + response_data = response.json() + except: + response_data = json.loads(json.dumps(response.text)) + + #Insert any data that was returned + if len(response_data) > 0: + RepoBadging.insert(db, repo_id, data) + else: + logger.info(f"Could not find CII data for {repo_git}") + + diff --git a/augur/tasks/github/repo_info/tasks.py b/augur/tasks/github/repo_info/tasks.py index fe31e5800f..d35c5dbdf8 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/augur/tasks/github/repo_info/tasks.py @@ -6,6 +6,8 @@ from augur.application.db.util import execute_session_query import traceback + +#Task to get regular misc github info @celery.task(base=AugurCoreRepoCollectionTask) def collect_repo_info(repo_git: str): @@ -17,3 +19,17 @@ def collect_repo_info(repo_git: str): repo = execute_session_query(query, 'one') repo_info_model(augur_db, manifest.key_auth, repo, logger) + + +#Task to get CII api data for linux badge info using github data. +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_linux_badge_info(repo_git: str): + + logger = logging.getLogger(collect_linux_badge_info.__name__) + + with GithubTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + badges_model(logger, repo_git, repo.repo_id, augur_db) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index b7c05986e7..48c5db32e4 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -132,7 +132,7 @@ def on_failure(self,exc,task_id,args, kwargs, einfo): 'augur.tasks.github.pull_requests.files_model.tasks.*': {'queue': 'secondary'}, 'augur.tasks.github.pull_requests.tasks.collect_pull_request_reviews': {'queue': 'secondary'}, 'augur.tasks.github.pull_requests.tasks.collect_pull_request_review_comments': {'queue': 'secondary'}, - 'augur.tasks.git.dependency_tasks.tasks.process_ossf_scorecard_metrics': {'queue': 'secondary'}, + 'augur.tasks.git.dependency_tasks.tasks.process_ossf_dependency_metrics': {'queue': 'secondary'}, 'augur.tasks.git.dependency_tasks.tasks.process_dependency_metrics': {'queue': 'facade'}, 'augur.tasks.git.dependency_libyear_tasks.tasks.process_libyear_dependency_metrics': {'queue': 'facade'}, 'augur.tasks.frontend.*': {'queue': 'frontend'} diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 32cb4e886d..c7b40943fd 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -22,7 +22,7 @@ from augur.tasks.github.repo_info.tasks import collect_repo_info from augur.tasks.github.pull_requests.files_model.tasks import process_pull_request_files from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits -from augur.tasks.git.dependency_tasks.tasks import process_ossf_scorecard_metrics +from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * @@ -107,7 +107,7 @@ def secondary_repo_collect_phase(repo_git): repo_task_group = group( process_pull_request_files.si(repo_git), process_pull_request_commits.si(repo_git), - process_ossf_scorecard_metrics.si(repo_git), + process_ossf_dependency_metrics.si(repo_git), chain(collect_pull_request_reviews.si(repo_git), collect_pull_request_review_comments.si(repo_git)) ) From 835997751407b65031e25694a993067e1f13f320 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 26 May 2023 14:43:51 -0500 Subject: [PATCH 46/48] add linux badge functionality to primary jobs Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index c7b40943fd..797a2903a1 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -19,7 +19,7 @@ from augur.tasks.data_analysis import * from augur.tasks.github.detect_move.tasks import detect_github_repo_move_core, detect_github_repo_move_secondary from augur.tasks.github.releases.tasks import collect_releases -from augur.tasks.github.repo_info.tasks import collect_repo_info +from augur.tasks.github.repo_info.tasks import collect_repo_info, collect_linux_badge_info from augur.tasks.github.pull_requests.files_model.tasks import process_pull_request_files from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics @@ -63,35 +63,30 @@ def prelim_phase_secondary(repo_git): #This is the phase that defines the message for core augur collection +#A chain is needed for each repo. def primary_repo_collect_phase(repo_git): logger = logging.getLogger(primary_repo_collect_phase.__name__) - #Here the term issues also includes prs. This list is a bunch of chains that run in parallel to process issue data. - issue_dependent_tasks = [] - #repo_info should run in a group - repo_info_tasks = [] - - np_clustered_array = [] - - #A chain is needed for each repo. - repo_info_task = collect_repo_info.si(repo_git)#collection_task_wrapper(self) + #Define primary group of jobs for the primary collect phase: issues and pull requests. primary_repo_jobs = group( collect_issues.si(repo_git), collect_pull_requests.si(repo_git) ) + #Define secondary group that can't run until after primary jobs have finished. secondary_repo_jobs = group( collect_events.si(repo_git),#*create_grouped_task_load(dataList=first_pass, task=collect_events).tasks, collect_github_messages.si(repo_git), #*create_grouped_task_load(dataList=first_pass,task=collect_github_messages).tasks, collect_github_repo_clones_data.si(repo_git), ) + #Other tasks that don't need other tasks to run before they do just put in final group. repo_task_group = group( - repo_info_task, + collect_repo_info.si(repo_git), chain(primary_repo_jobs | issue_pr_task_update_weight_util.s(repo_git=repo_git),secondary_repo_jobs,process_contributors.si()), #facade_phase(logger,repo_git), - + collect_linux_badge_info.si(repo_git), collect_releases.si(repo_git), grab_comitters.si(repo_git) ) From 3dcd132e2b041d5e3010dff320e5a8eb40d055ad Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 26 May 2023 15:06:43 -0500 Subject: [PATCH 47/48] deal with insertion issue Signed-off-by: Isaac Milarsky --- augur/application/db/models/augur_data.py | 3 ++- augur/tasks/github/repo_info/core.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index a49ae90249..676a71deec 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -25,6 +25,7 @@ import logging import re from typing import List, Any, Dict +import json from augur.application.db.models.base import Base @@ -1701,7 +1702,7 @@ def insert(session, repo_id: int, data: dict) -> dict: t_source="collect_linux_badge_info", t_version="0.50.3", d_source="OSSF CII", - data=data + data=json.dumps(data,indent=4) ) session.execute_sql(insert_statement) diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index ac2ccc5db4..50fa88068e 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -302,6 +302,8 @@ def badges_model(logger,repo_git,repo_id,db): """ cii_endpoint = "https://bestpractices.coreinfrastructure.org/projects.json?pq=" + + #https://github.com/chaoss/grimoirelab-hatstall logger.info(f"Collecting badge data for {repo_git}") git_url_extension = quote(repo_git[0:-4]) @@ -318,7 +320,7 @@ def badges_model(logger,repo_git,repo_id,db): #Insert any data that was returned if len(response_data) > 0: - RepoBadging.insert(db, repo_id, data) + RepoBadging.insert(db, repo_id, response_data) else: logger.info(f"Could not find CII data for {repo_git}") From 170570da93bdd22e8de6bba94332167e48cd6c2f Mon Sep 17 00:00:00 2001 From: Seltyk Date: Tue, 30 May 2023 11:46:13 -0400 Subject: [PATCH 48/48] [docs] Clarify prompts during `make install` Signed-off-by: Seltyk --- docs/new-install.md | 14 +++++++++++--- docs/new-install.rst | 20 ++++++++++++++------ 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/docs/new-install.md b/docs/new-install.md index 072dfada97..1adc1c6d9a 100644 --- a/docs/new-install.md +++ b/docs/new-install.md @@ -250,9 +250,17 @@ Create a Python Virtual Environment `python3 -m venv ~/virtual-env-directory` Activate your Python Virtual Environment `source ~/virtual-env-directory/bin/activate` -From the root of the Augur Directory, type `make install` - -You will be prompted to provide your GitHub username and password, your GitLab username and password, and the postgresql database where you want to have the Augur Schema built. You will also be prompted to provide a directory where repositories will be clone into. +From the root of the Augur Directory, type `make install`. You will be prompted to provide: + +- "User" is the PSQL database user, which is `augur` if you followed instructions exactly +- "Password" is the above user's password +- "Host" is the domain used with nginx, e.g. `ai.chaoss.io` +- "Port" is 5432 unless you reconfigured something +- "Database" is the name of the Augur database, which is `augur` if you followed instructions exactly +- The GitHub token created earlier +- Then the username associated with it +- Then the same for GitLab +- and finally a directory to clone repositories to ## Post Installation of Augur diff --git a/docs/new-install.rst b/docs/new-install.rst index 639cf08c8f..6b5a0ca9c8 100644 --- a/docs/new-install.rst +++ b/docs/new-install.rst @@ -317,12 +317,20 @@ Create a Python Virtual Environment Activate your Python Virtual Environment ``source ~/virtual-env-directory/bin/activate`` -From the root of the Augur Directory, type ``make install`` - -You will be prompted to provide your GitHub username and password, your -GitLab username and password, and the postgresql database where you want -to have the Augur Schema built. You will also be prompted to provide a -directory where repositories will be clone into. +From the root of the Augur Directory, type ``make install``. You will be +prompted to provide: + +- "User" is the PSQL database user, which is ``augur`` if you followed + instructions exactly +- "Password" is the above user's password +- "Host" is the domain used with nginx, e.g. ``ai.chaoss.io`` +- "Port" is 5432 unless you reconfigured something +- "Database" is the name of the Augur database, which is ``augur`` if you + followed instructions exactly +- The GitHub token created earlier +- Then the username associated with it +- Then the same for GitLab +- and finally a directory to clone repositories to Post Installation of Augur --------------------------