From c010c2c722eeea18289adde9fd7531eddef5de55 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 1 Apr 2024 18:24:58 -0500 Subject: [PATCH 001/122] Remove facade dependency on GithubTaskSession Signed-off-by: Andrew Brain --- augur/tasks/git/util/facade_worker/facade_worker/config.py | 4 ++-- augur/tasks/util/collection_util.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index 19539d79de..b8ac66101a 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -34,7 +34,7 @@ from sqlalchemy.exc import OperationalError from psycopg2.errors import DeadlockDetected -from augur.tasks.github.util.github_task_session import * +from augur.application.db.session import DatabaseSession from augur.application.config import AugurConfig from logging import Logger @@ -77,7 +77,7 @@ def get_database_args_from_env(): #print(credentials) return credentials -class FacadeSession(GithubTaskSession): +class FacadeSession(DatabaseSession): """ORM session used in facade tasks. This class adds the various attributes needed for legacy facade as well as a modified version of the legacy FacadeConfig class. diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 3561b19b40..b4ff09ecb9 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -655,7 +655,7 @@ def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook=" valid_repos = session.execute_sql(repo_query).fetchall() valid_repo_git_list = [repo[1] for repo in valid_repos] - session.logger.info(f"valid repo git list: {tuple(valid_repo_git_list)}") + session.logger.info(f"valid {hook} repo git list: {tuple(valid_repo_git_list)}") #start repos for new primary collection hook #collection_size = start_block_of_repos( From 003ed255840c34e828291aa7146532812699e77a Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 1 Apr 2024 18:30:21 -0500 Subject: [PATCH 002/122] remove facade session dependence on AugurConfig Signed-off-by: Andrew Brain --- augur/tasks/git/util/facade_worker/facade_worker/config.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index b8ac66101a..c9a1ee021d 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -35,7 +35,7 @@ from psycopg2.errors import DeadlockDetected from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from augur.application.db.lib import get_section from logging import Logger logger = logging.getLogger(__name__) @@ -104,12 +104,10 @@ def __init__(self,logger: Logger): from augur.application.db import get_engine engine = get_engine() - #self.cfg = FacadeConfig(logger) self.repos_processed = 0 super().__init__(logger=logger, engine=engine) - # Figure out what we need to do - worker_options = AugurConfig(logger, self).get_section("Facade") + worker_options = get_section("Facade") self.limited_run = worker_options["limited_run"] self.delete_marked_repos = worker_options["delete_marked_repos"] From cc423538981d74455cd5782aa177cc6ab1412f96 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 1 Apr 2024 18:42:27 -0500 Subject: [PATCH 003/122] Define database lib methods and use in commit analysis Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 22 ++++++++++++++++++- .../facade_worker/analyzecommit.py | 6 +++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index c1da707dbf..5c277c086b 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -2,7 +2,7 @@ import logging from typing import List, Any, Optional from augur.application.db.models import Config -from augur.application.db import get_session +from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query logger = logging.getLogger("db_lib") @@ -95,3 +95,23 @@ def get_value(section_name: str, setting_name: str) -> Optional[Any]: setting_dict = convert_type_of_value(setting_dict, logger) return setting_dict["value"] + + +def execute_sql(sql_text): + + engine = get_engine() + + with engine.begin() as connection: + + return_data = connection.execute(sql_text) + + return return_data + +def fetchall_data_from_sql_text(sql_text): + + engine = get_engine() + + with engine.begin() as connection: + + result = connection.execute(sql_text) + return [dict(row) for row in result.mappings()] diff --git a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py index a0ca29701a..56073c9836 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py @@ -29,6 +29,8 @@ import os import sqlalchemy as s +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text + def analyze_commit(session, repo_id, repo_loc, commit): # This function analyzes a given commit, counting the additions, removals, and @@ -84,7 +86,7 @@ def discover_alias(email): WHERE alias_email=:alias_email AND cntrb_active = 1""").bindparams(alias_email=email) - canonical = session.fetchall_data_from_sql_text(fetch_canonical)#list(cursor_people_local) + canonical = fetchall_data_from_sql_text(fetch_canonical)#list(cursor_people_local) if canonical: for email in canonical: @@ -173,7 +175,7 @@ def generate_commit_record(repos_id,commit,filename, #cursor_local.execute(store_working_commit, (repo_id,commit)) #db_local.commit() - session.execute_sql(store_working_commit) + execute_sql(store_working_commit) #session.log_activity('Debug',f"Stored working commit and analyzing : {commit}") From a070e71fd4c8754948ee2bbccf216e5de2f47fb9 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 1 Apr 2024 18:50:05 -0500 Subject: [PATCH 004/122] Update trim commits to use db lib Signed-off-by: Andrew Brain --- augur/tasks/git/facade_tasks.py | 5 +++-- .../git/util/facade_worker/facade_worker/utilitymethods.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index b96c596bc7..d50127d80e 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -4,6 +4,7 @@ from celery import group, chain import sqlalchemy as s +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set @@ -85,7 +86,7 @@ def update_analysis_log(repos_id,status): VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) try: - session.execute_sql(log_message) + execute_sql(log_message) except: pass @@ -98,7 +99,7 @@ def update_analysis_log(repos_id,status): """).bindparams(repo_id=repo_id) try: - working_commits = session.fetchall_data_from_sql_text(get_status) + working_commits = fetchall_data_from_sql_text(get_status) except: working_commits = [] diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 848cb38917..acfa7e2e0e 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -33,6 +33,7 @@ from augur.application.db.models import * from .config import FacadeSession as FacadeSession from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps +from augur.application.db.lib import execute_sql #from augur.tasks.git.util.facade_worker.facade def update_repo_log(session, repos_id,status): @@ -61,14 +62,14 @@ def trim_commits(session, repo_id,commits): AND cmt_commit_hash IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) - session.execute_sql(remove_commit) + execute_sql(remove_commit) # Remove the working commit. remove_commit = s.sql.text("""DELETE FROM working_commits WHERE repos_id = :repo_id AND working_commit IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) - session.execute_sql(remove_commit) + execute_sql(remove_commit) for commit in commits: session.log_activity('Debug',f"Trimmed commit: {commit}") From 32af57e1b870a3d5a1475f3d1da4bf757a805256 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 1 Apr 2024 19:04:52 -0500 Subject: [PATCH 005/122] Reduce dependency on session logger Signed-off-by: Andrew Brain --- .../git/dependency_libyear_tasks/core.py | 16 +++++----- .../git/dependency_libyear_tasks/tasks.py | 2 +- augur/tasks/git/dependency_tasks/core.py | 30 +++++++++---------- augur/tasks/git/dependency_tasks/tasks.py | 6 ++-- augur/tasks/git/facade_tasks.py | 18 +++++------ augur/tasks/git/scc_value_tasks/core.py | 16 +++++----- augur/tasks/git/scc_value_tasks/tasks.py | 2 +- .../facade_worker/analyzecommit.py | 8 ++--- .../facade_worker/postanalysiscleanup.py | 6 ++-- .../facade_worker/utilitymethods.py | 16 +++++----- 10 files changed, 60 insertions(+), 60 deletions(-) diff --git a/augur/tasks/git/dependency_libyear_tasks/core.py b/augur/tasks/git/dependency_libyear_tasks/core.py index 9e48757d61..21e47409d6 100644 --- a/augur/tasks/git/dependency_libyear_tasks/core.py +++ b/augur/tasks/git/dependency_libyear_tasks/core.py @@ -5,10 +5,10 @@ from augur.tasks.git.dependency_libyear_tasks.libyear_util.util import get_deps_libyear_data from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -def deps_libyear_model( session, repo_id,repo_git,repo_group_id): +def deps_libyear_model(logger, session, repo_id,repo_git,repo_group_id): """ Data collection and storage method """ - session.logger.info(f"This is the libyear deps model repo: {repo_git}") + logger.info(f"This is the libyear deps model repo: {repo_git}") #result = re.search(r"https:\/\/(github\.com\/[A-Za-z0-9 \- _]+\/)([A-Za-z0-9 \- _ .]+)$", repo_git).groups() @@ -21,23 +21,23 @@ def deps_libyear_model( session, repo_id,repo_git,repo_group_id): absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo_id,result.repo_path,result.repo_name) #config.get_section("Facade")['repo_directory'] + relative_repo_path#self.config['repo_directory'] + relative_repo_path - generate_deps_libyear_data(session,repo_id, absolute_repo_path) + generate_deps_libyear_data(logger, session,repo_id, absolute_repo_path) -def generate_deps_libyear_data(session, repo_id, path): +def generate_deps_libyear_data(logger, session, repo_id, path): """Scans for package files and calculates libyear :param session: Task manifest and database session. :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ date_scanned = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - session.logger.info('Searching for deps in repo') - session.logger.info(f'Repo ID: {repo_id}, Path: {path}') + logger.info('Searching for deps in repo') + logger.info(f'Repo ID: {repo_id}, Path: {path}') - deps = get_deps_libyear_data(path,session.logger) + deps = get_deps_libyear_data(path,logger) if not deps: - session.logger.info(f"No deps found for repo {repo_id} on path {path}") + logger.info(f"No deps found for repo {repo_id} on path {path}") return to_insert = [] diff --git a/augur/tasks/git/dependency_libyear_tasks/tasks.py b/augur/tasks/git/dependency_libyear_tasks/tasks.py index ff15c61d91..ec062e4853 100644 --- a/augur/tasks/git/dependency_libyear_tasks/tasks.py +++ b/augur/tasks/git/dependency_libyear_tasks/tasks.py @@ -19,4 +19,4 @@ def process_libyear_dependency_metrics(self, repo_git): repo = execute_session_query(query,'one') - deps_libyear_model(session, repo.repo_id,repo_git,repo.repo_group_id) \ No newline at end of file + deps_libyear_model(logger, session, repo.repo_id,repo_git,repo.repo_group_id) \ No newline at end of file diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 296e69075e..19a5b84a4d 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -5,19 +5,19 @@ from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from augur.tasks.util.worker_util import parse_json_from_subprocess_call -def generate_deps_data(session, repo_id, path): +def generate_deps_data(logger, session, repo_id, path): """Run dependency logic on repo and stores data in database :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ scan_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - session.logger.info('Searching for deps in repo') - session.logger.info(f'Repo ID: {repo_id}, Path: {path}, Scan date: {scan_date}') + logger.info('Searching for deps in repo') + logger.info(f'Repo ID: {repo_id}, Path: {path}, Scan date: {scan_date}') - deps = dep_calc.get_deps(path,session.logger) + deps = dep_calc.get_deps(path,logger) to_insert = [] for dep in deps: @@ -36,26 +36,26 @@ def generate_deps_data(session, repo_id, path): session.insert_data(to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) - session.logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") + logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") """ def deps_model(session, repo_id,repo_git,repo_path,repo_name): # Data collection and storage method - session.logger.info(f"This is the deps model repo: {repo_git}.") + logger.info(f"This is the deps model repo: {repo_git}.") generate_deps_data(session,repo_id, absolute_repo_path) """ -def generate_scorecard(session,repo_id,path): +def generate_scorecard(logger, session,repo_id,path): """Runs scorecard on repo and stores data in database :param repo_id: Repository ID :param path: URL path of the Repostiory """ - session.logger.info('Generating scorecard data for repo') - session.logger.info(f"Repo ID: {repo_id}, Path: {path}") + logger.info('Generating scorecard data for repo') + logger.info(f"Repo ID: {repo_id}, Path: {path}") # we convert relative path in the format required by scorecard like github.com/chaoss/augur # raw_path,_ = path.split('-') @@ -69,16 +69,16 @@ def generate_scorecard(session,repo_id,path): path_to_scorecard = os.environ['HOME'] + '/scorecard' #setting the environmental variable which is required by scorecard - key_handler = GithubApiKeyHandler(session, session.logger) + key_handler = GithubApiKeyHandler(session, logger) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() - required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) - session.logger.info('adding to database...') - session.logger.debug(f"output: {required_output}") + logger.info('adding to database...') + logger.debug(f"output: {required_output}") if not required_output['checks']: - session.logger.info('No scorecard checks found!') + logger.info('No scorecard checks found!') return #Store the overall score first @@ -112,6 +112,6 @@ def generate_scorecard(session,repo_id,path): session.insert_data(to_insert, RepoDepsScorecard, ["repo_id","name"]) - session.logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") + logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index 152c053080..455e9b1faf 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -26,9 +26,9 @@ def process_dependency_metrics(self, repo_git): absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - session.logger.debug(f"This is the deps model repo: {repo_git}.") + logger.debug(f"This is the deps model repo: {repo_git}.") - generate_deps_data(session,repo.repo_id,absolute_repo_path) + generate_deps_data(logger, session,repo.repo_id,absolute_repo_path) @celery.task(base=AugurSecondaryRepoCollectionTask, bind=True) @@ -44,4 +44,4 @@ def process_ossf_dependency_metrics(self, repo_git): query = session.query(Repo).filter(Repo.repo_git == repo_git) repo = execute_session_query(query,'one') - generate_scorecard(session, repo.repo_id, repo_git) \ No newline at end of file + generate_scorecard(logger, session, repo.repo_id, repo_git) \ No newline at end of file diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index d50127d80e..aa36e9576e 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -134,7 +134,7 @@ def update_analysis_log(repos_id,status): session.execute_sql(log_message) - session.logger.info(f"Generating sequence for repo {repo_id}") + logger.info(f"Generating sequence for repo {repo_id}") query = session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') @@ -210,7 +210,7 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: start_date = session.get_setting('start_date') - session.logger.info(f"Generating sequence for repo {repo_id}") + logger.info(f"Generating sequence for repo {repo_id}") query = session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') @@ -255,16 +255,16 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: #logger.info(f"Got to analysis!") - commitRecords = analyze_commit(session, repo_id, repo_loc, commitTuple) + commitRecords = analyze_commit(logger, repo_id, repo_loc, commitTuple) #logger.debug(commitRecord) if len(commitRecords): pendingCommitRecordsToInsert.extend(commitRecords) if len(pendingCommitRecordsToInsert) >= 1000: - facade_bulk_insert_commits(session,pendingCommitRecordsToInsert) + facade_bulk_insert_commits(logger, session,pendingCommitRecordsToInsert) pendingCommitRecordsToInsert = [] - facade_bulk_insert_commits(session,pendingCommitRecordsToInsert) + facade_bulk_insert_commits(logger, session,pendingCommitRecordsToInsert) @@ -343,7 +343,7 @@ def clone_repos(): session.commit() # get the commit count - commit_count = get_repo_commit_count(session, repo_git) + commit_count = get_repo_commit_count(logger, session, repo_git) facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) @@ -385,7 +385,7 @@ def git_update_commit_count_weight(self, repo_git): # Change facade session to take in engine with FacadeSession(logger) as session: - commit_count = get_repo_commit_count(session, repo_git) + commit_count = get_repo_commit_count(logger, session, repo_git) facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) @@ -452,7 +452,7 @@ def generate_contributor_sequence(logger,repo_git, session): WHERE repo_git=:value""").bindparams(value=repo_git) repo = session.execute_sql(query).fetchone() - session.logger.info(f"repo: {repo}") + logger.info(f"repo: {repo}") repo_id = repo[0] #pdb.set_trace() #breakpoint() @@ -557,7 +557,7 @@ def generate_non_repo_domain_facade_tasks(logger): # from queries and materialized views in the current version of Augur. # This method is also a major performance bottleneck with little value. - #session.logger.info(session.cfg) + #logger.info(session.cfg) if not limited_run or (limited_run and fix_affiliations): #facade_sequence.append(fill_empty_affiliations_facade_task.si().on_error(facade_error_handler.s()))#fill_empty_affiliations(session) logger.info("Fill empty affiliations is deprecated.") diff --git a/augur/tasks/git/scc_value_tasks/core.py b/augur/tasks/git/scc_value_tasks/core.py index 71993ebcd1..8e5854136f 100644 --- a/augur/tasks/git/scc_value_tasks/core.py +++ b/augur/tasks/git/scc_value_tasks/core.py @@ -3,22 +3,22 @@ from augur.application.db.models import * from augur.tasks.util.worker_util import parse_json_from_subprocess_call -def value_model(session,repo_git,repo_id, path): +def value_model(logger, session,repo_git,repo_id, path): """Runs scc on repo and stores data in database :param repo_id: Repository ID :param path: absolute file path of the Repostiory """ - session.logger.info('Generating value data for repo') - session.logger.info(f"Repo ID: {repo_id}, Path: {path}") - session.logger.info('Running scc...') + logger.info('Generating value data for repo') + logger.info(f"Repo ID: {repo_id}, Path: {path}") + logger.info('Running scc...') path_to_scc = os.environ['HOME'] + '/scc' - required_output = parse_json_from_subprocess_call(session.logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) + required_output = parse_json_from_subprocess_call(logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) - session.logger.info('adding scc data to database... ') - session.logger.debug(f"output: {required_output}") + logger.info('adding scc data to database... ') + logger.debug(f"output: {required_output}") to_insert = [] for record in required_output: @@ -44,4 +44,4 @@ def value_model(session,repo_git,repo_id, path): session.insert_data(to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) - session.logger.info(f"Done generating scc data for repo {repo_id} from path {path}") + logger.info(f"Done generating scc data for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/scc_value_tasks/tasks.py b/augur/tasks/git/scc_value_tasks/tasks.py index 37ff4ac4b1..22e049fdc9 100644 --- a/augur/tasks/git/scc_value_tasks/tasks.py +++ b/augur/tasks/git/scc_value_tasks/tasks.py @@ -23,4 +23,4 @@ def process_scc_value_metrics(self, repo_git): absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - value_model(session,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file + value_model(logger, session,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file diff --git a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py index 56073c9836..18a436abb4 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py @@ -31,7 +31,7 @@ from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text -def analyze_commit(session, repo_id, repo_loc, commit): +def analyze_commit(logger, repo_id, repo_loc, commit): # This function analyzes a given commit, counting the additions, removals, and # whitespace changes. It collects all of the metadata about the commit, and @@ -62,7 +62,7 @@ def check_swapped_emails(name,email): # Sometimes people mix up their name and email in their git settings if name.find('@') >= 0 and email.find('@') == -1: - session.logger.debug(f"Found swapped email/name: {email}/{name}") + logger.debug(f"Found swapped email/name: {email}/{name}") return email,name else: return name,email @@ -73,7 +73,7 @@ def strip_extra_amp(email): # matching. This extra info is not used, so we discard it. if email.count('@') > 1: - session.logger.debug(f"Found extra @: {email}") + logger.debug(f"Found extra @: {email}") return email[:email.find('@',email.find('@')+1)] else: return email @@ -113,7 +113,7 @@ def generate_commit_record(repos_id,commit,filename, #2021-10-11 11:57:46 -0500 placeholder_date = "1970-01-01 00:00:15 -0500" - #session.logger.info(f"Timestamp: {author_timestamp}") + #logger.info(f"Timestamp: {author_timestamp}") commit_record = { 'repo_id' : repos_id, 'cmt_commit_hash' : str(commit), diff --git a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py index 3ec2013274..6d41eb7c3f 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py @@ -37,7 +37,7 @@ def git_repo_cleanup(session,repo_git): # Clean up any git repos that are pending deletion session.update_status('Purging deleted repos') - #session.logger.info("Processing deletions") + #logger.info("Processing deletions") session.log_activity('Info','Processing deletions') @@ -101,7 +101,7 @@ def git_repo_cleanup(session,repo_git): session.execute_sql(query) #log_activity('Verbose','Deleted repo %s' % row[0]) - #session.logger.debug(f"Deleted repo {row.repo_id}") + #logger.debug(f"Deleted repo {row.repo_id}") session.log_activity('Verbose',f"Deleted repo {row.repo_id}") cleanup = '%s/%s%s' % (row.repo_group_id,row.repo_path,row.repo_name) @@ -129,7 +129,7 @@ def git_repo_cleanup(session,repo_git): cmd = "rmdir %s%s" % (session.repo_base_directory,cleanup) subprocess.Popen([cmd],shell=True).wait() #log_activity('Verbose','Attempted %s' % cmd) - #session.logger.debug(f"Attempted {cmd}") + #logger.debug(f"Attempted {cmd}") session.log_activity('Verbose',f"Attempted {cmd}") #update_repo_log(row[0],'Deleted') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index acfa7e2e0e..34ed93a6f7 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -149,15 +149,15 @@ def count_branches(git_dir): branches_dir = os.path.join(git_dir, 'refs', 'heads') return sum(1 for _ in os.scandir(branches_dir)) -def get_repo_commit_count(session, repo_git): +def get_repo_commit_count(logger, session, repo_git): repo = Repo.get_by_repo_git(session, repo_git) absolute_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) repo_loc = (f"{absolute_path}/.git") - session.logger.debug(f"loc: {repo_loc}") - session.logger.debug(f"path: {repo.repo_path}") + logger.debug(f"loc: {repo_loc}") + logger.debug(f"path: {repo.repo_path}") # Check if the .git directory exists if not os.path.exists(repo_loc): @@ -192,7 +192,7 @@ def get_facade_weight_with_commit_count(session, repo_git, commit_count): def get_repo_weight_by_commit(logger,repo_git): with FacadeSession(logger) as session: - return get_repo_commit_count(session, repo_git) - get_facade_weight_time_factor(session, repo_git) + return get_repo_commit_count(logger, session, repo_git) - get_facade_weight_time_factor(session, repo_git) def update_facade_scheduling_fields(session, repo_git, weight, commit_count): @@ -207,7 +207,7 @@ def update_facade_scheduling_fields(session, repo_git, weight, commit_count): session.execute(update_query) session.commit() -def facade_bulk_insert_commits(session,records): +def facade_bulk_insert_commits(logger, session,records): try: session.execute( @@ -218,14 +218,14 @@ def facade_bulk_insert_commits(session,records): except Exception as e: if len(records) > 1: - session.logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") + logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") #split list into halves and retry insert until we isolate offending record firsthalfRecords = records[:len(records)//2] secondhalfRecords = records[len(records)//2:] - facade_bulk_insert_commits(session,firsthalfRecords) - facade_bulk_insert_commits(session,secondhalfRecords) + facade_bulk_insert_commits(logger, session,firsthalfRecords) + facade_bulk_insert_commits(logger, session,secondhalfRecords) elif len(records) == 1 and isinstance(e,DataError) and "time zone displacement" in f"{e}": commit_record = records[0] #replace incomprehensible dates with epoch. From 812e63588bdf73f7ccae9ce4e7946d88c2466885 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 1 Apr 2024 21:17:52 -0500 Subject: [PATCH 006/122] Update facade session to not depend on database session Signed-off-by: Andrew Brain --- augur/tasks/git/facade_tasks.py | 378 +++++++++--------- .../facade_worker/facade_worker/config.py | 19 +- .../facade_worker/postanalysiscleanup.py | 67 ++-- .../facade_worker/rebuildcache.py | 175 ++++---- .../facade_worker/facade_worker/repofetch.py | 97 ++--- .../facade_worker/utilitymethods.py | 46 +-- augur/tasks/github/facade_github/tasks.py | 86 ++-- 7 files changed, 440 insertions(+), 428 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index aa36e9576e..a6fa0f0d93 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -4,7 +4,7 @@ from celery import group, chain import sqlalchemy as s -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, get_session from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set @@ -59,13 +59,10 @@ def facade_error_handler(request,exc,traceback): def facade_analysis_init_facade_task(repo_git): logger = logging.getLogger(facade_analysis_init_facade_task.__name__) - with FacadeSession(logger) as session: + session = FacadeSession(logger) - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id - - session.update_status('Running analysis') - session.log_activity('Info',f"Beginning analysis.") + session.update_status('Running analysis') + session.log_activity('Info',f"Beginning analysis.") @celery.task(base=AugurFacadeRepoCollectionTask) @@ -73,25 +70,27 @@ def trim_commits_facade_task(repo_git): logger = logging.getLogger(trim_commits_facade_task.__name__) - with FacadeSession(logger) as session: + facade_session = FacadeSession(logger) + + with get_session() as session: repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id + repo_id = repo.repo_id - def update_analysis_log(repos_id,status): + def update_analysis_log(repos_id,status): - # Log a repo's analysis status + # Log a repo's analysis status - log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) - VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) + log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) + VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) - try: - execute_sql(log_message) - except: - pass + try: + execute_sql(log_message) + except: + pass - session.inc_repos_processed() + facade_session.inc_repos_processed() update_analysis_log(repo_id,"Beginning analysis.") # First we check to see if the previous analysis didn't complete @@ -107,7 +106,7 @@ def update_analysis_log(repos_id,status): # the commit data may be incomplete. It should be trimmed, just in case. commits_to_trim = [commit['working_commit'] for commit in working_commits] - trim_commits(session,repo_id,commits_to_trim) + trim_commits(facade_session,repo_id,commits_to_trim) # Start the main analysis update_analysis_log(repo_id,'Collecting data') @@ -118,62 +117,63 @@ def trim_commits_post_analysis_facade_task(repo_git): logger = logging.getLogger(trim_commits_post_analysis_facade_task.__name__) + facade_session = FacadeSession(logger) - with FacadeSession(logger) as session: + with get_session() as session: repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id - start_date = session.get_setting('start_date') - def update_analysis_log(repos_id,status): + repo_id = repo.repo_id - # Log a repo's analysis status + start_date = facade_session.get_setting('start_date') + def update_analysis_log(repos_id,status): - log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) - VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) + # Log a repo's analysis status + + log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) + VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) - - session.execute_sql(log_message) - logger.info(f"Generating sequence for repo {repo_id}") + execute_sql(log_message) + + logger.info(f"Generating sequence for repo {repo_id}") + with get_session() as session: query = session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') - #Get the huge list of commits to process. - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") - # Grab the parents of HEAD + #Get the huge list of commits to process. + absoulte_path = get_absolute_repo_path(facade_session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + repo_loc = (f"{absoulte_path}/.git") + # Grab the parents of HEAD - parent_commits = get_parent_commits_set(repo_loc, start_date) + parent_commits = get_parent_commits_set(repo_loc, start_date) - # Grab the existing commits from the database - existing_commits = get_existing_commits_set(session, repo_id) + # Grab the existing commits from the database + existing_commits = get_existing_commits_set(repo_id) - # Find missing commits and add them + # Find missing commits and add them - missing_commits = parent_commits - existing_commits - - session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") - - # Find commits which are out of the analysis range + missing_commits = parent_commits - existing_commits - trimmed_commits = existing_commits - parent_commits - - update_analysis_log(repo_id,'Data collection complete') + facade_session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + + # Find commits which are out of the analysis range - update_analysis_log(repo_id,'Beginning to trim commits') + trimmed_commits = existing_commits - parent_commits - session.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") + update_analysis_log(repo_id,'Data collection complete') + update_analysis_log(repo_id,'Beginning to trim commits') + session.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") - #for commit in trimmed_commits: - trim_commits(session,repo_id,trimmed_commits) - + #for commit in trimmed_commits: + trim_commits(facade_session,repo_id,trimmed_commits) + - update_analysis_log(repo_id,'Commit trimming complete') + update_analysis_log(repo_id,'Commit trimming complete') - update_analysis_log(repo_id,'Complete') + update_analysis_log(repo_id,'Complete') @@ -181,8 +181,8 @@ def update_analysis_log(repos_id,status): def facade_analysis_end_facade_task(): logger = logging.getLogger(facade_analysis_end_facade_task.__name__) - with FacadeSession(logger) as session: - session.log_activity('Info','Running analysis (complete)') + session = FacadeSession(logger) + session.log_activity('Info','Running analysis (complete)') @@ -190,9 +190,9 @@ def facade_analysis_end_facade_task(): def facade_start_contrib_analysis_task(): logger = logging.getLogger(facade_start_contrib_analysis_task.__name__) - with FacadeSession(logger) as session: - session.update_status('Updating Contributors') - session.log_activity('Info', 'Updating Contributors with commits') + session = FacadeSession(logger) + session.update_status('Updating Contributors') + session.log_activity('Info', 'Updating Contributors with commits') #enable celery multithreading @@ -203,12 +203,14 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: #create new session for celery thread. logger = logging.getLogger(analyze_commits_in_parallel.__name__) - with FacadeSession(logger) as session: + facade_session = FacadeSession(logger) + + with get_session() as session: repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() repo_id = repo.repo_id - start_date = session.get_setting('start_date') + start_date = facade_session.get_setting('start_date') logger.info(f"Generating sequence for repo {repo_id}") @@ -216,19 +218,19 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: repo = execute_session_query(query, 'one') #Get the huge list of commits to process. - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) + absoulte_path = get_absolute_repo_path(facade_session.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) repo_loc = (f"{absoulte_path}/.git") # Grab the parents of HEAD parent_commits = get_parent_commits_set(repo_loc, start_date) # Grab the existing commits from the database - existing_commits = get_existing_commits_set(session, repo_id) + existing_commits = get_existing_commits_set(repo_id) # Find missing commits and add them missing_commits = parent_commits - existing_commits - session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + facade_session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") if not len(missing_commits) or repo_id is None: @@ -238,7 +240,7 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: queue = list(missing_commits) logger.info(f"Got to analysis!") - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + absoulte_path = get_absolute_repo_path(facade_session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) repo_loc = (f"{absoulte_path}/.git") pendingCommitRecordsToInsert = [] @@ -266,48 +268,45 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: facade_bulk_insert_commits(logger, session,pendingCommitRecordsToInsert) - - - # Remove the working commit. remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND working_commit IN :hashes - """).bindparams(repo_id=repo_id,hashes=tuple(queue)) - session.execute_sql(remove_commit) - - logger.info("Analysis complete") - return + WHERE repos_id = :repo_id AND working_commit IN :hashes + """).bindparams(repo_id=repo_id,hashes=tuple(queue)) + execute_sql(remove_commit) + + logger.info("Analysis complete") + return @celery.task def nuke_affiliations_facade_task(): logger = logging.getLogger(nuke_affiliations_facade_task.__name__) - with FacadeSession(logger) as session: - nuke_affiliations(session) + session = FacadeSession(logger) + nuke_affiliations(session) @celery.task def fill_empty_affiliations_facade_task(): logger = logging.getLogger(fill_empty_affiliations_facade_task.__name__) - with FacadeSession(logger) as session: - fill_empty_affiliations(session) + facade_session = FacadeSession(logger) + fill_empty_affiliations(facade_session) @celery.task def invalidate_caches_facade_task(): logger = logging.getLogger(invalidate_caches_facade_task.__name__) - with FacadeSession(logger) as session: - invalidate_caches(session) + session = FacadeSession(logger) + invalidate_caches(session) @celery.task def rebuild_unknown_affiliation_and_web_caches_facade_task(): logger = logging.getLogger(rebuild_unknown_affiliation_and_web_caches_facade_task.__name__) - with FacadeSession(logger) as session: - rebuild_unknown_affiliation_and_web_caches(session) + session = FacadeSession(logger) + rebuild_unknown_affiliation_and_web_caches(session) @celery.task @@ -315,8 +314,9 @@ def git_repo_cleanup_facade_task(repo_git): logger = logging.getLogger(git_repo_cleanup_facade_task.__name__) - with FacadeSession(logger) as session: - git_repo_cleanup(session, repo_git) + facade_session = FacadeSession(logger) + with get_session() as session: + git_repo_cleanup(facade_session, session, repo_git) # retry this task indefinitely every 5 minutes if it errors. Since the only way it gets scheduled is by itself, so if it stops running no more clones will happen till the instance is restarted @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) @@ -326,7 +326,9 @@ def clone_repos(): is_pending = CollectionStatus.facade_status == CollectionState.PENDING.value - with FacadeSession(logger) as session: + facade_session = FacadeSession(logger) + + with get_session() as session: # process up to 1000 repos at a time repo_git_identifiers = get_collection_status_repo_git_from_filter(session, is_pending, 999999) @@ -339,11 +341,11 @@ def clone_repos(): # clone repo try: - git_repo_initialize(session, repo_git) + git_repo_initialize(facade_session, session, repo_git) session.commit() # get the commit count - commit_count = get_repo_commit_count(logger, session, repo_git) + commit_count = get_repo_commit_count(logger, facade_session, session, repo_git) facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) @@ -362,9 +364,7 @@ def clone_repos(): setattr(repoStatus,"facade_status", CollectionState.ERROR.value) session.commit() - clone_repos.si().apply_async(countdown=60*5) - - + clone_repos.si().apply_async(countdown=60*5) #@celery.task(bind=True) @@ -374,7 +374,7 @@ def clone_repos(): # # logger = logging.getLogger(check_for_repo_updates_facade_task.__name__) # -# with FacadeSession(logger) as session: +# session = FacadeSession(logger) # check_for_repo_updates(session, repo_git) @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) @@ -384,8 +384,11 @@ def git_update_commit_count_weight(self, repo_git): logger = logging.getLogger(git_update_commit_count_weight.__name__) # Change facade session to take in engine - with FacadeSession(logger) as session: - commit_count = get_repo_commit_count(logger, session, repo_git) + facade_session = FacadeSession(logger) + + with get_session() as session: + + commit_count = get_repo_commit_count(logger, facade_session, session, repo_git) facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) @@ -396,11 +399,14 @@ def git_repo_updates_facade_task(repo_git): logger = logging.getLogger(git_repo_updates_facade_task.__name__) - with FacadeSession(logger) as session: - git_repo_updates(session, repo_git) + facade_session = FacadeSession(logger) + + with get_session() as session: + git_repo_updates(facade_session, session, repo_git) -def generate_analysis_sequence(logger,repo_git, session): + +def generate_analysis_sequence(logger,repo_git, facade_session): """Run the analysis by looping over all active repos. For each repo, we retrieve the list of commits which lead to HEAD. If any are missing from the database, they are filled in. Then we check to see if any commits in the database are @@ -417,9 +423,9 @@ def generate_analysis_sequence(logger,repo_git, session): repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) - repos = session.fetchall_data_from_sql_text(repo_list) + repos = fetchall_data_from_sql_text(repo_list) - start_date = session.get_setting('start_date') + start_date = facade_session.get_setting('start_date') repo_ids = [repo['repo_id'] for repo in repos] @@ -451,7 +457,7 @@ def generate_contributor_sequence(logger,repo_git, session): query = s.sql.text("""SELECT repo_id FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) - repo = session.execute_sql(query).fetchone() + repo = execute_sql(query).fetchone() logger.info(f"repo: {repo}") repo_id = repo[0] #pdb.set_trace() @@ -469,107 +475,107 @@ def generate_contributor_sequence(logger,repo_git, session): def facade_phase(repo_git): logger = logging.getLogger(facade_phase.__name__) logger.info("Generating facade sequence") - with FacadeSession(logger) as session: - #Get the repo_id - repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - repos = session.fetchall_data_from_sql_text(repo_list) + session = FacadeSession(logger) + #Get the repo_id + repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo + WHERE repo_git=:value""").bindparams(value=repo_git) + repos = fetchall_data_from_sql_text(repo_list) - start_date = session.get_setting('start_date') + start_date = session.get_setting('start_date') - repo_ids = [repo['repo_id'] for repo in repos] + repo_ids = [repo['repo_id'] for repo in repos] - repo_id = repo_ids.pop(0) + repo_id = repo_ids.pop(0) - #Get the collectionStatus - query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id) + #Get the collectionStatus + #query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id) - status = execute_session_query(query,'one') - - # Figure out what we need to do - limited_run = session.limited_run - run_analysis = session.run_analysis - pull_repos = session.pull_repos - #force_analysis = session.force_analysis - run_facade_contributors = session.run_facade_contributors - - facade_sequence = [] - facade_core_collection = [] - - if not limited_run or (limited_run and pull_repos): - facade_core_collection.append(git_repo_updates_facade_task.si(repo_git)) - - facade_core_collection.append(git_update_commit_count_weight.si(repo_git)) + #status = execute_session_query(query,'one') + + # Figure out what we need to do + limited_run = session.limited_run + run_analysis = session.run_analysis + pull_repos = session.pull_repos + #force_analysis = session.force_analysis + run_facade_contributors = session.run_facade_contributors + + facade_sequence = [] + facade_core_collection = [] + + if not limited_run or (limited_run and pull_repos): + facade_core_collection.append(git_repo_updates_facade_task.si(repo_git)) + + facade_core_collection.append(git_update_commit_count_weight.si(repo_git)) - #Generate commit analysis task order. - if not limited_run or (limited_run and run_analysis): - facade_core_collection.extend(generate_analysis_sequence(logger,repo_git,session)) + #Generate commit analysis task order. + if not limited_run or (limited_run and run_analysis): + facade_core_collection.extend(generate_analysis_sequence(logger,repo_git,session)) - #Generate contributor analysis task group. - if not limited_run or (limited_run and run_facade_contributors): - facade_core_collection.append(generate_contributor_sequence(logger,repo_git,session)) + #Generate contributor analysis task group. + if not limited_run or (limited_run and run_facade_contributors): + facade_core_collection.append(generate_contributor_sequence(logger,repo_git,session)) - #These tasks need repos to be cloned by facade before they can work. - facade_sequence.append( - group( - chain(*facade_core_collection), - process_dependency_metrics.si(repo_git), - process_libyear_dependency_metrics.si(repo_git), - process_scc_value_metrics.si(repo_git) - ) + #These tasks need repos to be cloned by facade before they can work. + facade_sequence.append( + group( + chain(*facade_core_collection), + process_dependency_metrics.si(repo_git), + process_libyear_dependency_metrics.si(repo_git), + process_scc_value_metrics.si(repo_git) ) + ) - logger.info(f"Facade sequence: {facade_sequence}") - return chain(*facade_sequence) + logger.info(f"Facade sequence: {facade_sequence}") + return chain(*facade_sequence) def generate_non_repo_domain_facade_tasks(logger): logger.info("Generating facade sequence") - with FacadeSession(logger) as session: - - # Figure out what we need to do - limited_run = session.limited_run - delete_marked_repos = session.delete_marked_repos - pull_repos = session.pull_repos - # clone_repos = session.clone_repos - check_updates = session.check_updates - # force_updates = session.force_updates - run_analysis = session.run_analysis - # force_analysis = session.force_analysis - nuke_stored_affiliations = session.nuke_stored_affiliations - fix_affiliations = session.fix_affiliations - force_invalidate_caches = session.force_invalidate_caches - rebuild_caches = session.rebuild_caches - #if abs((datetime.datetime.strptime(session.cfg.get_setting('aliases_processed')[:-3], - # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(session.cfg.get_setting( - # 'update_frequency')) else 0 - force_invalidate_caches = session.force_invalidate_caches - create_xlsx_summary_files = session.create_xlsx_summary_files - multithreaded = session.multithreaded - - facade_sequence = [] - - if nuke_stored_affiliations: - #facade_sequence.append(nuke_affiliations_facade_task.si().on_error(facade_error_handler.s()))#nuke_affiliations(session.cfg) - logger.info("Nuke stored affiliations is deprecated.") - # deprecated because the UI component of facade where affiliations would be - # nuked upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. - # This method is also a major performance bottleneck with little value. - - #logger.info(session.cfg) - if not limited_run or (limited_run and fix_affiliations): - #facade_sequence.append(fill_empty_affiliations_facade_task.si().on_error(facade_error_handler.s()))#fill_empty_affiliations(session) - logger.info("Fill empty affiliations is deprecated.") - # deprecated because the UI component of facade where affiliations would need - # to be fixed upon change no longer exists, and this information can easily be derived - # from queries and materialized views in the current version of Augur. - # This method is also a major performance bottleneck with little value. - - if force_invalidate_caches: - facade_sequence.append(invalidate_caches_facade_task.si().on_error(facade_error_handler.s()))#invalidate_caches(session.cfg) - - if not limited_run or (limited_run and rebuild_caches): - facade_sequence.append(rebuild_unknown_affiliation_and_web_caches_facade_task.si().on_error(facade_error_handler.s()))#rebuild_unknown_affiliation_and_web_caches(session.cfg) + session = FacadeSession(logger) - return facade_sequence + # Figure out what we need to do + limited_run = session.limited_run + delete_marked_repos = session.delete_marked_repos + pull_repos = session.pull_repos + # clone_repos = session.clone_repos + check_updates = session.check_updates + # force_updates = session.force_updates + run_analysis = session.run_analysis + # force_analysis = session.force_analysis + nuke_stored_affiliations = session.nuke_stored_affiliations + fix_affiliations = session.fix_affiliations + force_invalidate_caches = session.force_invalidate_caches + rebuild_caches = session.rebuild_caches + #if abs((datetime.datetime.strptime(session.cfg.get_setting('aliases_processed')[:-3], + # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(session.cfg.get_setting( + # 'update_frequency')) else 0 + force_invalidate_caches = session.force_invalidate_caches + create_xlsx_summary_files = session.create_xlsx_summary_files + multithreaded = session.multithreaded + + facade_sequence = [] + + if nuke_stored_affiliations: + #facade_sequence.append(nuke_affiliations_facade_task.si().on_error(facade_error_handler.s()))#nuke_affiliations(session.cfg) + logger.info("Nuke stored affiliations is deprecated.") + # deprecated because the UI component of facade where affiliations would be + # nuked upon change no longer exists, and this information can easily be derived + # from queries and materialized views in the current version of Augur. + # This method is also a major performance bottleneck with little value. + + #logger.info(session.cfg) + if not limited_run or (limited_run and fix_affiliations): + #facade_sequence.append(fill_empty_affiliations_facade_task.si().on_error(facade_error_handler.s()))#fill_empty_affiliations(session) + logger.info("Fill empty affiliations is deprecated.") + # deprecated because the UI component of facade where affiliations would need + # to be fixed upon change no longer exists, and this information can easily be derived + # from queries and materialized views in the current version of Augur. + # This method is also a major performance bottleneck with little value. + + if force_invalidate_caches: + facade_sequence.append(invalidate_caches_facade_task.si().on_error(facade_error_handler.s()))#invalidate_caches(session.cfg) + + if not limited_run or (limited_run and rebuild_caches): + facade_sequence.append(rebuild_unknown_affiliation_and_web_caches_facade_task.si().on_error(facade_error_handler.s()))#rebuild_unknown_affiliation_and_web_caches(session.cfg) + + return facade_sequence diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index c9a1ee021d..feb297fbf4 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -35,6 +35,7 @@ from psycopg2.errors import DeadlockDetected from augur.application.db.session import DatabaseSession +from augur.application.db.lib import execute_sql from augur.application.db.lib import get_section from logging import Logger @@ -77,7 +78,7 @@ def get_database_args_from_env(): #print(credentials) return credentials -class FacadeSession(DatabaseSession): +class FacadeSession(): """ORM session used in facade tasks. This class adds the various attributes needed for legacy facade as well as a modified version of the legacy FacadeConfig class. @@ -105,7 +106,9 @@ def __init__(self,logger: Logger): from augur.application.db import get_engine engine = get_engine() self.repos_processed = 0 - super().__init__(logger=logger, engine=engine) + # super().__init__(logger=logger, engine=engine) + + self.logger = logger worker_options = get_section("Facade") @@ -148,7 +151,7 @@ def get_setting(self,setting): query = s.sql.text("""SELECT value FROM settings WHERE setting=:settingParam ORDER BY last_modified DESC LIMIT 1""").bindparams(settingParam=setting) - result = self.execute_sql(query).fetchone() + result = execute_sql(query).fetchone() print(result) return result[0] @@ -157,7 +160,7 @@ def update_status(self, status): query = s.sql.text("""UPDATE settings SET value=:statusParam WHERE setting='utility_status' """).bindparams(statusParam=status) - self.execute_sql(query) + execute_sql(query) def log_activity(self, level, status): # Log an activity based upon urgency and user's preference. If the log level is @@ -174,7 +177,7 @@ def log_activity(self, level, status): """).bindparams(levelParam=level,statusParam=status) try: - self.execute_sql(query) + execute_sql(query) except Exception as e: self.logger.error(f"Error encountered: {e}") raise e @@ -185,7 +188,7 @@ def update_repo_log(self,repos_id,status): VALUES (:repo_id,:repo_status)""").bindparams(repo_id=repos_id,repo_status=status) try: - self.execute_sql(log_message) + execute_sql(log_message) except: pass def insert_or_update_data(self, query, **bind_args)-> None: @@ -204,9 +207,9 @@ def insert_or_update_data(self, query, **bind_args)-> None: try: if bind_args: #self.cfg.cursor.execute(query, params) - self.execute_sql(query.bindparams(**bind_args)) + execute_sql(query.bindparams(**bind_args)) else: - self.execute_sql(query) + execute_sql(query) break except OperationalError as e: # print(str(e).split("Process")[1].split(";")[0]) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py index 6d41eb7c3f..6f6c55ae49 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py @@ -28,29 +28,30 @@ import subprocess import sqlalchemy as s from augur.application.db.util import execute_session_query +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text from .utilitymethods import get_absolute_repo_path from augur.application.db.models import * #Will delete repos passed and cleanup associated commit data. -def git_repo_cleanup(session,repo_git): +def git_repo_cleanup(facade_session, session,repo_git): # Clean up any git repos that are pending deletion - session.update_status('Purging deleted repos') + facade_session.update_status('Purging deleted repos') #logger.info("Processing deletions") - session.log_activity('Info','Processing deletions') + facade_session.log_activity('Info','Processing deletions') query = session.query(Repo).filter( Repo.repo_git == repo_git)#s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_status='Delete'""") - delete_repos = execute_session_query(query,'all')#session.fetchall_data_from_sql_text(query) + delete_repos = execute_session_query(query,'all')#fetchall_data_from_sql_text(query) for row in delete_repos: # Remove the files on disk - absolute_path = get_absolute_repo_path(session.repo_base_directory, row.repo_id, row.repo_path,row.repo_name) + absolute_path = get_absolute_repo_path(facade_session.repo_base_directory, row.repo_id, row.repo_path,row.repo_name) cmd = ("rm -rf %s" % (absolute_path)) @@ -61,85 +62,85 @@ def git_repo_cleanup(session,repo_git): remove_commits = s.sql.text("""DELETE FROM commits WHERE repo_id=:repo_id """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_commits) + execute_sql(remove_commits) optimize_table = s.sql.text("""OPTIMIZE TABLE commits""") - session.execute_sql(optimize_table) + execute_sql(optimize_table) # Remove cached repo data remove_dm_repo_weekly = s.sql.text("""DELETE FROM dm_repo_weekly WHERE repo_id=:repo_id """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_dm_repo_weekly) + execute_sql(remove_dm_repo_weekly) optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_weekly""") - session.execute_sql(optimize_table) + execute_sql(optimize_table) remove_dm_repo_monthly = s.sql.text("""DELETE FROM dm_repo_monthly WHERE repo_id=:repo_id """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_dm_repo_monthly) + execute_sql(remove_dm_repo_monthly) optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_monthly""") - session.execute_sql(optimize_table) + execute_sql(optimize_table) remove_dm_repo_annual = s.sql.text("""DELETE FROM dm_repo_annual WHERE repo_id=:repo_id """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_dm_repo_annual) + execute_sql(remove_dm_repo_annual) optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_annual""") - session.execute_sql(optimize_table) + execute_sql(optimize_table) # Set project to be recached if just removing a repo set_project_recache = s.sql.text("""UPDATE projects SET recache=TRUE WHERE id=:repo_group_id""").bindparams(repo_group_id=row.repo_group_id) - session.execute_sql(set_project_recache) + execute_sql(set_project_recache) # Remove the entry from the repos table query = s.sql.text("""DELETE FROM repo WHERE repo_id=:repo_id """).bindparams(repo_id=row.repo_id) - session.execute_sql(query) + execute_sql(query) #log_activity('Verbose','Deleted repo %s' % row[0]) #logger.debug(f"Deleted repo {row.repo_id}") - session.log_activity('Verbose',f"Deleted repo {row.repo_id}") + facade_session.log_activity('Verbose',f"Deleted repo {row.repo_id}") cleanup = '%s/%s%s' % (row.repo_group_id,row.repo_path,row.repo_name) # Remove any working commits remove_working_commits = s.sql.text("""DELETE FROM working_commits WHERE repos_id=:repo_id """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_working_commits) + execute_sql(remove_working_commits) # Remove the repo from the logs remove_logs = s.sql.text("""DELETE FROM repos_fetch_log WHERE repos_id =:repo_id """).bindparams(repo_id=row.repo_id) - session.execute_sql(remove_logs) + execute_sql(remove_logs) optimize_table = s.sql.text("""OPTIMIZE TABLE repos_fetch_log""") - session.execute_sql(optimize_table) + execute_sql(optimize_table) # Attempt to cleanup any empty parent directories while (cleanup.find('/',0) > 0): cleanup = cleanup[:cleanup.rfind('/',0)] - cmd = "rmdir %s%s" % (session.repo_base_directory,cleanup) + cmd = "rmdir %s%s" % (facade_session.repo_base_directory,cleanup) subprocess.Popen([cmd],shell=True).wait() #log_activity('Verbose','Attempted %s' % cmd) #logger.debug(f"Attempted {cmd}") - session.log_activity('Verbose',f"Attempted {cmd}") + facade_session.log_activity('Verbose',f"Attempted {cmd}") #update_repo_log(row[0],'Deleted') - session.update_repo_log(row.repo_id,'Deleted') + facade_session.update_repo_log(row.repo_id,'Deleted') # Clean up deleted projects get_deleted_projects = s.sql.text("""SELECT repo_group_id FROM repo_groups WHERE rg_name='(Queued for removal)'""") - deleted_projects = session.fetchall_data_from_sql_text(get_deleted_projects) + deleted_projects = fetchall_data_from_sql_text(get_deleted_projects) for project in deleted_projects: @@ -147,37 +148,37 @@ def git_repo_cleanup(session,repo_git): clear_annual_cache = s.sql.text("""DELETE FROM dm_repo_group_annual WHERE repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_annual_cache) + execute_sql(clear_annual_cache) optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_annual""") - session.execute_sql(optimize_table) + execute_sql(optimize_table) clear_monthly_cache = s.sql.text("""DELETE FROM dm_repo_group_monthly WHERE repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_monthly_cache) + execute_sql(clear_monthly_cache) optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_monthly""") - session.execute_sql(optimize_table) + execute_sql(optimize_table) clear_weekly_cache = s.sql.text("""DELETE FROM dm_repo_group_weekly WHERE repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_weekly_cache) + execute_sql(clear_weekly_cache) optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_weekly""") - session.execute_sql(optimize_table) + execute_sql(optimize_table) clear_unknown_cache = s.sql.text("""DELETE FROM unknown_cache WHERE projects_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(clear_unknown_cache) + execute_sql(clear_unknown_cache) optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_weekly""") - session.execute_sql(optimize_table) + execute_sql(optimize_table) # Remove any projects which were also marked for deletion remove_project = s.sql.text("""DELETE FROM repo_groups WHERE repo_group_id=:repo_group_id """).bindparams(repo_group_id=project['repo_group_id']) - session.execute_sql(remove_project) + execute_sql(remove_project) - session.log_activity('Info', 'Processing deletions (complete)') + facade_session.log_activity('Info', 'Processing deletions (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index 5668739767..e7b4d2d8d2 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -26,13 +26,14 @@ # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. import sqlalchemy as s +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text from .utilitymethods import store_working_author, trim_author # if platform.python_implementation() == 'PyPy': # import pymysql # else: # import MySQLdb -def nuke_affiliations(session): +def nuke_affiliations(facade_session): # Delete all stored affiliations in the database. Normally when you # add/remove/change affiliation data via the web UI, any potentially affected @@ -42,16 +43,16 @@ def nuke_affiliations(session): # this is the scorched earth way: remove them all to force a total rebuild. # Brutal but effective. - session.log_activity('Info','Nuking affiliations') + facade_session.log_activity('Info','Nuking affiliations') nuke = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL, cmt_committer_affiliation = NULL""") - session.execute_sql(nuke) + execute_sql(nuke) - session.log_activity('Info','Nuking affiliations (complete)') + facade_session.log_activity('Info','Nuking affiliations (complete)') -def fill_empty_affiliations(session): +def fill_empty_affiliations(facade_session): @@ -79,13 +80,13 @@ def discover_null_affiliations(attribution,email): - matches = session.fetchall_data_from_sql_text(find_exact_match)#list(cfg.cursor) + matches = fetchall_data_from_sql_text(find_exact_match)#list(cfg.cursor) if not matches and email.find('@') < 0: # It's not a properly formatted email, leave it NULL and log it. - session.log_activity('Info',f"Unmatchable email: {email}") + facade_session.log_activity('Info',f"Unmatchable email: {email}") return @@ -104,7 +105,7 @@ def discover_null_affiliations(attribution,email): - matches = session.fetchall_data_from_sql_text(find_exact_domain) + matches = fetchall_data_from_sql_text(find_exact_domain) if not matches: @@ -117,7 +118,7 @@ def discover_null_affiliations(attribution,email): ORDER BY ca_start_date DESC""").bindparams(strippedDomain=domain[domain.rfind('.',0,domain.rfind('.',0))+1:]) - matches = session.fetchall_data_from_sql_text(find_domain)#list(cfg.cursor) + matches = fetchall_data_from_sql_text(find_domain)#list(cfg.cursor) if not matches: @@ -130,7 +131,7 @@ def discover_null_affiliations(attribution,email): if matches: - session.log_activity('Debug',f"Found domain match for {email}") + facade_session.log_activity('Debug',f"Found domain match for {email}") for match in matches: update = s.sql.text(("UPDATE commits " @@ -140,14 +141,14 @@ def discover_null_affiliations(attribution,email): f"AND cmt_{attribution}_date::date >= \'{match['ca_start_date']}\'::date") ).bindparams(affiliation=match['ca_affiliation'],email=email) - session.log_activity('Info', f"attr: {attribution} \nmatch:{match}\nsql: {update}") + facade_session.log_activity('Info', f"attr: {attribution} \nmatch:{match}\nsql: {update}") try: - session.execute_sql(update) + execute_sql(update) except Exception as e: - session.log_activity('Info', f"Error encountered: {e}") - session.log_activity('Info', f"Affiliation insertion failed for {email} ") - session.log_activity('Info', f"Offending query: {update} ") + facade_session.log_activity('Info', f"Error encountered: {e}") + facade_session.log_activity('Info', f"Affiliation insertion failed for {email} ") + facade_session.log_activity('Info', f"Offending query: {update} ") def discover_alias(email): @@ -158,7 +159,7 @@ def discover_alias(email): WHERE alias_email=:email AND cntrb_active = 1""").bindparams(email=email) - canonical = session.fetchall_data_from_sql_text(fetch_canonical)#list(cfg.cursor) + canonical = fetchall_data_from_sql_text(fetch_canonical)#list(cfg.cursor) if canonical: for email in canonical: @@ -168,8 +169,8 @@ def discover_alias(email): ### The real function starts here ### - session.update_status('Filling empty affiliations') - session.log_activity('Info','Filling empty affiliations') + facade_session.update_status('Filling empty affiliations') + facade_session.log_activity('Info','Filling empty affiliations') # Process any changes to the affiliations or aliases, and set any existing # entries in commits to NULL so they are filled properly. @@ -178,41 +179,41 @@ def discover_alias(email): timefetch = s.sql.text("""SELECT current_timestamp(6) as fetched""") - affiliations_fetched = session.execute_sql(timefetch).fetchone()[0] + affiliations_fetched = execute_sql(timefetch).fetchone()[0] print(affiliations_fetched) # Now find the last time we worked on affiliations, to figure out what's new - affiliations_processed = session.get_setting('affiliations_processed') + affiliations_processed = facade_session.get_setting('affiliations_processed') get_changed_affiliations = s.sql.text("""SELECT ca_domain FROM contributor_affiliations""")# WHERE " #"ca_last_used >= timestamptz %s") - changed_affiliations = session.fetchall_data_from_sql_text(get_changed_affiliations)#list(cfg.cursor) + changed_affiliations = fetchall_data_from_sql_text(get_changed_affiliations)#list(cfg.cursor) # Process any affiliations which changed since we last checked for changed_affiliation in changed_affiliations: - session.log_activity('Debug',f"Resetting affiliation for {changed_affiliation['ca_domain']}") + facade_session.log_activity('Debug',f"Resetting affiliation for {changed_affiliation['ca_domain']}") set_author_to_null = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL WHERE cmt_author_email LIKE CONCAT('%%',:affiliation)""").bindparams(affiliation=changed_affiliation['ca_domain']) - session.execute_sql(set_author_to_null) + execute_sql(set_author_to_null) set_committer_to_null = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_email LIKE CONCAT('%%',:affiliation)""").bindparams(affiliation=changed_affiliation['ca_domain']) - session.execute_sql(set_committer_to_null) + execute_sql(set_committer_to_null) # Update the last fetched date, so we know where to start next time. update_affiliations_date = s.sql.text("""UPDATE settings SET value=:affiliations WHERE setting = 'affiliations_processed'""").bindparams(affiliations=affiliations_fetched) - session.execute_sql(update_affiliations_date) + execute_sql(update_affiliations_date) # On to the aliases, now @@ -220,61 +221,61 @@ def discover_alias(email): get_time = s.sql.text("""SELECT current_timestamp(6) as fetched""") - aliases_fetched = session.execute_sql(get_time).fetchone()[0]#['fetched'] + aliases_fetched = execute_sql(get_time).fetchone()[0]#['fetched'] # Now find the last time we worked on aliases, to figure out what's new - aliases_processed = session.get_setting('aliases_processed') + aliases_processed = facade_session.get_setting('aliases_processed') get_changed_aliases = s.sql.text("""SELECT alias_email FROM contributors_aliases WHERE cntrb_last_modified >= :aliases""").bindparams(aliases=aliases_processed) - changed_aliases = session.fetchall_data_from_sql_text(get_changed_aliases)#list(cfg.cursor) + changed_aliases = fetchall_data_from_sql_text(get_changed_aliases)#list(cfg.cursor) # Process any aliases which changed since we last checked for changed_alias in changed_aliases: - session.log_activity('Debug',f"Resetting affiliation for {changed_alias['alias_email']}") + facade_session.log_activity('Debug',f"Resetting affiliation for {changed_alias['alias_email']}") set_author_to_null = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL WHERE cmt_author_raw_email LIKE CONCAT('%%',:alias)""").bindparams(alias=changed_alias['alias_email']) - session.insert_or_update_data(set_author_to_null) + facade_session.insert_or_update_data(set_author_to_null) set_committer_to_null = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_raw_email LIKE CONCAT('%%',:alias_email)""").bindparams(alias_email=changed_alias['alias_email']) - session.insert_or_update_data(set_committer_to_null) + facade_session.insert_or_update_data(set_committer_to_null) reset_author = s.sql.text("""UPDATE commits SET cmt_author_email = :author_email WHERE cmt_author_raw_email = :raw_author_email """).bindparams(author_email=discover_alias(changed_alias['alias_email']),raw_author_email=changed_alias['alias_email']) - session.insert_or_update_data(reset_author) + facade_session.insert_or_update_data(reset_author) reset_committer = s.sql.text("""UPDATE commits SET cmt_committer_email = :author_email WHERE cmt_committer_raw_email = :raw_author_email """).bindparams(author_email=discover_alias(changed_alias['alias_email']), raw_author_email=changed_alias['alias_email']) - session.insert_or_update_data(reset_committer) + facade_session.insert_or_update_data(reset_committer) # Update the last fetched date, so we know where to start next time. update_aliases_date = s.sql.text("""UPDATE settings SET value=:aliases WHERE setting = 'aliases_processed'""").bindparams(aliases=aliases_fetched) - session.execute_sql(update_aliases_date) + execute_sql(update_aliases_date) # Now rebuild the affiliation data - working_author = session.get_setting('working_author') + working_author = facade_session.get_setting('working_author') if working_author != 'done': - session.log_activity('Error',f"Trimming author data in affiliations: {working_author}") - trim_author(session, working_author) + facade_session.log_activity('Error',f"Trimming author data in affiliations: {working_author}") + trim_author(facade_session, working_author) # Figure out which projects have NULL affiliations so they can be recached @@ -294,7 +295,7 @@ def discover_alias(email): # "SET rg_recache=TRUE WHERE " # "author_affiliation IS NULL OR " # "committer_affiliation IS NULL") - session.execute_sql(set_recache) + execute_sql(set_recache) # Find any authors with NULL affiliations and fill them @@ -304,19 +305,19 @@ def discover_alias(email): WHERE cmt_author_affiliation IS NULL GROUP BY cmt_author_email""") - null_authors = session.fetchall_data_from_sql_text(find_null_authors) + null_authors = fetchall_data_from_sql_text(find_null_authors) - session.log_activity('Debug',f"Found {len(null_authors)} authors with NULL affiliation") + facade_session.log_activity('Debug',f"Found {len(null_authors)} authors with NULL affiliation") for null_author in null_authors: email = null_author['email'] - store_working_author(session, email) + store_working_author(facade_session, email) discover_null_affiliations('author',email) - store_working_author(session, 'done') + store_working_author(facade_session, 'done') # Find any committers with NULL affiliations and fill them @@ -326,15 +327,15 @@ def discover_alias(email): WHERE cmt_committer_affiliation IS NULL GROUP BY cmt_committer_email""") - null_committers = session.fetchall_data_from_sql_text(find_null_committers) + null_committers = fetchall_data_from_sql_text(find_null_committers) - session.log_activity('Debug',f"Found {len(null_committers)} committers with NULL affiliation") + facade_session.log_activity('Debug',f"Found {len(null_committers)} committers with NULL affiliation") for null_committer in null_committers: email = null_committer['email'] - store_working_author(session, email) + store_working_author(facade_session, email) discover_null_affiliations('committer',email) @@ -344,43 +345,43 @@ def discover_alias(email): SET cmt_author_affiliation = '(Unknown)' WHERE cmt_author_affiliation IS NULL""") - session.execute_sql(fill_unknown_author) + execute_sql(fill_unknown_author) fill_unknown_committer = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = '(Unknown)' WHERE cmt_committer_affiliation IS NULL""") - session.execute_sql(fill_unknown_committer) + execute_sql(fill_unknown_committer) - store_working_author(session, 'done') + store_working_author(facade_session, 'done') - session.log_activity('Info','Filling empty affiliations (complete)') + facade_session.log_activity('Info','Filling empty affiliations (complete)') -def invalidate_caches(session): +def invalidate_caches(facade_session): # Invalidate all caches - session.update_status('Invalidating caches') - session.log_activity('Info','Invalidating caches') + facade_session.update_status('Invalidating caches') + facade_session.log_activity('Info','Invalidating caches') invalidate_cache = s.sql.text("""UPDATE repo_groups SET rg_recache = 1""") - session.execute_sql(invalidate_cache) + execute_sql(invalidate_cache) - session.log_activity('Info','Invalidating caches (complete)') + facade_session.log_activity('Info','Invalidating caches (complete)') -def rebuild_unknown_affiliation_and_web_caches(session): +def rebuild_unknown_affiliation_and_web_caches(facade_session): # When there's a lot of analysis data, calculating display data on the fly gets # pretty expensive. Instead, we crunch the data based upon the user's preferred # statistics (author or committer) and store them. We also store all records # with an (Unknown) affiliation for display to the user. - session.update_status('Caching data for display') - session.log_activity('Info','Caching unknown affiliations and web data for display') + facade_session.update_status('Caching data for display') + facade_session.log_activity('Info','Caching unknown affiliations and web data for display') - report_date = session.get_setting('report_date') - report_attribution = session.get_setting('report_attribution') + report_date = facade_session.get_setting('report_date') + report_attribution = facade_session.get_setting('report_attribution') # Clear stale caches @@ -396,7 +397,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_weekly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_weekly) + execute_sql(clear_dm_repo_group_weekly) clear_dm_repo_group_monthly = s.sql.text(""" DELETE @@ -410,7 +411,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_monthly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_monthly) + execute_sql(clear_dm_repo_group_monthly) clear_dm_repo_group_annual = s.sql.text(""" DELETE @@ -424,7 +425,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_annual c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_annual) + execute_sql(clear_dm_repo_group_annual) clear_dm_repo_weekly = s.sql.text(""" DELETE @@ -441,7 +442,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_weekly) + execute_sql(clear_dm_repo_weekly) clear_dm_repo_monthly = s.sql.text(""" DELETE @@ -458,7 +459,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_monthly) + execute_sql(clear_dm_repo_monthly) clear_dm_repo_annual = s.sql.text(""" DELETE @@ -475,7 +476,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_annual) + execute_sql(clear_dm_repo_annual) clear_unknown_cache = s.sql.text(""" DELETE @@ -489,9 +490,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM unknown_cache c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_unknown_cache) + execute_sql(clear_unknown_cache) - session.log_activity('Verbose','Caching unknown authors and committers') + facade_session.log_activity('Verbose','Caching unknown authors and committers') # Cache the unknown authors @@ -511,9 +512,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): AND p.rg_recache = 1 GROUP BY r.repo_group_id,a.cmt_author_email, info.a, info.b, info.c - """).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + """).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) - session.execute_sql(unknown_authors) + execute_sql(unknown_authors) # Cache the unknown committers @@ -531,13 +532,13 @@ def rebuild_unknown_affiliation_and_web_caches(session): WHERE a.cmt_committer_affiliation = '(Unknown)' AND p.rg_recache = 1 GROUP BY r.repo_group_id,a.cmt_committer_email, info.a, info.b, info.c - """).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + """).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) - session.execute_sql(unknown_committers) + execute_sql(unknown_committers) # Start caching by project - session.log_activity('Verbose','Caching projects') + facade_session.log_activity('Verbose','Caching projects') cache_projects_by_week = s.sql.text(( "INSERT INTO dm_repo_group_weekly (repo_group_id, email, affiliation, week, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -571,9 +572,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email, " "r.repo_group_id, info.a, info.b, info.c") - ).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + ).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) - session.execute_sql(cache_projects_by_week) + execute_sql(cache_projects_by_week) cache_projects_by_month = s.sql.text( ("INSERT INTO dm_repo_group_monthly (repo_group_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -607,9 +608,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "r.repo_group_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) - session.execute_sql(cache_projects_by_month) + execute_sql(cache_projects_by_month) cache_projects_by_year = s.sql.text(( "INSERT INTO dm_repo_group_annual (repo_group_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -644,15 +645,15 @@ def rebuild_unknown_affiliation_and_web_caches(session): - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) - session.execute_sql(cache_projects_by_year) + execute_sql(cache_projects_by_year) # Start caching by repo - session.log_activity('Verbose','Caching repos') + facade_session.log_activity('Verbose','Caching repos') cache_repos_by_week = s.sql.text( ( @@ -687,9 +688,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) - session.execute_sql(cache_repos_by_week) + execute_sql(cache_repos_by_week) cache_repos_by_month = s.sql.text(( "INSERT INTO dm_repo_monthly (repo_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -723,9 +724,9 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) - session.execute_sql(cache_repos_by_month) + execute_sql(cache_repos_by_month) cache_repos_by_year = s.sql.text(( "INSERT INTO dm_repo_annual (repo_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -757,14 +758,14 @@ def rebuild_unknown_affiliation_and_web_caches(session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) + )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) - session.execute_sql(cache_repos_by_year) + execute_sql(cache_repos_by_year) # Reset cache flags reset_recache = s.sql.text("UPDATE repo_groups SET rg_recache = 0") - session.execute_sql(reset_recache) + execute_sql(reset_recache) - session.log_activity('Info','Caching unknown affiliations and web data for display (complete)') + facade_session.log_activity('Info','Caching unknown affiliations and web data for display (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index 64571bdd9b..eb790c16df 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -34,19 +34,20 @@ from augur.application.db.models.augur_data import * from augur.application.db.models.augur_operations import CollectionStatus from augur.application.db.util import execute_session_query, convert_orm_list_to_dict_list +from augur.application.db.lib import execute_sql class GitCloneError(Exception): pass -def git_repo_initialize(session, repo_git): +def git_repo_initialize(facade_session, session, repo_git): # Select any new git repos so we can set up their locations and git clone - session.update_status('Fetching non-cloned repos') - session.log_activity('Info', 'Fetching non-cloned repos') + facade_session.update_status('Fetching non-cloned repos') + facade_session.log_activity('Info', 'Fetching non-cloned repos') # Get data as a list of dicts - # new_repos = session.fetchall_data_from_sql_text(query)#list(cfg.cursor) + # new_repos = fetchall_data_from_sql_text(query)#list(cfg.cursor) row = Repo.get_by_repo_git(session, repo_git) if row: @@ -54,7 +55,7 @@ def git_repo_initialize(session, repo_git): session.log_activity( 'Info', f"Fetching repo with repo id: {row.repo_id}") - update_repo_log(session, row.repo_id, 'Cloning') + update_repo_log(logger, facade_session, row.repo_id, 'Cloning') git = html.unescape(row.repo_git) @@ -62,28 +63,28 @@ def git_repo_initialize(session, repo_git): if git.find('://', 0) > 0: platform_org_git_url_section = git[git.find( '://', 0)+3:][:git[git.find('://', 0)+3:].rfind('/', 0)+1] - session.log_activity( + facade_session.log_activity( 'Info', f"Repo Relative Path from facade05, from for row in new_repos, line 79: {platform_org_git_url_section}") - session.log_activity('Info', f"The git path used : {git}") + facade_session.log_activity('Info', f"The git path used : {git}") else: platform_org_git_url_section = git[:git.rfind('/', 0)+1] - session.log_activity( + facade_session.log_activity( 'Info', f"Repo Relative Path from facade05, line 80, reset at 86: {platform_org_git_url_section}") # Get the name of repo repo_name = git[git.rfind('/', 0)+1:] if repo_name.endswith('.git'): repo_name = repo_name[:repo_name.find('.git', 0)] - session.log_activity( + facade_session.log_activity( 'Info', f"Repo Name from facade05, line 93: {repo_name}") path_identifier = f"{platform_org_git_url_section}{repo_name}".replace('/','-') # Get the full path to the directory where we'll clone the repo repo_path = ( - f"{session.repo_base_directory}{row.repo_id}-{path_identifier}") - session.log_activity( + f"{facade_session.repo_base_directory}{row.repo_id}-{path_identifier}") + facade_session.log_activity( 'Info', f"Repo Path from facade05, line 86: {repo_path}") @@ -91,21 +92,21 @@ def git_repo_initialize(session, repo_git): # query = s.sql.text("""SELECT NULL FROM repo WHERE CONCAT(repo_group_id,'/',repo_path,repo_name) = :repo_group_id # """).bindparams(repo_group_id=f"{row.repo_group_id}/{platform_org_git_url_section}{repo_name}") # - # result = session.fetchall_data_from_sql_text(query) + # result = fetchall_data_from_sql_text(query) query = s.sql.text("""UPDATE repo SET repo_path=:pathParam, repo_name=:nameParam WHERE repo_id=:idParam """).bindparams(pathParam=path_identifier, nameParam=repo_name, idParam=row.repo_id) - session.execute_sql(query) + execute_sql(query) # Check if there will be a storage path collision # If there is a collision, throw an error so that it updates the existing repo instead of trying # to reclone. if os.path.isdir(repo_path): # len(result): - session.log_activity( + facade_session.log_activity( 'Verbose', f"Identical repo detected, storing {git} in {repo_name}") - session.logger.warning( + logger.warning( f"Identical repo found in facade directory! Repo git: {git}") statusQuery = session.query(CollectionStatus).filter( CollectionStatus.repo_id == row.repo_id) @@ -119,7 +120,7 @@ def git_repo_initialize(session, repo_git): repo_name=:nameParam WHERE repo_id=:idParam """).bindparams(pathParam=path_identifier, nameParam=repo_name, idParam=row.repo_id) - session.execute_sql(query) + execute_sql(query) return # Create the prerequisite directories @@ -128,21 +129,21 @@ def git_repo_initialize(session, repo_git): except Exception as e: print("COULD NOT CREATE REPO DIRECTORY") - update_repo_log(session, row.repo_id, 'Failed (mkdir)') + update_repo_log(logger, facade_session, row.repo_id, 'Failed (mkdir)') session.update_status(f"Failed (mkdir {repo_path})") session.log_activity( 'Error', f"Could not create repo directory: {repo_path}") raise e - update_repo_log(session, row.repo_id, 'New (cloning)') + update_repo_log(logger, facade_session, row.repo_id, 'New (cloning)') #Make sure newly cloned repo path is recorded in repo table query = s.sql.text("""UPDATE repo SET repo_path=:pathParam, repo_name=:nameParam WHERE repo_id=:idParam """).bindparams(pathParam=path_identifier, nameParam=repo_name, idParam=row.repo_id) - session.execute_sql(query) + execute_sql(query) session.log_activity('Verbose', f"Cloning: {git}") @@ -153,12 +154,12 @@ def git_repo_initialize(session, repo_git): # If cloning succeeded, repo is ready for analysis # Mark the entire project for an update, so that under normal # circumstances caches are rebuilt only once per waiting period. - update_repo_log(session, row.repo_id, 'Up-to-date') + update_repo_log(logger, facade_session, row.repo_id, 'Up-to-date') session.log_activity('Info', f"Cloned {git}") else: # If cloning failed, log it and set the status back to new - update_repo_log(session, row.repo_id, f"Failed ({return_code})") + update_repo_log(logger, facade_session, row.repo_id, f"Failed ({return_code})") session.log_activity('Error', f"Could not clone {git}") @@ -185,8 +186,8 @@ def check_for_repo_updates(session, repo_git): AND repo_status != 'Analyze' AND repo_status != 'Empty' AND repo_git = :value""").bindparams(value=repo_git) - # repos = session.fetchall_data_from_sql_text(get_initialized_repos)#list(cfg.cursor) - repo = session.execute_sql(get_initialized_repos).fetchone() + # repos = fetchall_data_from_sql_text(get_initialized_repos)#list(cfg.cursor) + repo = execute_sql(get_initialized_repos).fetchone() if repo: @@ -196,7 +197,7 @@ def check_for_repo_updates(session, repo_git): repos_id=:repo_id AND status='Up-to-date' AND date >= CURRENT_TIMESTAMP(6) - INTERVAL :update_freq HOUR """).bindparams(repo_id=repo['repo_id'], update_freq=update_frequency[0]) - result = session.fetchall_data_from_sql_text(get_last_update) + result = fetchall_data_from_sql_text(get_last_update) # If the repo has not been updated within the waiting period, mark it. # Also mark any other repos in the project, so we only recache the # project once per waiting period. @@ -213,7 +214,7 @@ def check_for_repo_updates(session, repo_git): # "SET status='Update' WHERE " # "r.id=%s and r.status != 'Empty'") - session.execute_sql(mark_repo) + execute_sql(mark_repo) # Mark the entire project for an update, so that under normal # circumstances caches are rebuilt only once per waiting period. @@ -250,7 +251,7 @@ def force_repo_updates(session, repo_git): get_repo_ids = s.sql.text("""UPDATE repo SET repo_status='Update' WHERE repo_status NOT LIKE 'New%' AND repo_status!='Delete' AND repo_status !='Empty' AND repo_git=:value""").bindparams(value=repo_git) - session.execute_sql(get_repo_ids) + execute_sql(get_repo_ids) session.log_activity('Info', 'Forcing repos to update (complete)') @@ -268,17 +269,17 @@ def force_repo_analysis(session, repo_git): NOT LIKE 'New%' AND repo_status!='Delete' AND repo_status != 'Empty' AND repo_git=:repo_git_ident""").bindparams(repo_git_ident=repo_git) - session.execute_sql(set_to_analyze) + execute_sql(set_to_analyze) session.log_activity('Info', 'Forcing repos to be analyzed (complete)') -def git_repo_updates(session, repo_git): +def git_repo_updates(facade_session, session, repo_git): # Update existing repos - session.update_status('Updating repos') - session.log_activity('Info', 'Updating existing repos') + facade_session.update_status('Updating repos') + facade_session.log_activity('Info', 'Updating existing repos') # query = s.sql.text("""SELECT repo_id,repo_group_id,repo_git,repo_name,repo_path FROM repo WHERE # repo_status='Update'""") @@ -287,7 +288,7 @@ def git_repo_updates(session, repo_git): result = execute_session_query(query, 'all') try: - # session.fetchall_data_from_sql_text(query)#list(cfg.cursor) + # fetchall_data_from_sql_text(query)#list(cfg.cursor) row = convert_orm_list_to_dict_list(result)[0] except IndexError: raise Exception( @@ -297,9 +298,9 @@ def git_repo_updates(session, repo_git): raise Exception( f"The repo path or repo name is NULL for repo_id: {row['repo_id']}") - session.log_activity( + facade_session.log_activity( 'Verbose', f"Attempting to update {row['repo_git']}") # ['git']) - update_repo_log(session, row['repo_id'], 'Updating') # ['id'],'Updating') + update_repo_log(logger, facade_session, row['repo_id'], 'Updating') # ['id'],'Updating') attempt = 0 @@ -310,7 +311,7 @@ def git_repo_updates(session, repo_git): # default_branch = '' absolute_path = get_absolute_repo_path( - session.repo_base_directory, row["repo_id"], row['repo_path'],row['repo_name']) + facade_session.repo_base_directory, row["repo_id"], row['repo_path'],row['repo_name']) while attempt < 2: @@ -321,7 +322,7 @@ def git_repo_updates(session, repo_git): return_code_remote = subprocess.Popen( [firstpull], shell=True).wait() - session.log_activity('Verbose', 'Got to here. 1.') + facade_session.log_activity('Verbose', 'Got to here. 1.') if return_code_remote == 0: @@ -343,26 +344,26 @@ def git_repo_updates(session, repo_git): remotedefault = remotedefault.decode() - session.log_activity( + facade_session.log_activity( 'Verbose', f'remote default getting checked out is: {remotedefault}.') getremotedefault = ( f"git -C {absolute_path} checkout {remotedefault}") - session.log_activity( + facade_session.log_activity( 'Verbose', f"get remote default command is: \n \n {getremotedefault} \n \n ") return_code_remote_default_again = subprocess.Popen( [getremotedefault], shell=True).wait() if return_code_remote_default_again == 0: - session.log_activity('Verbose', "local checkout worked.") + facade_session.log_activity('Verbose', "local checkout worked.") cmd = (f"git -C {absolute_path} pull") return_code = subprocess.Popen([cmd], shell=True).wait() except Exception as e: - session.log_activity( + facade_session.log_activity( 'Verbose', f'Error code on branch change is {e}.') pass @@ -378,7 +379,7 @@ def git_repo_updates(session, repo_git): break elif attempt == 0: - session.log_activity( + facade_session.log_activity( 'Verbose', f"git pull failed, attempting reset and clean for {row['repo_git']}") # remotedefault = 'main' @@ -412,7 +413,7 @@ def git_repo_updates(session, repo_git): return_message_getremotedefault = subprocess.Popen( [getremotedefault], stdout=subprocess.PIPE, shell=True).communicate()[0] - session.log_activity( + facade_session.log_activity( 'Verbose', f'get remote default result: {return_message_getremotedefault}') getcurrentbranch = (f"git -C {absolute_path} branch") @@ -425,7 +426,7 @@ def git_repo_updates(session, repo_git): localdefault = localdefault.decode() - session.log_activity( + facade_session.log_activity( 'Verbose', f'remote default is: {remotedefault}, and localdefault is {localdefault}.') cmd_checkout_default = ( @@ -448,7 +449,7 @@ def git_repo_updates(session, repo_git): except Exception as e: - session.log_activity('Verbose', f'Second pass failed: {e}.') + facade_session.log_activity('Verbose', f'Second pass failed: {e}.') pass cmdpull2 = (f"git -C {absolute_path} pull") @@ -462,12 +463,12 @@ def git_repo_updates(session, repo_git): if return_code == 0: - update_repo_log(session, row['repo_id'], 'Up-to-date') - session.log_activity('Verbose', f"Updated {row['repo_git']}") + update_repo_log(logger, facade_session, row['repo_id'], 'Up-to-date') + facade_session.log_activity('Verbose', f"Updated {row['repo_git']}") else: - update_repo_log(session, row['repo_id'], f"Failed ({return_code})") - session.log_activity('Error', f"Could not update {row['repo_git']}") + update_repo_log(logger, facade_session, row['repo_id'], f"Failed ({return_code})") + facade_session.log_activity('Error', f"Could not update {row['repo_git']}") - session.log_activity('Info', 'Updating existing repos (complete)') + facade_session.log_activity('Info', 'Updating existing repos (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 34ed93a6f7..89641d2014 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -33,13 +33,13 @@ from augur.application.db.models import * from .config import FacadeSession as FacadeSession from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps -from augur.application.db.lib import execute_sql +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text #from augur.tasks.git.util.facade_worker.facade -def update_repo_log(session, repos_id,status): +def update_repo_log(logger, facade_session, repos_id,status): # Log a repo's fetch status - session.log_activity("Info",f"{status} {repos_id}") + facade_session.log_activity("Info",f"{status} {repos_id}") #log_message = ("INSERT INTO repos_fetch_log (repos_id,status) " # "VALUES (%s,%s)") try: @@ -47,12 +47,12 @@ def update_repo_log(session, repos_id,status): VALUES (:repo_id,:repo_status)""").bindparams(repo_id=repos_id,repo_status=status) #session.insert_data(data,t_repos_fetch_log,['repos_id','status']) - session.execute_sql(log_message) + execute_sql(log_message) except Exception as e: - session.logger.error(f"Ran into error in update_repo_log: {e}") + logger.error(f"Ran into error in update_repo_log: {e}") pass -def trim_commits(session, repo_id,commits): +def trim_commits(facade_session, repo_id,commits): # Quickly remove a given commit @@ -72,10 +72,10 @@ def trim_commits(session, repo_id,commits): execute_sql(remove_commit) for commit in commits: - session.log_activity('Debug',f"Trimmed commit: {commit}") - session.log_activity('Debug',f"Removed working commit: {commit}") + facade_session.log_activity('Debug',f"Trimmed commit: {commit}") + facade_session.log_activity('Debug',f"Removed working commit: {commit}") -def store_working_author(session, email): +def store_working_author(facade_session, email): # Store the working author during affiliation discovery, in case it is # interrupted and needs to be trimmed. @@ -85,11 +85,11 @@ def store_working_author(session, email): WHERE setting = 'working_author' """).bindparams(email=email) - session.execute_sql(store) + execute_sql(store) - session.log_activity('Debug',f"Stored working author: {email}") + facade_session.log_activity('Debug',f"Stored working author: {email}") -def trim_author(session, email): +def trim_author(facade_session, email): # Remove the affiliations associated with an email. Used when an analysis is # interrupted during affiliation layering, and the data will be corrupt. @@ -101,18 +101,18 @@ def trim_author(session, email): - session.execute_sql(trim) + execute_sql(trim) trim = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_email = :email """).bindparams(email=email) - session.execute_sql(trim) + execute_sql(trim) - store_working_author(session, 'done') + store_working_author(facade_session, 'done') - session.log_activity('Debug',f"Trimmed working author: {email}") + facade_session.log_activity('Debug',f"Trimmed working author: {email}") def get_absolute_repo_path(repo_base_dir, repo_id, repo_path,repo_name): @@ -135,12 +135,12 @@ def get_parent_commits_set(absolute_repo_path, start_date): return parent_commits -def get_existing_commits_set(session, repo_id): +def get_existing_commits_set(repo_id): find_existing = s.sql.text("""SELECT DISTINCT cmt_commit_hash FROM commits WHERE repo_id=:repo_id """).bindparams(repo_id=repo_id) - existing_commits = [commit['cmt_commit_hash'] for commit in session.fetchall_data_from_sql_text(find_existing)] + existing_commits = [commit['cmt_commit_hash'] for commit in fetchall_data_from_sql_text(find_existing)] return set(existing_commits) @@ -149,11 +149,11 @@ def count_branches(git_dir): branches_dir = os.path.join(git_dir, 'refs', 'heads') return sum(1 for _ in os.scandir(branches_dir)) -def get_repo_commit_count(logger, session, repo_git): +def get_repo_commit_count(logger, facade_session, session, repo_git): repo = Repo.get_by_repo_git(session, repo_git) - absolute_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + absolute_path = get_absolute_repo_path(facade_session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) repo_loc = (f"{absolute_path}/.git") logger.debug(f"loc: {repo_loc}") @@ -190,9 +190,9 @@ def get_facade_weight_with_commit_count(session, repo_git, commit_count): return commit_count - get_facade_weight_time_factor(session, repo_git) -def get_repo_weight_by_commit(logger,repo_git): - with FacadeSession(logger) as session: - return get_repo_commit_count(logger, session, repo_git) - get_facade_weight_time_factor(session, repo_git) +def get_repo_weight_by_commit(logger, session, repo_git): + facade_session = FacadeSession(logger) + return get_repo_commit_count(logger, facade_session, session, repo_git) - get_facade_weight_time_factor(session, repo_git) def update_facade_scheduling_fields(session, repo_git, weight, commit_count): diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 64ce4f7409..93e87fff29 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -261,48 +261,48 @@ def insert_facade_contributors(self, repo_id): manifest.logger.debug("DEBUG: Got through the new_contribs") - with FacadeSession(logger) as session: - # sql query used to find corresponding cntrb_id's of emails found in the contributor's table - # i.e., if a contributor already exists, we use it! - resolve_email_to_cntrb_id_sql = s.sql.text(""" - SELECT DISTINCT - cntrb_id, - contributors.cntrb_login AS login, - contributors.cntrb_canonical AS email, - commits.cmt_author_raw_email - FROM - contributors, - commits - WHERE - contributors.cntrb_canonical = commits.cmt_author_raw_email - AND commits.repo_id = :repo_id - UNION - SELECT DISTINCT - contributors_aliases.cntrb_id, - contributors.cntrb_login as login, - contributors_aliases.alias_email AS email, - commits.cmt_author_raw_email - FROM - contributors, - contributors_aliases, - commits - WHERE - contributors_aliases.alias_email = commits.cmt_author_raw_email - AND contributors.cntrb_id = contributors_aliases.cntrb_id - AND commits.repo_id = :repo_id - """).bindparams(repo_id=repo_id) - - #self.logger.info("DEBUG: got passed the sql statement declaration") - # Get a list of dicts that contain the emails and cntrb_id's of commits that appear in the contributor's table. - #existing_cntrb_emails = json.loads(pd.read_sql(resolve_email_to_cntrb_id_sql, self.db, params={ - # 'repo_id': repo_id}).to_json(orient="records")) - - result = session.execute_sql(resolve_email_to_cntrb_id_sql) - existing_cntrb_emails = [dict(row) for row in result.mappings()] - - print(existing_cntrb_emails) - link_commits_to_contributor(session,list(existing_cntrb_emails)) - - session.logger.info("Done with inserting and updating facade contributors") + session = FacadeSession(logger) + # sql query used to find corresponding cntrb_id's of emails found in the contributor's table + # i.e., if a contributor already exists, we use it! + resolve_email_to_cntrb_id_sql = s.sql.text(""" + SELECT DISTINCT + cntrb_id, + contributors.cntrb_login AS login, + contributors.cntrb_canonical AS email, + commits.cmt_author_raw_email + FROM + contributors, + commits + WHERE + contributors.cntrb_canonical = commits.cmt_author_raw_email + AND commits.repo_id = :repo_id + UNION + SELECT DISTINCT + contributors_aliases.cntrb_id, + contributors.cntrb_login as login, + contributors_aliases.alias_email AS email, + commits.cmt_author_raw_email + FROM + contributors, + contributors_aliases, + commits + WHERE + contributors_aliases.alias_email = commits.cmt_author_raw_email + AND contributors.cntrb_id = contributors_aliases.cntrb_id + AND commits.repo_id = :repo_id + """).bindparams(repo_id=repo_id) + + #self.logger.info("DEBUG: got passed the sql statement declaration") + # Get a list of dicts that contain the emails and cntrb_id's of commits that appear in the contributor's table. + #existing_cntrb_emails = json.loads(pd.read_sql(resolve_email_to_cntrb_id_sql, self.db, params={ + # 'repo_id': repo_id}).to_json(orient="records")) + + result = session.execute_sql(resolve_email_to_cntrb_id_sql) + existing_cntrb_emails = [dict(row) for row in result.mappings()] + + print(existing_cntrb_emails) + link_commits_to_contributor(session,list(existing_cntrb_emails)) + + logger.info("Done with inserting and updating facade contributors") return From 15d788bdd2e102e3598f9edb9e9119826e0766bb Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 2 Apr 2024 18:04:09 -0500 Subject: [PATCH 007/122] Rename FacadeSession to FacadeHelper Signed-off-by: Andrew Brain --- augur/tasks/git/facade_tasks.py | 131 +++++++++--------- .../facade_worker/facade_worker/config.py | 2 +- .../facade_worker/facade00mainprogram.py | 2 +- .../facade_worker/postanalysiscleanup.py | 18 +-- .../facade_worker/rebuildcache.py | 102 +++++++------- .../facade_worker/facade_worker/repofetch.py | 70 +++++----- .../facade_worker/utilitymethods.py | 30 ++-- 7 files changed, 178 insertions(+), 177 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index a6fa0f0d93..6b66242f21 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -15,6 +15,7 @@ from augur.tasks.github.facade_github.tasks import * +from augur.tasks.git.util.facade_worker.facade_worker.config import FacadeHelper from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import get_collection_status_repo_git_from_filter from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize, git_repo_updates @@ -59,10 +60,10 @@ def facade_error_handler(request,exc,traceback): def facade_analysis_init_facade_task(repo_git): logger = logging.getLogger(facade_analysis_init_facade_task.__name__) - session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) - session.update_status('Running analysis') - session.log_activity('Info',f"Beginning analysis.") + facade_helper.update_status('Running analysis') + facade_helper.log_activity('Info',f"Beginning analysis.") @celery.task(base=AugurFacadeRepoCollectionTask) @@ -70,7 +71,7 @@ def trim_commits_facade_task(repo_git): logger = logging.getLogger(trim_commits_facade_task.__name__) - facade_session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) with get_session() as session: @@ -90,7 +91,7 @@ def update_analysis_log(repos_id,status): pass - facade_session.inc_repos_processed() + facade_helper.inc_repos_processed() update_analysis_log(repo_id,"Beginning analysis.") # First we check to see if the previous analysis didn't complete @@ -106,7 +107,7 @@ def update_analysis_log(repos_id,status): # the commit data may be incomplete. It should be trimmed, just in case. commits_to_trim = [commit['working_commit'] for commit in working_commits] - trim_commits(facade_session,repo_id,commits_to_trim) + trim_commits(facade_helper,repo_id,commits_to_trim) # Start the main analysis update_analysis_log(repo_id,'Collecting data') @@ -117,14 +118,14 @@ def trim_commits_post_analysis_facade_task(repo_git): logger = logging.getLogger(trim_commits_post_analysis_facade_task.__name__) - facade_session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) with get_session() as session: repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() repo_id = repo.repo_id - start_date = facade_session.get_setting('start_date') + start_date = facade_helper.get_setting('start_date') def update_analysis_log(repos_id,status): # Log a repo's analysis status @@ -142,7 +143,7 @@ def update_analysis_log(repos_id,status): repo = execute_session_query(query, 'one') #Get the huge list of commits to process. - absoulte_path = get_absolute_repo_path(facade_session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + absoulte_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) repo_loc = (f"{absoulte_path}/.git") # Grab the parents of HEAD @@ -155,7 +156,7 @@ def update_analysis_log(repos_id,status): missing_commits = parent_commits - existing_commits - facade_session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + facade_helper.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") # Find commits which are out of the analysis range @@ -168,7 +169,7 @@ def update_analysis_log(repos_id,status): session.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") #for commit in trimmed_commits: - trim_commits(facade_session,repo_id,trimmed_commits) + trim_commits(facade_helper,repo_id,trimmed_commits) update_analysis_log(repo_id,'Commit trimming complete') @@ -181,8 +182,8 @@ def update_analysis_log(repos_id,status): def facade_analysis_end_facade_task(): logger = logging.getLogger(facade_analysis_end_facade_task.__name__) - session = FacadeSession(logger) - session.log_activity('Info','Running analysis (complete)') + facade_helper = FacadeHelper(logger) + facade_helper.log_activity('Info','Running analysis (complete)') @@ -190,9 +191,9 @@ def facade_analysis_end_facade_task(): def facade_start_contrib_analysis_task(): logger = logging.getLogger(facade_start_contrib_analysis_task.__name__) - session = FacadeSession(logger) - session.update_status('Updating Contributors') - session.log_activity('Info', 'Updating Contributors with commits') + facade_helper = FacadeHelper(logger) + facade_helper.update_status('Updating Contributors') + facade_helper.log_activity('Info', 'Updating Contributors with commits') #enable celery multithreading @@ -203,14 +204,14 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: #create new session for celery thread. logger = logging.getLogger(analyze_commits_in_parallel.__name__) - facade_session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) with get_session() as session: repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() repo_id = repo.repo_id - start_date = facade_session.get_setting('start_date') + start_date = facade_helper.get_setting('start_date') logger.info(f"Generating sequence for repo {repo_id}") @@ -218,7 +219,7 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: repo = execute_session_query(query, 'one') #Get the huge list of commits to process. - absoulte_path = get_absolute_repo_path(facade_session.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) + absoulte_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) repo_loc = (f"{absoulte_path}/.git") # Grab the parents of HEAD @@ -230,7 +231,7 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: # Find missing commits and add them missing_commits = parent_commits - existing_commits - facade_session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + facade_helper.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") if not len(missing_commits) or repo_id is None: @@ -240,7 +241,7 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: queue = list(missing_commits) logger.info(f"Got to analysis!") - absoulte_path = get_absolute_repo_path(facade_session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + absoulte_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) repo_loc = (f"{absoulte_path}/.git") pendingCommitRecordsToInsert = [] @@ -282,31 +283,31 @@ def nuke_affiliations_facade_task(): logger = logging.getLogger(nuke_affiliations_facade_task.__name__) - session = FacadeSession(logger) - nuke_affiliations(session) + facade_helper = FacadeHelper(logger) + nuke_affiliations(facade_helper) @celery.task def fill_empty_affiliations_facade_task(): logger = logging.getLogger(fill_empty_affiliations_facade_task.__name__) - facade_session = FacadeSession(logger) - fill_empty_affiliations(facade_session) + facade_helper = FacadeHelper(logger) + fill_empty_affiliations(facade_helper) @celery.task def invalidate_caches_facade_task(): logger = logging.getLogger(invalidate_caches_facade_task.__name__) - session = FacadeSession(logger) - invalidate_caches(session) + facade_helper = FacadeHelper(logger) + invalidate_caches(facade_helper) @celery.task def rebuild_unknown_affiliation_and_web_caches_facade_task(): logger = logging.getLogger(rebuild_unknown_affiliation_and_web_caches_facade_task.__name__) - session = FacadeSession(logger) - rebuild_unknown_affiliation_and_web_caches(session) + facade_helper = FacadeHelper(logger) + rebuild_unknown_affiliation_and_web_caches(facade_helper) @celery.task @@ -314,9 +315,9 @@ def git_repo_cleanup_facade_task(repo_git): logger = logging.getLogger(git_repo_cleanup_facade_task.__name__) - facade_session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) with get_session() as session: - git_repo_cleanup(facade_session, session, repo_git) + git_repo_cleanup(facade_helper, session, repo_git) # retry this task indefinitely every 5 minutes if it errors. Since the only way it gets scheduled is by itself, so if it stops running no more clones will happen till the instance is restarted @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) @@ -326,7 +327,7 @@ def clone_repos(): is_pending = CollectionStatus.facade_status == CollectionState.PENDING.value - facade_session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) with get_session() as session: @@ -341,11 +342,11 @@ def clone_repos(): # clone repo try: - git_repo_initialize(facade_session, session, repo_git) + git_repo_initialize(facade_helper, session, repo_git) session.commit() # get the commit count - commit_count = get_repo_commit_count(logger, facade_session, session, repo_git) + commit_count = get_repo_commit_count(logger, facade_helper, session, repo_git) facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) @@ -374,7 +375,7 @@ def clone_repos(): # # logger = logging.getLogger(check_for_repo_updates_facade_task.__name__) # -# session = FacadeSession(logger) +# facade_helper = FacadeHelper(logger) # check_for_repo_updates(session, repo_git) @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) @@ -384,11 +385,11 @@ def git_update_commit_count_weight(self, repo_git): logger = logging.getLogger(git_update_commit_count_weight.__name__) # Change facade session to take in engine - facade_session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) with get_session() as session: - commit_count = get_repo_commit_count(logger, facade_session, session, repo_git) + commit_count = get_repo_commit_count(logger, facade_helper, session, repo_git) facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) @@ -399,14 +400,14 @@ def git_repo_updates_facade_task(repo_git): logger = logging.getLogger(git_repo_updates_facade_task.__name__) - facade_session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) with get_session() as session: - git_repo_updates(facade_session, session, repo_git) + git_repo_updates(facade_helper, session, repo_git) -def generate_analysis_sequence(logger,repo_git, facade_session): +def generate_analysis_sequence(logger,repo_git, facade_helper): """Run the analysis by looping over all active repos. For each repo, we retrieve the list of commits which lead to HEAD. If any are missing from the database, they are filled in. Then we check to see if any commits in the database are @@ -425,7 +426,7 @@ def generate_analysis_sequence(logger,repo_git, facade_session): WHERE repo_git=:value""").bindparams(value=repo_git) repos = fetchall_data_from_sql_text(repo_list) - start_date = facade_session.get_setting('start_date') + start_date = facade_helper.get_setting('start_date') repo_ids = [repo['repo_id'] for repo in repos] @@ -475,13 +476,13 @@ def generate_contributor_sequence(logger,repo_git, session): def facade_phase(repo_git): logger = logging.getLogger(facade_phase.__name__) logger.info("Generating facade sequence") - session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) #Get the repo_id repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) repos = fetchall_data_from_sql_text(repo_list) - start_date = session.get_setting('start_date') + start_date = facade_helper.get_setting('start_date') repo_ids = [repo['repo_id'] for repo in repos] @@ -493,11 +494,11 @@ def facade_phase(repo_git): #status = execute_session_query(query,'one') # Figure out what we need to do - limited_run = session.limited_run - run_analysis = session.run_analysis - pull_repos = session.pull_repos + limited_run = facade_helper.limited_run + run_analysis = facade_helper.run_analysis + pull_repos = facade_helper.pull_repos #force_analysis = session.force_analysis - run_facade_contributors = session.run_facade_contributors + run_facade_contributors = facade_helper.run_facade_contributors facade_sequence = [] facade_core_collection = [] @@ -509,11 +510,11 @@ def facade_phase(repo_git): #Generate commit analysis task order. if not limited_run or (limited_run and run_analysis): - facade_core_collection.extend(generate_analysis_sequence(logger,repo_git,session)) + facade_core_collection.extend(generate_analysis_sequence(logger,repo_git,facade_helper)) #Generate contributor analysis task group. if not limited_run or (limited_run and run_facade_contributors): - facade_core_collection.append(generate_contributor_sequence(logger,repo_git,session)) + facade_core_collection.append(generate_contributor_sequence(logger,repo_git,facade_helper)) #These tasks need repos to be cloned by facade before they can work. @@ -531,27 +532,27 @@ def facade_phase(repo_git): def generate_non_repo_domain_facade_tasks(logger): logger.info("Generating facade sequence") - session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) # Figure out what we need to do - limited_run = session.limited_run - delete_marked_repos = session.delete_marked_repos - pull_repos = session.pull_repos - # clone_repos = session.clone_repos - check_updates = session.check_updates - # force_updates = session.force_updates - run_analysis = session.run_analysis - # force_analysis = session.force_analysis - nuke_stored_affiliations = session.nuke_stored_affiliations - fix_affiliations = session.fix_affiliations - force_invalidate_caches = session.force_invalidate_caches - rebuild_caches = session.rebuild_caches + limited_run = facade_helper.limited_run + delete_marked_repos = facade_helper.delete_marked_repos + pull_repos = facade_helper.pull_repos + # clone_repos = facade_helper.clone_repos + check_updates = facade_helper.check_updates + # force_updates = facade_helper.force_updates + run_analysis = facade_helper.run_analysis + # force_analysis = facade_helper.force_analysis + nuke_stored_affiliations = facade_helper.nuke_stored_affiliations + fix_affiliations = facade_helper.fix_affiliations + force_invalidate_caches = facade_helper.force_invalidate_caches + rebuild_caches = facade_helper.rebuild_caches #if abs((datetime.datetime.strptime(session.cfg.get_setting('aliases_processed')[:-3], # '%Y-%m-%d %I:%M:%S.%f') - datetime.datetime.now()).total_seconds()) // 3600 > int(session.cfg.get_setting( # 'update_frequency')) else 0 - force_invalidate_caches = session.force_invalidate_caches - create_xlsx_summary_files = session.create_xlsx_summary_files - multithreaded = session.multithreaded + force_invalidate_caches = facade_helper.force_invalidate_caches + create_xlsx_summary_files = facade_helper.create_xlsx_summary_files + multithreaded = facade_helper.multithreaded facade_sequence = [] diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index feb297fbf4..0be34fa7cb 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -78,7 +78,7 @@ def get_database_args_from_env(): #print(credentials) return credentials -class FacadeSession(): +class FacadeHelper(): """ORM session used in facade tasks. This class adds the various attributes needed for legacy facade as well as a modified version of the legacy FacadeConfig class. diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py index b41c6f14da..1811c734f6 100755 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py @@ -27,7 +27,7 @@ # aliases, and caches data for display. from __future__ import annotations import html.parser -from .config import FacadeSession as FacadeSession +from .config import FacadeHelper as FacadeHelper #.facade06analyze analysis moved to facade_tasks.py - IM 10/12/22 #from contributor_interfaceable.facade08contributorinterfaceable import ContributorInterfaceable diff --git a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py index 6f6c55ae49..77ca2f3c6a 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py @@ -33,13 +33,13 @@ from augur.application.db.models import * #Will delete repos passed and cleanup associated commit data. -def git_repo_cleanup(facade_session, session,repo_git): +def git_repo_cleanup(facade_helper, session,repo_git): # Clean up any git repos that are pending deletion - facade_session.update_status('Purging deleted repos') + facade_helper.update_status('Purging deleted repos') #logger.info("Processing deletions") - facade_session.log_activity('Info','Processing deletions') + facade_helper.log_activity('Info','Processing deletions') query = session.query(Repo).filter( @@ -51,7 +51,7 @@ def git_repo_cleanup(facade_session, session,repo_git): # Remove the files on disk - absolute_path = get_absolute_repo_path(facade_session.repo_base_directory, row.repo_id, row.repo_path,row.repo_name) + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, row.repo_id, row.repo_path,row.repo_name) cmd = ("rm -rf %s" % (absolute_path)) @@ -103,7 +103,7 @@ def git_repo_cleanup(facade_session, session,repo_git): #log_activity('Verbose','Deleted repo %s' % row[0]) #logger.debug(f"Deleted repo {row.repo_id}") - facade_session.log_activity('Verbose',f"Deleted repo {row.repo_id}") + facade_helper.log_activity('Verbose',f"Deleted repo {row.repo_id}") cleanup = '%s/%s%s' % (row.repo_group_id,row.repo_path,row.repo_name) # Remove any working commits @@ -127,14 +127,14 @@ def git_repo_cleanup(facade_session, session,repo_git): while (cleanup.find('/',0) > 0): cleanup = cleanup[:cleanup.rfind('/',0)] - cmd = "rmdir %s%s" % (facade_session.repo_base_directory,cleanup) + cmd = "rmdir %s%s" % (facade_helper.repo_base_directory,cleanup) subprocess.Popen([cmd],shell=True).wait() #log_activity('Verbose','Attempted %s' % cmd) #logger.debug(f"Attempted {cmd}") - facade_session.log_activity('Verbose',f"Attempted {cmd}") + facade_helper.log_activity('Verbose',f"Attempted {cmd}") #update_repo_log(row[0],'Deleted') - facade_session.update_repo_log(row.repo_id,'Deleted') + facade_helper.update_repo_log(row.repo_id,'Deleted') # Clean up deleted projects @@ -181,4 +181,4 @@ def git_repo_cleanup(facade_session, session,repo_git): execute_sql(remove_project) - facade_session.log_activity('Info', 'Processing deletions (complete)') + facade_helper.log_activity('Info', 'Processing deletions (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index e7b4d2d8d2..8e7bb1a6e7 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -33,7 +33,7 @@ # else: # import MySQLdb -def nuke_affiliations(facade_session): +def nuke_affiliations(facade_helper): # Delete all stored affiliations in the database. Normally when you # add/remove/change affiliation data via the web UI, any potentially affected @@ -43,16 +43,16 @@ def nuke_affiliations(facade_session): # this is the scorched earth way: remove them all to force a total rebuild. # Brutal but effective. - facade_session.log_activity('Info','Nuking affiliations') + facade_helper.log_activity('Info','Nuking affiliations') nuke = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL, cmt_committer_affiliation = NULL""") execute_sql(nuke) - facade_session.log_activity('Info','Nuking affiliations (complete)') + facade_helper.log_activity('Info','Nuking affiliations (complete)') -def fill_empty_affiliations(facade_session): +def fill_empty_affiliations(facade_helper): @@ -86,7 +86,7 @@ def discover_null_affiliations(attribution,email): # It's not a properly formatted email, leave it NULL and log it. - facade_session.log_activity('Info',f"Unmatchable email: {email}") + facade_helper.log_activity('Info',f"Unmatchable email: {email}") return @@ -131,7 +131,7 @@ def discover_null_affiliations(attribution,email): if matches: - facade_session.log_activity('Debug',f"Found domain match for {email}") + facade_helper.log_activity('Debug',f"Found domain match for {email}") for match in matches: update = s.sql.text(("UPDATE commits " @@ -141,14 +141,14 @@ def discover_null_affiliations(attribution,email): f"AND cmt_{attribution}_date::date >= \'{match['ca_start_date']}\'::date") ).bindparams(affiliation=match['ca_affiliation'],email=email) - facade_session.log_activity('Info', f"attr: {attribution} \nmatch:{match}\nsql: {update}") + facade_helper.log_activity('Info', f"attr: {attribution} \nmatch:{match}\nsql: {update}") try: execute_sql(update) except Exception as e: - facade_session.log_activity('Info', f"Error encountered: {e}") - facade_session.log_activity('Info', f"Affiliation insertion failed for {email} ") - facade_session.log_activity('Info', f"Offending query: {update} ") + facade_helper.log_activity('Info', f"Error encountered: {e}") + facade_helper.log_activity('Info', f"Affiliation insertion failed for {email} ") + facade_helper.log_activity('Info', f"Offending query: {update} ") def discover_alias(email): @@ -169,8 +169,8 @@ def discover_alias(email): ### The real function starts here ### - facade_session.update_status('Filling empty affiliations') - facade_session.log_activity('Info','Filling empty affiliations') + facade_helper.update_status('Filling empty affiliations') + facade_helper.log_activity('Info','Filling empty affiliations') # Process any changes to the affiliations or aliases, and set any existing # entries in commits to NULL so they are filled properly. @@ -183,7 +183,7 @@ def discover_alias(email): print(affiliations_fetched) # Now find the last time we worked on affiliations, to figure out what's new - affiliations_processed = facade_session.get_setting('affiliations_processed') + affiliations_processed = facade_helper.get_setting('affiliations_processed') get_changed_affiliations = s.sql.text("""SELECT ca_domain FROM contributor_affiliations""")# WHERE " #"ca_last_used >= timestamptz %s") @@ -196,7 +196,7 @@ def discover_alias(email): for changed_affiliation in changed_affiliations: - facade_session.log_activity('Debug',f"Resetting affiliation for {changed_affiliation['ca_domain']}") + facade_helper.log_activity('Debug',f"Resetting affiliation for {changed_affiliation['ca_domain']}") set_author_to_null = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL WHERE cmt_author_email LIKE CONCAT('%%',:affiliation)""").bindparams(affiliation=changed_affiliation['ca_domain']) @@ -225,7 +225,7 @@ def discover_alias(email): # Now find the last time we worked on aliases, to figure out what's new - aliases_processed = facade_session.get_setting('aliases_processed') + aliases_processed = facade_helper.get_setting('aliases_processed') get_changed_aliases = s.sql.text("""SELECT alias_email FROM contributors_aliases WHERE cntrb_last_modified >= :aliases""").bindparams(aliases=aliases_processed) @@ -236,31 +236,31 @@ def discover_alias(email): for changed_alias in changed_aliases: - facade_session.log_activity('Debug',f"Resetting affiliation for {changed_alias['alias_email']}") + facade_helper.log_activity('Debug',f"Resetting affiliation for {changed_alias['alias_email']}") set_author_to_null = s.sql.text("""UPDATE commits SET cmt_author_affiliation = NULL WHERE cmt_author_raw_email LIKE CONCAT('%%',:alias)""").bindparams(alias=changed_alias['alias_email']) - facade_session.insert_or_update_data(set_author_to_null) + facade_helper.insert_or_update_data(set_author_to_null) set_committer_to_null = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_raw_email LIKE CONCAT('%%',:alias_email)""").bindparams(alias_email=changed_alias['alias_email']) - facade_session.insert_or_update_data(set_committer_to_null) + facade_helper.insert_or_update_data(set_committer_to_null) reset_author = s.sql.text("""UPDATE commits SET cmt_author_email = :author_email WHERE cmt_author_raw_email = :raw_author_email """).bindparams(author_email=discover_alias(changed_alias['alias_email']),raw_author_email=changed_alias['alias_email']) - facade_session.insert_or_update_data(reset_author) + facade_helper.insert_or_update_data(reset_author) reset_committer = s.sql.text("""UPDATE commits SET cmt_committer_email = :author_email WHERE cmt_committer_raw_email = :raw_author_email """).bindparams(author_email=discover_alias(changed_alias['alias_email']), raw_author_email=changed_alias['alias_email']) - facade_session.insert_or_update_data(reset_committer) + facade_helper.insert_or_update_data(reset_committer) # Update the last fetched date, so we know where to start next time. @@ -271,11 +271,11 @@ def discover_alias(email): # Now rebuild the affiliation data - working_author = facade_session.get_setting('working_author') + working_author = facade_helper.get_setting('working_author') if working_author != 'done': - facade_session.log_activity('Error',f"Trimming author data in affiliations: {working_author}") - trim_author(facade_session, working_author) + facade_helper.log_activity('Error',f"Trimming author data in affiliations: {working_author}") + trim_author(facade_helper, working_author) # Figure out which projects have NULL affiliations so they can be recached @@ -307,17 +307,17 @@ def discover_alias(email): null_authors = fetchall_data_from_sql_text(find_null_authors) - facade_session.log_activity('Debug',f"Found {len(null_authors)} authors with NULL affiliation") + facade_helper.log_activity('Debug',f"Found {len(null_authors)} authors with NULL affiliation") for null_author in null_authors: email = null_author['email'] - store_working_author(facade_session, email) + store_working_author(facade_helper, email) discover_null_affiliations('author',email) - store_working_author(facade_session, 'done') + store_working_author(facade_helper, 'done') # Find any committers with NULL affiliations and fill them @@ -329,13 +329,13 @@ def discover_alias(email): null_committers = fetchall_data_from_sql_text(find_null_committers) - facade_session.log_activity('Debug',f"Found {len(null_committers)} committers with NULL affiliation") + facade_helper.log_activity('Debug',f"Found {len(null_committers)} committers with NULL affiliation") for null_committer in null_committers: email = null_committer['email'] - store_working_author(facade_session, email) + store_working_author(facade_helper, email) discover_null_affiliations('committer',email) @@ -354,34 +354,34 @@ def discover_alias(email): execute_sql(fill_unknown_committer) - store_working_author(facade_session, 'done') + store_working_author(facade_helper, 'done') - facade_session.log_activity('Info','Filling empty affiliations (complete)') + facade_helper.log_activity('Info','Filling empty affiliations (complete)') -def invalidate_caches(facade_session): +def invalidate_caches(facade_helper): # Invalidate all caches - facade_session.update_status('Invalidating caches') - facade_session.log_activity('Info','Invalidating caches') + facade_helper.update_status('Invalidating caches') + facade_helper.log_activity('Info','Invalidating caches') invalidate_cache = s.sql.text("""UPDATE repo_groups SET rg_recache = 1""") execute_sql(invalidate_cache) - facade_session.log_activity('Info','Invalidating caches (complete)') + facade_helper.log_activity('Info','Invalidating caches (complete)') -def rebuild_unknown_affiliation_and_web_caches(facade_session): +def rebuild_unknown_affiliation_and_web_caches(facade_helper): # When there's a lot of analysis data, calculating display data on the fly gets # pretty expensive. Instead, we crunch the data based upon the user's preferred # statistics (author or committer) and store them. We also store all records # with an (Unknown) affiliation for display to the user. - facade_session.update_status('Caching data for display') - facade_session.log_activity('Info','Caching unknown affiliations and web data for display') + facade_helper.update_status('Caching data for display') + facade_helper.log_activity('Info','Caching unknown affiliations and web data for display') - report_date = facade_session.get_setting('report_date') - report_attribution = facade_session.get_setting('report_attribution') + report_date = facade_helper.get_setting('report_date') + report_attribution = facade_helper.get_setting('report_attribution') # Clear stale caches @@ -492,7 +492,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): # "p.rg_recache=TRUE") execute_sql(clear_unknown_cache) - facade_session.log_activity('Verbose','Caching unknown authors and committers') + facade_helper.log_activity('Verbose','Caching unknown authors and committers') # Cache the unknown authors @@ -512,7 +512,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): AND p.rg_recache = 1 GROUP BY r.repo_group_id,a.cmt_author_email, info.a, info.b, info.c - """).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) + """).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) execute_sql(unknown_authors) @@ -532,13 +532,13 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): WHERE a.cmt_committer_affiliation = '(Unknown)' AND p.rg_recache = 1 GROUP BY r.repo_group_id,a.cmt_committer_email, info.a, info.b, info.c - """).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) + """).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) execute_sql(unknown_committers) # Start caching by project - facade_session.log_activity('Verbose','Caching projects') + facade_helper.log_activity('Verbose','Caching projects') cache_projects_by_week = s.sql.text(( "INSERT INTO dm_repo_group_weekly (repo_group_id, email, affiliation, week, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -572,7 +572,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): "affiliation, " f"a.cmt_{report_attribution}_email, " "r.repo_group_id, info.a, info.b, info.c") - ).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) + ).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) execute_sql(cache_projects_by_week) @@ -608,7 +608,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): "affiliation, " f"a.cmt_{report_attribution}_email," "r.repo_group_id, info.a, info.b, info.c" - )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) execute_sql(cache_projects_by_month) @@ -645,7 +645,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): - )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) @@ -653,7 +653,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): execute_sql(cache_projects_by_year) # Start caching by repo - facade_session.log_activity('Verbose','Caching repos') + facade_helper.log_activity('Verbose','Caching repos') cache_repos_by_week = s.sql.text( ( @@ -688,7 +688,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) execute_sql(cache_repos_by_week) @@ -724,7 +724,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) execute_sql(cache_repos_by_month) @@ -758,7 +758,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): "affiliation, " f"a.cmt_{report_attribution}_email," "a.repo_id, info.a, info.b, info.c" - )).bindparams(tool_source=facade_session.tool_source,tool_version=facade_session.tool_version,data_source=facade_session.data_source) + )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) execute_sql(cache_repos_by_year) @@ -767,5 +767,5 @@ def rebuild_unknown_affiliation_and_web_caches(facade_session): reset_recache = s.sql.text("UPDATE repo_groups SET rg_recache = 0") execute_sql(reset_recache) - facade_session.log_activity('Info','Caching unknown affiliations and web data for display (complete)') + facade_helper.log_activity('Info','Caching unknown affiliations and web data for display (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index eb790c16df..df37dca457 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -39,12 +39,12 @@ class GitCloneError(Exception): pass -def git_repo_initialize(facade_session, session, repo_git): +def git_repo_initialize(facade_helper, session, repo_git): # Select any new git repos so we can set up their locations and git clone - facade_session.update_status('Fetching non-cloned repos') - facade_session.log_activity('Info', 'Fetching non-cloned repos') + facade_helper.update_status('Fetching non-cloned repos') + facade_helper.log_activity('Info', 'Fetching non-cloned repos') # Get data as a list of dicts # new_repos = fetchall_data_from_sql_text(query)#list(cfg.cursor) @@ -55,7 +55,7 @@ def git_repo_initialize(facade_session, session, repo_git): session.log_activity( 'Info', f"Fetching repo with repo id: {row.repo_id}") - update_repo_log(logger, facade_session, row.repo_id, 'Cloning') + update_repo_log(logger, facade_helper, row.repo_id, 'Cloning') git = html.unescape(row.repo_git) @@ -63,28 +63,28 @@ def git_repo_initialize(facade_session, session, repo_git): if git.find('://', 0) > 0: platform_org_git_url_section = git[git.find( '://', 0)+3:][:git[git.find('://', 0)+3:].rfind('/', 0)+1] - facade_session.log_activity( + facade_helper.log_activity( 'Info', f"Repo Relative Path from facade05, from for row in new_repos, line 79: {platform_org_git_url_section}") - facade_session.log_activity('Info', f"The git path used : {git}") + facade_helper.log_activity('Info', f"The git path used : {git}") else: platform_org_git_url_section = git[:git.rfind('/', 0)+1] - facade_session.log_activity( + facade_helper.log_activity( 'Info', f"Repo Relative Path from facade05, line 80, reset at 86: {platform_org_git_url_section}") # Get the name of repo repo_name = git[git.rfind('/', 0)+1:] if repo_name.endswith('.git'): repo_name = repo_name[:repo_name.find('.git', 0)] - facade_session.log_activity( + facade_helper.log_activity( 'Info', f"Repo Name from facade05, line 93: {repo_name}") path_identifier = f"{platform_org_git_url_section}{repo_name}".replace('/','-') # Get the full path to the directory where we'll clone the repo repo_path = ( - f"{facade_session.repo_base_directory}{row.repo_id}-{path_identifier}") - facade_session.log_activity( + f"{facade_helper.repo_base_directory}{row.repo_id}-{path_identifier}") + facade_helper.log_activity( 'Info', f"Repo Path from facade05, line 86: {repo_path}") @@ -104,7 +104,7 @@ def git_repo_initialize(facade_session, session, repo_git): # to reclone. if os.path.isdir(repo_path): # len(result): - facade_session.log_activity( + facade_helper.log_activity( 'Verbose', f"Identical repo detected, storing {git} in {repo_name}") logger.warning( f"Identical repo found in facade directory! Repo git: {git}") @@ -129,14 +129,14 @@ def git_repo_initialize(facade_session, session, repo_git): except Exception as e: print("COULD NOT CREATE REPO DIRECTORY") - update_repo_log(logger, facade_session, row.repo_id, 'Failed (mkdir)') + update_repo_log(logger, facade_helper, row.repo_id, 'Failed (mkdir)') session.update_status(f"Failed (mkdir {repo_path})") session.log_activity( 'Error', f"Could not create repo directory: {repo_path}") raise e - update_repo_log(logger, facade_session, row.repo_id, 'New (cloning)') + update_repo_log(logger, facade_helper, row.repo_id, 'New (cloning)') #Make sure newly cloned repo path is recorded in repo table query = s.sql.text("""UPDATE repo SET repo_path=:pathParam, @@ -154,12 +154,12 @@ def git_repo_initialize(facade_session, session, repo_git): # If cloning succeeded, repo is ready for analysis # Mark the entire project for an update, so that under normal # circumstances caches are rebuilt only once per waiting period. - update_repo_log(logger, facade_session, row.repo_id, 'Up-to-date') + update_repo_log(logger, facade_helper, row.repo_id, 'Up-to-date') session.log_activity('Info', f"Cloned {git}") else: # If cloning failed, log it and set the status back to new - update_repo_log(logger, facade_session, row.repo_id, f"Failed ({return_code})") + update_repo_log(logger, facade_helper, row.repo_id, f"Failed ({return_code})") session.log_activity('Error', f"Could not clone {git}") @@ -274,12 +274,12 @@ def force_repo_analysis(session, repo_git): session.log_activity('Info', 'Forcing repos to be analyzed (complete)') -def git_repo_updates(facade_session, session, repo_git): +def git_repo_updates(facade_helper, session, repo_git): # Update existing repos - facade_session.update_status('Updating repos') - facade_session.log_activity('Info', 'Updating existing repos') + facade_helper.update_status('Updating repos') + facade_helper.log_activity('Info', 'Updating existing repos') # query = s.sql.text("""SELECT repo_id,repo_group_id,repo_git,repo_name,repo_path FROM repo WHERE # repo_status='Update'""") @@ -298,9 +298,9 @@ def git_repo_updates(facade_session, session, repo_git): raise Exception( f"The repo path or repo name is NULL for repo_id: {row['repo_id']}") - facade_session.log_activity( + facade_helper.log_activity( 'Verbose', f"Attempting to update {row['repo_git']}") # ['git']) - update_repo_log(logger, facade_session, row['repo_id'], 'Updating') # ['id'],'Updating') + update_repo_log(logger, facade_helper, row['repo_id'], 'Updating') # ['id'],'Updating') attempt = 0 @@ -311,7 +311,7 @@ def git_repo_updates(facade_session, session, repo_git): # default_branch = '' absolute_path = get_absolute_repo_path( - facade_session.repo_base_directory, row["repo_id"], row['repo_path'],row['repo_name']) + facade_helper.repo_base_directory, row["repo_id"], row['repo_path'],row['repo_name']) while attempt < 2: @@ -322,7 +322,7 @@ def git_repo_updates(facade_session, session, repo_git): return_code_remote = subprocess.Popen( [firstpull], shell=True).wait() - facade_session.log_activity('Verbose', 'Got to here. 1.') + facade_helper.log_activity('Verbose', 'Got to here. 1.') if return_code_remote == 0: @@ -344,26 +344,26 @@ def git_repo_updates(facade_session, session, repo_git): remotedefault = remotedefault.decode() - facade_session.log_activity( + facade_helper.log_activity( 'Verbose', f'remote default getting checked out is: {remotedefault}.') getremotedefault = ( f"git -C {absolute_path} checkout {remotedefault}") - facade_session.log_activity( + facade_helper.log_activity( 'Verbose', f"get remote default command is: \n \n {getremotedefault} \n \n ") return_code_remote_default_again = subprocess.Popen( [getremotedefault], shell=True).wait() if return_code_remote_default_again == 0: - facade_session.log_activity('Verbose', "local checkout worked.") + facade_helper.log_activity('Verbose', "local checkout worked.") cmd = (f"git -C {absolute_path} pull") return_code = subprocess.Popen([cmd], shell=True).wait() except Exception as e: - facade_session.log_activity( + facade_helper.log_activity( 'Verbose', f'Error code on branch change is {e}.') pass @@ -379,7 +379,7 @@ def git_repo_updates(facade_session, session, repo_git): break elif attempt == 0: - facade_session.log_activity( + facade_helper.log_activity( 'Verbose', f"git pull failed, attempting reset and clean for {row['repo_git']}") # remotedefault = 'main' @@ -413,7 +413,7 @@ def git_repo_updates(facade_session, session, repo_git): return_message_getremotedefault = subprocess.Popen( [getremotedefault], stdout=subprocess.PIPE, shell=True).communicate()[0] - facade_session.log_activity( + facade_helper.log_activity( 'Verbose', f'get remote default result: {return_message_getremotedefault}') getcurrentbranch = (f"git -C {absolute_path} branch") @@ -426,7 +426,7 @@ def git_repo_updates(facade_session, session, repo_git): localdefault = localdefault.decode() - facade_session.log_activity( + facade_helper.log_activity( 'Verbose', f'remote default is: {remotedefault}, and localdefault is {localdefault}.') cmd_checkout_default = ( @@ -449,7 +449,7 @@ def git_repo_updates(facade_session, session, repo_git): except Exception as e: - facade_session.log_activity('Verbose', f'Second pass failed: {e}.') + facade_helper.log_activity('Verbose', f'Second pass failed: {e}.') pass cmdpull2 = (f"git -C {absolute_path} pull") @@ -463,12 +463,12 @@ def git_repo_updates(facade_session, session, repo_git): if return_code == 0: - update_repo_log(logger, facade_session, row['repo_id'], 'Up-to-date') - facade_session.log_activity('Verbose', f"Updated {row['repo_git']}") + update_repo_log(logger, facade_helper, row['repo_id'], 'Up-to-date') + facade_helper.log_activity('Verbose', f"Updated {row['repo_git']}") else: - update_repo_log(logger, facade_session, row['repo_id'], f"Failed ({return_code})") - facade_session.log_activity('Error', f"Could not update {row['repo_git']}") + update_repo_log(logger, facade_helper, row['repo_id'], f"Failed ({return_code})") + facade_helper.log_activity('Error', f"Could not update {row['repo_git']}") - facade_session.log_activity('Info', 'Updating existing repos (complete)') + facade_helper.log_activity('Info', 'Updating existing repos (complete)') diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 89641d2014..da30bfd827 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -31,15 +31,15 @@ import sqlalchemy as s from sqlalchemy.exc import DataError from augur.application.db.models import * -from .config import FacadeSession as FacadeSession +from .config import FacadeHelper as FacadeHelper from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text #from augur.tasks.git.util.facade_worker.facade -def update_repo_log(logger, facade_session, repos_id,status): +def update_repo_log(logger, facade_helper, repos_id,status): # Log a repo's fetch status - facade_session.log_activity("Info",f"{status} {repos_id}") + facade_helper.log_activity("Info",f"{status} {repos_id}") #log_message = ("INSERT INTO repos_fetch_log (repos_id,status) " # "VALUES (%s,%s)") try: @@ -52,7 +52,7 @@ def update_repo_log(logger, facade_session, repos_id,status): logger.error(f"Ran into error in update_repo_log: {e}") pass -def trim_commits(facade_session, repo_id,commits): +def trim_commits(facade_helper, repo_id,commits): # Quickly remove a given commit @@ -72,10 +72,10 @@ def trim_commits(facade_session, repo_id,commits): execute_sql(remove_commit) for commit in commits: - facade_session.log_activity('Debug',f"Trimmed commit: {commit}") - facade_session.log_activity('Debug',f"Removed working commit: {commit}") + facade_helper.log_activity('Debug',f"Trimmed commit: {commit}") + facade_helper.log_activity('Debug',f"Removed working commit: {commit}") -def store_working_author(facade_session, email): +def store_working_author(facade_helper, email): # Store the working author during affiliation discovery, in case it is # interrupted and needs to be trimmed. @@ -87,9 +87,9 @@ def store_working_author(facade_session, email): execute_sql(store) - facade_session.log_activity('Debug',f"Stored working author: {email}") + facade_helper.log_activity('Debug',f"Stored working author: {email}") -def trim_author(facade_session, email): +def trim_author(facade_helper, email): # Remove the affiliations associated with an email. Used when an analysis is # interrupted during affiliation layering, and the data will be corrupt. @@ -110,9 +110,9 @@ def trim_author(facade_session, email): execute_sql(trim) - store_working_author(facade_session, 'done') + store_working_author(facade_helper, 'done') - facade_session.log_activity('Debug',f"Trimmed working author: {email}") + facade_helper.log_activity('Debug',f"Trimmed working author: {email}") def get_absolute_repo_path(repo_base_dir, repo_id, repo_path,repo_name): @@ -149,11 +149,11 @@ def count_branches(git_dir): branches_dir = os.path.join(git_dir, 'refs', 'heads') return sum(1 for _ in os.scandir(branches_dir)) -def get_repo_commit_count(logger, facade_session, session, repo_git): +def get_repo_commit_count(logger, facade_helper, session, repo_git): repo = Repo.get_by_repo_git(session, repo_git) - absolute_path = get_absolute_repo_path(facade_session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) repo_loc = (f"{absolute_path}/.git") logger.debug(f"loc: {repo_loc}") @@ -191,8 +191,8 @@ def get_facade_weight_with_commit_count(session, repo_git, commit_count): def get_repo_weight_by_commit(logger, session, repo_git): - facade_session = FacadeSession(logger) - return get_repo_commit_count(logger, facade_session, session, repo_git) - get_facade_weight_time_factor(session, repo_git) + facade_helper = FacadeHelper(logger) + return get_repo_commit_count(logger, facade_helper, session, repo_git) - get_facade_weight_time_factor(session, repo_git) def update_facade_scheduling_fields(session, repo_git, weight, commit_count): From 32585687d4407ec4d2980ddc19749e4ab53a00d6 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 2 Apr 2024 18:44:05 -0500 Subject: [PATCH 008/122] Add repo methods to lib Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 20 +++++++- augur/tasks/git/facade_tasks.py | 89 +++++++++++++++------------------ 2 files changed, 60 insertions(+), 49 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 5c277c086b..34e67fd2e6 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -1,7 +1,7 @@ import sqlalchemy as s import logging from typing import List, Any, Optional -from augur.application.db.models import Config +from augur.application.db.models import Config, Repo from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query @@ -115,3 +115,21 @@ def fetchall_data_from_sql_text(sql_text): result = connection.execute(sql_text) return [dict(row) for row in result.mappings()] + +def get_repo_by_repo_git(repo_git: str): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + return repo + +def get_repo_by_repo_id(repo_id): + + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_id == repo_id) + repo = execute_session_query(query, 'one') + + return repo diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 6b66242f21..40df676ddb 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -4,7 +4,7 @@ from celery import group, chain import sqlalchemy as s -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, get_session +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, get_session, get_repo_by_repo_git, get_repo_by_repo_id from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set @@ -73,9 +73,8 @@ def trim_commits_facade_task(repo_git): facade_helper = FacadeHelper(logger) - with get_session() as session: + repo = get_repo_by_repo_git(repo_git) - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() repo_id = repo.repo_id def update_analysis_log(repos_id,status): @@ -120,9 +119,7 @@ def trim_commits_post_analysis_facade_task(repo_git): facade_helper = FacadeHelper(logger) - with get_session() as session: - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - + repo = repo = get_repo_by_repo_git(repo_git) repo_id = repo.repo_id start_date = facade_helper.get_setting('start_date') @@ -138,9 +135,7 @@ def update_analysis_log(repos_id,status): logger.info(f"Generating sequence for repo {repo_id}") - with get_session() as session: - query = session.query(Repo).filter(Repo.repo_id == repo_id) - repo = execute_session_query(query, 'one') + repo = get_repo_by_repo_git(repo_git) #Get the huge list of commits to process. absoulte_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) @@ -166,7 +161,7 @@ def update_analysis_log(repos_id,status): update_analysis_log(repo_id,'Beginning to trim commits') - session.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") + facade_helper.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") #for commit in trimmed_commits: trim_commits(facade_helper,repo_id,trimmed_commits) @@ -206,45 +201,44 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: logger = logging.getLogger(analyze_commits_in_parallel.__name__) facade_helper = FacadeHelper(logger) - with get_session() as session: - - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id - start_date = facade_helper.get_setting('start_date') + start_date = facade_helper.get_setting('start_date') - logger.info(f"Generating sequence for repo {repo_id}") - - query = session.query(Repo).filter(Repo.repo_id == repo_id) - repo = execute_session_query(query, 'one') + logger.info(f"Generating sequence for repo {repo_id}") + + repo = get_repo_by_repo_id(repo_id) - #Get the huge list of commits to process. - absoulte_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") - # Grab the parents of HEAD + #Get the huge list of commits to process. + absoulte_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) + repo_loc = (f"{absoulte_path}/.git") + # Grab the parents of HEAD - parent_commits = get_parent_commits_set(repo_loc, start_date) + parent_commits = get_parent_commits_set(repo_loc, start_date) - # Grab the existing commits from the database - existing_commits = get_existing_commits_set(repo_id) + # Grab the existing commits from the database + existing_commits = get_existing_commits_set(repo_id) - # Find missing commits and add them - missing_commits = parent_commits - existing_commits + # Find missing commits and add them + missing_commits = parent_commits - existing_commits - facade_helper.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + facade_helper.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") - - if not len(missing_commits) or repo_id is None: - #session.log_activity('Info','Type of missing_commits: %s' % type(missing_commits)) - return - - queue = list(missing_commits) + + if not len(missing_commits) or repo_id is None: + #session.log_activity('Info','Type of missing_commits: %s' % type(missing_commits)) + return + + queue = list(missing_commits) - logger.info(f"Got to analysis!") - absoulte_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") + logger.info(f"Got to analysis!") + absoulte_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + repo_loc = (f"{absoulte_path}/.git") - pendingCommitRecordsToInsert = [] + pendingCommitRecordsToInsert = [] + + with get_session() as session: for count, commitTuple in enumerate(queue): quarterQueue = int(len(queue) / 4) @@ -256,7 +250,6 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: if (count + 1) % quarterQueue == 0: logger.info(f"Progress through current analysis queue is {(count / len(queue)) * 100}%") - #logger.info(f"Got to analysis!") commitRecords = analyze_commit(logger, repo_id, repo_loc, commitTuple) #logger.debug(commitRecord) @@ -269,14 +262,14 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: facade_bulk_insert_commits(logger, session,pendingCommitRecordsToInsert) - # Remove the working commit. - remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND working_commit IN :hashes - """).bindparams(repo_id=repo_id,hashes=tuple(queue)) - execute_sql(remove_commit) + # Remove the working commit. + remove_commit = s.sql.text("""DELETE FROM working_commits + WHERE repos_id = :repo_id AND working_commit IN :hashes + """).bindparams(repo_id=repo_id,hashes=tuple(queue)) + execute_sql(remove_commit) - logger.info("Analysis complete") - return + logger.info("Analysis complete") + return @celery.task def nuke_affiliations_facade_task(): @@ -335,7 +328,7 @@ def clone_repos(): repo_git_identifiers = get_collection_status_repo_git_from_filter(session, is_pending, 999999) for repo_git in repo_git_identifiers: # set repo to intializing - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() + repo = get_repo_by_repo_git(repo_git) repoStatus = repo.collection_status[0] setattr(repoStatus,"facade_status", CollectionState.INITIALIZING.value) session.commit() From b31baf3d0702451428b8a7847e2b7502f185ff16 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 2 Apr 2024 19:11:56 -0500 Subject: [PATCH 009/122] Use get repo git Signed-off-by: Andrew Brain --- augur/tasks/git/facade_tasks.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 40df676ddb..30ff43bf31 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -411,19 +411,16 @@ def generate_analysis_sequence(logger,repo_git, facade_helper): commit being analyzed at the time) we can recover. """ - - analysis_sequence = [] - repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - repos = fetchall_data_from_sql_text(repo_list) + #repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) + #repos = fetchall_data_from_sql_text(repo_list) start_date = facade_helper.get_setting('start_date') - repo_ids = [repo['repo_id'] for repo in repos] + #repo_ids = [repo['repo_id'] for repo in repos] - repo_id = repo_ids.pop(0) + #repo_id = repo_ids.pop(0) analysis_sequence.append(facade_analysis_init_facade_task.si(repo_git)) @@ -448,12 +445,9 @@ def generate_contributor_sequence(logger,repo_git, session): repo_id = None #contributor_sequence.append(facade_start_contrib_analysis_task.si()) - query = s.sql.text("""SELECT repo_id FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id - repo = execute_sql(query).fetchone() - logger.info(f"repo: {repo}") - repo_id = repo[0] #pdb.set_trace() #breakpoint() #for repo in all_repos: @@ -471,15 +465,14 @@ def facade_phase(repo_git): logger.info("Generating facade sequence") facade_helper = FacadeHelper(logger) #Get the repo_id - repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - repos = fetchall_data_from_sql_text(repo_list) + #repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) + #repos = fetchall_data_from_sql_text(repo_list) start_date = facade_helper.get_setting('start_date') - repo_ids = [repo['repo_id'] for repo in repos] + #repo_ids = [repo['repo_id'] for repo in repos] - repo_id = repo_ids.pop(0) + #repo_id = repo_ids.pop(0) #Get the collectionStatus #query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id) From 5781e90c6ad199d376b3edf4017c7027912bdaf7 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 2 Apr 2024 19:35:04 -0500 Subject: [PATCH 010/122] Add more facade lib methods Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 41 +++++++++++++++++++ augur/tasks/git/facade_tasks.py | 15 ++----- .../facade_worker/postanalysiscleanup.py | 14 ++----- .../facade_worker/utilitymethods.py | 15 ++----- 4 files changed, 51 insertions(+), 34 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 34e67fd2e6..853e7e6518 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -133,3 +133,44 @@ def get_repo_by_repo_id(repo_id): repo = execute_session_query(query, 'one') return repo + +def remove_working_commits_by_repo_id_and_hashes(repo_id, commit_hashes): + + remove_working_commits = s.sql.text("""DELETE FROM working_commits + WHERE repos_id = :repo_id AND working_commit IN :hashes + """).bindparams(repo_id=repo_id,hashes=tuple(commit_hashes)) + + execute_sql(remove_working_commits) + +def remove_working_commits_by_repo_id(repo_id): + + remove_working_commits = s.sql.text("""DELETE FROM working_commits WHERE repos_id=:repo_id""").bindparams(repo_id=repo_id) + execute_sql(remove_working_commits) + +def remove_commits_by_repo_id_and_hashes(repo_id, commit_hashes): + + remove_commit = s.sql.text("""DELETE FROM commits + WHERE repo_id=:repo_id + AND cmt_commit_hash IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commit_hashes)) + execute_sql(remove_commit) + + +def remove_commits_by_repo_id(repo_id): + + remove_commits = s.sql.text("""DELETE FROM commits WHERE repo_id=:repo_id""").bindparams(repo_id=repo_id) + execute_sql(remove_commits) + +def get_working_commits_by_repo_id(repo_id): + + query = s.sql.text("""SELECT working_commit FROM working_commits WHERE repos_id=:repo_id + """).bindparams(repo_id=repo_id) + + try: + working_commits = fetchall_data_from_sql_text(query) + except: + working_commits = [] + + return working_commits + + + diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 30ff43bf31..4deaa08b35 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -4,7 +4,7 @@ from celery import group, chain import sqlalchemy as s -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, get_session, get_repo_by_repo_git, get_repo_by_repo_id +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set @@ -94,13 +94,7 @@ def update_analysis_log(repos_id,status): update_analysis_log(repo_id,"Beginning analysis.") # First we check to see if the previous analysis didn't complete - get_status = s.sql.text("""SELECT working_commit FROM working_commits WHERE repos_id=:repo_id - """).bindparams(repo_id=repo_id) - - try: - working_commits = fetchall_data_from_sql_text(get_status) - except: - working_commits = [] + working_commits = get_working_commits_by_repo_id(repo_id) # If there's a commit still there, the previous run was interrupted and # the commit data may be incomplete. It should be trimmed, just in case. @@ -263,10 +257,7 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: facade_bulk_insert_commits(logger, session,pendingCommitRecordsToInsert) # Remove the working commit. - remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND working_commit IN :hashes - """).bindparams(repo_id=repo_id,hashes=tuple(queue)) - execute_sql(remove_commit) + remove_working_commits_by_repo_id_and_hashes(repo_id, queue) logger.info("Analysis complete") return diff --git a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py index 77ca2f3c6a..177a100d8f 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py @@ -28,7 +28,7 @@ import subprocess import sqlalchemy as s from augur.application.db.util import execute_session_query -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_commits_by_repo_id, remove_working_commits_by_repo_id from .utilitymethods import get_absolute_repo_path from augur.application.db.models import * @@ -41,7 +41,7 @@ def git_repo_cleanup(facade_helper, session,repo_git): #logger.info("Processing deletions") facade_helper.log_activity('Info','Processing deletions') - + # TODO: We can convert this to use get_repo_by_repo_git. We just need to know how to handle the NoResultFoundException query = session.query(Repo).filter( Repo.repo_git == repo_git)#s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_status='Delete'""") @@ -59,10 +59,7 @@ def git_repo_cleanup(facade_helper, session,repo_git): return_code = subprocess.Popen([cmd],shell=True).wait() # Remove the analysis data - - remove_commits = s.sql.text("""DELETE FROM commits WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - execute_sql(remove_commits) + remove_commits_by_repo_id(row.repo_id) optimize_table = s.sql.text("""OPTIMIZE TABLE commits""") execute_sql(optimize_table) @@ -107,10 +104,7 @@ def git_repo_cleanup(facade_helper, session,repo_git): cleanup = '%s/%s%s' % (row.repo_group_id,row.repo_path,row.repo_name) # Remove any working commits - - remove_working_commits = s.sql.text("""DELETE FROM working_commits WHERE repos_id=:repo_id - """).bindparams(repo_id=row.repo_id) - execute_sql(remove_working_commits) + remove_working_commits_by_repo_id(row.repo_id) # Remove the repo from the logs diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index da30bfd827..6a5fbb928a 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -33,7 +33,7 @@ from augur.application.db.models import * from .config import FacadeHelper as FacadeHelper from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes #from augur.tasks.git.util.facade_worker.facade def update_repo_log(logger, facade_helper, repos_id,status): @@ -57,19 +57,10 @@ def trim_commits(facade_helper, repo_id,commits): # Quickly remove a given commit if len(commits): - remove_commit = s.sql.text("""DELETE FROM commits - WHERE repo_id=:repo_id - AND cmt_commit_hash IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) - - - execute_sql(remove_commit) + remove_commits_by_repo_id_and_hashes(repo_id, commits) # Remove the working commit. - remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND - working_commit IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) - - execute_sql(remove_commit) + remove_working_commits_by_repo_id_and_hashes(repo_id, commits) for commit in commits: facade_helper.log_activity('Debug',f"Trimmed commit: {commit}") From 0dcbb91a8bd7165ebac279df08967781e90b2077 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 2 Apr 2024 19:40:55 -0500 Subject: [PATCH 011/122] Move facade insert commits method to lib Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 45 ++++++++++++++++++- augur/tasks/git/facade_tasks.py | 41 ++++++++--------- .../facade_worker/utilitymethods.py | 38 ---------------- 3 files changed, 63 insertions(+), 61 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 853e7e6518..768a6f75af 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -1,7 +1,8 @@ import sqlalchemy as s import logging +from sqlalchemy.exc import DataError from typing import List, Any, Optional -from augur.application.db.models import Config, Repo +from augur.application.db.models import Config, Repo, Commit from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query @@ -173,4 +174,46 @@ def get_working_commits_by_repo_id(repo_id): return working_commits +def facade_bulk_insert_commits(logger, records): + + with get_session() as session: + + try: + session.execute( + s.insert(Commit), + records, + ) + session.commit() + except Exception as e: + + if len(records) > 1: + logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") + + #split list into halves and retry insert until we isolate offending record + firsthalfRecords = records[:len(records)//2] + secondhalfRecords = records[len(records)//2:] + + facade_bulk_insert_commits(logger, session,firsthalfRecords) + facade_bulk_insert_commits(logger, session,secondhalfRecords) + elif len(records) == 1 and isinstance(e,DataError) and "time zone displacement" in f"{e}": + commit_record = records[0] + #replace incomprehensible dates with epoch. + #2021-10-11 11:57:46 -0500 + placeholder_date = "1970-01-01 00:00:15 -0500" + + #Check for improper utc timezone offset + #UTC timezone offset should be betwen -14:00 and +14:00 + + commit_record['author_timestamp'] = placeholder_date + commit_record['committer_timestamp'] = placeholder_date + + session.execute( + s.insert(Commit), + [commit_record], + ) + session.commit() + else: + raise e + + diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 4deaa08b35..a3cc8a5aa5 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -4,12 +4,12 @@ from celery import group, chain import sqlalchemy as s -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, get_session, get_repo_by_repo_git, get_repo_by_repo_id, remove_working_commits_by_repo_id_and_hashes, get_working_commits_by_repo_id, facade_bulk_insert_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set from augur.tasks.git.util.facade_worker.facade_worker.analyzecommit import analyze_commit -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count, facade_bulk_insert_commits +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count from augur.tasks.git.util.facade_worker.facade_worker.rebuildcache import fill_empty_affiliations, invalidate_caches, nuke_affiliations, rebuild_unknown_affiliation_and_web_caches from augur.tasks.git.util.facade_worker.facade_worker.postanalysiscleanup import git_repo_cleanup @@ -232,29 +232,26 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: pendingCommitRecordsToInsert = [] - with get_session() as session: - - for count, commitTuple in enumerate(queue): - quarterQueue = int(len(queue) / 4) - - if quarterQueue == 0: - quarterQueue = 1 # prevent division by zero with integer math + for count, commitTuple in enumerate(queue): + quarterQueue = int(len(queue) / 4) - #Log progress when another quarter of the queue has been processed - if (count + 1) % quarterQueue == 0: - logger.info(f"Progress through current analysis queue is {(count / len(queue)) * 100}%") + if quarterQueue == 0: + quarterQueue = 1 # prevent division by zero with integer math - #logger.info(f"Got to analysis!") - commitRecords = analyze_commit(logger, repo_id, repo_loc, commitTuple) - #logger.debug(commitRecord) - if len(commitRecords): - pendingCommitRecordsToInsert.extend(commitRecords) - if len(pendingCommitRecordsToInsert) >= 1000: - facade_bulk_insert_commits(logger, session,pendingCommitRecordsToInsert) - pendingCommitRecordsToInsert = [] + #Log progress when another quarter of the queue has been processed + if (count + 1) % quarterQueue == 0: + logger.info(f"Progress through current analysis queue is {(count / len(queue)) * 100}%") - - facade_bulk_insert_commits(logger, session,pendingCommitRecordsToInsert) + #logger.info(f"Got to analysis!") + commitRecords = analyze_commit(logger, repo_id, repo_loc, commitTuple) + #logger.debug(commitRecord) + if len(commitRecords): + pendingCommitRecordsToInsert.extend(commitRecords) + if len(pendingCommitRecordsToInsert) >= 1000: + facade_bulk_insert_commits(logger,pendingCommitRecordsToInsert) + pendingCommitRecordsToInsert = [] + + facade_bulk_insert_commits(logger,pendingCommitRecordsToInsert) # Remove the working commit. remove_working_commits_by_repo_id_and_hashes(repo_id, queue) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 6a5fbb928a..681f8e94d9 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -29,7 +29,6 @@ from subprocess import check_output import os import sqlalchemy as s -from sqlalchemy.exc import DataError from augur.application.db.models import * from .config import FacadeHelper as FacadeHelper from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps @@ -198,42 +197,5 @@ def update_facade_scheduling_fields(session, repo_git, weight, commit_count): session.execute(update_query) session.commit() -def facade_bulk_insert_commits(logger, session,records): - try: - session.execute( - s.insert(Commit), - records, - ) - session.commit() - except Exception as e: - - if len(records) > 1: - logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") - - #split list into halves and retry insert until we isolate offending record - firsthalfRecords = records[:len(records)//2] - secondhalfRecords = records[len(records)//2:] - - facade_bulk_insert_commits(logger, session,firsthalfRecords) - facade_bulk_insert_commits(logger, session,secondhalfRecords) - elif len(records) == 1 and isinstance(e,DataError) and "time zone displacement" in f"{e}": - commit_record = records[0] - #replace incomprehensible dates with epoch. - #2021-10-11 11:57:46 -0500 - placeholder_date = "1970-01-01 00:00:15 -0500" - - #Check for improper utc timezone offset - #UTC timezone offset should be betwen -14:00 and +14:00 - - commit_record['author_timestamp'] = placeholder_date - commit_record['committer_timestamp'] = placeholder_date - - session.execute( - s.insert(Commit), - [commit_record], - ) - session.commit() - else: - raise e From 25638bd04f499b575382ddb0554a5b4a6fcc4d05 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 2 Apr 2024 20:06:38 -0500 Subject: [PATCH 012/122] Define bulk insert dicts to remove database session dependence Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 198 +++++++++++++++++- .../git/dependency_libyear_tasks/core.py | 10 +- .../git/dependency_libyear_tasks/tasks.py | 6 +- augur/tasks/git/dependency_tasks/core.py | 11 +- augur/tasks/git/dependency_tasks/tasks.py | 9 +- augur/tasks/git/scc_value_tasks/core.py | 5 +- augur/tasks/git/scc_value_tasks/tasks.py | 6 +- .../facade_worker/utilitymethods.py | 4 - 8 files changed, 219 insertions(+), 30 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 768a6f75af..e0680903ed 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -1,10 +1,17 @@ -import sqlalchemy as s +import time +import random import logging +import sqlalchemy as s from sqlalchemy.exc import DataError -from typing import List, Any, Optional +from sqlalchemy.dialects import postgresql +from sqlalchemy.exc import OperationalError +from psycopg2.errors import DeadlockDetected +from typing import List, Any, Optional, Union + from augur.application.db.models import Config, Repo, Commit from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query +from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts logger = logging.getLogger("db_lib") @@ -214,6 +221,193 @@ def facade_bulk_insert_commits(logger, records): session.commit() else: raise e + + +def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: + + if isinstance(data, list) is False: + + # if a dict is passed to data then + # convert it to a list with one value + if isinstance(data, dict) is True: + data = [data] + + else: + self.logger.info("Data must be a list or a dict") + return None + + if len(data) == 0: + # self.logger.info("Gave no data to insert, returning...") + return None + + if isinstance(data[0], dict) is False: + self.logger.info("Must be list of dicts") + return None + + # remove any duplicate data + # this only counts something as a duplicate if every field is the same + data = remove_duplicates_by_uniques(data, natural_keys) + + # remove null data from string fields + if string_fields and isinstance(string_fields, list): + data = remove_null_characters_from_list_of_dicts(data, string_fields) + + # creates list of arguments to tell sqlalchemy what columns to return after the data is inserted + returning_args = [] + if return_columns: + for column in return_columns: + argument = getattr(table, column) + returning_args.append(argument) + + # creates insert on table + # that returns cols specificed in returning_args + # and inserts the data specified in data + # NOTE: if return_columns does not have an values this still works + stmnt = postgresql.insert(table).returning(*returning_args).values(data) + + + if on_conflict_update: + + # create a dict that the on_conflict_do_update method requires to be able to map updates whenever there is a conflict. See sqlalchemy docs for more explanation and examples: https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#updating-using-the-excluded-insert-values + setDict = {} + for key in data[0].keys(): + setDict[key] = getattr(stmnt.excluded, key) + + stmnt = stmnt.on_conflict_do_update( + #This might need to change + index_elements=natural_keys, + + #Columns to be updated + set_ = setDict + ) + + else: + stmnt = stmnt.on_conflict_do_nothing( + index_elements=natural_keys + ) + + + # print(str(stmnt.compile(dialect=postgresql.dialect()))) + attempts = 0 + # creates list from 1 to 10 + sleep_time_list = list(range(1,11)) + deadlock_detected = False + + + # if there is no data to return then it executes the insert then returns nothing + if not return_columns: + + while attempts < 10: + try: + #begin keyword is needed for sqlalchemy 2.x + #this is because autocommit support was removed in 2.0 + with self.engine.begin() as connection: + connection.execute(stmnt) + break + except OperationalError as e: + # print(str(e).split("Process")[1].split(";")[0]) + if isinstance(e.orig, DeadlockDetected): + deadlock_detected = True + sleep_time = random.choice(sleep_time_list) + self.logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") + time.sleep(sleep_time) + + attempts += 1 + continue + + raise e + + except Exception as e: + #self.logger.info(e) + if len(data) == 1: + raise e + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] + + self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(second_half,table, natural_keys, return_columns, string_fields, on_conflict_update) + + else: + self.logger.error("Unable to insert data in 10 attempts") + return None + + if deadlock_detected is True: + self.logger.error("Made it through even though Deadlock was detected") + + return "success" + + + # othewise it gets the requested return columns and returns them as a list of dicts + while attempts < 10: + try: + with self.engine.begin() as connection: + return_data_tuples = connection.execute(stmnt) + break + except OperationalError as e: + if isinstance(e.orig, DeadlockDetected): + sleep_time = random.choice(sleep_time_list) + self.logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") + time.sleep(sleep_time) + + attempts += 1 + continue + + raise e + + except Exception as e: + if len(data) == 1: + raise e + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] + + self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(second_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + + else: + self.logger.error("Unable to insert and return data in 10 attempts") + return None + + if deadlock_detected is True: + self.logger.error("Made it through even though Deadlock was detected") + + return_data = [dict(row) for row in return_data_tuples.mappings()] + + #no longer working in sqlalchemy 2.x + #for data_tuple in return_data_tuples: + # return_data.append(dict(data_tuple)) + + # using on confilict do nothing does not return the + # present values so this does gets the return values + if not on_conflict_update: + + conditions = [] + for column in natural_keys: + + column_values = [value[column] for value in data] + + column = getattr(table, column) + + conditions.append(column.in_(tuple(column_values))) + + result = ( + self.query(table).filter(*conditions).all() + ) + + for row in result: + + return_dict = {} + for field in return_columns: + + return_dict[field] = getattr(row, field) + + return_data.append(return_dict) + + + return return_data diff --git a/augur/tasks/git/dependency_libyear_tasks/core.py b/augur/tasks/git/dependency_libyear_tasks/core.py index 21e47409d6..2d25ff9887 100644 --- a/augur/tasks/git/dependency_libyear_tasks/core.py +++ b/augur/tasks/git/dependency_libyear_tasks/core.py @@ -1,6 +1,6 @@ from datetime import datetime from augur.application.db.models import * -from augur.application.db.lib import get_value +from augur.application.db.lib import get_value, bulk_insert_dicts from augur.application.db.util import execute_session_query from augur.tasks.git.dependency_libyear_tasks.libyear_util.util import get_deps_libyear_data from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path @@ -21,12 +21,11 @@ def deps_libyear_model(logger, session, repo_id,repo_git,repo_group_id): absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo_id,result.repo_path,result.repo_name) #config.get_section("Facade")['repo_directory'] + relative_repo_path#self.config['repo_directory'] + relative_repo_path - generate_deps_libyear_data(logger, session,repo_id, absolute_repo_path) + generate_deps_libyear_data(logger, repo_id, absolute_repo_path) -def generate_deps_libyear_data(logger, session, repo_id, path): +def generate_deps_libyear_data(logger, repo_id, path): """Scans for package files and calculates libyear - :param session: Task manifest and database session. :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ @@ -68,4 +67,5 @@ def generate_deps_libyear_data(logger, session, repo_id, path): # #session.execute_sql(insert_statement) to_insert.append(repo_deps) - session.insert_data(to_insert, RepoDepsLibyear, ["repo_id","name","data_collection_date"]) + + bulk_insert_dicts(to_insert, RepoDepsLibyear, ["repo_id","name","data_collection_date"]) diff --git a/augur/tasks/git/dependency_libyear_tasks/tasks.py b/augur/tasks/git/dependency_libyear_tasks/tasks.py index ec062e4853..4255bfc8ae 100644 --- a/augur/tasks/git/dependency_libyear_tasks/tasks.py +++ b/augur/tasks/git/dependency_libyear_tasks/tasks.py @@ -1,5 +1,5 @@ import logging -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_session from augur.tasks.git.dependency_libyear_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask @@ -13,10 +13,10 @@ def process_libyear_dependency_metrics(self, repo_git): logger = logging.getLogger(process_libyear_dependency_metrics.__name__) - with DatabaseSession(logger, engine) as session: + with get_session() as session: + logger.info(f"repo_git: {repo_git}") query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query,'one') deps_libyear_model(logger, session, repo.repo_id,repo_git,repo.repo_group_id) \ No newline at end of file diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 19a5b84a4d..467211b51e 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -1,11 +1,12 @@ from datetime import datetime import os from augur.application.db.models import * +from augur.application.db.lib import bulk_insert_dicts from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from augur.tasks.util.worker_util import parse_json_from_subprocess_call -def generate_deps_data(logger, session, repo_id, path): +def generate_deps_data(logger, repo_id, path): """Run dependency logic on repo and stores data in database :param repo_id: Repository ID :param path: Absolute path of the Repostiory @@ -14,9 +15,7 @@ def generate_deps_data(logger, session, repo_id, path): scan_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') logger.info('Searching for deps in repo') logger.info(f'Repo ID: {repo_id}, Path: {path}, Scan date: {scan_date}') - - deps = dep_calc.get_deps(path,logger) to_insert = [] @@ -33,8 +32,8 @@ def generate_deps_data(logger, session, repo_id, path): } to_insert.append(repo_deps) - - session.insert_data(to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) + + bulk_insert_dicts(to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") @@ -110,7 +109,7 @@ def generate_scorecard(logger, session,repo_id,path): } to_insert.append(repo_deps_scorecard) - session.insert_data(to_insert, RepoDepsScorecard, ["repo_id","name"]) + bulk_insert_dicts(to_insert, RepoDepsScorecard, ["repo_id","name"]) logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index 455e9b1faf..ad64c08b7e 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -1,6 +1,6 @@ import logging import traceback -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_session from augur.tasks.git.dependency_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurSecondaryRepoCollectionTask @@ -17,18 +17,17 @@ def process_dependency_metrics(self, repo_git): logger = logging.getLogger(process_dependency_metrics.__name__) - with DatabaseSession(logger, engine) as session: + with get_session() as session: logger.info(f"repo_git: {repo_git}") query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query,'one') absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) logger.debug(f"This is the deps model repo: {repo_git}.") - generate_deps_data(logger, session,repo.repo_id,absolute_repo_path) + generate_deps_data(logger,repo.repo_id,absolute_repo_path) @celery.task(base=AugurSecondaryRepoCollectionTask, bind=True) @@ -38,7 +37,7 @@ def process_ossf_dependency_metrics(self, repo_git): logger = logging.getLogger(process_ossf_dependency_metrics.__name__) - with DatabaseSession(logger, engine) as session: + with get_session() as session: logger.info(f"repo_git: {repo_git}") query = session.query(Repo).filter(Repo.repo_git == repo_git) diff --git a/augur/tasks/git/scc_value_tasks/core.py b/augur/tasks/git/scc_value_tasks/core.py index 8e5854136f..d705927791 100644 --- a/augur/tasks/git/scc_value_tasks/core.py +++ b/augur/tasks/git/scc_value_tasks/core.py @@ -1,9 +1,10 @@ from datetime import datetime import os from augur.application.db.models import * +from augur.application.db.lib import bulk_insert_dicts from augur.tasks.util.worker_util import parse_json_from_subprocess_call -def value_model(logger, session,repo_git,repo_id, path): +def value_model(logger,repo_git,repo_id, path): """Runs scc on repo and stores data in database :param repo_id: Repository ID :param path: absolute file path of the Repostiory @@ -42,6 +43,6 @@ def value_model(logger, session,repo_git,repo_id, path): to_insert.append(repo_labor) - session.insert_data(to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) + bulk_insert_dicts(to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) logger.info(f"Done generating scc data for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/scc_value_tasks/tasks.py b/augur/tasks/git/scc_value_tasks/tasks.py index 22e049fdc9..54b45cd76b 100644 --- a/augur/tasks/git/scc_value_tasks/tasks.py +++ b/augur/tasks/git/scc_value_tasks/tasks.py @@ -1,5 +1,5 @@ import logging -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_session from augur.tasks.git.scc_value_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask @@ -15,7 +15,7 @@ def process_scc_value_metrics(self, repo_git): logger = logging.getLogger(process_scc_value_metrics.__name__) - with DatabaseSession(logger,engine) as session: + with get_session() as session: logger.info(f"repo_git: {repo_git}") query = session.query(Repo).filter(Repo.repo_git == repo_git) @@ -23,4 +23,4 @@ def process_scc_value_metrics(self, repo_git): absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - value_model(logger, session,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file + value_model(logger,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 681f8e94d9..f0e506c832 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -88,16 +88,12 @@ def trim_author(facade_helper, email): SET cmt_author_affiliation = NULL WHERE cmt_author_email = :email """).bindparams(email=email) - - - execute_sql(trim) trim = s.sql.text("""UPDATE commits SET cmt_committer_affiliation = NULL WHERE cmt_committer_email = :email """).bindparams(email=email) - execute_sql(trim) store_working_author(facade_helper, 'done') From b1ee4bde3891e0bb3501fda2f29b9be1d75bfeb2 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 2 Apr 2024 20:23:22 -0500 Subject: [PATCH 013/122] Reduce dependence on database session using lib methods Signed-off-by: Andrew Brain --- .../data_analysis/clustering_worker/tasks.py | 5 +- .../data_analysis/discourse_analysis/tasks.py | 6 +- .../data_analysis/insight_worker/tasks.py | 19 ++- .../data_analysis/message_insights/tasks.py | 114 +++++++++--------- .../pull_request_analysis_worker/tasks.py | 7 +- augur/tasks/db/refresh_materialized_views.py | 41 +++---- augur/tasks/github/contributors/tasks.py | 8 +- augur/tasks/github/util/util.py | 28 ++--- 8 files changed, 104 insertions(+), 124 deletions(-) diff --git a/augur/tasks/data_analysis/clustering_worker/tasks.py b/augur/tasks/data_analysis/clustering_worker/tasks.py index e59951ab0e..81d89e523c 100644 --- a/augur/tasks/data_analysis/clustering_worker/tasks.py +++ b/augur/tasks/data_analysis/clustering_worker/tasks.py @@ -20,8 +20,7 @@ from collections import Counter from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value +from augur.application.db.lib import get_value, get_session from augur.application.db.models import Repo, RepoClusterMessage, RepoTopic, TopicWord from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -37,7 +36,7 @@ def clustering_task(self, repo_git): logger = logging.getLogger(clustering_model.__name__) engine = self.app.engine - with DatabaseSession(logger, engine) as session: + with get_session() as session: clustering_model(repo_git, logger, engine, session) def clustering_model(repo_git: str,logger,engine, session) -> None: diff --git a/augur/tasks/data_analysis/discourse_analysis/tasks.py b/augur/tasks/data_analysis/discourse_analysis/tasks.py index 450ec15a29..1a2bed485b 100644 --- a/augur/tasks/data_analysis/discourse_analysis/tasks.py +++ b/augur/tasks/data_analysis/discourse_analysis/tasks.py @@ -8,7 +8,7 @@ from collections import Counter from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_session from augur.application.db.models import Repo, DiscourseInsight from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -47,7 +47,7 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: tool_version = '0.1.0' data_source = 'Analysis of Issue/PR Messages' - with DatabaseSession(logger, engine) as session: + with get_session() as session: query = session.query(Repo).filter(Repo.repo_git == repo_git) repo_id = execute_session_query(query, 'one').repo_id @@ -96,7 +96,7 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: logger.debug(f"y_pred_git_flat len: {len(y_pred_git_flat)}") msg_df_cur_repo['discourse_act'] = y_pred_git_flat - with DatabaseSession(logger, engine) as session: + with get_session() as session: for index, row in msg_df_cur_repo.iterrows(): record = { 'msg_id': row['msg_id'], diff --git a/augur/tasks/data_analysis/insight_worker/tasks.py b/augur/tasks/data_analysis/insight_worker/tasks.py index 5bf159d2fa..97a6580d6f 100644 --- a/augur/tasks/data_analysis/insight_worker/tasks.py +++ b/augur/tasks/data_analysis/insight_worker/tasks.py @@ -10,10 +10,8 @@ import warnings from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, ChaossMetricStatus, RepoInsight, RepoInsightsRecord -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_repo_by_repo_git, get_session +from augur.application.db.models import ChaossMetricStatus, RepoInsight, RepoInsightsRecord from augur.tasks.init.celery_app import AugurMlRepoCollectionTask warnings.filterwarnings('ignore') @@ -25,11 +23,10 @@ def insight_task(self, repo_git): logger = logging.getLogger(insight_task.__name__) engine = self.app.engine - with DatabaseSession(logger, engine) as session: - insight_model(repo_git, logger, engine, session) + insight_model(repo_git, logger, engine) -def insight_model(repo_git: str,logger,engine,session) -> None: +def insight_model(repo_git: str,logger,engine) -> None: refresh = True send_insights = True @@ -40,8 +37,8 @@ def insight_model(repo_git: str,logger,engine,session) -> None: metrics = {"issues-new": "issues", "code-changes": "commit_count", "code-changes-lines": "added", "reviews": "pull_requests", "contributors-new": "new_contributors"} - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id anomaly_days = get_value('Insight_Task', 'anomaly_days') training_days = get_value('Insight_Task', 'training_days') @@ -247,7 +244,7 @@ def classify_anomalies(df, metric): "data_source": data_source } - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo_insight_record_obj = RepoInsightsRecord(**record) session.add(repo_insight_record_obj) session.commit() @@ -292,7 +289,7 @@ def classify_anomalies(df, metric): "data_source": data_source } - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo_insight_obj = RepoInsight(**data_point) session.add(repo_insight_obj) session.commit() diff --git a/augur/tasks/data_analysis/message_insights/tasks.py b/augur/tasks/data_analysis/message_insights/tasks.py index 6cc0446ab8..fe12bb9606 100644 --- a/augur/tasks/data_analysis/message_insights/tasks.py +++ b/augur/tasks/data_analysis/message_insights/tasks.py @@ -12,10 +12,8 @@ from augur.tasks.data_analysis.message_insights.message_sentiment import get_senti_score from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value -from augur.application.db.models import Repo, MessageAnalysis, MessageAnalysisSummary -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_repo_by_repo_git, get_session +from augur.application.db.models import MessageAnalysis, MessageAnalysisSummary from augur.tasks.init.celery_app import AugurMlRepoCollectionTask #SPDX-License-Identifier: MIT @@ -28,12 +26,11 @@ def message_insight_task(self, repo_git): logger = logging.getLogger(message_insight_task.__name__) engine = self.app.engine - with DatabaseSession(logger, engine) as session: - message_insight_model(repo_git, logger, engine, session) + message_insight_model(repo_git, logger, engine) -def message_insight_model(repo_git: str,logger,engine, session) -> None: +def message_insight_model(repo_git: str,logger,engine) -> None: full_train = True begin_date = '' @@ -45,8 +42,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: now = datetime.datetime.utcnow() run_id = int(now.timestamp())+5 - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) insight_days = get_value("Message_Insights", 'insight_days') @@ -193,32 +190,34 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: logger.info('Begin message_analysis data insertion...') logger.info(f'{df_message.shape[0]} data records to be inserted') - for row in df_message.itertuples(index=False): - try: - msg = { - "msg_id": row.msg_id, - "worker_run_id": run_id, - "sentiment_score": row.sentiment_score, - "reconstruction_error": row.rec_err, - "novelty_flag": row.novel_label, - "feedback_flag": None, - "tool_source": tool_source, - "tool_version": tool_version, - "data_source": data_source, - } - - message_analysis_object = MessageAnalysis(**msg) - session.add(message_analysis_object) - session.commit() - - # result = create_database_engine().execute(message_analysis_table.insert().values(msg)) - logger.info( - f'Primary key inserted into the message_analysis table: {message_analysis_object.msg_analysis_id}') - # logger.info( - # f'Inserted data point {results_counter} with msg_id {row.msg_id} and timestamp {row.msg_timestamp}') - except Exception as e: - logger.error(f'Error occurred while storing datapoint {repr(e)}') - break + with get_session() as session: + + for row in df_message.itertuples(index=False): + try: + msg = { + "msg_id": row.msg_id, + "worker_run_id": run_id, + "sentiment_score": row.sentiment_score, + "reconstruction_error": row.rec_err, + "novelty_flag": row.novel_label, + "feedback_flag": None, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + } + + message_analysis_object = MessageAnalysis(**msg) + session.add(message_analysis_object) + session.commit() + + # result = create_database_engine().execute(message_analysis_table.insert().values(msg)) + logger.info( + f'Primary key inserted into the message_analysis table: {message_analysis_object.msg_analysis_id}') + # logger.info( + # f'Inserted data point {results_counter} with msg_id {row.msg_id} and timestamp {row.msg_timestamp}') + except Exception as e: + logger.error(f'Error occurred while storing datapoint {repr(e)}') + break logger.info('Data insertion completed\n') @@ -318,27 +317,30 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: # Insertion of sentiment ratios & novel counts to repo level table logger.info('Begin repo wise insights insertion...') logger.info(f'{df_senti.shape[0]} data records to be inserted\n') - for row in df_trend.itertuples(): - msg = { - "repo_id": repo_id, - "worker_run_id": run_id, - "positive_ratio": row.PosR, - "negative_ratio": row.NegR, - "novel_count": row.Novel, - "period": row.Index, - "tool_source": tool_source, - "tool_version": tool_version, - "data_source": data_source - } - - message_analysis_summary_object = MessageAnalysisSummary(**msg) - session.add(message_analysis_summary_object) - session.commit() - - # result = create_database_engine().execute(message_analysis_summary_table.insert().values(msg)) - logger.info( - f'Primary key inserted into the message_analysis_summary table: {message_analysis_summary_object.msg_summary_id}') - # logger.info(f'Inserted data point {results_counter} for insight_period {row.Index}') + + with get_session() as session: + + for row in df_trend.itertuples(): + msg = { + "repo_id": repo_id, + "worker_run_id": run_id, + "positive_ratio": row.PosR, + "negative_ratio": row.NegR, + "novel_count": row.Novel, + "period": row.Index, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + message_analysis_summary_object = MessageAnalysisSummary(**msg) + session.add(message_analysis_summary_object) + session.commit() + + # result = create_database_engine().execute(message_analysis_summary_table.insert().values(msg)) + logger.info( + f'Primary key inserted into the message_analysis_summary table: {message_analysis_summary_object.msg_summary_id}') + # logger.info(f'Inserted data point {results_counter} for insight_period {row.Index}') logger.info('Data insertion completed\n') diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py index af806bcdd1..24dd634bd4 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -9,8 +9,7 @@ from augur.tasks.data_analysis.message_insights.message_sentiment import get_senti_score from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value +from augur.application.db.lib import get_value, get_session from augur.application.db.models import Repo, PullRequestAnalysis from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -40,7 +39,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: insight_days = 200 - with DatabaseSession(logger, engine) as session: + with get_session() as session: query = session.query(Repo).filter(Repo.repo_git == repo_git) repo_id = execute_session_query(query, 'one').repo_id @@ -211,7 +210,7 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: logger.info('Begin PR_analysis data insertion...') logger.info(f'{df.shape[0]} data records to be inserted') - with DatabaseSession(logger, engine) as session: + with get_session() as session: for row in df.itertuples(index=False): try: msg = { diff --git a/augur/tasks/db/refresh_materialized_views.py b/augur/tasks/db/refresh_materialized_views.py index c191b56039..09faffe0cb 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/augur/tasks/db/refresh_materialized_views.py @@ -3,7 +3,7 @@ import sqlalchemy as s from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import execute_sql @celery.task(bind=True) def refresh_materialized_views(self): @@ -86,92 +86,79 @@ def refresh_materialized_views(self): """) try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv1_refresh) + execute_sql(mv1_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv2_refresh) + execute_sql(mv2_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv3_refresh) + execute_sql(mv3_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv4_refresh) + execute_sql(mv4_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv5_refresh) + execute_sql(mv5_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv6_refresh) + execute_sql(mv6_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv7_refresh) + execute_sql(mv7_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv8_refresh) + execute_sql(mv8_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv9_refresh) + execute_sql(mv9_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv10_refresh) + execute_sql(mv10_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv11_refresh) + execute_sql(mv11_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv12_refresh) + execute_sql(mv12_refresh) except Exception as e: logger.info(f"error is {e}") pass try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv13_refresh) + execute_sql(mv13_refresh) except Exception as e: logger.info(f"error is {e}") pass diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index 882725d205..a581b09f71 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -6,8 +6,9 @@ from augur.tasks.github.util.github_paginator import hit_api from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.facade_github.tasks import * -from augur.application.db.models import Contributor, Repo +from augur.application.db.models import Contributor from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git @celery.task @@ -109,10 +110,9 @@ def grab_comitters(self, repo_git,platform="github"): engine = self.app.engine logger = logging.getLogger(grab_comitters.__name__) - with DatabaseSession(logger,engine) as session: - repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id try: with GithubTaskManifest(logger) as manifest: diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 5dfe100977..432f674512 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -4,8 +4,7 @@ import json import httpx from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.application.db.session import DatabaseSession -from augur.application.db.models import Repo +from augur.application.db.lib import get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps @@ -81,20 +80,17 @@ def get_repo_weight_by_issue(logger,repo_git): #Get the weight for each repo for the core collection hook def get_repo_weight_core(logger,repo_git): - from augur.application.db import get_engine - engine = get_engine() - - with DatabaseSession(logger,engine) as session: - repo = Repo.get_by_repo_git(session, repo_git) - if not repo: - raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") - - #try to get the collection status if it exists at this point - try: - status = repo.collection_status[0] - time_factor = calculate_date_weight_from_timestamps(repo.repo_added,status.core_data_last_collected) - except IndexError: - time_factor = calculate_date_weight_from_timestamps(repo.repo_added,None) + repo = get_repo_by_repo_git(repo_git) + + if not repo: + raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") + + #try to get the collection status if it exists at this point + try: + status = repo.collection_status[0] + time_factor = calculate_date_weight_from_timestamps(repo.repo_added,status.core_data_last_collected) + except IndexError: + time_factor = calculate_date_weight_from_timestamps(repo.repo_added,None) #Don't go below zero. From 3772c237c87c3fd99f01dd0435bd29f0606291f7 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 2 Apr 2024 21:14:34 -0500 Subject: [PATCH 014/122] Fix bulk upsert syntax issues Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 39 ++++++++++--------- .../git/dependency_libyear_tasks/core.py | 2 +- augur/tasks/git/dependency_tasks/core.py | 4 +- augur/tasks/git/scc_value_tasks/core.py | 2 +- augur/tasks/github/facade_github/tasks.py | 13 ++++--- 5 files changed, 32 insertions(+), 28 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index e0680903ed..818c574076 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -223,7 +223,7 @@ def facade_bulk_insert_commits(logger, records): raise e -def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: +def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: if isinstance(data, list) is False: @@ -233,7 +233,7 @@ def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: data = [data] else: - self.logger.info("Data must be a list or a dict") + logger.info("Data must be a list or a dict") return None if len(data) == 0: @@ -241,7 +241,7 @@ def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: return None if isinstance(data[0], dict) is False: - self.logger.info("Must be list of dicts") + logger.info("Must be list of dicts") return None # remove any duplicate data @@ -293,6 +293,7 @@ def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: sleep_time_list = list(range(1,11)) deadlock_detected = False + engine = get_engine() # if there is no data to return then it executes the insert then returns nothing if not return_columns: @@ -301,7 +302,7 @@ def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: try: #begin keyword is needed for sqlalchemy 2.x #this is because autocommit support was removed in 2.0 - with self.engine.begin() as connection: + with engine.begin() as connection: connection.execute(stmnt) break except OperationalError as e: @@ -309,7 +310,7 @@ def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: if isinstance(e.orig, DeadlockDetected): deadlock_detected = True sleep_time = random.choice(sleep_time_list) - self.logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") + logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") time.sleep(sleep_time) attempts += 1 @@ -326,15 +327,15 @@ def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: first_half = data[:len(data)//2] second_half = data[len(data)//2:] - self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) - self.insert_data(second_half,table, natural_keys, return_columns, string_fields, on_conflict_update) + bulk_insert_dicts(logger, first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + bulk_insert_dicts(logger, second_half,table, natural_keys, return_columns, string_fields, on_conflict_update) else: - self.logger.error("Unable to insert data in 10 attempts") + logger.error("Unable to insert data in 10 attempts") return None if deadlock_detected is True: - self.logger.error("Made it through even though Deadlock was detected") + logger.error("Made it through even though Deadlock was detected") return "success" @@ -342,13 +343,13 @@ def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: # othewise it gets the requested return columns and returns them as a list of dicts while attempts < 10: try: - with self.engine.begin() as connection: + with engine.begin() as connection: return_data_tuples = connection.execute(stmnt) break except OperationalError as e: if isinstance(e.orig, DeadlockDetected): sleep_time = random.choice(sleep_time_list) - self.logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") + logger.debug(f"Deadlock detected on {table.__table__} table...trying again in {round(sleep_time)} seconds: transaction size: {len(data)}") time.sleep(sleep_time) attempts += 1 @@ -364,15 +365,15 @@ def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: first_half = data[:len(data)//2] second_half = data[len(data)//2:] - self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) - self.insert_data(second_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + bulk_insert_dicts(logger, first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + bulk_insert_dicts(logger, second_half, table, natural_keys, return_columns, string_fields, on_conflict_update) else: - self.logger.error("Unable to insert and return data in 10 attempts") + logger.error("Unable to insert and return data in 10 attempts") return None if deadlock_detected is True: - self.logger.error("Made it through even though Deadlock was detected") + logger.error("Made it through even though Deadlock was detected") return_data = [dict(row) for row in return_data_tuples.mappings()] @@ -393,9 +394,11 @@ def bulk_insert_dicts(self, data: Union[List[dict], dict], table, natural_keys: conditions.append(column.in_(tuple(column_values))) - result = ( - self.query(table).filter(*conditions).all() - ) + with get_session() as session: + + result = ( + session.query(table).filter(*conditions).all() + ) for row in result: diff --git a/augur/tasks/git/dependency_libyear_tasks/core.py b/augur/tasks/git/dependency_libyear_tasks/core.py index 2d25ff9887..0241a9a41c 100644 --- a/augur/tasks/git/dependency_libyear_tasks/core.py +++ b/augur/tasks/git/dependency_libyear_tasks/core.py @@ -68,4 +68,4 @@ def generate_deps_libyear_data(logger, repo_id, path): #session.execute_sql(insert_statement) to_insert.append(repo_deps) - bulk_insert_dicts(to_insert, RepoDepsLibyear, ["repo_id","name","data_collection_date"]) + bulk_insert_dicts(logger, to_insert, RepoDepsLibyear, ["repo_id","name","data_collection_date"]) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 467211b51e..dcb9b82234 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -33,7 +33,7 @@ def generate_deps_data(logger, repo_id, path): to_insert.append(repo_deps) - bulk_insert_dicts(to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) + bulk_insert_dicts(logger, to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") @@ -109,7 +109,7 @@ def generate_scorecard(logger, session,repo_id,path): } to_insert.append(repo_deps_scorecard) - bulk_insert_dicts(to_insert, RepoDepsScorecard, ["repo_id","name"]) + bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name"]) logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/scc_value_tasks/core.py b/augur/tasks/git/scc_value_tasks/core.py index d705927791..7afe369e94 100644 --- a/augur/tasks/git/scc_value_tasks/core.py +++ b/augur/tasks/git/scc_value_tasks/core.py @@ -43,6 +43,6 @@ def value_model(logger,repo_git,repo_id, path): to_insert.append(repo_labor) - bulk_insert_dicts(to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) + bulk_insert_dicts(logger, to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) logger.info(f"Done generating scc data for repo {repo_id} from path {path}") diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 93e87fff29..46ae367171 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -7,6 +7,7 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.models import Contributor from augur.tasks.github.facade_github.core import * +from augur.application.db.lib import execute_sql from augur.application.db.util import execute_session_query from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * @@ -169,11 +170,11 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) return -def link_commits_to_contributor(session,contributorQueue): +def link_commits_to_contributor(logger, facade_helper, contributorQueue): # # iterate through all the commits with emails that appear in contributors and give them the relevant cntrb_id. for cntrb in contributorQueue: - session.logger.debug( + logger.debug( f"These are the emails and cntrb_id's returned: {cntrb}") query = s.sql.text(""" @@ -186,7 +187,7 @@ def link_commits_to_contributor(session,contributorQueue): """).bindparams(cntrb_id=cntrb["cntrb_id"],cntrb_email=cntrb["email"]) #engine.execute(query, **data) - session.insert_or_update_data(query) + facade_helper.insert_or_update_data(query) return @@ -261,7 +262,7 @@ def insert_facade_contributors(self, repo_id): manifest.logger.debug("DEBUG: Got through the new_contribs") - session = FacadeSession(logger) + facade_helper = FacadeHelper(logger) # sql query used to find corresponding cntrb_id's of emails found in the contributor's table # i.e., if a contributor already exists, we use it! resolve_email_to_cntrb_id_sql = s.sql.text(""" @@ -297,11 +298,11 @@ def insert_facade_contributors(self, repo_id): #existing_cntrb_emails = json.loads(pd.read_sql(resolve_email_to_cntrb_id_sql, self.db, params={ # 'repo_id': repo_id}).to_json(orient="records")) - result = session.execute_sql(resolve_email_to_cntrb_id_sql) + result = execute_sql(resolve_email_to_cntrb_id_sql) existing_cntrb_emails = [dict(row) for row in result.mappings()] print(existing_cntrb_emails) - link_commits_to_contributor(session,list(existing_cntrb_emails)) + link_commits_to_contributor(logger, facade_helper,list(existing_cntrb_emails)) logger.info("Done with inserting and updating facade contributors") return From 995fa6e33e8e3b108cabfe5570ef44715bc69d68 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 6 Apr 2024 10:47:12 -0500 Subject: [PATCH 015/122] Clean up lib year task session management Signed-off-by: Andrew Brain --- augur/tasks/git/dependency_libyear_tasks/core.py | 15 ++++++--------- augur/tasks/git/dependency_libyear_tasks/tasks.py | 12 +----------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/augur/tasks/git/dependency_libyear_tasks/core.py b/augur/tasks/git/dependency_libyear_tasks/core.py index 0241a9a41c..e87e0c684b 100644 --- a/augur/tasks/git/dependency_libyear_tasks/core.py +++ b/augur/tasks/git/dependency_libyear_tasks/core.py @@ -1,11 +1,10 @@ from datetime import datetime from augur.application.db.models import * -from augur.application.db.lib import get_value, bulk_insert_dicts -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, bulk_insert_dicts, get_repo_by_repo_git from augur.tasks.git.dependency_libyear_tasks.libyear_util.util import get_deps_libyear_data from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -def deps_libyear_model(logger, session, repo_id,repo_git,repo_group_id): +def deps_libyear_model(logger,repo_git): """ Data collection and storage method """ logger.info(f"This is the libyear deps model repo: {repo_git}") @@ -13,15 +12,13 @@ def deps_libyear_model(logger, session, repo_id,repo_git,repo_group_id): #result = re.search(r"https:\/\/(github\.com\/[A-Za-z0-9 \- _]+\/)([A-Za-z0-9 \- _ .]+)$", repo_git).groups() #relative_repo_path = f"{repo_group_id}/{result[0]}{result[1]}" - query = session.query(Repo).filter( - Repo.repo_git == repo_git) - - result = execute_session_query(query, 'one') + + repo = get_repo_by_repo_git(repo_git) - absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo_id,result.repo_path,result.repo_name) + absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) #config.get_section("Facade")['repo_directory'] + relative_repo_path#self.config['repo_directory'] + relative_repo_path - generate_deps_libyear_data(logger, repo_id, absolute_repo_path) + generate_deps_libyear_data(logger, repo.repo_id, absolute_repo_path) def generate_deps_libyear_data(logger, repo_id, path): diff --git a/augur/tasks/git/dependency_libyear_tasks/tasks.py b/augur/tasks/git/dependency_libyear_tasks/tasks.py index 4255bfc8ae..fbf121b2ac 100644 --- a/augur/tasks/git/dependency_libyear_tasks/tasks.py +++ b/augur/tasks/git/dependency_libyear_tasks/tasks.py @@ -1,22 +1,12 @@ import logging -from augur.application.db.lib import get_session from augur.tasks.git.dependency_libyear_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.application.db.util import execute_session_query @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) def process_libyear_dependency_metrics(self, repo_git): #raise NotImplementedError - engine = self.app.engine - logger = logging.getLogger(process_libyear_dependency_metrics.__name__) - with get_session() as session: - - logger.info(f"repo_git: {repo_git}") - query = session.query(Repo).filter(Repo.repo_git == repo_git) - - repo = execute_session_query(query,'one') - deps_libyear_model(logger, session, repo.repo_id,repo_git,repo.repo_group_id) \ No newline at end of file + deps_libyear_model(logger, repo_git) \ No newline at end of file From a4602c59e7e1793053d66269091e09668259cd21 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 6 Apr 2024 10:56:16 -0500 Subject: [PATCH 016/122] Clean up db usage in dependency tasks Signed-off-by: Andrew Brain --- augur/tasks/git/dependency_tasks/core.py | 37 ++++++++++++++++------- augur/tasks/git/dependency_tasks/tasks.py | 27 +++-------------- 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index dcb9b82234..75c96ad5de 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -1,16 +1,27 @@ from datetime import datetime import os from augur.application.db.models import * -from augur.application.db.lib import bulk_insert_dicts +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value, get_session from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from augur.tasks.util.worker_util import parse_json_from_subprocess_call +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -def generate_deps_data(logger, repo_id, path): + +def generate_deps_data(logger, repo_git): """Run dependency logic on repo and stores data in database :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ + + logger.info(f"repo_git: {repo_git}") + + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + + path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) + + logger.debug(f"This is the deps model repo: {repo_git}.") scan_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') logger.info('Searching for deps in repo') @@ -48,28 +59,32 @@ def deps_model(session, repo_id,repo_git,repo_path,repo_name): generate_deps_data(session,repo_id, absolute_repo_path) """ -def generate_scorecard(logger, session,repo_id,path): +def generate_scorecard(logger, repo_git): """Runs scorecard on repo and stores data in database :param repo_id: Repository ID - :param path: URL path of the Repostiory - """ - logger.info('Generating scorecard data for repo') - logger.info(f"Repo ID: {repo_id}, Path: {path}") + :param repo_git: URL path of the Repostiory + """ + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + logger.info('Generating scorecard data for repo') # we convert relative path in the format required by scorecard like github.com/chaoss/augur # raw_path,_ = path.split('-') # scorecard_repo_path = raw_path[2:] - path = path[8:] + path = repo_git[8:] if path[-4:] == '.git': path = path.replace(".git", "") - command = '--repo='+ path + command = '--repo=' + path #this is path where our scorecard project is located path_to_scorecard = os.environ['HOME'] + '/scorecard' #setting the environmental variable which is required by scorecard - key_handler = GithubApiKeyHandler(session, logger) - os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + + with get_session() as session: + + key_handler = GithubApiKeyHandler(session, logger) + os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index ad64c08b7e..68f5cc31af 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -9,25 +9,12 @@ from augur.application.db.lib import get_value -@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def process_dependency_metrics(self, repo_git): - #raise NotImplementedError - - engine = self.app.engine +@celery.task(base=AugurFacadeRepoCollectionTask) +def process_dependency_metrics(repo_git): logger = logging.getLogger(process_dependency_metrics.__name__) - with get_session() as session: - logger.info(f"repo_git: {repo_git}") - query = session.query(Repo).filter(Repo.repo_git == repo_git) - - repo = execute_session_query(query,'one') - - absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - - logger.debug(f"This is the deps model repo: {repo_git}.") - - generate_deps_data(logger,repo.repo_id,absolute_repo_path) + generate_deps_data(logger, repo_git) @celery.task(base=AugurSecondaryRepoCollectionTask, bind=True) @@ -37,10 +24,4 @@ def process_ossf_dependency_metrics(self, repo_git): logger = logging.getLogger(process_ossf_dependency_metrics.__name__) - with get_session() as session: - logger.info(f"repo_git: {repo_git}") - - query = session.query(Repo).filter(Repo.repo_git == repo_git) - - repo = execute_session_query(query,'one') - generate_scorecard(logger, session, repo.repo_id, repo_git) \ No newline at end of file + generate_scorecard(logger, repo_git) \ No newline at end of file From 688cee23323159a70decb482a8c1ca786bf85fe5 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 6 Apr 2024 11:01:40 -0500 Subject: [PATCH 017/122] Clean up value model db usage Signed-off-by: Andrew Brain --- augur/tasks/git/dependency_tasks/tasks.py | 4 ---- augur/tasks/git/scc_value_tasks/core.py | 12 +++++++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index 68f5cc31af..ddfe11ff4c 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -1,12 +1,8 @@ import logging import traceback -from augur.application.db.lib import get_session from augur.tasks.git.dependency_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -from augur.application.db.lib import get_value @celery.task(base=AugurFacadeRepoCollectionTask) diff --git a/augur/tasks/git/scc_value_tasks/core.py b/augur/tasks/git/scc_value_tasks/core.py index 7afe369e94..38ad34c566 100644 --- a/augur/tasks/git/scc_value_tasks/core.py +++ b/augur/tasks/git/scc_value_tasks/core.py @@ -1,14 +1,20 @@ from datetime import datetime import os from augur.application.db.models import * -from augur.application.db.lib import bulk_insert_dicts +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_value from augur.tasks.util.worker_util import parse_json_from_subprocess_call +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -def value_model(logger,repo_git,repo_id, path): +def value_model(logger,repo_git): """Runs scc on repo and stores data in database :param repo_id: Repository ID - :param path: absolute file path of the Repostiory """ + logger.info(f"repo_git: {repo_git}") + + repo = get_repo_by_repo_git(repo_git) + repo_id = repo.repo_id + + path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo_id,repo.repo_path,repo.repo_name) logger.info('Generating value data for repo') logger.info(f"Repo ID: {repo_id}, Path: {path}") From 9a6283262987745b5275f8a04126245b91f07ce2 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 6 Apr 2024 11:04:49 -0500 Subject: [PATCH 018/122] Update value task with new db stuff Signed-off-by: Andrew Brain --- augur/tasks/git/scc_value_tasks/tasks.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/augur/tasks/git/scc_value_tasks/tasks.py b/augur/tasks/git/scc_value_tasks/tasks.py index 54b45cd76b..dc0cd94724 100644 --- a/augur/tasks/git/scc_value_tasks/tasks.py +++ b/augur/tasks/git/scc_value_tasks/tasks.py @@ -3,24 +3,11 @@ from augur.tasks.git.scc_value_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.application.db.util import execute_session_query -from augur.application.db.lib import get_value -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def process_scc_value_metrics(self, repo_git): - - engine = self.app.engine +@celery.task(base=AugurFacadeRepoCollectionTask) +def process_scc_value_metrics(repo_git): logger = logging.getLogger(process_scc_value_metrics.__name__) - with get_session() as session: - logger.info(f"repo_git: {repo_git}") - - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - - absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) - - value_model(logger,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file + value_model(logger,repo_git,) \ No newline at end of file From d982c4459eef93ae784a3e1f69fccc0b0874a73f Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 6 Apr 2024 11:12:21 -0500 Subject: [PATCH 019/122] Update repofetch to use new method Signed-off-by: Andrew Brain --- .../facade_worker/facade_worker/repofetch.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index df37dca457..f22de37dd5 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -31,10 +31,11 @@ import pathlib import sqlalchemy as s from .utilitymethods import update_repo_log, get_absolute_repo_path +from sqlalchemy.orm.exc import NoResultFound from augur.application.db.models.augur_data import * from augur.application.db.models.augur_operations import CollectionStatus from augur.application.db.util import execute_session_query, convert_orm_list_to_dict_list -from augur.application.db.lib import execute_sql +from augur.application.db.lib import execute_sql, get_repo_by_repo_git class GitCloneError(Exception): pass @@ -283,24 +284,21 @@ def git_repo_updates(facade_helper, session, repo_git): # query = s.sql.text("""SELECT repo_id,repo_group_id,repo_git,repo_name,repo_path FROM repo WHERE # repo_status='Update'""") - query = session.query(Repo).filter( - Repo.repo_git == repo_git) - result = execute_session_query(query, 'all') try: - # fetchall_data_from_sql_text(query)#list(cfg.cursor) - row = convert_orm_list_to_dict_list(result)[0] - except IndexError: + repo = get_repo_by_repo_git(repo_git) + except NoResultFound: raise Exception( f"Repo git: {repo_git} does not exist or the status is not 'Update'") - if row["repo_path"] is None or row["repo_name"] is None: + + if repo.repo_path is None or repo.repo_name is None: raise Exception( - f"The repo path or repo name is NULL for repo_id: {row['repo_id']}") + f"The repo path or repo name is NULL for repo_id: {repo.repo_id}") facade_helper.log_activity( - 'Verbose', f"Attempting to update {row['repo_git']}") # ['git']) - update_repo_log(logger, facade_helper, row['repo_id'], 'Updating') # ['id'],'Updating') + 'Verbose', f"Attempting to update {repo.repo_git}") # ['git']) + update_repo_log(logger, facade_helper, repo.repo_id, 'Updating') # ['id'],'Updating') attempt = 0 @@ -311,7 +309,7 @@ def git_repo_updates(facade_helper, session, repo_git): # default_branch = '' absolute_path = get_absolute_repo_path( - facade_helper.repo_base_directory, row["repo_id"], row['repo_path'],row['repo_name']) + facade_helper.repo_base_directory, repo.repo_id, repo.repo_path, repo.repo_name) while attempt < 2: @@ -380,7 +378,7 @@ def git_repo_updates(facade_helper, session, repo_git): elif attempt == 0: facade_helper.log_activity( - 'Verbose', f"git pull failed, attempting reset and clean for {row['repo_git']}") + 'Verbose', f"git pull failed, attempting reset and clean for {repo.repo_git}") # remotedefault = 'main' @@ -463,12 +461,12 @@ def git_repo_updates(facade_helper, session, repo_git): if return_code == 0: - update_repo_log(logger, facade_helper, row['repo_id'], 'Up-to-date') - facade_helper.log_activity('Verbose', f"Updated {row['repo_git']}") + update_repo_log(logger, facade_helper, repo.repo_id, 'Up-to-date') + facade_helper.log_activity('Verbose', f"Updated {repo.repo_git}") else: - update_repo_log(logger, facade_helper, row['repo_id'], f"Failed ({return_code})") - facade_helper.log_activity('Error', f"Could not update {row['repo_git']}") + update_repo_log(logger, facade_helper, repo.repo_id, f"Failed ({return_code})") + facade_helper.log_activity('Error', f"Could not update {repo.repo_git}") facade_helper.log_activity('Info', 'Updating existing repos (complete)') From ee833a10942aaa186c13a4c3f80db78a7d9df826 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 6 Apr 2024 11:38:48 -0500 Subject: [PATCH 020/122] Facade db improvements Signed-off-by: Andrew Brain --- augur/tasks/git/facade_tasks.py | 78 ++++++------------- .../facade_worker/facade_worker/config.py | 10 +++ .../facade_worker/postanalysiscleanup.py | 10 ++- .../facade_worker/facade_worker/repofetch.py | 2 +- .../facade_worker/utilitymethods.py | 40 +++++----- 5 files changed, 64 insertions(+), 76 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index a3cc8a5aa5..2612ca7ae9 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -77,34 +77,21 @@ def trim_commits_facade_task(repo_git): repo_id = repo.repo_id - def update_analysis_log(repos_id,status): + facade_helper.inc_repos_processed() + facade_helper.update_analysis_log(repo_id,"Beginning analysis.") + # First we check to see if the previous analysis didn't complete - # Log a repo's analysis status + working_commits = get_working_commits_by_repo_id(repo_id) - log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) - VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) - - try: - execute_sql(log_message) - except: - pass - - - facade_helper.inc_repos_processed() - update_analysis_log(repo_id,"Beginning analysis.") - # First we check to see if the previous analysis didn't complete - - working_commits = get_working_commits_by_repo_id(repo_id) - - # If there's a commit still there, the previous run was interrupted and - # the commit data may be incomplete. It should be trimmed, just in case. - commits_to_trim = [commit['working_commit'] for commit in working_commits] - - trim_commits(facade_helper,repo_id,commits_to_trim) - # Start the main analysis + # If there's a commit still there, the previous run was interrupted and + # the commit data may be incomplete. It should be trimmed, just in case. + commits_to_trim = [commit['working_commit'] for commit in working_commits] + + trim_commits(facade_helper,repo_id,commits_to_trim) + # Start the main analysis - update_analysis_log(repo_id,'Collecting data') - logger.info(f"Got past repo {repo_id}") + facade_helper.update_analysis_log(repo_id,'Collecting data') + logger.info(f"Got past repo {repo_id}") @celery.task(base=AugurFacadeRepoCollectionTask) def trim_commits_post_analysis_facade_task(repo_git): @@ -117,15 +104,6 @@ def trim_commits_post_analysis_facade_task(repo_git): repo_id = repo.repo_id start_date = facade_helper.get_setting('start_date') - def update_analysis_log(repos_id,status): - - # Log a repo's analysis status - - log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) - VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) - - - execute_sql(log_message) logger.info(f"Generating sequence for repo {repo_id}") @@ -151,19 +129,18 @@ def update_analysis_log(repos_id,status): trimmed_commits = existing_commits - parent_commits - update_analysis_log(repo_id,'Data collection complete') + facade_helper.update_analysis_log(repo_id,'Data collection complete') - update_analysis_log(repo_id,'Beginning to trim commits') + facade_helper.update_analysis_log(repo_id,'Beginning to trim commits') facade_helper.log_activity('Debug',f"Commits to be trimmed from repo {repo_id}: {len(trimmed_commits)}") #for commit in trimmed_commits: trim_commits(facade_helper,repo_id,trimmed_commits) + facade_helper.update_analysis_log(repo_id,'Commit trimming complete') - update_analysis_log(repo_id,'Commit trimming complete') - - update_analysis_log(repo_id,'Complete') + facade_helper.update_analysis_log(repo_id,'Complete') @@ -297,8 +274,7 @@ def git_repo_cleanup_facade_task(repo_git): logger = logging.getLogger(git_repo_cleanup_facade_task.__name__) facade_helper = FacadeHelper(logger) - with get_session() as session: - git_repo_cleanup(facade_helper, session, repo_git) + git_repo_cleanup(facade_helper, repo_git) # retry this task indefinitely every 5 minutes if it errors. Since the only way it gets scheduled is by itself, so if it stops running no more clones will happen till the instance is restarted @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) @@ -327,10 +303,10 @@ def clone_repos(): session.commit() # get the commit count - commit_count = get_repo_commit_count(logger, facade_helper, session, repo_git) - facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) + commit_count = get_repo_commit_count(logger, facade_helper, repo_git) + facade_weight = get_facade_weight_with_commit_count(repo_git, commit_count) - update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) + update_facade_scheduling_fields(repo_git, facade_weight, commit_count) # set repo to update setattr(repoStatus,"facade_status", CollectionState.UPDATE.value) @@ -368,12 +344,10 @@ def git_update_commit_count_weight(self, repo_git): # Change facade session to take in engine facade_helper = FacadeHelper(logger) - with get_session() as session: - - commit_count = get_repo_commit_count(logger, facade_helper, session, repo_git) - facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) - - update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) + commit_count = get_repo_commit_count(logger, facade_helper, repo_git) + facade_weight = get_facade_weight_with_commit_count(repo_git, commit_count) + + update_facade_scheduling_fields(repo_git, facade_weight, commit_count) @celery.task(base=AugurFacadeRepoCollectionTask) @@ -383,9 +357,7 @@ def git_repo_updates_facade_task(repo_git): facade_helper = FacadeHelper(logger) - with get_session() as session: - - git_repo_updates(facade_helper, session, repo_git) + git_repo_updates(facade_helper, repo_git) def generate_analysis_sequence(logger,repo_git, facade_helper): diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index 0be34fa7cb..c62034a94e 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -191,6 +191,16 @@ def update_repo_log(self,repos_id,status): execute_sql(log_message) except: pass + + def update_analysis_log(self, repos_id,status): + + # Log a repo's analysis status + + log_message = s.sql.text("""INSERT INTO analysis_log (repos_id,status) + VALUES (:repo_id,:status)""").bindparams(repo_id=repos_id,status=status) + + execute_sql(log_message) + def insert_or_update_data(self, query, **bind_args)-> None: """Provide deadlock detection for postgres updates, inserts, and deletions for facade. diff --git a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py index 177a100d8f..03e4f98acd 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py @@ -28,7 +28,7 @@ import subprocess import sqlalchemy as s from augur.application.db.util import execute_session_query -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_commits_by_repo_id, remove_working_commits_by_repo_id +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_commits_by_repo_id, remove_working_commits_by_repo_id, get_session from .utilitymethods import get_absolute_repo_path from augur.application.db.models import * @@ -42,10 +42,12 @@ def git_repo_cleanup(facade_helper, session,repo_git): facade_helper.log_activity('Info','Processing deletions') # TODO: We can convert this to use get_repo_by_repo_git. We just need to know how to handle the NoResultFoundException - query = session.query(Repo).filter( - Repo.repo_git == repo_git)#s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_status='Delete'""") + with get_session() as session: - delete_repos = execute_session_query(query,'all')#fetchall_data_from_sql_text(query) + query = session.query(Repo).filter( + Repo.repo_git == repo_git)#s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_status='Delete'""") + + delete_repos = execute_session_query(query,'all')#fetchall_data_from_sql_text(query) for row in delete_repos: diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index f22de37dd5..ce1b3ae2da 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -275,7 +275,7 @@ def force_repo_analysis(session, repo_git): session.log_activity('Info', 'Forcing repos to be analyzed (complete)') -def git_repo_updates(facade_helper, session, repo_git): +def git_repo_updates(facade_helper, repo_git): # Update existing repos diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index f0e506c832..1c22a565d8 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -32,7 +32,7 @@ from augur.application.db.models import * from .config import FacadeHelper as FacadeHelper from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes +from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes, get_repo_by_repo_git, get_session #from augur.tasks.git.util.facade_worker.facade def update_repo_log(logger, facade_helper, repos_id,status): @@ -135,10 +135,10 @@ def count_branches(git_dir): branches_dir = os.path.join(git_dir, 'refs', 'heads') return sum(1 for _ in os.scandir(branches_dir)) -def get_repo_commit_count(logger, facade_helper, session, repo_git): - - repo = Repo.get_by_repo_git(session, repo_git) +def get_repo_commit_count(logger, facade_helper, repo_git): + repo = get_repo_by_repo_git(repo_git) + absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) repo_loc = (f"{absolute_path}/.git") @@ -158,8 +158,9 @@ def get_repo_commit_count(logger, facade_helper, session, repo_git): return commit_count -def get_facade_weight_time_factor(session,repo_git): - repo = Repo.get_by_repo_git(session, repo_git) +def get_facade_weight_time_factor(repo_git): + + repo = get_repo_by_repo_git(repo_git) try: status = repo.collection_status[0] @@ -172,26 +173,29 @@ def get_facade_weight_time_factor(session,repo_git): return time_factor -def get_facade_weight_with_commit_count(session, repo_git, commit_count): - return commit_count - get_facade_weight_time_factor(session, repo_git) +def get_facade_weight_with_commit_count(repo_git, commit_count): + return commit_count - get_facade_weight_time_factor(repo_git) -def get_repo_weight_by_commit(logger, session, repo_git): +def get_repo_weight_by_commit(logger, repo_git): facade_helper = FacadeHelper(logger) - return get_repo_commit_count(logger, facade_helper, session, repo_git) - get_facade_weight_time_factor(session, repo_git) + return get_repo_commit_count(logger, facade_helper, repo_git) - get_facade_weight_time_factor(repo_git) def update_facade_scheduling_fields(session, repo_git, weight, commit_count): - repo = Repo.get_by_repo_git(session, repo_git) - update_query = ( - s.update(CollectionStatus) - .where(CollectionStatus.repo_id == repo.repo_id) - .values(facade_weight=weight,commit_sum=commit_count) - ) + repo = get_repo_by_repo_git(repo_git) + + with get_session() as session: + + update_query = ( + s.update(CollectionStatus) + .where(CollectionStatus.repo_id == repo.repo_id) + .values(facade_weight=weight,commit_sum=commit_count) + ) - session.execute(update_query) - session.commit() + session.execute(update_query) + session.commit() From 7f358009f406d884b2d1c48a55c5e407e7536a84 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 6 Apr 2024 12:44:01 -0500 Subject: [PATCH 021/122] Imporve database handling in ml workers Signed-off-by: Andrew Brain --- .../data_analysis/clustering_worker/tasks.py | 132 +++++++++--------- .../data_analysis/discourse_analysis/tasks.py | 7 +- .../pull_request_analysis_worker/tasks.py | 14 +- augur/tasks/git/facade_tasks.py | 2 +- 4 files changed, 76 insertions(+), 79 deletions(-) diff --git a/augur/tasks/data_analysis/clustering_worker/tasks.py b/augur/tasks/data_analysis/clustering_worker/tasks.py index 81d89e523c..d548ecf108 100644 --- a/augur/tasks/data_analysis/clustering_worker/tasks.py +++ b/augur/tasks/data_analysis/clustering_worker/tasks.py @@ -20,9 +20,8 @@ from collections import Counter from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import get_value, get_session -from augur.application.db.models import Repo, RepoClusterMessage, RepoTopic, TopicWord -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_session, get_repo_by_repo_git +from augur.application.db.models import RepoClusterMessage, RepoTopic, TopicWord from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -36,10 +35,9 @@ def clustering_task(self, repo_git): logger = logging.getLogger(clustering_model.__name__) engine = self.app.engine - with get_session() as session: - clustering_model(repo_git, logger, engine, session) + clustering_model(repo_git, logger, engine) -def clustering_model(repo_git: str,logger,engine, session) -> None: +def clustering_model(repo_git: str,logger,engine) -> None: logger.info(f"Starting clustering analysis for {repo_git}") @@ -55,8 +53,7 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: tool_version = '0.2.0' data_source = 'Augur Collected Messages' - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo_id = get_repo_by_repo_git(repo_git).repo_id num_clusters = get_value("Clustering_Task", 'num_clusters') max_df = get_value("Clustering_Task", 'max_df') @@ -122,7 +119,7 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: # check if dumped pickle file exists, if exists no need to train the model if not os.path.exists(MODEL_FILE_NAME): logger.info("clustering model not trained. Training the model.........") - train_model(logger, engine, session, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) + train_model(logger, engine, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) else: model_stats = os.stat(MODEL_FILE_NAME) model_age = (time.time() - model_stats.st_mtime) @@ -130,7 +127,7 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: logger.debug(f'model age is: {model_age}') if model_age > 2000000: logger.info("clustering model to old. Retraining the model.........") - train_model(logger, engine, session, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) + train_model(logger, engine, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source) else: logger.info("using pre-trained clustering model....") @@ -161,18 +158,20 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: prediction = kmeans_model.predict(feature_matrix_cur_repo) logger.info("prediction: " + str(prediction[0])) - # inserting data - record = { - 'repo_id': int(repo_id), - 'cluster_content': int(prediction[0]), - 'cluster_mechanism': -1, - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } - repo_cluster_messages_obj = RepoClusterMessage(**record) - session.add(repo_cluster_messages_obj) - session.commit() + with get_session() as session: + + # inserting data + record = { + 'repo_id': int(repo_id), + 'cluster_content': int(prediction[0]), + 'cluster_mechanism': -1, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + repo_cluster_messages_obj = RepoClusterMessage(**record) + session.add(repo_cluster_messages_obj) + session.commit() # result = db.execute(repo_cluster_messages_table.insert().values(record)) logging.info( @@ -196,22 +195,24 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: logger.debug('prediction vocab') prediction = lda_model.transform(count_matrix_cur_repo) - logger.debug('for loop for vocab') - for i, prob_vector in enumerate(prediction): - # repo_id = msg_df.loc[i]['repo_id'] - for i, prob in enumerate(prob_vector): - record = { - 'repo_id': int(repo_id), - 'topic_id': i + 1, - 'topic_prob': prob, - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } - - repo_topic_object = RepoTopic(**record) - session.add(repo_topic_object) - session.commit() + with get_session() as session: + + logger.debug('for loop for vocab') + for i, prob_vector in enumerate(prediction): + # repo_id = msg_df.loc[i]['repo_id'] + for i, prob in enumerate(prob_vector): + record = { + 'repo_id': int(repo_id), + 'topic_id': i + 1, + 'topic_prob': prob, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + repo_topic_object = RepoTopic(**record) + session.add(repo_topic_object) + session.commit() # result = db.execute(repo_topic_table.insert().values(record)) except Exception as e: @@ -259,7 +260,7 @@ def preprocess_and_tokenize(text): stems = [stemmer.stem(t) for t in tokens] return stems -def train_model(logger, engine, session, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source): +def train_model(logger, engine, max_df, min_df, max_features, ngram_range, num_clusters, num_topics, num_words_per_topic, tool_source, tool_version, data_source): def visualize_labels_PCA(features, labels, annotations, num_components, title): labels_color_map = {-1: "red"} for label in labels: @@ -371,32 +372,35 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): # twid = self.db.execute(key_sequence_words_sql) # logger.info("twid variable is: {}".format(twid)) # insert topic list into database - topic_id = 1 - for topic in topic_list: - # twid = self.get_max_id('topic_words', 'topic_words_id') + 1 - # logger.info("twid variable is: {}".format(twid)) - for i in topic.argsort()[:-num_words_per_topic - 1:-1]: - # twid+=1 - # logger.info("in loop incremented twid variable is: {}".format(twid)) + + with get_session() as session: + + topic_id = 1 + for topic in topic_list: + # twid = self.get_max_id('topic_words', 'topic_words_id') + 1 # logger.info("twid variable is: {}".format(twid)) - record = { - # 'topic_words_id': twid, - # 'word_prob': word_prob[i], - 'topic_id': int(topic_id), - 'word': feature_names[i], - 'tool_source': tool_source, - 'tool_version': tool_version, - 'data_source': data_source - } - - topic_word_obj = TopicWord(**record) - session.add(topic_word_obj) - session.commit() - - # result = db.execute(topic_words_table.insert().values(record)) - logger.info( - "Primary key inserted into the topic_words table: {}".format(topic_word_obj.topic_words_id)) - topic_id += 1 + for i in topic.argsort()[:-num_words_per_topic - 1:-1]: + # twid+=1 + # logger.info("in loop incremented twid variable is: {}".format(twid)) + # logger.info("twid variable is: {}".format(twid)) + record = { + # 'topic_words_id': twid, + # 'word_prob': word_prob[i], + 'topic_id': int(topic_id), + 'word': feature_names[i], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + topic_word_obj = TopicWord(**record) + session.add(topic_word_obj) + session.commit() + + # result = db.execute(topic_words_table.insert().values(record)) + logger.info( + "Primary key inserted into the topic_words table: {}".format(topic_word_obj.topic_words_id)) + topic_id += 1 # insert topic list into database diff --git a/augur/tasks/data_analysis/discourse_analysis/tasks.py b/augur/tasks/data_analysis/discourse_analysis/tasks.py index 1a2bed485b..e78e030e66 100644 --- a/augur/tasks/data_analysis/discourse_analysis/tasks.py +++ b/augur/tasks/data_analysis/discourse_analysis/tasks.py @@ -8,7 +8,7 @@ from collections import Counter from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import get_session +from augur.application.db.lib import get_session, get_repo_by_repo_git from augur.application.db.models import Repo, DiscourseInsight from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -47,10 +47,7 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: tool_version = '0.1.0' data_source = 'Analysis of Issue/PR Messages' - with get_session() as session: - - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo_id = get_repo_by_repo_git(repo_git).repo_id get_messages_for_repo_sql = s.sql.text(""" (SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py index 24dd634bd4..2347eb109c 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -9,9 +9,8 @@ from augur.tasks.data_analysis.message_insights.message_sentiment import get_senti_score from augur.tasks.init.celery_app import celery_app as celery -from augur.application.db.lib import get_value, get_session -from augur.application.db.models import Repo, PullRequestAnalysis -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_value, get_session, get_repo_by_repo_git +from augur.application.db.models import PullRequestAnalysis from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -39,14 +38,11 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: insight_days = 200 - with get_session() as session: - - query = session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id + repo_id = get_repo_by_repo_git(repo_git).repo_id - senti_models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) + senti_models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) - logger.info(f'Sentiment model dir located - {senti_models_dir}') + logger.info(f'Sentiment model dir located - {senti_models_dir}') # Any initial database instructions, like finding the last tuple inserted or generate the next ID value diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 2612ca7ae9..139b23897a 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -292,7 +292,7 @@ def clone_repos(): repo_git_identifiers = get_collection_status_repo_git_from_filter(session, is_pending, 999999) for repo_git in repo_git_identifiers: # set repo to intializing - repo = get_repo_by_repo_git(repo_git) + repo = Repo.get_by_repo_git(session) repoStatus = repo.collection_status[0] setattr(repoStatus,"facade_status", CollectionState.INITIALIZING.value) session.commit() From e5b001c8df4af7a8604bf76126ae257fdd0f3d89 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sun, 7 Apr 2024 15:30:33 -0500 Subject: [PATCH 022/122] Remove github key handling dependence from a session Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 11 ++++++++++- .../tasks/github/util/github_api_key_handler.py | 17 +++++------------ .../tasks/github/util/github_random_key_auth.py | 4 ++-- augur/tasks/github/util/github_task_session.py | 4 ++-- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 818c574076..07679ac64b 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -2,13 +2,14 @@ import random import logging import sqlalchemy as s +from sqlalchemy import func from sqlalchemy.exc import DataError from sqlalchemy.dialects import postgresql from sqlalchemy.exc import OperationalError from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union -from augur.application.db.models import Config, Repo, Commit +from augur.application.db.models import Config, Repo, Commit, WorkerOauth from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts @@ -180,6 +181,14 @@ def get_working_commits_by_repo_id(repo_id): return working_commits +def get_worker_oauth_keys(platform: str): + + with get_session() as session: + + results = session.query(WorkerOauth).filter(WorkerOauth.platform == platform).order_by(func.random()).all() + + return [row.access_token for row in results] + def facade_bulk_insert_commits(logger, records): diff --git a/augur/tasks/github/util/github_api_key_handler.py b/augur/tasks/github/util/github_api_key_handler.py index d87d7495eb..4f8178e7c2 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/augur/tasks/github/util/github_api_key_handler.py @@ -6,8 +6,7 @@ from sqlalchemy.orm import Session from augur.tasks.util.redis_list import RedisList -from augur.application.db.session import DatabaseSession -from augur.application.db.lib import get_value +from augur.application.db.lib import get_value, get_worker_oauth_keys from sqlalchemy import func @@ -19,7 +18,6 @@ class GithubApiKeyHandler(): """Handles Github API key retrieval from the database and redis Attributes: - session (DatabaseSession): Database connection logger (logging.Logger): Handles all logs oauth_redis_key (str): The key where the github api keys are cached in redis redis_key_list (RedisList): Acts like a python list, and interacts directly with the redis cache @@ -27,9 +25,8 @@ class GithubApiKeyHandler(): key: (List[str]): List of keys retrieve from database or cache """ - def __init__(self, session: Session, logger): + def __init__(self, logger): - self.session = session self.logger = logger self.oauth_redis_key = "github_oauth_keys_list" @@ -69,16 +66,12 @@ def get_api_keys_from_database(self) -> List[str]: Returns: Github api keys that are in the database """ - from augur.application.db.models import WorkerOauth - select = WorkerOauth.access_token - # randomizing the order at db time - #select.order_by(func.random()) - where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'github'] + keys = get_worker_oauth_keys('github') - return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] - #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + filtered_keys = [item for item in keys if item != self.config_key] + return filtered_keys def get_api_keys(self) -> List[str]: """Retrieves all valid Github API Keys diff --git a/augur/tasks/github/util/github_random_key_auth.py b/augur/tasks/github/util/github_random_key_auth.py index ed539430d8..95788da1cc 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/augur/tasks/github/util/github_random_key_auth.py @@ -9,12 +9,12 @@ class GithubRandomKeyAuth(RandomKeyAuth): github collections can have a class randomly selects an api key for each request """ - def __init__(self, session: Session, logger): + def __init__(self, logger): """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" # gets the github api keys from the database via the GithubApiKeyHandler - github_api_keys = GithubApiKeyHandler(session, logger).keys + github_api_keys = GithubApiKeyHandler(logger).keys #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) if not github_api_keys: diff --git a/augur/tasks/github/util/github_task_session.py b/augur/tasks/github/util/github_task_session.py index 0acbbf64cd..4699fb7ef6 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/augur/tasks/github/util/github_task_session.py @@ -11,7 +11,7 @@ def __init__(self, logger): engine = get_engine() self.augur_db = DatabaseSession(logger, engine) - self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + self.key_auth = GithubRandomKeyAuth(logger) self.logger = logger self.platform_id = 1 @@ -39,6 +39,6 @@ def __init__(self, logger: Logger, engine=None): super().__init__(logger, engine=engine) - self.oauths = GithubRandomKeyAuth(self, logger) + self.oauths = GithubRandomKeyAuth(logger) self.platform_id = 1 From 6c8d59028b53523fcb72318bf54c12b687c8361b Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 9 Apr 2024 07:15:38 -0500 Subject: [PATCH 023/122] Remove git repo clean up Signed-off-by: Andrew Brain --- augur/tasks/git/facade_tasks.py | 9 - .../facade_worker/postanalysiscleanup.py | 180 ------------------ 2 files changed, 189 deletions(-) delete mode 100644 augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 139b23897a..c74f8ec6ce 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -267,15 +267,6 @@ def rebuild_unknown_affiliation_and_web_caches_facade_task(): facade_helper = FacadeHelper(logger) rebuild_unknown_affiliation_and_web_caches(facade_helper) - -@celery.task -def git_repo_cleanup_facade_task(repo_git): - - logger = logging.getLogger(git_repo_cleanup_facade_task.__name__) - - facade_helper = FacadeHelper(logger) - git_repo_cleanup(facade_helper, repo_git) - # retry this task indefinitely every 5 minutes if it errors. Since the only way it gets scheduled is by itself, so if it stops running no more clones will happen till the instance is restarted @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) def clone_repos(): diff --git a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py deleted file mode 100644 index 03e4f98acd..0000000000 --- a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2016-2018 Brian Warner -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: Apache-2.0 - -# Git repo maintenance -# -# This script is responsible for cloning new repos and keeping existing repos up -# to date. It can be run as often as you want (and will detect when it's -# already running, so as not to spawn parallel processes), but once or twice per -# day should be more than sufficient. Each time it runs, it updates the repo -# and checks for any parents of HEAD that aren't already accounted for in the -# repos. It also rebuilds analysis data, checks any changed affiliations and -# aliases, and caches data for display. -import subprocess -import sqlalchemy as s -from augur.application.db.util import execute_session_query -from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_commits_by_repo_id, remove_working_commits_by_repo_id, get_session -from .utilitymethods import get_absolute_repo_path -from augur.application.db.models import * - -#Will delete repos passed and cleanup associated commit data. -def git_repo_cleanup(facade_helper, session,repo_git): - -# Clean up any git repos that are pending deletion - - facade_helper.update_status('Purging deleted repos') - #logger.info("Processing deletions") - facade_helper.log_activity('Info','Processing deletions') - - # TODO: We can convert this to use get_repo_by_repo_git. We just need to know how to handle the NoResultFoundException - with get_session() as session: - - query = session.query(Repo).filter( - Repo.repo_git == repo_git)#s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_status='Delete'""") - - delete_repos = execute_session_query(query,'all')#fetchall_data_from_sql_text(query) - - for row in delete_repos: - - # Remove the files on disk - - absolute_path = get_absolute_repo_path(facade_helper.repo_base_directory, row.repo_id, row.repo_path,row.repo_name) - - cmd = ("rm -rf %s" - % (absolute_path)) - - return_code = subprocess.Popen([cmd],shell=True).wait() - - # Remove the analysis data - remove_commits_by_repo_id(row.repo_id) - - optimize_table = s.sql.text("""OPTIMIZE TABLE commits""") - execute_sql(optimize_table) - - # Remove cached repo data - - remove_dm_repo_weekly = s.sql.text("""DELETE FROM dm_repo_weekly WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - execute_sql(remove_dm_repo_weekly) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_weekly""") - execute_sql(optimize_table) - - remove_dm_repo_monthly = s.sql.text("""DELETE FROM dm_repo_monthly WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - execute_sql(remove_dm_repo_monthly) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_monthly""") - execute_sql(optimize_table) - - remove_dm_repo_annual = s.sql.text("""DELETE FROM dm_repo_annual WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - execute_sql(remove_dm_repo_annual) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_annual""") - execute_sql(optimize_table) - - # Set project to be recached if just removing a repo - - set_project_recache = s.sql.text("""UPDATE projects SET recache=TRUE - WHERE id=:repo_group_id""").bindparams(repo_group_id=row.repo_group_id) - execute_sql(set_project_recache) - # Remove the entry from the repos table - - query = s.sql.text("""DELETE FROM repo WHERE repo_id=:repo_id - """).bindparams(repo_id=row.repo_id) - execute_sql(query) - - #log_activity('Verbose','Deleted repo %s' % row[0]) - #logger.debug(f"Deleted repo {row.repo_id}") - facade_helper.log_activity('Verbose',f"Deleted repo {row.repo_id}") - cleanup = '%s/%s%s' % (row.repo_group_id,row.repo_path,row.repo_name) - - # Remove any working commits - remove_working_commits_by_repo_id(row.repo_id) - - # Remove the repo from the logs - - remove_logs = s.sql.text("""DELETE FROM repos_fetch_log WHERE repos_id =:repo_id - """).bindparams(repo_id=row.repo_id) - - execute_sql(remove_logs) - - optimize_table = s.sql.text("""OPTIMIZE TABLE repos_fetch_log""") - execute_sql(optimize_table) - - # Attempt to cleanup any empty parent directories - - while (cleanup.find('/',0) > 0): - cleanup = cleanup[:cleanup.rfind('/',0)] - - cmd = "rmdir %s%s" % (facade_helper.repo_base_directory,cleanup) - subprocess.Popen([cmd],shell=True).wait() - #log_activity('Verbose','Attempted %s' % cmd) - #logger.debug(f"Attempted {cmd}") - facade_helper.log_activity('Verbose',f"Attempted {cmd}") - - #update_repo_log(row[0],'Deleted') - facade_helper.update_repo_log(row.repo_id,'Deleted') - - # Clean up deleted projects - - get_deleted_projects = s.sql.text("""SELECT repo_group_id FROM repo_groups WHERE rg_name='(Queued for removal)'""") - - deleted_projects = fetchall_data_from_sql_text(get_deleted_projects) - - for project in deleted_projects: - - # Remove cached data for projects which were marked for deletion - - clear_annual_cache = s.sql.text("""DELETE FROM dm_repo_group_annual WHERE - repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - execute_sql(clear_annual_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_annual""") - execute_sql(optimize_table) - - clear_monthly_cache = s.sql.text("""DELETE FROM dm_repo_group_monthly WHERE - repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - execute_sql(clear_monthly_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_monthly""") - execute_sql(optimize_table) - - clear_weekly_cache = s.sql.text("""DELETE FROM dm_repo_group_weekly WHERE - repo_group_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - execute_sql(clear_weekly_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_weekly""") - execute_sql(optimize_table) - - clear_unknown_cache = s.sql.text("""DELETE FROM unknown_cache WHERE - projects_id=:repo_group_id""").bindparams(repo_group_id=project['repo_group_id']) - execute_sql(clear_unknown_cache) - - optimize_table = s.sql.text("""OPTIMIZE TABLE dm_repo_group_weekly""") - execute_sql(optimize_table) - - # Remove any projects which were also marked for deletion - - remove_project = s.sql.text("""DELETE FROM repo_groups WHERE repo_group_id=:repo_group_id - """).bindparams(repo_group_id=project['repo_group_id']) - execute_sql(remove_project) - - - facade_helper.log_activity('Info', 'Processing deletions (complete)') From 2ff53b1a615ea9dca9618c911a3e7d2d2e29bafd Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 9 Apr 2024 07:31:56 -0500 Subject: [PATCH 024/122] Decouple gitlab api key handler from session Signed-off-by: Andrew Brain --- augur/tasks/gitlab/gitlab_api_key_handler.py | 18 +++++------------- augur/tasks/gitlab/gitlab_random_key_auth.py | 9 +++------ augur/tasks/gitlab/gitlab_task_session.py | 4 ++-- 3 files changed, 10 insertions(+), 21 deletions(-) diff --git a/augur/tasks/gitlab/gitlab_api_key_handler.py b/augur/tasks/gitlab/gitlab_api_key_handler.py index c3a76f6ddc..40b37d62c6 100644 --- a/augur/tasks/gitlab/gitlab_api_key_handler.py +++ b/augur/tasks/gitlab/gitlab_api_key_handler.py @@ -8,11 +8,9 @@ import random from typing import List -from sqlalchemy.orm import Session from augur.tasks.util.redis_list import RedisList -from augur.application.db.lib import get_value -from sqlalchemy import func +from augur.application.db.lib import get_value, get_worker_oauth_keys class NoValidKeysError(Exception): @@ -23,7 +21,6 @@ class GitlabApiKeyHandler(): """Handles Gitlab API key retrieval from the database and redis Attributes: - session (DatabaseSession): Database connection logger (logging.Logger): Handles all logs oauth_redis_key (str): The key where the gitlab api keys are cached in redis redis_key_list (RedisList): Acts like a python list, and interacts directly with the redis cache @@ -31,9 +28,8 @@ class GitlabApiKeyHandler(): key: (List[str]): List of keys retrieve from database or cache """ - def __init__(self, session: Session, logger): + def __init__(self, logger): - self.session = session self.logger = logger self.oauth_redis_key = "gitlab_oauth_keys_list" @@ -72,15 +68,11 @@ def get_api_keys_from_database(self) -> List[str]: Returns: Github api keys that are in the database """ - from augur.application.db.models import WorkerOauth + keys = get_worker_oauth_keys('gitlab') - select = WorkerOauth.access_token - # randomizing the order at db time - #select.order_by(func.random()) - where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'gitlab'] + filtered_keys = [item for item in keys if item != self.config_key] - return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] - #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + return filtered_keys def get_api_keys(self) -> List[str]: diff --git a/augur/tasks/gitlab/gitlab_random_key_auth.py b/augur/tasks/gitlab/gitlab_random_key_auth.py index b2afded3ae..3269d1ec3e 100644 --- a/augur/tasks/gitlab/gitlab_random_key_auth.py +++ b/augur/tasks/gitlab/gitlab_random_key_auth.py @@ -1,7 +1,4 @@ """Defines the GitlabRandomKeyAuth class""" - -from sqlalchemy.orm import Session - from augur.tasks.util.random_key_auth import RandomKeyAuth from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler @@ -11,12 +8,12 @@ class GitlabRandomKeyAuth(RandomKeyAuth): gitlab collections can have a class randomly selects an api key for each request """ - def __init__(self, session: Session, logger): + def __init__(self, logger): """Creates a GitlabRandomKeyAuth object and initializes the RandomKeyAuth parent class""" # gets the gitlab api keys from the database via the GitlabApiKeyHandler - gitlab_api_keys = GitlabApiKeyHandler(session, logger).keys + gitlab_api_keys = GitlabApiKeyHandler(logger).keys if not gitlab_api_keys: print("Failed to find github api keys. This is usually because your key has expired") @@ -24,4 +21,4 @@ def __init__(self, session: Session, logger): header_name = "Authorization" key_format = "Bearer {0}" - super().__init__(gitlab_api_keys, header_name, session.logger, key_format) \ No newline at end of file + super().__init__(gitlab_api_keys, header_name, logger, key_format) \ No newline at end of file diff --git a/augur/tasks/gitlab/gitlab_task_session.py b/augur/tasks/gitlab/gitlab_task_session.py index 0892087d22..52b1cf879b 100644 --- a/augur/tasks/gitlab/gitlab_task_session.py +++ b/augur/tasks/gitlab/gitlab_task_session.py @@ -24,7 +24,7 @@ def __init__(self, logger): engine = get_engine() self.augur_db = DatabaseSession(logger, engine) - self.key_auth = GitlabRandomKeyAuth(self.augur_db.session, logger) + self.key_auth = GitlabRandomKeyAuth(logger) self.logger = logger self.platform_id = 2 @@ -51,6 +51,6 @@ def __init__(self, logger: Logger, engine=None): super().__init__(logger, engine=engine) - self.oauths = GitlabRandomKeyAuth(self, logger) + self.oauths = GitlabRandomKeyAuth(logger) self.platform_id = 2 From 62ac0be111c7d8bad07f768304b1f56e7a3b6ad2 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 9 Apr 2024 15:05:25 -0500 Subject: [PATCH 025/122] Fix issues in branch Signed-off-by: Andrew Brain --- augur/tasks/git/dependency_tasks/core.py | 2 +- augur/tasks/git/facade_tasks.py | 3 +- .../facade_worker/facade_worker/repofetch.py | 14 +++++----- .../facade_worker/utilitymethods.py | 28 +++++++++++-------- augur/tasks/gitlab/merge_request_task.py | 2 +- augur/tasks/start_tasks.py | 2 +- .../test_github_api_key_handler.py | 2 +- 7 files changed, 28 insertions(+), 25 deletions(-) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 75c96ad5de..bc3444551c 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -83,7 +83,7 @@ def generate_scorecard(logger, repo_git): with get_session() as session: - key_handler = GithubApiKeyHandler(session, logger) + key_handler = GithubApiKeyHandler(logger) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index c74f8ec6ce..e955fbad10 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -11,7 +11,6 @@ from augur.tasks.git.util.facade_worker.facade_worker.analyzecommit import analyze_commit from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count from augur.tasks.git.util.facade_worker.facade_worker.rebuildcache import fill_empty_affiliations, invalidate_caches, nuke_affiliations, rebuild_unknown_affiliation_and_web_caches -from augur.tasks.git.util.facade_worker.facade_worker.postanalysiscleanup import git_repo_cleanup from augur.tasks.github.facade_github.tasks import * @@ -283,7 +282,7 @@ def clone_repos(): repo_git_identifiers = get_collection_status_repo_git_from_filter(session, is_pending, 999999) for repo_git in repo_git_identifiers: # set repo to intializing - repo = Repo.get_by_repo_git(session) + repo = Repo.get_by_repo_git(session, repo_git) repoStatus = repo.collection_status[0] setattr(repoStatus,"facade_status", CollectionState.INITIALIZING.value) session.commit() diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index ce1b3ae2da..874f338902 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -53,7 +53,7 @@ def git_repo_initialize(facade_helper, session, repo_git): if row: - session.log_activity( + facade_helper.log_activity( 'Info', f"Fetching repo with repo id: {row.repo_id}") update_repo_log(logger, facade_helper, row.repo_id, 'Cloning') @@ -131,8 +131,8 @@ def git_repo_initialize(facade_helper, session, repo_git): print("COULD NOT CREATE REPO DIRECTORY") update_repo_log(logger, facade_helper, row.repo_id, 'Failed (mkdir)') - session.update_status(f"Failed (mkdir {repo_path})") - session.log_activity( + facade_helper.update_status(f"Failed (mkdir {repo_path})") + facade_helper.log_activity( 'Error', f"Could not create repo directory: {repo_path}") raise e @@ -146,7 +146,7 @@ def git_repo_initialize(facade_helper, session, repo_git): execute_sql(query) - session.log_activity('Verbose', f"Cloning: {git}") + facade_helper.log_activity('Verbose', f"Cloning: {git}") cmd = f"git -C {repo_path} clone '{git}' {repo_name}" return_code = subprocess.Popen([cmd], shell=True).wait() @@ -156,17 +156,17 @@ def git_repo_initialize(facade_helper, session, repo_git): # Mark the entire project for an update, so that under normal # circumstances caches are rebuilt only once per waiting period. update_repo_log(logger, facade_helper, row.repo_id, 'Up-to-date') - session.log_activity('Info', f"Cloned {git}") + facade_helper.log_activity('Info', f"Cloned {git}") else: # If cloning failed, log it and set the status back to new update_repo_log(logger, facade_helper, row.repo_id, f"Failed ({return_code})") - session.log_activity('Error', f"Could not clone {git}") + facade_helper.log_activity('Error', f"Could not clone {git}") raise GitCloneError(f"Could not clone {git}") - session.log_activity('Info', f"Fetching new repos (complete)") + facade_helper.log_activity('Info', f"Fetching new repos (complete)") # Deprecated functionality. No longer used diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index 1c22a565d8..fe3325754b 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -33,6 +33,7 @@ from .config import FacadeHelper as FacadeHelper from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps from augur.application.db.lib import execute_sql, fetchall_data_from_sql_text, remove_working_commits_by_repo_id_and_hashes, remove_commits_by_repo_id_and_hashes, get_repo_by_repo_git, get_session +from augur.application.db.util import execute_session_query #from augur.tasks.git.util.facade_worker.facade def update_repo_log(logger, facade_helper, repos_id,status): @@ -160,18 +161,21 @@ def get_repo_commit_count(logger, facade_helper, repo_git): def get_facade_weight_time_factor(repo_git): - repo = get_repo_by_repo_git(repo_git) - - try: - status = repo.collection_status[0] - time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) - except IndexError: - time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) - - #Adjust for commits. - time_factor *= 1.2 + with get_session() as session: + + query = session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + try: + status = repo.collection_status[0] + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) + except IndexError: + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) + + #Adjust for commits. + time_factor *= 1.2 - return time_factor + return time_factor def get_facade_weight_with_commit_count(repo_git, commit_count): return commit_count - get_facade_weight_time_factor(repo_git) @@ -182,7 +186,7 @@ def get_repo_weight_by_commit(logger, repo_git): return get_repo_commit_count(logger, facade_helper, repo_git) - get_facade_weight_time_factor(repo_git) -def update_facade_scheduling_fields(session, repo_git, weight, commit_count): +def update_facade_scheduling_fields(repo_git, weight, commit_count): repo = get_repo_by_repo_git(repo_git) diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index d5212a52d4..437909e78e 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -6,7 +6,7 @@ from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor +from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee from augur.application.db.util import execute_session_query from augur.tasks.util.worker_util import remove_duplicate_dicts diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 2a458649c3..17a4ba273e 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -301,7 +301,7 @@ def augur_collection_update_weights(self): repo = Repo.get_by_id(session, status.repo_id) commit_count = status.commit_sum - date_factor = get_facade_weight_time_factor(session, repo.repo_git) + date_factor = get_facade_weight_time_factor(repo.repo_git) weight = commit_count - date_factor update_query = ( diff --git a/tests/test_tasks/test_task_utlities/test_key_handler/test_github_api_key_handler.py b/tests/test_tasks/test_task_utlities/test_key_handler/test_github_api_key_handler.py index 8b9adb222c..5443b3a733 100644 --- a/tests/test_tasks/test_task_utlities/test_key_handler/test_github_api_key_handler.py +++ b/tests/test_tasks/test_task_utlities/test_key_handler/test_github_api_key_handler.py @@ -23,7 +23,7 @@ def key_handler(test_db_session): redis.flushdb() - yield GithubApiKeyHandler(test_db_session) + yield GithubApiKeyHandler(logger) def test_get_config_key(key_handler, test_db_engine): From effdd41783fc2511ed443354934776d51937ab7f Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 08:37:15 -0500 Subject: [PATCH 026/122] Start reducing query usage in github tasks Signed-off-by: Andrew Brain --- augur/tasks/github/detect_move/tasks.py | 24 +++++++++++--------- augur/tasks/github/events/tasks.py | 30 ++++++++++--------------- augur/tasks/github/repo_info/tasks.py | 16 ++++++------- augur/tasks/github/traffic/tasks.py | 20 ++++++++--------- 4 files changed, 42 insertions(+), 48 deletions(-) diff --git a/augur/tasks/github/detect_move/tasks.py b/augur/tasks/github/detect_move/tasks.py index c9da0d3ca2..708173629f 100644 --- a/augur/tasks/github/detect_move/tasks.py +++ b/augur/tasks/github/detect_move/tasks.py @@ -4,7 +4,7 @@ from augur.tasks.github.detect_move.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git @@ -14,14 +14,15 @@ def detect_github_repo_move_core(repo_git : str) -> None: logger = logging.getLogger(detect_github_repo_move_core.__name__) logger.info(f"Starting repo_move operation with {repo_git}") + + repo = get_repo_by_repo_git(repo_git) + + logger.info(f"Pinging repo: {repo_git}") + with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db #Ping each repo with the given repo_git to make sure #that they are still in place. - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - logger.info(f"Pinging repo: {repo_git}") - ping_github_for_repo_move(augur_db, manifest.key_auth, repo, logger) + ping_github_for_repo_move(manifest.augur_db, manifest.key_auth, repo, logger) @celery.task(base=AugurSecondaryRepoCollectionTask) @@ -30,11 +31,12 @@ def detect_github_repo_move_secondary(repo_git : str) -> None: logger = logging.getLogger(detect_github_repo_move_secondary.__name__) logger.info(f"Starting repo_move operation with {repo_git}") + + repo = get_repo_by_repo_git(repo_git) + + logger.info(f"Pinging repo: {repo_git}") + with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db #Ping each repo with the given repo_git to make sure #that they are still in place. - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - logger.info(f"Pinging repo: {repo_git}") - ping_github_for_repo_move(augur_db, manifest.key_auth, repo, logger,collection_hook='secondary') \ No newline at end of file + ping_github_for_repo_move(manifest.augur_db, manifest.key_auth, repo, logger,collection_hook='secondary') \ No newline at end of file diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py index 442af9922f..dc76138f2f 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events/tasks.py @@ -9,8 +9,8 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.util.util import get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequest, PullRequestEvent, Issue, IssueEvent, Contributor, Repo -from augur.application.db.util import execute_session_query +from augur.application.db.models import PullRequest, PullRequestEvent, Issue, IssueEvent, Contributor +from augur.application.db.lib import get_repo_by_repo_git platform_id = 1 @@ -19,32 +19,26 @@ def collect_events(repo_git: str): logger = logging.getLogger(collect_events.__name__) - with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db - - try: - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + try: + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id - owner, repo = get_owner_repo(repo_git) + owner, repo = get_owner_repo(repo_git) - logger.info(f"Collecting Github events for {owner}/{repo}") + logger.info(f"Collecting Github events for {owner}/{repo}") - url = f"https://api.github.com/repos/{owner}/{repo}/issues/events" + with GithubTaskManifest(logger) as manifest: event_data = retrieve_all_event_data(repo_git, logger, manifest.key_auth) if event_data: - process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger, manifest.augur_db) - else: logger.info(f"{owner}/{repo} has no events") - except Exception as e: - logger.error(f"Could not collect events for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + + except Exception as e: + logger.error(f"Could not collect events for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") def retrieve_all_event_data(repo_git: str, logger, key_auth): diff --git a/augur/tasks/github/repo_info/tasks.py b/augur/tasks/github/repo_info/tasks.py index b31bc7bf62..3494f5c5e5 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/augur/tasks/github/repo_info/tasks.py @@ -4,7 +4,7 @@ from augur.tasks.github.repo_info.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git #Task to get regular misc github info @@ -13,12 +13,11 @@ def collect_repo_info(repo_git: str): logger = logging.getLogger(collect_repo_info.__name__) + repo = get_repo_by_repo_git(repo_git) + with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - - repo_info_model(augur_db, manifest.key_auth, repo, logger) + + repo_info_model(manifest.augur_db, manifest.key_auth, repo, logger) #Task to get CII api data for linux badge info using github data. @@ -27,9 +26,8 @@ def collect_linux_badge_info(repo_git: str): logger = logging.getLogger(collect_linux_badge_info.__name__) + repo = get_repo_by_repo_git(repo_git) + with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') badges_model(logger, repo_git, repo.repo_id, augur_db) diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index 068c9616b7..e374c312fc 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -6,25 +6,25 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import RepoClone, Repo -from augur.application.db.util import execute_session_query +from augur.application.db.models import RepoClone +from augur.application.db.lib import get_repo_by_repo_git + + @celery.task def collect_github_repo_clones_data(repo_git: str) -> None: logger = logging.getLogger(collect_github_repo_clones_data.__name__) + + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id - # using GithubTaskSession to get our repo_obj for which we will store data of clones - with GithubTaskManifest(logger) as manifest: + owner, repo = get_owner_repo(repo_git) - query = manifest.augur_db.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + logger.info(f"Collecting Github repository clone data for {owner}/{repo}") - owner, repo = get_owner_repo(repo_git) + with GithubTaskManifest(logger) as manifest: - logger.info(f"Collecting Github repository clone data for {owner}/{repo}") - clones_data = retrieve_all_clones_data(repo_git, logger, manifest.key_auth) if clones_data: From 78368a5009294ceae79eb9cf6e8f1c280a55552b Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 08:56:42 -0500 Subject: [PATCH 027/122] Continue improving database handling in github tasks Signed-off-by: Andrew Brain --- .../contributor_interface.py | 8 ++--- augur/tasks/github/issues/tasks.py | 34 +++++-------------- augur/tasks/github/messages/tasks.py | 17 ++++------ .../pull_requests/commits_model/core.py | 11 ++---- .../pull_requests/commits_model/tasks.py | 12 +++---- .../github/pull_requests/files_model/core.py | 10 ++---- .../github/pull_requests/files_model/tasks.py | 10 +++--- augur/tasks/github/pull_requests/tasks.py | 23 +++++-------- augur/tasks/github/releases/tasks.py | 13 ++++--- 9 files changed, 49 insertions(+), 89 deletions(-) diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 44b6c706f8..3af210366f 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -8,6 +8,7 @@ import traceback from augur.tasks.github.util.github_paginator import GithubApiResult from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_id ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -113,9 +114,7 @@ def create_endpoint_from_commit_sha(logger,db,commit_sha, repo_id): #stmnt = s.select(Repo.repo_path, Repo.repo_name).where(Repo.repo_id == repo_id) - - query = db.query(Repo).filter_by(repo_id=repo_id) - result = execute_session_query(query, 'one') + result = get_repo_by_repo_id(repo_id) if result.repo_path is None or result.repo_name is None: raise KeyError @@ -402,8 +401,7 @@ def create_endpoint_from_repo_id(logger,db, repo_id): WHERE repo_id = :repo_id_bind """ #ORM syntax of above statement - query = db.session.query(Repo).filter_by(repo_id=repo_id) - result = execute_session_query(query, 'one') + result = get_repo_by_repo_id(repo_id) url = result.repo_git logger.info(f"Url: {url}") diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index baccfdc60e..565a55e35d 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -13,46 +13,30 @@ from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor, Repo from augur.application.config import get_development_flag -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git + development = get_development_flag() @celery.task(base=AugurCoreRepoCollectionTask) def collect_issues(repo_git : str) -> int: - logger = logging.getLogger(collect_issues.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id + + owner, repo = get_owner_repo(repo_git) + + with GithubTaskManifest(logger) as manifest: logger.info(f'this is the manifest.key_auth value: {str(manifest.key_auth)}') - try: - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id - - #try this - # the_key = manifest.key_auth - # try: - # randomon = GithubApiKeyHandler(augur_db.session) - # the_key = randomon.get_random_key() - # logger.info(f'The Random Key {the_key}') - # except Exception as e: - # logger.info(f'error: {e}') - # the_key = manifest.key_auth - # pass - - owner, repo = get_owner_repo(repo_git) - + try: issue_data = retrieve_all_issue_data(repo_git, logger, manifest.key_auth) - #issue_data = retrieve_all_issue_data(repo_git, logger, the_key) if issue_data: total_issues = len(issue_data) - process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger, augur_db) + process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger, manifest.augur_db) return total_issues else: diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 54a4c41e0c..35ab57828c 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -9,32 +9,27 @@ from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo - +from augur.application.db.lib import get_repo_by_repo_git platform_id = 1 - @celery.task(base=AugurCoreRepoCollectionTask) def collect_github_messages(repo_git: str) -> None: logger = logging.getLogger(collect_github_messages.__name__) - with GithubTaskManifest(logger) as manifest: + repo_id = get_repo_by_repo_git(repo_git).repo_id - augur_db = manifest.augur_db - - repo_id = augur_db.session.query(Repo).filter( - Repo.repo_git == repo_git).one().repo_id + owner, repo = get_owner_repo(repo_git) - owner, repo = get_owner_repo(repo_git) + with GithubTaskManifest(logger) as manifest: + task_name = f"{owner}/{repo}: Message Task" message_data = retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) if message_data: - - process_messages(message_data, task_name, repo_id, logger, augur_db) - + process_messages(message_data, task_name, repo_id, logger, manifest.augur_db) else: logger.info(f"{owner}/{repo} has no messages") diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index ea91a597da..e1d2d1fee7 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -2,24 +2,19 @@ from augur.tasks.github.util.github_paginator import GithubPaginator from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.util import execute_session_query - -def pull_request_commits_model(repo_id,logger, augur_db, key_auth): +def pull_request_commits_model(repo,logger, augur_db, key_auth): # query existing PRs and the respective url we will append the commits url to pr_url_sql = s.sql.text(""" SELECT DISTINCT pr_url, pull_requests.pull_request_id FROM pull_requests--, pull_request_meta WHERE repo_id = :repo_id - """).bindparams(repo_id=repo_id) + """).bindparams(repo_id=repo.repo_id) pr_urls = [] #pd.read_sql(pr_number_sql, self.db, params={}) pr_urls = augur_db.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() - - query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) - repo = execute_session_query(query, 'one') owner, name = get_owner_repo(repo.repo_git) @@ -49,7 +44,7 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth): 'tool_source': 'pull_request_commits_model', 'tool_version': '0.41', 'data_source': 'GitHub API', - 'repo_id': repo_id, + 'repo_id': repo.repo_id, } all_data.append(pr_commit_row) diff --git a/augur/tasks/github/pull_requests/commits_model/tasks.py b/augur/tasks/github/pull_requests/commits_model/tasks.py index f0a065bdd1..d681b4fd33 100644 --- a/augur/tasks/github/pull_requests/commits_model/tasks.py +++ b/augur/tasks/github/pull_requests/commits_model/tasks.py @@ -2,8 +2,9 @@ from augur.tasks.github.pull_requests.commits_model.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.application.db.lib import get_repo_by_repo_git + @celery.task(base=AugurSecondaryRepoCollectionTask) @@ -11,11 +12,8 @@ def process_pull_request_commits(repo_git: str) -> None: logger = logging.getLogger(process_pull_request_commits.__name__) - with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db + repo = get_repo_by_repo_git(repo_git) - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') + with GithubTaskManifest(logger) as manifest: - pull_request_commits_model(repo.repo_id, logger, augur_db, manifest.key_auth) + pull_request_commits_model(repo, logger, manifest.augur_db, manifest.key_auth) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 138aa61cb3..3dfb7aec39 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -2,25 +2,21 @@ from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.util import execute_session_query -def pull_request_files_model(repo_id,logger, augur_db, key_auth): +def pull_request_files_model(repo,logger, augur_db, key_auth): # query existing PRs and the respective url we will append the commits url to pr_number_sql = s.sql.text(""" SELECT DISTINCT pr_src_number as pr_src_number, pull_requests.pull_request_id FROM pull_requests--, pull_request_meta WHERE repo_id = :repo_id - """).bindparams(repo_id=repo_id) + """).bindparams(repo_id=repo.repo_id) pr_numbers = [] #pd.read_sql(pr_number_sql, self.db, params={}) result = augur_db.execute_sql(pr_number_sql)#.fetchall() pr_numbers = [dict(row) for row in result.mappings()] - query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) - repo = execute_session_query(query, 'one') - owner, name = get_owner_repo(repo.repo_git) pr_file_rows = [] @@ -71,7 +67,7 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, 'pr_file_path': pr_file['path'], 'data_source': 'GitHub API', - 'repo_id': repo_id, + 'repo_id': repo.repo_id, } for pr_file in file_collection if pr_file and 'path' in pr_file] diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index 988261f6c8..48be47e284 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -3,16 +3,16 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git + @celery.task(base=AugurSecondaryRepoCollectionTask) def process_pull_request_files(repo_git: str) -> None: logger = logging.getLogger(process_pull_request_files.__name__) + repo = get_repo_by_repo_git(repo_git) + with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo = execute_session_query(query, 'one') - pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth) \ No newline at end of file + pull_request_files_model(repo, logger, manifest.augur_db, manifest.key_auth) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 69e40f6818..4ef737105d 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -9,30 +9,28 @@ from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo +from augur.application.db.lib import get_repo_by_repo_git from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors platform_id = 1 - @celery.task(base=AugurCoreRepoCollectionTask) def collect_pull_requests(repo_git: str) -> int: logger = logging.getLogger(collect_pull_requests.__name__) - with GithubTaskManifest(logger) as manifest: + repo_id = get_repo_by_repo_git(repo_git).repo_id - augur_db = manifest.augur_db + owner, repo = get_owner_repo(repo_git) - repo_id = augur_db.session.query(Repo).filter( - Repo.repo_git == repo_git).one().repo_id + with GithubTaskManifest(logger) as manifest: - owner, repo = get_owner_repo(repo_git) pr_data = retrieve_all_pr_data(repo_git, logger, manifest.key_auth) if pr_data: - process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, manifest.augur_db) return len(pr_data) else: @@ -206,15 +204,13 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger = logging.getLogger(collect_pull_request_review_comments.__name__) logger.info(f"Collecting pull request review comments for {owner}/{repo}") + repo_id = get_repo_by_repo_git(repo_git).repo_id + # define GithubTaskSession to handle insertions, and store oauth keys with GithubTaskManifest(logger) as manifest: augur_db = manifest.augur_db - # get repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id - query = augur_db.session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id) pr_reviews = execute_session_query(query, 'all') @@ -327,13 +323,12 @@ def collect_pull_request_reviews(repo_git: str) -> None: tool_source = "pull_request_reviews" data_source = "Github API" + repo_id = get_repo_by_repo_git(repo_git).repo_id + with GithubTaskManifest(logger) as manifest: augur_db = manifest.augur_db - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_id = execute_session_query(query, 'one').repo_id - query = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) prs = execute_session_query(query, 'all') diff --git a/augur/tasks/github/releases/tasks.py b/augur/tasks/github/releases/tasks.py index 310da90d74..ab947ed3a9 100644 --- a/augur/tasks/github/releases/tasks.py +++ b/augur/tasks/github/releases/tasks.py @@ -5,18 +5,17 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.util import execute_session_query +from augur.application.db.lib import get_repo_by_repo_git + @celery.task(base=AugurCoreRepoCollectionTask) def collect_releases(repo_git): logger = logging.getLogger(collect_releases.__name__) - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + repo_obj = get_repo_by_repo_git(repo_git) + repo_id = repo_obj.repo_id + with GithubTaskManifest(logger) as manifest: - releases_model(augur_db, manifest.key_auth, logger, repo_git, repo_id) \ No newline at end of file + releases_model(manifest.augur_db, manifest.key_auth, logger, repo_git, repo_id) \ No newline at end of file From 93d5a297d20a74511b2669a19e5823a552e27caf Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 09:05:53 -0500 Subject: [PATCH 028/122] Simplify facade github logic Signed-off-by: Andrew Brain --- augur/tasks/github/contributors/tasks.py | 2 +- augur/tasks/github/facade_github/core.py | 33 ++++++++++++------------ 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index a581b09f71..cd62fa643e 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -116,7 +116,7 @@ def grab_comitters(self, repo_git,platform="github"): try: with GithubTaskManifest(logger) as manifest: - grab_committer_list(manifest, repo_id,platform) + grab_committer_list(logger, manifest.key_auth, repo_id,platform) except Exception as e: logger.error(f"Could not grab committers from github endpoint!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py index 10f4affc6a..962efb766b 100644 --- a/augur/tasks/github/facade_github/core.py +++ b/augur/tasks/github/facade_github/core.py @@ -4,20 +4,24 @@ from augur.tasks.github.util.github_paginator import * from augur.application.db.models import * from augur.tasks.util.AugurUUID import GithubUUID +from augur.application.db.lib import bulk_insert_dicts -def query_github_contributors(manifest, github_url): +def query_github_contributors(logger, key_auth, github_url): """ Data collection function Query the GitHub API for contributors """ + # Set platform id to 1 since it is a github method + platform_id = 1 + # Extract owner/repo from the url for the endpoint try: owner, name = get_owner_repo(github_url) except IndexError as e: - manifest.logger.error(f"Encountered bad url: {github_url}") + logger.error(f"Encountered bad url: {github_url}") raise e # Set the base of the url and place to hold contributors to insert @@ -35,11 +39,11 @@ def query_github_contributors(manifest, github_url): duplicate_col_map = {'cntrb_login': 'login'} #list to hold contributors needing insertion or update - contributor_list = GithubPaginator(contributors_url, manifest.key_auth,manifest.logger)#paginate(contributors_url, duplicate_col_map, update_col_map, table, table_pkey) + contributor_list = GithubPaginator(contributors_url, key_auth, logger)#paginate(contributors_url, duplicate_col_map, update_col_map, table, table_pkey) len_contributor_list = len(contributor_list) - manifest.logger.info("Count of contributors needing insertion: " + str(len_contributor_list) + "\n") + logger.info("Count of contributors needing insertion: " + str(len_contributor_list) + "\n") if len_contributor_list == 0: return @@ -52,13 +56,13 @@ def query_github_contributors(manifest, github_url): cntrb_url = ("https://api.github.com/users/" + repo_contributor['login']) - manifest.logger.info("Hitting endpoint: " + cntrb_url + " ...\n") + logger.info("Hitting endpoint: " + cntrb_url + " ...\n") #r = hit_api(session.oauths, cntrb_url, session.logger) #contributor = r.json() - contributor, result = retrieve_dict_from_endpoint(manifest.logger,manifest.key_auth, cntrb_url) + contributor, result = retrieve_dict_from_endpoint(logger, key_auth, cntrb_url) - #manifest.logger.info(f"Contributor: {contributor} \n") + #logger.info(f"Contributor: {contributor} \n") company = None location = None email = None @@ -76,7 +80,7 @@ def query_github_contributors(manifest, github_url): #cntrb_id = AugurUUID(session.platform_id,contributor['id']).to_UUID() cntrb_id = GithubUUID() cntrb_id["user"] = int(contributor['id']) - cntrb_id["platform"] = manifest.platform_id + cntrb_id["platform"] = platform_id cntrb = { "cntrb_id" : cntrb_id.to_UUID(), @@ -115,20 +119,17 @@ def query_github_contributors(manifest, github_url): cntrb_natural_keys = ['cntrb_id'] #insert cntrb to table. #session.logger.info(f"Contributor: {cntrb} \n") - manifest.augur_db.insert_data(cntrb,Contributor,cntrb_natural_keys) + bulk_insert_dicts(cntrb,Contributor,cntrb_natural_keys) except Exception as e: - manifest.logger.error("Caught exception: {}".format(e)) - manifest.logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) + logger.error("Caught exception: {}".format(e)) + logger.error("Cascading Contributor Anomalie from missing repo contributor data: {} ...\n".format(cntrb_url)) raise e # Get all the committer data for a repo. # Used by facade in facade03analyzecommit -def grab_committer_list(manifest, repo_id, platform="github"): +def grab_committer_list(logger, key_auth, repo_git, platform="github"): # Create API endpoint from repo_id - - endpoint = create_endpoint_from_repo_id(manifest.logger,manifest.augur_db, repo_id) - - query_github_contributors(manifest,endpoint) + query_github_contributors(logger, key_auth, repo_git) \ No newline at end of file From 7a3a50dbcdc44598fd0d791820b9a1a0a6c63967 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 09:25:56 -0500 Subject: [PATCH 029/122] Reduce dependence on task manifest db Signed-off-by: Andrew Brain --- .../contributor_interface.py | 19 --------- augur/tasks/github/issues/tasks.py | 16 +++---- augur/tasks/github/messages/tasks.py | 10 ++--- .../pull_requests/commits_model/core.py | 7 ++-- .../pull_requests/commits_model/tasks.py | 2 +- .../github/pull_requests/files_model/core.py | 7 ++-- .../github/pull_requests/files_model/tasks.py | 2 +- augur/tasks/github/pull_requests/tasks.py | 42 +++++++++---------- augur/tasks/github/releases/core.py | 3 +- augur/tasks/github/repo_info/core.py | 7 ++-- augur/tasks/github/repo_info/tasks.py | 4 +- augur/tasks/github/traffic/tasks.py | 8 ++-- 12 files changed, 54 insertions(+), 73 deletions(-) diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 3af210366f..e6687b767b 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -391,22 +391,3 @@ def get_login_with_commit_hash(logger,db,auth, commit_data, repo_id): match = None return match - - - -def create_endpoint_from_repo_id(logger,db, repo_id): - - """ - SELECT repo_git from repo - WHERE repo_id = :repo_id_bind - """ - #ORM syntax of above statement - result = get_repo_by_repo_id(repo_id) - - url = result.repo_git - logger.info(f"Url: {url}") - - return url - - - diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index 565a55e35d..85ecbd315a 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -11,9 +11,9 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor, Repo +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor from augur.application.config import get_development_flag -from augur.application.db.lib import get_repo_by_repo_git +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts development = get_development_flag() @@ -36,7 +36,7 @@ def collect_issues(repo_git : str) -> int: if issue_data: total_issues = len(issue_data) - process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger, manifest.augur_db) + process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger) return total_issues else: @@ -83,7 +83,7 @@ def retrieve_all_issue_data(repo_git, logger, key_auth) -> None: return all_data -def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: +def process_issues(issues, task_name, repo_id, logger) -> None: # get repo_id or have it passed tool_source = "Issue Task" @@ -137,7 +137,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) # insert the issues into the issues table. @@ -148,7 +148,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: issue_return_columns = ["issue_url", "issue_id"] issue_string_columns = ["issue_title", "issue_body"] try: - issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) except IntegrityError as e: logger.error(f"Ran into integrity error:{e} \n Offending data: \n{issue_dicts}") @@ -181,13 +181,13 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - augur_db.insert_data(issue_label_dicts, IssueLabel, + bulk_insert_dicts(issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 35ab57828c..80e3e195a5 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -9,7 +9,7 @@ from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo -from augur.application.db.lib import get_repo_by_repo_git +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts platform_id = 1 @@ -170,13 +170,13 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) if message_return_data is None: return @@ -199,11 +199,11 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(pr_message_ref_dicts)} pr messages ref rows") pr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - augur_db.insert_data(pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) + bulk_insert_dicts(pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + bulk_insert_dicts(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) logger.info(f"{task_name}: Inserted {len(message_dicts)} messages. {len(issue_message_ref_dicts)} from issues and {len(pr_message_ref_dicts)} from prs") diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index e1d2d1fee7..af710b5d50 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -2,8 +2,9 @@ from augur.tasks.github.util.github_paginator import GithubPaginator from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo +from augur.application.db.lib import bulk_insert_dicts, fetchall_data_from_sql_text -def pull_request_commits_model(repo,logger, augur_db, key_auth): +def pull_request_commits_model(repo,logger, key_auth): # query existing PRs and the respective url we will append the commits url to pr_url_sql = s.sql.text(""" @@ -14,7 +15,7 @@ def pull_request_commits_model(repo,logger, augur_db, key_auth): pr_urls = [] #pd.read_sql(pr_number_sql, self.db, params={}) - pr_urls = augur_db.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() + pr_urls = fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() owner, name = get_owner_repo(repo.repo_git) @@ -52,7 +53,7 @@ def pull_request_commits_model(repo,logger, augur_db, key_auth): if len(all_data) > 0: logger.info(f"{task_name}: Inserting {len(all_data)} rows") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - augur_db.insert_data(all_data,PullRequestCommit,pr_commits_natural_keys) + bulk_insert_dicts(all_data,PullRequestCommit,pr_commits_natural_keys) diff --git a/augur/tasks/github/pull_requests/commits_model/tasks.py b/augur/tasks/github/pull_requests/commits_model/tasks.py index d681b4fd33..f3184f8b74 100644 --- a/augur/tasks/github/pull_requests/commits_model/tasks.py +++ b/augur/tasks/github/pull_requests/commits_model/tasks.py @@ -16,4 +16,4 @@ def process_pull_request_commits(repo_git: str) -> None: with GithubTaskManifest(logger) as manifest: - pull_request_commits_model(repo, logger, manifest.augur_db, manifest.key_auth) + pull_request_commits_model(repo, logger, manifest.key_auth) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 3dfb7aec39..98af66220a 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -2,8 +2,9 @@ from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo +from augur.application.db.lib import bulk_insert_dicts, execute_sql -def pull_request_files_model(repo,logger, augur_db, key_auth): +def pull_request_files_model(repo,logger, key_auth): # query existing PRs and the respective url we will append the commits url to pr_number_sql = s.sql.text(""" @@ -14,7 +15,7 @@ def pull_request_files_model(repo,logger, augur_db, key_auth): pr_numbers = [] #pd.read_sql(pr_number_sql, self.db, params={}) - result = augur_db.execute_sql(pr_number_sql)#.fetchall() + result = execute_sql(pr_number_sql)#.fetchall() pr_numbers = [dict(row) for row in result.mappings()] owner, name = get_owner_repo(repo.repo_git) @@ -74,4 +75,4 @@ def pull_request_files_model(repo,logger, augur_db, key_auth): if len(pr_file_rows) > 0: #Execute a bulk upsert with sqlalchemy pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - augur_db.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) + bulk_insert_dicts(pr_file_rows, PullRequestFile, pr_file_natural_keys) diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index 48be47e284..762a8c24f8 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -15,4 +15,4 @@ def process_pull_request_files(repo_git: str) -> None: with GithubTaskManifest(logger) as manifest: - pull_request_files_model(repo, logger, manifest.augur_db, manifest.key_auth) \ No newline at end of file + pull_request_files_model(repo, logger, manifest.key_auth) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 4ef737105d..6e1326e9f4 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -8,8 +8,8 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo -from augur.application.db.lib import get_repo_by_repo_git +from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors @@ -30,7 +30,7 @@ def collect_pull_requests(repo_git: str) -> int: pr_data = retrieve_all_pr_data(repo_git, logger, manifest.key_auth) if pr_data: - process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, manifest.augur_db) + process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger) return len(pr_data) else: @@ -70,7 +70,7 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: return all_data -def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): +def process_pull_requests(pull_requests, task_name, repo_id, logger): """ Parse and insert all retrieved PR data. @@ -92,7 +92,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): # insert contributors from these prs logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) # insert the prs into the pull_requests table. @@ -102,7 +102,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): pr_natural_keys = ["repo_id", "pr_src_id"] pr_return_columns = ["pull_request_id", "pr_url"] pr_string_fields = ["pr_src_title", "pr_body"] - pr_return_data = augur_db.insert_data(pr_dicts, PullRequest, pr_natural_keys, + pr_return_data = bulk_insert_dicts(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) if pr_return_data is None: @@ -141,24 +141,24 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - augur_db.insert_data(pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + bulk_insert_dicts(pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) # inserting pr assignees # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - augur_db.insert_data(pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) + bulk_insert_dicts(pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) # inserting pr requested reviewers # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_reviewer_natural_keys = ["pull_request_id", "pr_reviewer_src_id"] - augur_db.insert_data(pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) + bulk_insert_dicts(pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] pr_metadata_string_fields = ["pr_src_meta_label"] - augur_db.insert_data(pr_metadata_dicts, PullRequestMeta, + bulk_insert_dicts(pr_metadata_dicts, PullRequestMeta, pr_metadata_natural_keys, string_fields=pr_metadata_string_fields) @@ -208,10 +208,8 @@ def collect_pull_request_review_comments(repo_git: str) -> None: # define GithubTaskSession to handle insertions, and store oauth keys with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db - - query = augur_db.session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id) + + query = manifest.augur_db.session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id) pr_reviews = execute_session_query(query, 'all') # maps the github pr_review id to the auto incrementing pk that augur stores as pr_review id @@ -250,7 +248,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: contributors.append(contributor) logger.info(f"{owner}/{repo} Pr review messages: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) pr_review_comment_dicts = [] @@ -277,7 +275,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] - message_return_data = augur_db.insert_data(pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) + message_return_data = bulk_insert_dicts(pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) if message_return_data is None: return @@ -307,7 +305,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] - augur_db.insert_data(pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) + bulk_insert_dicts(pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) @@ -326,10 +324,8 @@ def collect_pull_request_reviews(repo_git: str) -> None: repo_id = get_repo_by_repo_git(repo_git).repo_id with GithubTaskManifest(logger) as manifest: - - augur_db = manifest.augur_db - - query = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) + + query = manifest.augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) prs = execute_session_query(query, 'all') pr_count = len(prs) @@ -373,7 +369,7 @@ def collect_pull_request_reviews(repo_git: str) -> None: contributors.append(contributor) logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) pr_reviews = [] @@ -387,7 +383,7 @@ def collect_pull_request_reviews(repo_git: str) -> None: logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") pr_review_natural_keys = ["pr_review_src_id",] - augur_db.insert_data(pr_reviews, PullRequestReview, pr_review_natural_keys) + bulk_insert_dicts(pr_reviews, PullRequestReview, pr_review_natural_keys) diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index b7f953c618..9a15539bd4 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -4,6 +4,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts def get_release_inf(repo_id, release, tag_only): @@ -77,7 +78,7 @@ def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): #Do an upsert string_fields = ["release_name", "release_description", "release_author", "release_tag_name"] - augur_db.insert_data(release_inf,Release,['release_id'], string_fields=string_fields) + bulk_insert_dicts(release_inf,Release,['release_id'], string_fields=string_fields) logger.info(f"Inserted info for {owner}/{repo_id}/{release['name']}\n") diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index 2a9f21af72..d25decae0f 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -6,6 +6,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict from augur.application.db.models import * +from augur.application.db.lib import execute_sql from augur.tasks.github.util.github_task_session import * from augur.application.db.models.augur_data import RepoBadging from urllib.parse import quote @@ -92,7 +93,7 @@ def grab_repo_info_from_graphql_endpoint(key_auth, logger, query): return data -def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): +def repo_info_model(key_auth, repo_orm_obj, logger): logger.info("Beginning filling the repo_info model for repo: " + repo_orm_obj.repo_git + "\n") owner, repo = get_owner_repo(repo_orm_obj.repo_git) @@ -270,7 +271,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): :tool_source, :tool_version, :data_source) """).bindparams(**rep_inf) - augur_db.execute_sql(insert_statement) + execute_sql(insert_statement) # Note that the addition of information about where a repository may be forked from, and whether a repository is archived, updates the `repo` table, not the `repo_info` table. forked = is_forked(key_auth, logger, owner, repo) @@ -283,7 +284,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): archived = 0 update_repo_data = s.sql.text("""UPDATE repo SET forked_from=:forked, repo_archived=:archived, repo_archived_date_collected=:archived_date_collected WHERE repo_id=:repo_id""").bindparams(forked=forked, archived=archived, archived_date_collected=archived_date_collected, repo_id=repo_orm_obj.repo_id) - augur_db.execute_sql(update_repo_data) + execute_sql(update_repo_data) logger.info(f"Inserted info for {owner}/{repo}\n") diff --git a/augur/tasks/github/repo_info/tasks.py b/augur/tasks/github/repo_info/tasks.py index 3494f5c5e5..2d07368f39 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/augur/tasks/github/repo_info/tasks.py @@ -17,7 +17,7 @@ def collect_repo_info(repo_git: str): with GithubTaskManifest(logger) as manifest: - repo_info_model(manifest.augur_db, manifest.key_auth, repo, logger) + repo_info_model(manifest.key_auth, repo, logger) #Task to get CII api data for linux badge info using github data. @@ -30,4 +30,4 @@ def collect_linux_badge_info(repo_git: str): with GithubTaskManifest(logger) as manifest: - badges_model(logger, repo_git, repo.repo_id, augur_db) + badges_model(logger, repo_git, repo.repo_id, manifest.augur_db) diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index e374c312fc..a7c1fca998 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -7,7 +7,7 @@ from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import RepoClone -from augur.application.db.lib import get_repo_by_repo_git +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts @@ -28,7 +28,7 @@ def collect_github_repo_clones_data(repo_git: str) -> None: clones_data = retrieve_all_clones_data(repo_git, logger, manifest.key_auth) if clones_data: - process_clones_data(clones_data, f"{owner}/{repo}: Traffic task", repo_id, manifest.augur_db) + process_clones_data(clones_data, f"{owner}/{repo}: Traffic task", repo_id) else: logger.info(f"{owner}/{repo} has no clones") @@ -58,7 +58,7 @@ def retrieve_all_clones_data(repo_git: str, logger, key_auth): return all_data -def process_clones_data(clones_data, task_name, repo_id, logger, augur_db) -> None: +def process_clones_data(clones_data, task_name, repo_id, logger) -> None: clone_history_data = clones_data[0]['clones'] clone_history_data_dicts = extract_needed_clone_history_data(clone_history_data, repo_id) @@ -66,4 +66,4 @@ def process_clones_data(clones_data, task_name, repo_id, logger, augur_db) -> No clone_history_data = remove_duplicate_dicts(clone_history_data_dicts, 'clone_data_timestamp') logger.info(f"{task_name}: Inserting {len(clone_history_data_dicts)} clone history records") - augur_db.insert_data(clone_history_data_dicts, RepoClone, ['repo_id']) + bulk_insert_dicts(clone_history_data_dicts, RepoClone, ['repo_id']) From 9406688cf03b8c9b553b746f84d2181aae1196e5 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 09:55:08 -0500 Subject: [PATCH 030/122] Remove references to execute_sql and insert_data Signed-off-by: Andrew Brain --- .../contributor_breadth_worker.py | 3 +- .../git/dependency_libyear_tasks/core.py | 1 - augur/tasks/git/dependency_tasks/core.py | 2 +- .../facade_worker/utilitymethods.py | 2 +- augur/tasks/github/contributors/tasks.py | 4 +- augur/tasks/github/detect_move/core.py | 9 +-- augur/tasks/github/events/tasks.py | 8 +-- .../contributor_interface.py | 14 ++-- augur/tasks/github/facade_github/tasks.py | 15 ++-- .../pull_requests/commits_model/core.py | 3 +- augur/tasks/github/pull_requests/core.py | 33 ++++----- augur/tasks/github/repo_info/core.py | 2 +- augur/tasks/gitlab/events_task.py | 5 +- augur/tasks/gitlab/issues_task.py | 19 +++--- augur/tasks/gitlab/merge_request_task.py | 23 ++++--- augur/tasks/start_tasks.py | 68 +++++++++---------- augur/tasks/util/collection_util.py | 27 ++++---- .../test_github_tasks/test_pull_requests.py | 4 +- 18 files changed, 123 insertions(+), 119 deletions(-) diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 15660e763b..c8f65c7902 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -7,6 +7,7 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.util.github_paginator import GithubPaginator from augur.application.db.models import ContributorRepo +from augur.application.db.lib import bulk_insert_dicts ### This worker scans all the platform users in Augur, and pulls their platform activity ### logs. Those are then used to analyze what repos each is working in (which will include repos not @@ -117,7 +118,7 @@ def contributor_breadth_model(self) -> None: logger.info(f"Inserting {len(events)} events") natural_keys = ["event_id", "tool_version"] - manifest.augur_db.insert_data(events, ContributorRepo, natural_keys) + bulk_insert_dicts(events, ContributorRepo, natural_keys) def process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source): diff --git a/augur/tasks/git/dependency_libyear_tasks/core.py b/augur/tasks/git/dependency_libyear_tasks/core.py index e87e0c684b..b892570ad2 100644 --- a/augur/tasks/git/dependency_libyear_tasks/core.py +++ b/augur/tasks/git/dependency_libyear_tasks/core.py @@ -62,7 +62,6 @@ def generate_deps_libyear_data(logger, repo_id, path): # VALUES (:repo_id, :name,:requirement,:type,:package_manager,:current_verion,:latest_version,:current_release_date,:latest_release_date,:libyear,:tool_source,:tool_version,:data_source, :data_collection_date) #""").bindparams(**repo_deps) # - #session.execute_sql(insert_statement) to_insert.append(repo_deps) bulk_insert_dicts(logger, to_insert, RepoDepsLibyear, ["repo_id","name","data_collection_date"]) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index bc3444551c..cbdd3ff7a5 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -108,7 +108,7 @@ def generate_scorecard(logger, repo_git): 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') } to_insert.append(overall_deps_scorecard) - # session.insert_data(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"]) + # bulk_insert_dicts(overall_deps_scorecard, RepoDepsScorecard, ["repo_id","name"]) #Store misc data from scorecard in json field. for check in required_output['checks']: diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index fe3325754b..40f3a29e0e 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -46,7 +46,7 @@ def update_repo_log(logger, facade_helper, repos_id,status): log_message = s.sql.text("""INSERT INTO repos_fetch_log (repos_id,status) VALUES (:repo_id,:repo_status)""").bindparams(repo_id=repos_id,repo_status=status) - #session.insert_data(data,t_repos_fetch_log,['repos_id','status']) + #bulk_insert_dicts(data,t_repos_fetch_log,['repos_id','status']) execute_sql(log_message) except Exception as e: logger.error(f"Ran into error in update_repo_log: {e}") diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index cd62fa643e..080647eace 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -8,7 +8,7 @@ from augur.tasks.github.facade_github.tasks import * from augur.application.db.models import Contributor from augur.application.db.util import execute_session_query -from augur.application.db.lib import get_repo_by_repo_git +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts @celery.task @@ -61,7 +61,7 @@ def process_contributors(): enriched_contributors.append(contributor_dict) logger.info(f"Enriching {len(enriched_contributors)} contributors") - augur_db.insert_data(enriched_contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(enriched_contributors, Contributor, ["cntrb_id"]) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 2bf96ffa1f..da8449f760 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -6,10 +6,11 @@ from datetime import datetime from augur.tasks.util.collection_state import CollectionState from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts -def update_repo_with_dict(repo,new_dict,logger,db): +def update_repo_with_dict(repo,new_dict,logger): """ Update a repository record in the database using a dictionary tagged with the appropriate table fields @@ -25,7 +26,7 @@ def update_repo_with_dict(repo,new_dict,logger,db): del to_insert['_sa_instance_state'] to_insert.update(new_dict) - result = db.insert_data(to_insert, Repo, ['repo_id']) + result = bulk_insert_dicts(to_insert, Repo, ['repo_id']) url = to_insert['repo_git'] logger.info(f"Updated repo for {url}\n") @@ -76,7 +77,7 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' 'description': f"(Originally hosted at {url}) {old_description}" } - update_repo_with_dict(repo, repo_update_dict, logger,augur_db) + update_repo_with_dict(repo, repo_update_dict, logger) raise Exception("ERROR: Repo has moved! Resetting Collection!") @@ -90,7 +91,7 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') } - update_repo_with_dict(repo, repo_update_dict, logger, augur_db) + update_repo_with_dict(repo, repo_update_dict, logger) statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py index dc76138f2f..ae7e466178 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events/tasks.py @@ -10,7 +10,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.application.db.models import PullRequest, PullRequestEvent, Issue, IssueEvent, Contributor -from augur.application.db.lib import get_repo_by_repo_git +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts platform_id = 1 @@ -154,7 +154,7 @@ def process_events(events, task_name, repo_id, logger, augur_db): # remove contributors that were found in the data more than once contributors = remove_duplicate_dicts(contributors) - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) issue_events_len = len(issue_event_dicts) pr_events_len = len(pr_event_dicts) @@ -168,10 +168,10 @@ def process_events(events, task_name, repo_id, logger, augur_db): # TODO: Could replace this with "id" but it isn't stored on the table for some reason pr_event_natural_keys = ["node_id"] - augur_db.insert_data(pr_event_dicts, PullRequestEvent, pr_event_natural_keys) + bulk_insert_dicts(pr_event_dicts, PullRequestEvent, pr_event_natural_keys) issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + bulk_insert_dicts(issue_event_dicts, IssueEvent, issue_event_natural_keys) update_issue_closed_cntrbs_from_events(augur_db.engine, repo_id) diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index e6687b767b..8da45315df 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -8,7 +8,7 @@ import traceback from augur.tasks.github.util.github_paginator import GithubApiResult from augur.application.db.util import execute_session_query -from augur.application.db.lib import get_repo_by_repo_id +from augur.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -191,7 +191,7 @@ def insert_alias(logger,db, contributor, email): # Insert new alias - db.insert_data(alias, ContributorsAlias, ['alias_email']) + bulk_insert_dicts(alias, ContributorsAlias, ['alias_email']) return @@ -199,7 +199,7 @@ def insert_alias(logger,db, contributor, email): # Takes the user data from the endpoint as arg # Updates the alias table if the login is already in the contributor's table with the new email. # Returns whether the login was found in the contributors table -def resolve_if_login_existing(session, contributor): +def resolve_if_login_existing(logger, contributor): # check if login exists in contributors table select_cntrbs_query = s.sql.text(""" SELECT cntrb_id from contributors @@ -209,7 +209,7 @@ def resolve_if_login_existing(session, contributor): # Bind parameter select_cntrbs_query = select_cntrbs_query.bindparams( gh_login_value=contributor['cntrb_login']) - result = session.execute_sql(select_cntrbs_query) + result = execute_sql(select_cntrbs_query) # if yes if len(result.fetchall()) >= 1: @@ -217,7 +217,7 @@ def resolve_if_login_existing(session, contributor): return True # If not found, return false - session.logger.info( + logger.info( f"Contributor not found in contributors table but can be added. Adding...") return False """ @@ -310,7 +310,7 @@ def fetch_username_from_email(logger, auth, commit): # Method to return the login given commit data using the supplemental data in the commit # -email # -name -def get_login_with_supplemental_data(logger,db,auth, commit_data): +def get_login_with_supplemental_data(logger, auth, commit_data): # Try to get login from all possible emails # Is None upon failure. @@ -328,7 +328,7 @@ def get_login_with_supplemental_data(logger,db,auth, commit_data): try: unresolved_natural_keys = ['email'] - db.insert_data(unresolved, UnresolvedCommitEmail, unresolved_natural_keys) + bulk_insert_dicts(unresolved, UnresolvedCommitEmail, unresolved_natural_keys) except Exception as e: logger.error( f"Could not create new unresolved email {unresolved['email']}. Error: {e}") diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 46ae367171..d363ab4307 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -63,7 +63,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) if login == None or login == "": logger.info("Failed to get login from commit hash") # Try to get the login from supplemental data if not found with the commit hash - login = get_login_with_supplemental_data(logger, db, auth,contributor) + login = get_login_with_supplemental_data(logger, auth,contributor) if login == None or login == "": logger.error("Failed to get login from supplemental data!") @@ -131,8 +131,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Executes an upsert with sqlalchemy cntrb_natural_keys = ['cntrb_id'] - db.insert_data(cntrb,Contributor,cntrb_natural_keys) - + bulk_insert_dicts(cntrb,Contributor,cntrb_natural_keys) try: # Update alias after insertion. Insertion needs to happen first so we can get the autoincrementkey @@ -160,7 +159,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) logger.info(f"Updating now resolved email {email}") try: - db.execute_sql(query) + execute_sql(query) except Exception as e: logger.info( f"Deleting now resolved email failed with error: {e}") @@ -207,7 +206,7 @@ def insert_facade_contributors(self, repo_id): # Get all of the commit data's emails and names from the commit table that do not appear # in the contributors table or the contributors_aliases table. - manifest.logger.info( + logger.info( "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) new_contrib_sql = s.sql.text(""" SELECT DISTINCT @@ -247,7 +246,7 @@ def insert_facade_contributors(self, repo_id): """).bindparams(repo_id=repo_id) #Execute statement with session. - result = manifest.augur_db.execute_sql(new_contrib_sql) + result = execute_sql(new_contrib_sql) new_contribs = [dict(row) for row in result.mappings()] #print(new_contribs) @@ -257,9 +256,9 @@ def insert_facade_contributors(self, repo_id): - process_commit_metadata(manifest.logger,manifest.augur_db,manifest.key_auth,list(new_contribs),repo_id,manifest.platform_id) + process_commit_metadata(logger,manifest.augur_db,manifest.key_auth,list(new_contribs),repo_id,manifest.platform_id) - manifest.logger.debug("DEBUG: Got through the new_contribs") + logger.debug("DEBUG: Got through the new_contribs") facade_helper = FacadeHelper(logger) diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index af710b5d50..8635c9b4e0 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -15,8 +15,7 @@ def pull_request_commits_model(repo,logger, key_auth): pr_urls = [] #pd.read_sql(pr_number_sql, self.db, params={}) - pr_urls = fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() - + pr_urls = fetchall_data_from_sql_text(pr_url_sql) owner, name = get_owner_repo(repo.repo_git) task_name = f"{owner}/{name} Pr commits" diff --git a/augur/tasks/github/pull_requests/core.py b/augur/tasks/github/pull_requests/core.py index 5bc86cd676..02837c6f72 100644 --- a/augur/tasks/github/pull_requests/core.py +++ b/augur/tasks/github/pull_requests/core.py @@ -4,6 +4,7 @@ from augur.application.db.data_parse import * from augur.application.db.session import DatabaseSession +from augur.application.db.lib import bulk_insert_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, Contributor @@ -129,12 +130,12 @@ def extract_data_from_pr_list(pull_requests: List[dict], return pr_dicts, pr_mapping_data, pr_numbers, contributors -def insert_pr_contributors(contributors: List[dict], session: DatabaseSession, task_name: str) -> None: +def insert_pr_contributors(contributors: List[dict], logger, task_name: str) -> None: """Insert pr contributors Args: contributors: the contributor data that is being inserted - session: database session to insert the data with + logger task_name: to differiante between log statements since there are multiple tasks of the same type """ @@ -142,16 +143,16 @@ def insert_pr_contributors(contributors: List[dict], session: DatabaseSession, t contributors = remove_duplicate_dicts(contributors) # insert contributors from these prs - session.logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - session.insert_data(contributors, Contributor, ["cntrb_id"]) + logger.info(f"{task_name}: Inserting {len(contributors)} contributors") + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) -def insert_prs(pr_dicts: List[dict], session: DatabaseSession, task_name: str) -> Optional[List[dict]]: +def insert_prs(pr_dicts: List[dict], logger, task_name: str) -> Optional[List[dict]]: """Insert pull requests Args: pr_dicts: the pull request data that is being inserted - session: database session to insert the data with + logger task_name: to differiante between log statements since there are multiple tasks of the same type Returns: @@ -159,10 +160,10 @@ def insert_prs(pr_dicts: List[dict], session: DatabaseSession, task_name: str) - So we can determine what labels, assigness, and other data belong to each pr """ - session.logger.info(f"{task_name}: Inserting prs of length: {len(pr_dicts)}") + logger.info(f"{task_name}: Inserting prs of length: {len(pr_dicts)}") pr_natural_keys = ["pr_url"] pr_return_columns = ["pull_request_id", "pr_url"] - pr_return_data = session.insert_data(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) + pr_return_data = bulk_insert_dicts(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) return pr_return_data @@ -211,7 +212,7 @@ def map_other_pr_data_to_pr( return pr_label_dicts, pr_assignee_dicts, pr_reviewer_dicts, pr_metadata_dicts -def insert_pr_labels(labels: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_labels(labels: List[dict], logger: logging.Logger) -> None: """Insert pull request labels Note: @@ -223,10 +224,10 @@ def insert_pr_labels(labels: List[dict], logger: logging.Logger, session) -> Non """ # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] - session.insert_data(labels, PullRequestLabel, pr_label_natural_keys) + bulk_insert_dicts(labels, PullRequestLabel, pr_label_natural_keys) -def insert_pr_assignees(assignees: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_assignees(assignees: List[dict], logger: logging.Logger) -> None: """Insert pull request assignees Note: @@ -238,10 +239,10 @@ def insert_pr_assignees(assignees: List[dict], logger: logging.Logger, session) """ # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - session.insert_data(assignees, PullRequestAssignee, pr_assignee_natural_keys) + bulk_insert_dicts(assignees, PullRequestAssignee, pr_assignee_natural_keys) -def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger) -> None: """Insert pull request reviewers Note: @@ -253,10 +254,10 @@ def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger, session) """ # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_reviewer_natural_keys = ["pull_request_id", "pr_reviewer_src_id"] - session.insert_data(reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + bulk_insert_dicts(reviewers, PullRequestReviewer, pr_reviewer_natural_keys) -def insert_pr_metadata(metadata: List[dict], logger: logging.Logger, session) -> None: +def insert_pr_metadata(metadata: List[dict], logger: logging.Logger) -> None: """Insert pull request metadata Note: @@ -269,7 +270,7 @@ def insert_pr_metadata(metadata: List[dict], logger: logging.Logger, session) -> # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - session.insert_data(metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(metadata, PullRequestMeta, pr_metadata_natural_keys) diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index d25decae0f..0cf6705fca 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -256,7 +256,7 @@ def repo_info_model(key_auth, repo_orm_obj, logger): 'data_source': "Github" } - #result = session.insert_data(rep_inf,RepoInfo,['repo_info_id']) #result = self.db.execute(self.repo_info_table.insert().values(rep_inf)) + #result = bulk_insert_dicts(rep_inf,RepoInfo,['repo_info_id']) #result = self.db.execute(self.repo_info_table.insert().values(rep_inf)) insert_statement = s.sql.text("""INSERT INTO repo_info (repo_id,last_updated,issues_enabled, open_issues,pull_requests_enabled,wiki_enabled,pages_enabled,fork_count, default_branch,watchers_count,license,stars_count, diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index a7b886da2d..186560d4e2 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -11,6 +11,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts platform_id = 2 @@ -153,7 +154,7 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_event_dicts)} gitlab issue events") issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + bulk_insert_dicts(issue_event_dicts, IssueEvent, issue_event_natural_keys) def process_mr_events(events, task_name, repo_id, logger, augur_db): @@ -203,6 +204,6 @@ def process_mr_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") mr_event_natural_keys = ["platform_id", "node_id"] - augur_db.insert_data(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) + bulk_insert_dicts(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index b96650c9a1..9b50b20b39 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -13,6 +13,7 @@ from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo, Contributor from augur.application.db.util import execute_session_query from augur.tasks.util.worker_util import remove_duplicate_dicts +from augur.application.db.lib import bulk_insert_dicts platform_id = 2 @@ -41,7 +42,7 @@ def collect_gitlab_issues(repo_git : str) -> int: issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, manifest.key_auth) if issue_data: - issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger) return issue_ids else: @@ -87,7 +88,7 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: return all_data -def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: +def process_issues(issues, task_name, repo_id, logger) -> None: """ Retrieve only the needed data for issues from the api response @@ -142,14 +143,14 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") issue_natural_keys = ["repo_id", "gh_issue_id"] issue_string_columns = ["issue_title", "issue_body"] issue_return_columns = ["gh_issue_id", "issue_id"] - issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) issue_label_dicts = [] issue_assignee_dicts = [] @@ -176,12 +177,12 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - augur_db.insert_data(issue_label_dicts, IssueLabel, + bulk_insert_dicts(issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) return issue_ids @@ -326,13 +327,13 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) issue_message_ref_dicts = [] @@ -349,7 +350,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} gitlab issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + bulk_insert_dicts(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) def process_gitlab_issue_comment_contributors(message, tool_source, tool_version, data_source): diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 437909e78e..3b57de143a 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -9,6 +9,7 @@ from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee from augur.application.db.util import execute_session_query from augur.tasks.util.worker_util import remove_duplicate_dicts +from augur.application.db.lib import bulk_insert_dicts platform_id = 2 @@ -126,13 +127,13 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") pr_natural_keys = ["repo_id", "pr_src_id"] pr_string_fields = ["pr_src_title", "pr_body"] pr_return_columns = ["pull_request_id", "pr_src_id"] - pr_return_data = augur_db.insert_data(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) + pr_return_data = bulk_insert_dicts(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) mr_assignee_dicts = [] @@ -154,11 +155,11 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + bulk_insert_dicts(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - augur_db.insert_data(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + bulk_insert_dicts(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) return mr_ids @@ -250,13 +251,13 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} mr message contributors") - augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} mr messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) mr_message_ref_dicts = [] @@ -273,7 +274,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_message_ref_dicts)} mr messages ref rows") mr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - augur_db.insert_data(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) + bulk_insert_dicts(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -339,7 +340,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - augur_db.insert_data(all_metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(all_metadata, PullRequestMeta, pr_metadata_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -406,7 +407,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): # TODO: Need to add unique key with pull_request_id and cntrb_id to insert gitlab reviewers # pr_reviewer_natural_keys = ["pull_request_id", "cntrb_id"] - # augur_db.insert_data(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + # bulk_insert_dicts(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) @@ -475,7 +476,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - augur_db.insert_data(all_commits,PullRequestCommit,pr_commits_natural_keys) + bulk_insert_dicts(all_commits,PullRequestCommit,pr_commits_natural_keys) @@ -530,7 +531,7 @@ def process_mr_files(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_files)} merge request files") pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - augur_db.insert_data(all_files, PullRequestFile, pr_file_natural_keys) + bulk_insert_dicts(all_files, PullRequestFile, pr_file_natural_keys) def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index f0906007b2..aaf7ddaaec 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -27,6 +27,7 @@ from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import * from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor +from augur.application.db.lib import execute_sql CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) @@ -153,7 +154,7 @@ def non_repo_domain_tasks(self): tasks.apply_async() -def build_primary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_primary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Add all required tasks to a list and pass it to the CollectionRequest primary_enabled_phases = [] primary_gitlab_enabled_phases = [] @@ -173,10 +174,10 @@ def core_task_success_util_gen(repo_git): primary_gitlab_enabled_phases.append(core_task_success_util_gen) primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7, gitlab_phases=primary_gitlab_enabled_phases) - primary_request.get_valid_repos(session) + primary_request.get_valid_repos(session, logger) return primary_request -def build_secondary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_secondary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Deal with secondary collection secondary_enabled_phases = [] @@ -192,11 +193,11 @@ def secondary_task_success_util_gen(repo_git): secondary_enabled_phases.append(secondary_task_success_util_gen) request = CollectionRequest("secondary",secondary_enabled_phases,max_repo=10, days_until_collect_again=10) - request.get_valid_repos(session) + request.get_valid_repos(session, logger) return request -def build_facade_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_facade_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Deal with facade collection facade_enabled_phases = [] @@ -214,10 +215,10 @@ def facade_task_update_weight_util_gen(repo_git): request = CollectionRequest("facade",facade_enabled_phases,max_repo=30, days_until_collect_again=7) - request.get_valid_repos(session) + request.get_valid_repos(session, logger) return request -def build_ml_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): +def build_ml_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): ml_enabled_phases = [] ml_enabled_phases.append(machine_learning_phase) @@ -228,7 +229,7 @@ def ml_task_success_util_gen(repo_git): ml_enabled_phases.append(ml_task_success_util_gen) request = CollectionRequest("ml",ml_enabled_phases,max_repo=5, days_until_collect_again=10) - request.get_valid_repos(session) + request.get_valid_repos(session, logger) return request @celery.task(bind=True) @@ -247,18 +248,18 @@ def augur_collection_monitor(self): enabled_collection_hooks = [] if primary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_primary_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_primary_repo_collect_request(session, logger, enabled_phase_names)) if secondary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_secondary_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_secondary_repo_collect_request(session, logger, enabled_phase_names)) #start_secondary_collection(session, max_repo=10) if facade_phase.__name__ in enabled_phase_names: #start_facade_collection(session, max_repo=30) - enabled_collection_hooks.append(build_facade_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_facade_repo_collect_request(session, logger, enabled_phase_names)) if machine_learning_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_ml_repo_collect_request(session,enabled_phase_names)) + enabled_collection_hooks.append(build_ml_repo_collect_request(session, logger, enabled_phase_names)) #start_ml_collection(session,max_repo=5) logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") @@ -324,27 +325,26 @@ def retry_errored_repos(self): #TODO: Isaac needs to normalize the status's to be abstract in the #collection_status table once augur dev is less unstable. - with DatabaseSession(logger,engine) as session: - query = s.sql.text(f"""UPDATE collection_status SET secondary_status = '{CollectionState.PENDING.value}'""" - f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is NULL;""" - f"""UPDATE collection_status SET core_status = '{CollectionState.PENDING.value}'""" - f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is NULL;""" - f"""UPDATE collection_status SET facade_status = '{CollectionState.PENDING.value}'""" - f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is NULL;""" - f"""UPDATE collection_status SET ml_status = '{CollectionState.PENDING.value}'""" - f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is NULL;""" - - f"""UPDATE collection_status SET secondary_status = '{CollectionState.SUCCESS.value}'""" - f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is not NULL;""" - f"""UPDATE collection_status SET core_status = '{CollectionState.SUCCESS.value}'""" - f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is not NULL;;""" - f"""UPDATE collection_status SET facade_status = '{CollectionState.SUCCESS.value}'""" - f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is not NULL;;""" - f"""UPDATE collection_status SET ml_status = '{CollectionState.SUCCESS.value}'""" - f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is not NULL;;""" - ) + query = s.sql.text(f"""UPDATE collection_status SET secondary_status = '{CollectionState.PENDING.value}'""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is NULL;""" + f"""UPDATE collection_status SET core_status = '{CollectionState.PENDING.value}'""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is NULL;""" + f"""UPDATE collection_status SET facade_status = '{CollectionState.PENDING.value}'""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is NULL;""" + f"""UPDATE collection_status SET ml_status = '{CollectionState.PENDING.value}'""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is NULL;""" + + f"""UPDATE collection_status SET secondary_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is not NULL;""" + f"""UPDATE collection_status SET core_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is not NULL;;""" + f"""UPDATE collection_status SET facade_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is not NULL;;""" + f"""UPDATE collection_status SET ml_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is not NULL;;""" + ) - session.execute_sql(query) + execute_sql(query) @@ -366,11 +366,11 @@ def create_collection_status_records(self): SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.collection_status) """) - repo = session.execute_sql(query).first() + repo = execute_sql(query).first() while repo is not None: CollectionStatus.insert(session,repo[0]) - repo = session.execute_sql(query).first() + repo = execute_sql(query).first() #Check for new repos every seven minutes to be out of step with the clone_repos task create_collection_status_records.si().apply_async(countdown=60*7) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index b4ff09ecb9..bffd9b69a3 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -14,11 +14,12 @@ from augur.tasks.github.util.util import get_repo_weight_core, get_repo_weight_by_issue from augur.application.db.session import DatabaseSession from augur.application.db import get_engine +from augur.application.db.lib import execute_sql from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps from augur.tasks.util.collection_state import CollectionState -def get_list_of_all_users(session): +def get_list_of_all_users(): #Get a list of all users. query = s.sql.text(""" SELECT @@ -26,7 +27,7 @@ def get_list_of_all_users(session): FROM augur_operations.users """) - users = session.execute_sql(query).fetchall() + users = execute_sql(query).fetchall() return users @@ -133,7 +134,7 @@ def get_active_repo_count(self,session): return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) #Get repo urls based on passed in info. - def get_valid_repos(self,session): + def get_valid_repos(self,session, logger): #getattr(CollectionStatus,f"{hook}_status" ) represents the status of the given hook #Get the count of repos that are currently running this collection hook #status_column = f"{hook}_status" @@ -145,16 +146,16 @@ def get_valid_repos(self,session): limit = self.max_repo-active_repo_count #Extract the user id from the randomized list and split into four chunks - split_user_list = split_random_users_list(session,f"{self.name}_status",self.new_status) + split_user_list = split_random_users_list(f"{self.name}_status",self.new_status) - session.logger.info(f"User_list: {split_user_list}") + logger.info(f"User_list: {split_user_list}") #Iterate through each fourth of the users fetched for quarter_list in split_user_list: if limit <= 0: return - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) + collection_list = get_valid_repos_for_users(logger,limit,tuple(quarter_list),hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) self.repo_list.extend(collection_list) #Update limit with amount of repos started @@ -165,7 +166,7 @@ def get_valid_repos(self,session): return - user_list = get_list_of_all_users(session) + user_list = get_list_of_all_users() random.shuffle(user_list) #Extract the user id from the randomized list and split into four chunks @@ -180,7 +181,7 @@ def get_valid_repos(self,session): #only start repos older than the specified amount of days #Query a set of valid repositories sorted by weight, also making sure that the repos aren't new or errored #Order by the relevant weight for the collection hook - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),allow_old_repos=True,hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) + collection_list = get_valid_repos_for_users(logger,limit,tuple(quarter_list),allow_old_repos=True,hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) self.repo_list.extend(collection_list) limit -= len(collection_list) @@ -625,7 +626,7 @@ def send_messages(self): # # return len(repo_git_identifiers) -def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook="core",days_to_wait_until_next_collection = 1): +def get_valid_repos_for_users(logger,limit,users,allow_old_repos = False,hook="core",days_to_wait_until_next_collection = 1): condition_string = "1" @@ -652,10 +653,10 @@ def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook=" """).bindparams(list_of_user_ids=users,limit_num=limit) #Get a list of valid repo ids, limit set to 2 times the usual - valid_repos = session.execute_sql(repo_query).fetchall() + valid_repos = execute_sql(repo_query).fetchall() valid_repo_git_list = [repo[1] for repo in valid_repos] - session.logger.info(f"valid {hook} repo git list: {tuple(valid_repo_git_list)}") + logger.info(f"valid {hook} repo git list: {tuple(valid_repo_git_list)}") #start repos for new primary collection hook #collection_size = start_block_of_repos( @@ -666,7 +667,7 @@ def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook=" return valid_repo_git_list -def split_random_users_list(session,status_col, status_new): +def split_random_users_list(status_col, status_new): #Split all users that have new repos into four lists and randomize order query = s.sql.text(f""" SELECT @@ -679,7 +680,7 @@ def split_random_users_list(session,status_col, status_new): GROUP BY user_id """) - user_list = session.execute_sql(query).fetchall() + user_list = execute_sql(query).fetchall() random.shuffle(user_list) #Extract the user id from the randomized list and split into four chunks diff --git a/tests/test_tasks/test_github_tasks/test_pull_requests.py b/tests/test_tasks/test_github_tasks/test_pull_requests.py index a047ca1698..d23d2f0f3d 100644 --- a/tests/test_tasks/test_github_tasks/test_pull_requests.py +++ b/tests/test_tasks/test_github_tasks/test_pull_requests.py @@ -259,7 +259,7 @@ def test_insert_pr_contributors(github_api_key_headers, test_db_session, pr_numb unique_contributors.append(cntrb["login"]) - insert_pr_contributors(contributors_to_pass_to_insert, test_db_session, "Insert contrbibutors test") + insert_pr_contributors(contributors_to_pass_to_insert, logger, "Insert contrbibutors test") with test_db_session.engine.connect() as connection: @@ -332,7 +332,7 @@ def test_insert_prs(github_api_key_headers, test_db_session, repo): ) - return_data = insert_prs(prs_insert, test_db_session, "Insert contrbibutors test") + return_data = insert_prs(prs_insert, logger, "Insert contrbibutors test") with test_db_session.engine.connect() as connection: From c5687e7da535dde04e75b5401fe299ad62f71f35 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 10:01:27 -0500 Subject: [PATCH 031/122] Remove references of session.logger Signed-off-by: Andrew Brain --- augur/api/routes/dei.py | 2 +- .../application/db/models/augur_operations.py | 10 +++--- .../contributor_interface.py | 34 +++++++++---------- augur/tasks/github/facade_github/core.py | 2 +- augur/tasks/start_tasks.py | 4 +-- augur/tasks/util/collection_util.py | 6 ++-- 6 files changed, 29 insertions(+), 29 deletions(-) diff --git a/augur/api/routes/dei.py b/augur/api/routes/dei.py index 990a6e7368..621c89604d 100644 --- a/augur/api/routes/dei.py +++ b/augur/api/routes/dei.py @@ -92,7 +92,7 @@ def core_task_success_util_gen(repo_git): deiHook = CollectionRequest("core",primary_enabled_phases) deiHook.repo_list = [repo_url] - singleRoutine = AugurTaskRoutine(session,[deiHook]) + singleRoutine = AugurTaskRoutine(logger, session,[deiHook]) singleRoutine.start_data_collection() #start_block_of_repos(logger, session, [repo_url], primary_enabled_phases, "new") diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 47f28b12f2..0ce224fec6 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1224,7 +1224,7 @@ class CollectionStatus(Base): repo = relationship("Repo", back_populates="collection_status") @staticmethod - def insert(session, repo_id): + def insert(session, logger, repo_id): from augur.tasks.github.util.util import get_repo_weight_by_issue from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps @@ -1237,13 +1237,13 @@ def insert(session, repo_id): if "github" in repo_git: try: - pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) + pr_issue_count = get_repo_weight_by_issue(logger, repo_git) #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) except Exception as e: pr_issue_count = None github_weight = None - session.logger.error( + logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) else: try: @@ -1252,7 +1252,7 @@ def insert(session, repo_id): except Exception as e: pr_issue_count = None github_weight = None - session.logger.error( + logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) @@ -1267,7 +1267,7 @@ def insert(session, repo_id): result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) - session.logger.info(f"Trying to insert repo \n issue and pr sum: {record['issue_pr_sum']}") + logger.info(f"Trying to insert repo \n issue and pr sum: {record['issue_pr_sum']}") if not result: return False diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 8da45315df..a746bc79f2 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -25,8 +25,8 @@ # Hit the endpoint specified by the url and return the json that it returns if it returns a dict. # Returns None on failure. # NOTE: This function is being deprecated in favor of retrieve_dict_from_endpoint -def request_dict_from_endpoint(session, url, timeout_wait=10): - #session.logger.info(f"Hitting endpoint: {url}") +def request_dict_from_endpoint(logger, session, url, timeout_wait=10): + #logger.info(f"Hitting endpoint: {url}") attempts = 0 response_data = None @@ -34,9 +34,9 @@ def request_dict_from_endpoint(session, url, timeout_wait=10): while attempts < 10: try: - response = hit_api(session.oauths, url, session.logger) + response = hit_api(session.oauths, url, logger) except TimeoutError: - session.logger.info( + logger.info( f"User data request for enriching contributor data failed with {attempts} attempts! Trying again...") time.sleep(timeout_wait) continue @@ -51,34 +51,34 @@ def request_dict_from_endpoint(session, url, timeout_wait=10): response_data = json.loads(json.dumps(response.text)) if type(response_data) == dict: - err = process_dict_response(session.logger,response,response_data) + err = process_dict_response(logger,response,response_data) #If we get an error message that's not None if err and err != GithubApiResult.SUCCESS: attempts += 1 - session.logger.info(f"err: {err}") + logger.info(f"err: {err}") continue - #session.logger.info(f"Returned dict: {response_data}") + #logger.info(f"Returned dict: {response_data}") success = True break elif type(response_data) == list: - session.logger.warning("Wrong type returned, trying again...") - session.logger.info(f"Returned list: {response_data}") + logger.warning("Wrong type returned, trying again...") + logger.info(f"Returned list: {response_data}") elif type(response_data) == str: - session.logger.info( + logger.info( f"Warning! page_data was string: {response_data}") if "" in response_data: - session.logger.info("HTML was returned, trying again...\n") + logger.info("HTML was returned, trying again...\n") elif len(response_data) == 0: - session.logger.warning("Empty string, trying again...\n") + logger.warning("Empty string, trying again...\n") else: try: # Sometimes raw text can be converted to a dict response_data = json.loads(response_data) - err = process_dict_response(session.logger,response,response_data) + err = process_dict_response(logger,response,response_data) #If we get an error message that's not None if err and err != GithubApiResult.SUCCESS: @@ -120,7 +120,7 @@ def create_endpoint_from_commit_sha(logger,db,commit_sha, repo_id): raise KeyError # Else put into a more readable local var - #session.logger.info(f"Result: {result}") + #logger.info(f"Result: {result}") split_git = result.repo_git.split('/') repo_name_and_org = split_git[-2] + "/" + result.repo_name @@ -174,9 +174,9 @@ def insert_alias(logger,db, contributor, email): logger.info( f"There are more than one contributors in the table with gh_user_id={contributor['gh_user_id']}") - #session.logger.info(f"Creating alias for email: {email}") + #logger.info(f"Creating alias for email: {email}") - #session.logger.info(f"{contributor_table_data} has type {type(contributor_table_data)}") + #logger.info(f"{contributor_table_data} has type {type(contributor_table_data)}") # Insert a new alias that corresponds to where the contributor was found # use the email of the new alias for canonical_email if the api returns NULL # TODO: It might be better to have the canonical_email allowed to be NUll because right now it has a null constraint. @@ -275,7 +275,7 @@ def fetch_username_from_email(logger, auth, commit): # Default to failed state login_json = None - #session.logger.info(f"Here is the commit: {commit}") + #logger.info(f"Here is the commit: {commit}") # email = commit['email_raw'] if 'email_raw' in commit else commit['email_raw'] diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py index 962efb766b..849bce5ce6 100644 --- a/augur/tasks/github/facade_github/core.py +++ b/augur/tasks/github/facade_github/core.py @@ -57,7 +57,7 @@ def query_github_contributors(logger, key_auth, github_url): logger.info("Hitting endpoint: " + cntrb_url + " ...\n") - #r = hit_api(session.oauths, cntrb_url, session.logger) + #r = hit_api(session.oauths, cntrb_url, logger) #contributor = r.json() contributor, result = retrieve_dict_from_endpoint(logger, key_auth, cntrb_url) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index aaf7ddaaec..709f57494d 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -263,7 +263,7 @@ def augur_collection_monitor(self): #start_ml_collection(session,max_repo=5) logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") - main_routine = AugurTaskRoutine(session,enabled_collection_hooks) + main_routine = AugurTaskRoutine(logger, session,enabled_collection_hooks) main_routine.start_data_collection() @@ -369,7 +369,7 @@ def create_collection_status_records(self): repo = execute_sql(query).first() while repo is not None: - CollectionStatus.insert(session,repo[0]) + CollectionStatus.insert(session, logger, repo[0]) repo = execute_sql(query).first() #Check for new repos every seven minutes to be out of step with the clone_repos task diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index bffd9b69a3..d9726b4634 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -536,8 +536,8 @@ class to keep track of various groups of collection tasks for a group of repos. collection_hook (str): String determining the attributes to update when collection for a repo starts. e.g. core session: Database session to use """ - def __init__(self,session,collection_hooks): - self.logger = session.logger + def __init__(self, logger, session,collection_hooks): + self.logger = logger self.collection_hooks = collection_hooks self.session = session @@ -660,7 +660,7 @@ def get_valid_repos_for_users(logger,limit,users,allow_old_repos = False,hook="c #start repos for new primary collection hook #collection_size = start_block_of_repos( - # session.logger, session, + # logger, session, # valid_repo_git_list, # phases, repos_type=repos_type, hook=hook #) From 7683067697755907efa136091fb07a8e137a1461 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 10:10:08 -0500 Subject: [PATCH 032/122] Reduce usage of GithubTaskManifest where only key auth is needed Signed-off-by: Andrew Brain --- .../contributor_breadth_worker.py | 57 +++++++++---------- augur/tasks/github/contributors/tasks.py | 6 +- augur/tasks/github/issues/tasks.py | 30 +++++----- .../pull_requests/commits_model/tasks.py | 6 +- augur/tasks/github/pull_requests/tasks.py | 17 +++--- augur/tasks/github/repo_info/tasks.py | 5 +- augur/tasks/github/traffic/tasks.py | 14 ++--- .../tasks/github/util/github_task_session.py | 1 - 8 files changed, 69 insertions(+), 67 deletions(-) diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index c8f65c7902..075da26831 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -8,6 +8,7 @@ from augur.tasks.github.util.github_paginator import GithubPaginator from augur.application.db.models import ContributorRepo from augur.application.db.lib import bulk_insert_dicts +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth ### This worker scans all the platform users in Augur, and pulls their platform activity ### logs. Those are then used to analyze what repos each is working in (which will include repos not @@ -26,6 +27,8 @@ def contributor_breadth_model(self) -> None: tool_version = '0.0.1' data_source = 'GitHub API' + key_auth = GithubRandomKeyAuth(logger) + # This version of the query pulls contributors who have not had any data collected yet # To the top of the list cntrb_login_query = s.sql.text(""" @@ -61,7 +64,6 @@ def contributor_breadth_model(self) -> None: current_cntrb_logins = [dict(row) for row in result.mappings()] - cntrb_newest_events_query = s.sql.text(""" SELECT c.gh_login, MAX(cr.created_at) as newest_event_date FROM contributor_repo AS cr @@ -82,43 +84,40 @@ def contributor_breadth_model(self) -> None: cntrb_newest_events_map[gh_login] = newest_event_date + index = 1 + total = len(current_cntrb_logins) + for cntrb in current_cntrb_logins: - with GithubTaskManifest(logger) as manifest: - - index = 1 - total = len(current_cntrb_logins) - for cntrb in current_cntrb_logins: - - print(f"Processing cntrb {index} of {total}") - index += 1 + print(f"Processing cntrb {index} of {total}") + index += 1 - repo_cntrb_url = f"https://api.github.com/users/{cntrb['gh_login']}/events" + repo_cntrb_url = f"https://api.github.com/users/{cntrb['gh_login']}/events" - newest_event_in_db = datetime(1970, 1, 1) - if cntrb["gh_login"] in cntrb_newest_events_map: - newest_event_in_db = cntrb_newest_events_map[cntrb["gh_login"]] - + newest_event_in_db = datetime(1970, 1, 1) + if cntrb["gh_login"] in cntrb_newest_events_map: + newest_event_in_db = cntrb_newest_events_map[cntrb["gh_login"]] + - cntrb_events = [] - for page_data, page in GithubPaginator(repo_cntrb_url, manifest.key_auth, logger).iter_pages(): + cntrb_events = [] + for page_data, page in GithubPaginator(repo_cntrb_url, key_auth, logger).iter_pages(): - if page_data: - cntrb_events += page_data + if page_data: + cntrb_events += page_data - oldest_event_on_page = datetime.strptime(page_data[-1]["created_at"], "%Y-%m-%dT%H:%M:%SZ") - if oldest_event_on_page < newest_event_in_db: - print("Found cntrb events we already have...skipping the rest") - break + oldest_event_on_page = datetime.strptime(page_data[-1]["created_at"], "%Y-%m-%dT%H:%M:%SZ") + if oldest_event_on_page < newest_event_in_db: + print("Found cntrb events we already have...skipping the rest") + break - if len(cntrb_events) == 0: - logger.info("There are no cntrb events, or new events for this user.\n") - continue + if len(cntrb_events) == 0: + logger.info("There are no cntrb events, or new events for this user.\n") + continue - events = process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source) + events = process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source) - logger.info(f"Inserting {len(events)} events") - natural_keys = ["event_id", "tool_version"] - bulk_insert_dicts(events, ContributorRepo, natural_keys) + logger.info(f"Inserting {len(events)} events") + natural_keys = ["event_id", "tool_version"] + bulk_insert_dicts(events, ContributorRepo, natural_keys) def process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source): diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index 080647eace..7804fd0a57 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -9,6 +9,8 @@ from augur.application.db.models import Contributor from augur.application.db.util import execute_session_query from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + @celery.task @@ -115,8 +117,8 @@ def grab_comitters(self, repo_git,platform="github"): repo_id = repo.repo_id try: - with GithubTaskManifest(logger) as manifest: - grab_committer_list(logger, manifest.key_auth, repo_id,platform) + key_auth = GithubRandomKeyAuth(logger) + grab_committer_list(logger, key_auth, repo_id,platform) except Exception as e: logger.error(f"Could not grab committers from github endpoint!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index 85ecbd315a..18d631afb8 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -8,7 +8,7 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor @@ -27,24 +27,24 @@ def collect_issues(repo_git : str) -> int: owner, repo = get_owner_repo(repo_git) - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - logger.info(f'this is the manifest.key_auth value: {str(manifest.key_auth)}') + logger.info(f'this is the manifest.key_auth value: {str(key_auth)}') - try: - issue_data = retrieve_all_issue_data(repo_git, logger, manifest.key_auth) + try: + issue_data = retrieve_all_issue_data(repo_git, logger, key_auth) - if issue_data: - total_issues = len(issue_data) - process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger) + if issue_data: + total_issues = len(issue_data) + process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger) - return total_issues - else: - logger.info(f"{owner}/{repo} has no issues") - return 0 - except Exception as e: - logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") - return -1 + return total_issues + else: + logger.info(f"{owner}/{repo} has no issues") + return 0 + except Exception as e: + logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + return -1 diff --git a/augur/tasks/github/pull_requests/commits_model/tasks.py b/augur/tasks/github/pull_requests/commits_model/tasks.py index f3184f8b74..b1d920e986 100644 --- a/augur/tasks/github/pull_requests/commits_model/tasks.py +++ b/augur/tasks/github/pull_requests/commits_model/tasks.py @@ -2,7 +2,7 @@ from augur.tasks.github.pull_requests.commits_model.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.application.db.lib import get_repo_by_repo_git @@ -14,6 +14,6 @@ def process_pull_request_commits(repo_git: str) -> None: repo = get_repo_by_repo_git(repo_git) - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - pull_request_commits_model(repo, logger, manifest.key_auth) + pull_request_commits_model(repo, logger, key_auth) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 6e1326e9f4..3f1df36e50 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -12,6 +12,7 @@ from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth platform_id = 1 @@ -25,17 +26,17 @@ def collect_pull_requests(repo_git: str) -> int: owner, repo = get_owner_repo(repo_git) - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - pr_data = retrieve_all_pr_data(repo_git, logger, manifest.key_auth) + pr_data = retrieve_all_pr_data(repo_git, logger, key_auth) - if pr_data: - process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger) + if pr_data: + process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger) - return len(pr_data) - else: - logger.info(f"{owner}/{repo} has no pull requests") - return 0 + return len(pr_data) + else: + logger.info(f"{owner}/{repo} has no pull requests") + return 0 # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers diff --git a/augur/tasks/github/repo_info/tasks.py b/augur/tasks/github/repo_info/tasks.py index 2d07368f39..0d0222ea51 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/augur/tasks/github/repo_info/tasks.py @@ -5,6 +5,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.lib import get_repo_by_repo_git +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth #Task to get regular misc github info @@ -15,9 +16,9 @@ def collect_repo_info(repo_git: str): repo = get_repo_by_repo_git(repo_git) - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - repo_info_model(manifest.key_auth, repo, logger) + repo_info_model(key_auth, repo, logger) #Task to get CII api data for linux badge info using github data. diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index a7c1fca998..43d5963372 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -8,7 +8,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import RepoClone from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts - +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth @celery.task @@ -23,14 +23,14 @@ def collect_github_repo_clones_data(repo_git: str) -> None: logger.info(f"Collecting Github repository clone data for {owner}/{repo}") - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - clones_data = retrieve_all_clones_data(repo_git, logger, manifest.key_auth) + clones_data = retrieve_all_clones_data(repo_git, logger, key_auth) - if clones_data: - process_clones_data(clones_data, f"{owner}/{repo}: Traffic task", repo_id) - else: - logger.info(f"{owner}/{repo} has no clones") + if clones_data: + process_clones_data(clones_data, f"{owner}/{repo}: Traffic task", repo_id) + else: + logger.info(f"{owner}/{repo} has no clones") def retrieve_all_clones_data(repo_git: str, logger, key_auth): owner, repo = get_owner_repo(repo_git) diff --git a/augur/tasks/github/util/github_task_session.py b/augur/tasks/github/util/github_task_session.py index 4699fb7ef6..80c12feb81 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/augur/tasks/github/util/github_task_session.py @@ -12,7 +12,6 @@ def __init__(self, logger): self.augur_db = DatabaseSession(logger, engine) self.key_auth = GithubRandomKeyAuth(logger) - self.logger = logger self.platform_id = 1 def __enter__(self): From 3e8a6e56c4ad6e03d6016210d9bc4abbc5a6d119 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 10:18:29 -0500 Subject: [PATCH 033/122] Use get_repo_by_repo_git in gitlab tasks Signed-off-by: Andrew Brain --- augur/tasks/gitlab/events_task.py | 20 +++----- augur/tasks/gitlab/issues_task.py | 42 +++++++--------- augur/tasks/gitlab/merge_request_task.py | 61 +++++++++--------------- 3 files changed, 47 insertions(+), 76 deletions(-) diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index 186560d4e2..519e6b3f8c 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -11,7 +11,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent from augur.application.db.util import execute_session_query -from augur.application.db.lib import bulk_insert_dicts +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git platform_id = 2 @@ -27,19 +27,16 @@ def collect_gitlab_issue_events(repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + with GitlabTaskManifest(logger) as manifest: events = retrieve_all_gitlab_event_data("issue", repo_git, logger, manifest.key_auth) if events: logger.info(f"Length of gitlab issue events: {len(events)}") - process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, augur_db) + process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, manifest.augur_db) else: logger.info(f"{owner}/{repo} has no gitlab issue events") @@ -57,19 +54,16 @@ def collect_gitlab_merge_request_events(repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + with GitlabTaskManifest(logger) as manifest: events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, manifest.key_auth) if events: logger.info(f"Length of gitlab merge request events: {len(events)}") - process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, augur_db) + process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, manifest.augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request events") diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 9b50b20b39..531a78ea7a 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -13,7 +13,8 @@ from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo, Contributor from augur.application.db.util import execute_session_query from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.lib import bulk_insert_dicts +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth platform_id = 2 @@ -27,28 +28,24 @@ def collect_gitlab_issues(repo_git : str) -> int: """ logger = logging.getLogger(collect_gitlab_issues.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - try: - - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + key_auth = GitlabRandomKeyAuth(logger) - owner, repo = get_owner_repo(repo_git) - - issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, manifest.key_auth) + try: + owner, repo = get_owner_repo(repo_git) + + issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) - if issue_data: - issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger) + if issue_data: + issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger) - return issue_ids - else: - logger.info(f"{owner}/{repo} has no issues") - return [] - except Exception as e: + return issue_ids + else: + logger.info(f"{owner}/{repo} has no issues") + return [] + except Exception as e: logger.error(f"Could not collect gitlab issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") return -1 @@ -215,19 +212,16 @@ def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_gitlab_issues.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + with GitlabTaskManifest(logger) as manifest: comments = retrieve_all_gitlab_issue_comments(manifest.key_auth, logger, issue_ids, repo_git) if comments: logger.info(f"Length of comments: {len(comments)}") - process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, augur_db) + process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, manifest.augur_db) else: logger.info(f"{owner}/{repo} has no gitlab issue comments") diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 3b57de143a..99f787cc9a 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -9,7 +9,7 @@ from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee from augur.application.db.util import execute_session_query from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.lib import bulk_insert_dicts +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git platform_id = 2 @@ -25,18 +25,16 @@ def collect_gitlab_merge_requests(repo_git: str) -> int: logger = logging.getLogger(collect_gitlab_merge_requests.__name__) - with GitlabTaskManifest(logger) as manifest: + repo_id = get_repo_by_repo_git(repo_git).repo_id - augur_db = manifest.augur_db + owner, repo = get_owner_repo(repo_git) - repo_id = augur_db.session.query(Repo).filter( - Repo.repo_git == repo_git).one().repo_id + with GitlabTaskManifest(logger) as manifest: - owner, repo = get_owner_repo(repo_git) mr_data = retrieve_all_mr_data(repo_git, logger, manifest.key_auth) if mr_data: - mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, augur_db) + mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, manifest.augur_db) return mr_ids else: @@ -178,20 +176,17 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_merge_request_comments.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + with GitlabTaskManifest(logger) as manifest: url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger, response_type="list") if comments: logger.info(f"Length of merge request comments: {len(comments)}") - process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, augur_db) + process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, manifest.augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request comments") @@ -290,20 +285,17 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_merge_request_metadata.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + with GitlabTaskManifest(logger) as manifest: url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger, response_type="dict") if metadata_list: logger.info(f"Length of merge request metadata: {len(metadata_list)}") - process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, augur_db) + process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, manifest.augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request metadata") @@ -356,20 +348,17 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_merge_request_reviewers.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + with GitlabTaskManifest(logger) as manifest: url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger, response_type="dict") if reviewers: logger.info(f"Length of merge request reviewers: {len(reviewers)}") - process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, augur_db) + process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, manifest.augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") @@ -424,20 +413,17 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_merge_request_commits.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + with GitlabTaskManifest(logger) as manifest: url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger, response_type="list") if commits: logger.info(f"Length of merge request commits: {len(commits)}") - process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, augur_db) + process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, manifest.augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request commits") @@ -490,23 +476,20 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) - logger = logging.getLogger(collect_merge_request_files.__name__) - with GitlabTaskManifest(logger) as manifest: - augur_db = manifest.augur_db + owner, repo = get_owner_repo(repo_git) + + repo_id = get_repo_by_repo_git(repo_git).repo_id - query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) - repo_obj = execute_session_query(query, 'one') - repo_id = repo_obj.repo_id + with GitlabTaskManifest(logger) as manifest: url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger, response_type="dict") if files: logger.info(f"Length of merge request files: {len(files)}") - process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, augur_db) + process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, manifest.augur_db) else: logger.info(f"{owner}/{repo} has no gitlab merge request files") From 7196b787b36ee7d8abb2865cbff6b7bf91646e23 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 11:16:46 -0500 Subject: [PATCH 034/122] Database improvements Signed-off-by: Andrew Brain --- augur/api/view/api.py | 4 ++-- augur/application/cli/backend.py | 4 ++-- augur/application/cli/collection.py | 2 +- augur/application/db/models/augur_data.py | 6 +++-- .../application/db/models/augur_operations.py | 17 +++++++------- .../contributor_breadth_worker.py | 2 +- augur/tasks/github/contributors/tasks.py | 5 +---- augur/tasks/github/detect_move/core.py | 2 +- augur/tasks/github/events/tasks.py | 6 ++--- .../contributor_interface.py | 4 ++-- augur/tasks/github/facade_github/core.py | 2 +- augur/tasks/github/facade_github/tasks.py | 2 +- augur/tasks/github/issues/tasks.py | 8 +++---- augur/tasks/github/messages/tasks.py | 8 +++---- .../pull_requests/commits_model/core.py | 2 +- augur/tasks/github/pull_requests/core.py | 8 +++---- .../github/pull_requests/files_model/core.py | 2 +- augur/tasks/github/pull_requests/tasks.py | 22 +++++++++---------- augur/tasks/github/releases/core.py | 2 +- augur/tasks/github/traffic/tasks.py | 2 +- augur/tasks/gitlab/events_task.py | 4 ++-- augur/tasks/gitlab/issues_task.py | 14 ++++++------ augur/tasks/gitlab/merge_request_task.py | 20 ++++++++--------- augur/tasks/start_tasks.py | 15 ++++++------- augur/util/repo_load_controller.py | 2 +- 25 files changed, 82 insertions(+), 83 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index cbd7e4a0f1..3003495ac7 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -23,7 +23,7 @@ def add_existing_repo_to_group(session, user_id, group_name, repo_id): if group_id is None: return False - result = UserRepo.insert(session, repo_id, group_id) + result = UserRepo.insert(repo_id, group_id) if not result: return False @@ -38,7 +38,7 @@ def add_existing_org_to_group(session, user_id, group_name, rg_id): repos = session.query(Repo).filter(Repo.repo_group_id == rg_id).all() logger.info("Length of repos in org: " + str(len(repos))) for repo in repos: - result = UserRepo.insert(session, repo.repo_id, group_id) + result = UserRepo.insert(repo.repo_id, group_id) if not result: logger.info("Failed to add repo to group") diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index a0480adab4..64da534875 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -100,7 +100,7 @@ def start(ctx, disable_collection, development, port): create_collection_status_records.si().apply_async() time.sleep(3) - contributor_breadth_model.si().apply_async() + #contributor_breadth_model.si().apply_async() # start cloning repos when augur starts clone_repos.si().apply_async() @@ -317,7 +317,7 @@ def assign_orphan_repos_to_default_user(session): repos = session.execute_sql(query).fetchall() for repo in repos: - UserRepo.insert(session,repo[0],1) + UserRepo.insert(repo[0],1) @cli.command('export-env') diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py index 63c433a79e..7fefb35d4c 100644 --- a/augur/application/cli/collection.py +++ b/augur/application/cli/collection.py @@ -301,4 +301,4 @@ def assign_orphan_repos_to_default_user(session): repos = session.execute_sql(query).fetchall() for repo in repos: - UserRepo.insert(session,repo[0],1) + UserRepo.insert(repo[0],1) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 606236df21..1a1ac0dab7 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -29,6 +29,8 @@ from augur.application.db.models.base import Base from augur.application.db.util import execute_session_query +from augur.application.db.lib import bulk_insert_dicts + DEFAULT_REPO_GROUP_ID = 1 metadata = Base.metadata @@ -1072,7 +1074,7 @@ def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): repo_unique = ["repo_git"] return_columns = ["repo_id"] - result = session.insert_data(repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) + result = bulk_insert_dicts(logger, repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) if not result: return None @@ -1120,7 +1122,7 @@ def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_ repo_unique = ["repo_git"] return_columns = ["repo_id"] - result = session.insert_data(repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) + result = bulk_insert_dicts(logger, repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) if not result: return None diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 0ce224fec6..2191a19a6b 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -13,6 +13,7 @@ from augur.application.db.models import Repo, RepoGroup from augur.application.db.session import DatabaseSession +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_id from augur.application.db.models.base import Base FRONTEND_REPO_GROUP_NAME = "Frontend Repos" @@ -673,7 +674,7 @@ def insert(session, user_id:int, group_name:str) -> dict: return False, {"status": "Group already exists"} try: - result = session.insert_data(user_group_data, UserGroup, ["name", "user_id"], return_columns=["group_id"]) + result = bulk_insert_dicts(logger, user_group_data, UserGroup, ["name", "user_id"], return_columns=["group_id"]) except IntegrityError: return False, {"status": "Error: User id does not exist"} @@ -756,7 +757,7 @@ class UserRepo(Base): group = relationship("UserGroup", back_populates="repos") @staticmethod - def insert(session, repo_id: int, group_id:int = 1) -> bool: + def insert(repo_id: int, group_id:int = 1) -> bool: """Add a repo to a user in the user_repos table. Args: @@ -777,7 +778,7 @@ def insert(session, repo_id: int, group_id:int = 1) -> bool: return_columns = ["group_id", "repo_id"] try: - data = session.insert_data(repo_user_group_data, UserRepo, repo_user_group_unique, return_columns) + data = bulk_insert_dicts(logger, repo_user_group_data, UserRepo, repo_user_group_unique, return_columns) except IntegrityError: return False @@ -832,7 +833,7 @@ def add_gitlab_repo(session, url: List[str], user_id: int, group_name=None, grou if not repo_id: return False, {"status": "Repo insertion failed", "repo_url": url} - result = UserRepo.insert(session, repo_id, group_id) + result = UserRepo.insert(repo_id, group_id) if not result: return False, {"status": "repo_user insertion failed", "repo_url": url} @@ -897,7 +898,7 @@ def add_github_repo(session, url: List[str], user_id: int, group_name=None, grou if not repo_id: return False, {"status": "Repo insertion failed", "repo_url": url} - result = UserRepo.insert(session, repo_id, group_id) + result = UserRepo.insert(repo_id, group_id) if not result: return False, {"status": "repo_user insertion failed", "repo_url": url} @@ -1224,11 +1225,11 @@ class CollectionStatus(Base): repo = relationship("Repo", back_populates="collection_status") @staticmethod - def insert(session, logger, repo_id): + def insert(logger, repo_id): from augur.tasks.github.util.util import get_repo_weight_by_issue from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps - repo = Repo.get_by_id(session, repo_id) + repo = get_repo_by_repo_id(repo_id) repo_git = repo.repo_git collection_status_unique = ["repo_id"] @@ -1265,7 +1266,7 @@ def insert(session, logger, repo_id): } - result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) + result = bulk_insert_dicts(logger, record, CollectionStatus, collection_status_unique, on_conflict_update=False) logger.info(f"Trying to insert repo \n issue and pr sum: {record['issue_pr_sum']}") diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 075da26831..0fd419ccb9 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -117,7 +117,7 @@ def contributor_breadth_model(self) -> None: logger.info(f"Inserting {len(events)} events") natural_keys = ["event_id", "tool_version"] - bulk_insert_dicts(events, ContributorRepo, natural_keys) + bulk_insert_dicts(logger, events, ContributorRepo, natural_keys) def process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source): diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index 7804fd0a57..e7ac4b902c 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -113,12 +113,9 @@ def grab_comitters(self, repo_git,platform="github"): logger = logging.getLogger(grab_comitters.__name__) - repo = get_repo_by_repo_git(repo_git) - repo_id = repo.repo_id - try: key_auth = GithubRandomKeyAuth(logger) - grab_committer_list(logger, key_auth, repo_id,platform) + grab_committer_list(logger, key_auth, repo_git, platform) except Exception as e: logger.error(f"Could not grab committers from github endpoint!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index da8449f760..ca916d744e 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -26,7 +26,7 @@ def update_repo_with_dict(repo,new_dict,logger): del to_insert['_sa_instance_state'] to_insert.update(new_dict) - result = bulk_insert_dicts(to_insert, Repo, ['repo_id']) + result = bulk_insert_dicts(logger, to_insert, Repo, ['repo_id']) url = to_insert['repo_git'] logger.info(f"Updated repo for {url}\n") diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py index ae7e466178..cb48a236ec 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events/tasks.py @@ -154,7 +154,7 @@ def process_events(events, task_name, repo_id, logger, augur_db): # remove contributors that were found in the data more than once contributors = remove_duplicate_dicts(contributors) - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) issue_events_len = len(issue_event_dicts) pr_events_len = len(pr_event_dicts) @@ -168,10 +168,10 @@ def process_events(events, task_name, repo_id, logger, augur_db): # TODO: Could replace this with "id" but it isn't stored on the table for some reason pr_event_natural_keys = ["node_id"] - bulk_insert_dicts(pr_event_dicts, PullRequestEvent, pr_event_natural_keys) + bulk_insert_dicts(logger, pr_event_dicts, PullRequestEvent, pr_event_natural_keys) issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - bulk_insert_dicts(issue_event_dicts, IssueEvent, issue_event_natural_keys) + bulk_insert_dicts(logger, issue_event_dicts, IssueEvent, issue_event_natural_keys) update_issue_closed_cntrbs_from_events(augur_db.engine, repo_id) diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index a746bc79f2..be2358ff00 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -191,7 +191,7 @@ def insert_alias(logger,db, contributor, email): # Insert new alias - bulk_insert_dicts(alias, ContributorsAlias, ['alias_email']) + bulk_insert_dicts(logger, alias, ContributorsAlias, ['alias_email']) return @@ -328,7 +328,7 @@ def get_login_with_supplemental_data(logger, auth, commit_data): try: unresolved_natural_keys = ['email'] - bulk_insert_dicts(unresolved, UnresolvedCommitEmail, unresolved_natural_keys) + bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) except Exception as e: logger.error( f"Could not create new unresolved email {unresolved['email']}. Error: {e}") diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py index 849bce5ce6..d8a35ca582 100644 --- a/augur/tasks/github/facade_github/core.py +++ b/augur/tasks/github/facade_github/core.py @@ -119,7 +119,7 @@ def query_github_contributors(logger, key_auth, github_url): cntrb_natural_keys = ['cntrb_id'] #insert cntrb to table. #session.logger.info(f"Contributor: {cntrb} \n") - bulk_insert_dicts(cntrb,Contributor,cntrb_natural_keys) + bulk_insert_dicts(logger, cntrb,Contributor,cntrb_natural_keys) except Exception as e: logger.error("Caught exception: {}".format(e)) diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index d363ab4307..6c44879603 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -131,7 +131,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Executes an upsert with sqlalchemy cntrb_natural_keys = ['cntrb_id'] - bulk_insert_dicts(cntrb,Contributor,cntrb_natural_keys) + bulk_insert_dicts(logger, cntrb,Contributor,cntrb_natural_keys) try: # Update alias after insertion. Insertion needs to happen first so we can get the autoincrementkey diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index 18d631afb8..98a8067eb5 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -137,7 +137,7 @@ def process_issues(issues, task_name, repo_id, logger) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) # insert the issues into the issues table. @@ -148,7 +148,7 @@ def process_issues(issues, task_name, repo_id, logger) -> None: issue_return_columns = ["issue_url", "issue_id"] issue_string_columns = ["issue_title", "issue_body"] try: - issue_return_data = bulk_insert_dicts(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(logger, issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) except IntegrityError as e: logger.error(f"Ran into integrity error:{e} \n Offending data: \n{issue_dicts}") @@ -181,13 +181,13 @@ def process_issues(issues, task_name, repo_id, logger) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - bulk_insert_dicts(issue_label_dicts, IssueLabel, + bulk_insert_dicts(logger, issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - bulk_insert_dicts(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(logger, issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 80e3e195a5..d387ec8171 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -170,13 +170,13 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = bulk_insert_dicts(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) if message_return_data is None: return @@ -199,11 +199,11 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(pr_message_ref_dicts)} pr messages ref rows") pr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - bulk_insert_dicts(pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) + bulk_insert_dicts(logger, pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - bulk_insert_dicts(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + bulk_insert_dicts(logger, issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) logger.info(f"{task_name}: Inserted {len(message_dicts)} messages. {len(issue_message_ref_dicts)} from issues and {len(pr_message_ref_dicts)} from prs") diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index 8635c9b4e0..7c6f36abfb 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -52,7 +52,7 @@ def pull_request_commits_model(repo,logger, key_auth): if len(all_data) > 0: logger.info(f"{task_name}: Inserting {len(all_data)} rows") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - bulk_insert_dicts(all_data,PullRequestCommit,pr_commits_natural_keys) + bulk_insert_dicts(logger, all_data,PullRequestCommit,pr_commits_natural_keys) diff --git a/augur/tasks/github/pull_requests/core.py b/augur/tasks/github/pull_requests/core.py index 02837c6f72..38d1136eb6 100644 --- a/augur/tasks/github/pull_requests/core.py +++ b/augur/tasks/github/pull_requests/core.py @@ -144,7 +144,7 @@ def insert_pr_contributors(contributors: List[dict], logger, task_name: str) -> # insert contributors from these prs logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) def insert_prs(pr_dicts: List[dict], logger, task_name: str) -> Optional[List[dict]]: @@ -163,7 +163,7 @@ def insert_prs(pr_dicts: List[dict], logger, task_name: str) -> Optional[List[di logger.info(f"{task_name}: Inserting prs of length: {len(pr_dicts)}") pr_natural_keys = ["pr_url"] pr_return_columns = ["pull_request_id", "pr_url"] - pr_return_data = bulk_insert_dicts(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) + pr_return_data = bulk_insert_dicts(logger, pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) return pr_return_data @@ -239,7 +239,7 @@ def insert_pr_assignees(assignees: List[dict], logger: logging.Logger) -> None: """ # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - bulk_insert_dicts(assignees, PullRequestAssignee, pr_assignee_natural_keys) + bulk_insert_dicts(logger, assignees, PullRequestAssignee, pr_assignee_natural_keys) def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger) -> None: @@ -270,7 +270,7 @@ def insert_pr_metadata(metadata: List[dict], logger: logging.Logger) -> None: # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - bulk_insert_dicts(metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(logger, metadata, PullRequestMeta, pr_metadata_natural_keys) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 98af66220a..24cd81574e 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -75,4 +75,4 @@ def pull_request_files_model(repo,logger, key_auth): if len(pr_file_rows) > 0: #Execute a bulk upsert with sqlalchemy pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - bulk_insert_dicts(pr_file_rows, PullRequestFile, pr_file_natural_keys) + bulk_insert_dicts(logger, pr_file_rows, PullRequestFile, pr_file_natural_keys) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 3f1df36e50..02ca264a54 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -93,7 +93,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): # insert contributors from these prs logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) # insert the prs into the pull_requests table. @@ -103,7 +103,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): pr_natural_keys = ["repo_id", "pr_src_id"] pr_return_columns = ["pull_request_id", "pr_url"] pr_string_fields = ["pr_src_title", "pr_body"] - pr_return_data = bulk_insert_dicts(pr_dicts, PullRequest, pr_natural_keys, + pr_return_data = bulk_insert_dicts(logger, pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) if pr_return_data is None: @@ -142,24 +142,24 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - bulk_insert_dicts(pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + bulk_insert_dicts(logger, pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) # inserting pr assignees # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - bulk_insert_dicts(pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) + bulk_insert_dicts(logger, pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) # inserting pr requested reviewers # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_reviewer_natural_keys = ["pull_request_id", "pr_reviewer_src_id"] - bulk_insert_dicts(pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) + bulk_insert_dicts(logger, pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] pr_metadata_string_fields = ["pr_src_meta_label"] - bulk_insert_dicts(pr_metadata_dicts, PullRequestMeta, + bulk_insert_dicts(logger, pr_metadata_dicts, PullRequestMeta, pr_metadata_natural_keys, string_fields=pr_metadata_string_fields) @@ -249,7 +249,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: contributors.append(contributor) logger.info(f"{owner}/{repo} Pr review messages: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) pr_review_comment_dicts = [] @@ -276,7 +276,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] - message_return_data = bulk_insert_dicts(pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) + message_return_data = bulk_insert_dicts(logger, pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) if message_return_data is None: return @@ -306,7 +306,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] - bulk_insert_dicts(pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) + bulk_insert_dicts(logger, pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) @@ -370,7 +370,7 @@ def collect_pull_request_reviews(repo_git: str) -> None: contributors.append(contributor) logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) pr_reviews = [] @@ -384,7 +384,7 @@ def collect_pull_request_reviews(repo_git: str) -> None: logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") pr_review_natural_keys = ["pr_review_src_id",] - bulk_insert_dicts(pr_reviews, PullRequestReview, pr_review_natural_keys) + bulk_insert_dicts(logger, pr_reviews, PullRequestReview, pr_review_natural_keys) diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index 9a15539bd4..20d9c3bb32 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -78,7 +78,7 @@ def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): #Do an upsert string_fields = ["release_name", "release_description", "release_author", "release_tag_name"] - bulk_insert_dicts(release_inf,Release,['release_id'], string_fields=string_fields) + bulk_insert_dicts(logger, release_inf,Release,['release_id'], string_fields=string_fields) logger.info(f"Inserted info for {owner}/{repo_id}/{release['name']}\n") diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index 43d5963372..a1678c68c2 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -66,4 +66,4 @@ def process_clones_data(clones_data, task_name, repo_id, logger) -> None: clone_history_data = remove_duplicate_dicts(clone_history_data_dicts, 'clone_data_timestamp') logger.info(f"{task_name}: Inserting {len(clone_history_data_dicts)} clone history records") - bulk_insert_dicts(clone_history_data_dicts, RepoClone, ['repo_id']) + bulk_insert_dicts(logger, clone_history_data_dicts, RepoClone, ['repo_id']) diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index 519e6b3f8c..6a5b4ce5d3 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -148,7 +148,7 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_event_dicts)} gitlab issue events") issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - bulk_insert_dicts(issue_event_dicts, IssueEvent, issue_event_natural_keys) + bulk_insert_dicts(logger, issue_event_dicts, IssueEvent, issue_event_natural_keys) def process_mr_events(events, task_name, repo_id, logger, augur_db): @@ -198,6 +198,6 @@ def process_mr_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") mr_event_natural_keys = ["platform_id", "node_id"] - bulk_insert_dicts(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) + bulk_insert_dicts(logger, mr_event_dicts, PullRequestEvent, mr_event_natural_keys) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 531a78ea7a..b6f7a6b2b6 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -140,14 +140,14 @@ def process_issues(issues, task_name, repo_id, logger) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") issue_natural_keys = ["repo_id", "gh_issue_id"] issue_string_columns = ["issue_title", "issue_body"] issue_return_columns = ["gh_issue_id", "issue_id"] - issue_return_data = bulk_insert_dicts(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(logger, issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) issue_label_dicts = [] issue_assignee_dicts = [] @@ -174,12 +174,12 @@ def process_issues(issues, task_name, repo_id, logger) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - bulk_insert_dicts(issue_label_dicts, IssueLabel, + bulk_insert_dicts(logger, issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - bulk_insert_dicts(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(logger, issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) return issue_ids @@ -321,13 +321,13 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = bulk_insert_dicts(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) issue_message_ref_dicts = [] @@ -344,7 +344,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} gitlab issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - bulk_insert_dicts(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + bulk_insert_dicts(logger, issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) def process_gitlab_issue_comment_contributors(message, tool_source, tool_version, data_source): diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 99f787cc9a..e36d0b6acc 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -125,13 +125,13 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") pr_natural_keys = ["repo_id", "pr_src_id"] pr_string_fields = ["pr_src_title", "pr_body"] pr_return_columns = ["pull_request_id", "pr_src_id"] - pr_return_data = bulk_insert_dicts(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) + pr_return_data = bulk_insert_dicts(logger, merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) mr_assignee_dicts = [] @@ -153,11 +153,11 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - bulk_insert_dicts(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + bulk_insert_dicts(logger, mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - bulk_insert_dicts(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + bulk_insert_dicts(logger, mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) return mr_ids @@ -246,13 +246,13 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} mr message contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} mr messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = bulk_insert_dicts(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) mr_message_ref_dicts = [] @@ -269,7 +269,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_message_ref_dicts)} mr messages ref rows") mr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - bulk_insert_dicts(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) + bulk_insert_dicts(logger, mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -332,7 +332,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - bulk_insert_dicts(all_metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(logger, all_metadata, PullRequestMeta, pr_metadata_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -462,7 +462,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - bulk_insert_dicts(all_commits,PullRequestCommit,pr_commits_natural_keys) + bulk_insert_dicts(logger, all_commits,PullRequestCommit,pr_commits_natural_keys) @@ -514,7 +514,7 @@ def process_mr_files(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_files)} merge request files") pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - bulk_insert_dicts(all_files, PullRequestFile, pr_file_natural_keys) + bulk_insert_dicts(logger, all_files, PullRequestFile, pr_file_natural_keys) def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 709f57494d..0b3a3017df 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -361,16 +361,15 @@ def create_collection_status_records(self): engine = self.app.engine logger = logging.getLogger(create_collection_status_records.__name__) - with DatabaseSession(logger,engine) as session: - query = s.sql.text(""" - SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.collection_status) - """) + query = s.sql.text(""" + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.collection_status) + """) + + repo = execute_sql(query).first() + while repo is not None: + CollectionStatus.insert(logger, repo[0]) repo = execute_sql(query).first() - while repo is not None: - CollectionStatus.insert(session, logger, repo[0]) - repo = execute_sql(query).first() - #Check for new repos every seven minutes to be out of step with the clone_repos task create_collection_status_records.si().apply_async(countdown=60*7) diff --git a/augur/util/repo_load_controller.py b/augur/util/repo_load_controller.py index 7021a215fb..c35bfab0f7 100644 --- a/augur/util/repo_load_controller.py +++ b/augur/util/repo_load_controller.py @@ -74,7 +74,7 @@ def add_cli_repo(self, repo_data: Dict[str, Any], from_org_list=False, repo_type logger.warning(f"Invalid repo group id specified for {url}, skipping.") return False, {"status": f"Invalid repo group id specified for {url}, skipping."} - UserRepo.insert(self.session, repo_id) + UserRepo.insert(repo_id) #collection_status records are now only added during collection -IM 5/1/23 #CollectionStatus.insert(self.session, repo_id) From d02c440ab26761fdb04323148fc84eaea8b60cd5 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 11:26:26 -0500 Subject: [PATCH 035/122] get good changes --- .../contributor_breadth_worker.py | 2 +- augur/tasks/github/contributors/tasks.py | 5 +---- augur/tasks/github/detect_move/core.py | 2 +- augur/tasks/github/events/tasks.py | 6 ++--- .../contributor_interface.py | 4 ++-- augur/tasks/github/facade_github/core.py | 2 +- augur/tasks/github/facade_github/tasks.py | 2 +- augur/tasks/github/issues/tasks.py | 8 +++---- augur/tasks/github/messages/tasks.py | 8 +++---- .../pull_requests/commits_model/core.py | 2 +- augur/tasks/github/pull_requests/core.py | 8 +++---- .../github/pull_requests/files_model/core.py | 2 +- augur/tasks/github/pull_requests/tasks.py | 22 +++++++++---------- augur/tasks/github/releases/core.py | 2 +- augur/tasks/github/traffic/tasks.py | 2 +- augur/tasks/gitlab/events_task.py | 4 ++-- augur/tasks/gitlab/issues_task.py | 14 ++++++------ augur/tasks/gitlab/merge_request_task.py | 20 ++++++++--------- 18 files changed, 56 insertions(+), 59 deletions(-) diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 075da26831..0fd419ccb9 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -117,7 +117,7 @@ def contributor_breadth_model(self) -> None: logger.info(f"Inserting {len(events)} events") natural_keys = ["event_id", "tool_version"] - bulk_insert_dicts(events, ContributorRepo, natural_keys) + bulk_insert_dicts(logger, events, ContributorRepo, natural_keys) def process_contributor_events(cntrb, cntrb_events, logger, tool_source, tool_version, data_source): diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index 7804fd0a57..e7ac4b902c 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -113,12 +113,9 @@ def grab_comitters(self, repo_git,platform="github"): logger = logging.getLogger(grab_comitters.__name__) - repo = get_repo_by_repo_git(repo_git) - repo_id = repo.repo_id - try: key_auth = GithubRandomKeyAuth(logger) - grab_committer_list(logger, key_auth, repo_id,platform) + grab_committer_list(logger, key_auth, repo_git, platform) except Exception as e: logger.error(f"Could not grab committers from github endpoint!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index da8449f760..ca916d744e 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -26,7 +26,7 @@ def update_repo_with_dict(repo,new_dict,logger): del to_insert['_sa_instance_state'] to_insert.update(new_dict) - result = bulk_insert_dicts(to_insert, Repo, ['repo_id']) + result = bulk_insert_dicts(logger, to_insert, Repo, ['repo_id']) url = to_insert['repo_git'] logger.info(f"Updated repo for {url}\n") diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py index ae7e466178..cb48a236ec 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events/tasks.py @@ -154,7 +154,7 @@ def process_events(events, task_name, repo_id, logger, augur_db): # remove contributors that were found in the data more than once contributors = remove_duplicate_dicts(contributors) - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) issue_events_len = len(issue_event_dicts) pr_events_len = len(pr_event_dicts) @@ -168,10 +168,10 @@ def process_events(events, task_name, repo_id, logger, augur_db): # TODO: Could replace this with "id" but it isn't stored on the table for some reason pr_event_natural_keys = ["node_id"] - bulk_insert_dicts(pr_event_dicts, PullRequestEvent, pr_event_natural_keys) + bulk_insert_dicts(logger, pr_event_dicts, PullRequestEvent, pr_event_natural_keys) issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - bulk_insert_dicts(issue_event_dicts, IssueEvent, issue_event_natural_keys) + bulk_insert_dicts(logger, issue_event_dicts, IssueEvent, issue_event_natural_keys) update_issue_closed_cntrbs_from_events(augur_db.engine, repo_id) diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index a746bc79f2..be2358ff00 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -191,7 +191,7 @@ def insert_alias(logger,db, contributor, email): # Insert new alias - bulk_insert_dicts(alias, ContributorsAlias, ['alias_email']) + bulk_insert_dicts(logger, alias, ContributorsAlias, ['alias_email']) return @@ -328,7 +328,7 @@ def get_login_with_supplemental_data(logger, auth, commit_data): try: unresolved_natural_keys = ['email'] - bulk_insert_dicts(unresolved, UnresolvedCommitEmail, unresolved_natural_keys) + bulk_insert_dicts(logger, unresolved, UnresolvedCommitEmail, unresolved_natural_keys) except Exception as e: logger.error( f"Could not create new unresolved email {unresolved['email']}. Error: {e}") diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py index 849bce5ce6..d8a35ca582 100644 --- a/augur/tasks/github/facade_github/core.py +++ b/augur/tasks/github/facade_github/core.py @@ -119,7 +119,7 @@ def query_github_contributors(logger, key_auth, github_url): cntrb_natural_keys = ['cntrb_id'] #insert cntrb to table. #session.logger.info(f"Contributor: {cntrb} \n") - bulk_insert_dicts(cntrb,Contributor,cntrb_natural_keys) + bulk_insert_dicts(logger, cntrb,Contributor,cntrb_natural_keys) except Exception as e: logger.error("Caught exception: {}".format(e)) diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index d363ab4307..6c44879603 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -131,7 +131,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Executes an upsert with sqlalchemy cntrb_natural_keys = ['cntrb_id'] - bulk_insert_dicts(cntrb,Contributor,cntrb_natural_keys) + bulk_insert_dicts(logger, cntrb,Contributor,cntrb_natural_keys) try: # Update alias after insertion. Insertion needs to happen first so we can get the autoincrementkey diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index 18d631afb8..98a8067eb5 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -137,7 +137,7 @@ def process_issues(issues, task_name, repo_id, logger) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) # insert the issues into the issues table. @@ -148,7 +148,7 @@ def process_issues(issues, task_name, repo_id, logger) -> None: issue_return_columns = ["issue_url", "issue_id"] issue_string_columns = ["issue_title", "issue_body"] try: - issue_return_data = bulk_insert_dicts(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(logger, issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) except IntegrityError as e: logger.error(f"Ran into integrity error:{e} \n Offending data: \n{issue_dicts}") @@ -181,13 +181,13 @@ def process_issues(issues, task_name, repo_id, logger) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - bulk_insert_dicts(issue_label_dicts, IssueLabel, + bulk_insert_dicts(logger, issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - bulk_insert_dicts(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(logger, issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 80e3e195a5..d387ec8171 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -170,13 +170,13 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = bulk_insert_dicts(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) if message_return_data is None: return @@ -199,11 +199,11 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(pr_message_ref_dicts)} pr messages ref rows") pr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - bulk_insert_dicts(pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) + bulk_insert_dicts(logger, pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - bulk_insert_dicts(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + bulk_insert_dicts(logger, issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) logger.info(f"{task_name}: Inserted {len(message_dicts)} messages. {len(issue_message_ref_dicts)} from issues and {len(pr_message_ref_dicts)} from prs") diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index 8635c9b4e0..7c6f36abfb 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -52,7 +52,7 @@ def pull_request_commits_model(repo,logger, key_auth): if len(all_data) > 0: logger.info(f"{task_name}: Inserting {len(all_data)} rows") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - bulk_insert_dicts(all_data,PullRequestCommit,pr_commits_natural_keys) + bulk_insert_dicts(logger, all_data,PullRequestCommit,pr_commits_natural_keys) diff --git a/augur/tasks/github/pull_requests/core.py b/augur/tasks/github/pull_requests/core.py index 02837c6f72..38d1136eb6 100644 --- a/augur/tasks/github/pull_requests/core.py +++ b/augur/tasks/github/pull_requests/core.py @@ -144,7 +144,7 @@ def insert_pr_contributors(contributors: List[dict], logger, task_name: str) -> # insert contributors from these prs logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) def insert_prs(pr_dicts: List[dict], logger, task_name: str) -> Optional[List[dict]]: @@ -163,7 +163,7 @@ def insert_prs(pr_dicts: List[dict], logger, task_name: str) -> Optional[List[di logger.info(f"{task_name}: Inserting prs of length: {len(pr_dicts)}") pr_natural_keys = ["pr_url"] pr_return_columns = ["pull_request_id", "pr_url"] - pr_return_data = bulk_insert_dicts(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) + pr_return_data = bulk_insert_dicts(logger, pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns) return pr_return_data @@ -239,7 +239,7 @@ def insert_pr_assignees(assignees: List[dict], logger: logging.Logger) -> None: """ # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - bulk_insert_dicts(assignees, PullRequestAssignee, pr_assignee_natural_keys) + bulk_insert_dicts(logger, assignees, PullRequestAssignee, pr_assignee_natural_keys) def insert_pr_reviewers(reviewers: List[dict], logger: logging.Logger) -> None: @@ -270,7 +270,7 @@ def insert_pr_metadata(metadata: List[dict], logger: logging.Logger) -> None: # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - bulk_insert_dicts(metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(logger, metadata, PullRequestMeta, pr_metadata_natural_keys) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 98af66220a..24cd81574e 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -75,4 +75,4 @@ def pull_request_files_model(repo,logger, key_auth): if len(pr_file_rows) > 0: #Execute a bulk upsert with sqlalchemy pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - bulk_insert_dicts(pr_file_rows, PullRequestFile, pr_file_natural_keys) + bulk_insert_dicts(logger, pr_file_rows, PullRequestFile, pr_file_natural_keys) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 3f1df36e50..02ca264a54 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -93,7 +93,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): # insert contributors from these prs logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) # insert the prs into the pull_requests table. @@ -103,7 +103,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): pr_natural_keys = ["repo_id", "pr_src_id"] pr_return_columns = ["pull_request_id", "pr_url"] pr_string_fields = ["pr_src_title", "pr_body"] - pr_return_data = bulk_insert_dicts(pr_dicts, PullRequest, pr_natural_keys, + pr_return_data = bulk_insert_dicts(logger, pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) if pr_return_data is None: @@ -142,24 +142,24 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - bulk_insert_dicts(pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + bulk_insert_dicts(logger, pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) # inserting pr assignees # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - bulk_insert_dicts(pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) + bulk_insert_dicts(logger, pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) # inserting pr requested reviewers # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_reviewer_natural_keys = ["pull_request_id", "pr_reviewer_src_id"] - bulk_insert_dicts(pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) + bulk_insert_dicts(logger, pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] pr_metadata_string_fields = ["pr_src_meta_label"] - bulk_insert_dicts(pr_metadata_dicts, PullRequestMeta, + bulk_insert_dicts(logger, pr_metadata_dicts, PullRequestMeta, pr_metadata_natural_keys, string_fields=pr_metadata_string_fields) @@ -249,7 +249,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: contributors.append(contributor) logger.info(f"{owner}/{repo} Pr review messages: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) pr_review_comment_dicts = [] @@ -276,7 +276,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] - message_return_data = bulk_insert_dicts(pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) + message_return_data = bulk_insert_dicts(logger, pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) if message_return_data is None: return @@ -306,7 +306,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] - bulk_insert_dicts(pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) + bulk_insert_dicts(logger, pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) @@ -370,7 +370,7 @@ def collect_pull_request_reviews(repo_git: str) -> None: contributors.append(contributor) logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) pr_reviews = [] @@ -384,7 +384,7 @@ def collect_pull_request_reviews(repo_git: str) -> None: logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") pr_review_natural_keys = ["pr_review_src_id",] - bulk_insert_dicts(pr_reviews, PullRequestReview, pr_review_natural_keys) + bulk_insert_dicts(logger, pr_reviews, PullRequestReview, pr_review_natural_keys) diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index 9a15539bd4..20d9c3bb32 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -78,7 +78,7 @@ def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): #Do an upsert string_fields = ["release_name", "release_description", "release_author", "release_tag_name"] - bulk_insert_dicts(release_inf,Release,['release_id'], string_fields=string_fields) + bulk_insert_dicts(logger, release_inf,Release,['release_id'], string_fields=string_fields) logger.info(f"Inserted info for {owner}/{repo_id}/{release['name']}\n") diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index 43d5963372..a1678c68c2 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -66,4 +66,4 @@ def process_clones_data(clones_data, task_name, repo_id, logger) -> None: clone_history_data = remove_duplicate_dicts(clone_history_data_dicts, 'clone_data_timestamp') logger.info(f"{task_name}: Inserting {len(clone_history_data_dicts)} clone history records") - bulk_insert_dicts(clone_history_data_dicts, RepoClone, ['repo_id']) + bulk_insert_dicts(logger, clone_history_data_dicts, RepoClone, ['repo_id']) diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index 519e6b3f8c..6a5b4ce5d3 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -148,7 +148,7 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_event_dicts)} gitlab issue events") issue_event_natural_keys = ["issue_id", "issue_event_src_id"] - bulk_insert_dicts(issue_event_dicts, IssueEvent, issue_event_natural_keys) + bulk_insert_dicts(logger, issue_event_dicts, IssueEvent, issue_event_natural_keys) def process_mr_events(events, task_name, repo_id, logger, augur_db): @@ -198,6 +198,6 @@ def process_mr_events(events, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") mr_event_natural_keys = ["platform_id", "node_id"] - bulk_insert_dicts(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) + bulk_insert_dicts(logger, mr_event_dicts, PullRequestEvent, mr_event_natural_keys) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 531a78ea7a..b6f7a6b2b6 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -140,14 +140,14 @@ def process_issues(issues, task_name, repo_id, logger) -> None: # insert contributors from these issues logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") issue_natural_keys = ["repo_id", "gh_issue_id"] issue_string_columns = ["issue_title", "issue_body"] issue_return_columns = ["gh_issue_id", "issue_id"] - issue_return_data = bulk_insert_dicts(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + issue_return_data = bulk_insert_dicts(logger, issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) issue_label_dicts = [] issue_assignee_dicts = [] @@ -174,12 +174,12 @@ def process_issues(issues, task_name, repo_id, logger) -> None: # we are using label_src_id and issue_id to determine if the label is already in the database. issue_label_natural_keys = ['label_src_id', 'issue_id'] issue_label_string_fields = ["label_text", "label_description"] - bulk_insert_dicts(issue_label_dicts, IssueLabel, + bulk_insert_dicts(logger, issue_label_dicts, IssueLabel, issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - bulk_insert_dicts(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + bulk_insert_dicts(logger, issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) return issue_ids @@ -321,13 +321,13 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = bulk_insert_dicts(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) issue_message_ref_dicts = [] @@ -344,7 +344,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} gitlab issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - bulk_insert_dicts(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + bulk_insert_dicts(logger, issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) def process_gitlab_issue_comment_contributors(message, tool_source, tool_version, data_source): diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 99f787cc9a..e36d0b6acc 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -125,13 +125,13 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") pr_natural_keys = ["repo_id", "pr_src_id"] pr_string_fields = ["pr_src_title", "pr_body"] pr_return_columns = ["pull_request_id", "pr_src_id"] - pr_return_data = bulk_insert_dicts(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) + pr_return_data = bulk_insert_dicts(logger, merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) mr_assignee_dicts = [] @@ -153,11 +153,11 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - bulk_insert_dicts(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + bulk_insert_dicts(logger, mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - bulk_insert_dicts(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + bulk_insert_dicts(logger, mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) return mr_ids @@ -246,13 +246,13 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} mr message contributors") - bulk_insert_dicts(contributors, Contributor, ["cntrb_id"]) + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} mr messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = bulk_insert_dicts(message_dicts, Message, message_natural_keys, + message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) mr_message_ref_dicts = [] @@ -269,7 +269,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(mr_message_ref_dicts)} mr messages ref rows") mr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - bulk_insert_dicts(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) + bulk_insert_dicts(logger, mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -332,7 +332,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] - bulk_insert_dicts(all_metadata, PullRequestMeta, pr_metadata_natural_keys) + bulk_insert_dicts(logger, all_metadata, PullRequestMeta, pr_metadata_natural_keys) @celery.task(base=AugurCoreRepoCollectionTask) @@ -462,7 +462,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] - bulk_insert_dicts(all_commits,PullRequestCommit,pr_commits_natural_keys) + bulk_insert_dicts(logger, all_commits,PullRequestCommit,pr_commits_natural_keys) @@ -514,7 +514,7 @@ def process_mr_files(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting {len(all_files)} merge request files") pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - bulk_insert_dicts(all_files, PullRequestFile, pr_file_natural_keys) + bulk_insert_dicts(logger, all_files, PullRequestFile, pr_file_natural_keys) def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): From 4d0baaa73d1ae850d4a5465709d3d512ccb651ad Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 13 Apr 2024 11:33:36 -0500 Subject: [PATCH 036/122] Fix circular import Signed-off-by: Andrew Brain --- augur/api/view/api.py | 4 ++-- augur/application/cli/backend.py | 2 +- augur/application/cli/collection.py | 2 +- augur/application/db/models/augur_data.py | 5 ++--- augur/application/db/models/augur_operations.py | 17 ++++++++--------- augur/tasks/start_tasks.py | 8 +++++--- augur/util/repo_load_controller.py | 2 +- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 3003495ac7..cbd7e4a0f1 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -23,7 +23,7 @@ def add_existing_repo_to_group(session, user_id, group_name, repo_id): if group_id is None: return False - result = UserRepo.insert(repo_id, group_id) + result = UserRepo.insert(session, repo_id, group_id) if not result: return False @@ -38,7 +38,7 @@ def add_existing_org_to_group(session, user_id, group_name, rg_id): repos = session.query(Repo).filter(Repo.repo_group_id == rg_id).all() logger.info("Length of repos in org: " + str(len(repos))) for repo in repos: - result = UserRepo.insert(repo.repo_id, group_id) + result = UserRepo.insert(session, repo.repo_id, group_id) if not result: logger.info("Failed to add repo to group") diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 64da534875..62c5bbbe2d 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -317,7 +317,7 @@ def assign_orphan_repos_to_default_user(session): repos = session.execute_sql(query).fetchall() for repo in repos: - UserRepo.insert(repo[0],1) + UserRepo.insert(session, repo[0],1) @cli.command('export-env') diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py index 7fefb35d4c..47730f5303 100644 --- a/augur/application/cli/collection.py +++ b/augur/application/cli/collection.py @@ -301,4 +301,4 @@ def assign_orphan_repos_to_default_user(session): repos = session.execute_sql(query).fetchall() for repo in repos: - UserRepo.insert(repo[0],1) + UserRepo.insert(session, repo[0],1) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 1a1ac0dab7..82ef7b7ca4 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -29,7 +29,6 @@ from augur.application.db.models.base import Base from augur.application.db.util import execute_session_query -from augur.application.db.lib import bulk_insert_dicts DEFAULT_REPO_GROUP_ID = 1 @@ -1074,7 +1073,7 @@ def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): repo_unique = ["repo_git"] return_columns = ["repo_id"] - result = bulk_insert_dicts(logger, repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) + result = session.insert_data(repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) if not result: return None @@ -1122,7 +1121,7 @@ def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_ repo_unique = ["repo_git"] return_columns = ["repo_id"] - result = bulk_insert_dicts(logger, repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) + result = session.insert_data(repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) if not result: return None diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 2191a19a6b..9e0661228f 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -13,7 +13,6 @@ from augur.application.db.models import Repo, RepoGroup from augur.application.db.session import DatabaseSession -from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_id from augur.application.db.models.base import Base FRONTEND_REPO_GROUP_NAME = "Frontend Repos" @@ -674,7 +673,7 @@ def insert(session, user_id:int, group_name:str) -> dict: return False, {"status": "Group already exists"} try: - result = bulk_insert_dicts(logger, user_group_data, UserGroup, ["name", "user_id"], return_columns=["group_id"]) + result = session.insert_data(user_group_data, UserGroup, ["name", "user_id"], return_columns=["group_id"]) except IntegrityError: return False, {"status": "Error: User id does not exist"} @@ -757,7 +756,7 @@ class UserRepo(Base): group = relationship("UserGroup", back_populates="repos") @staticmethod - def insert(repo_id: int, group_id:int = 1) -> bool: + def insert(session, repo_id: int, group_id:int = 1) -> bool: """Add a repo to a user in the user_repos table. Args: @@ -778,7 +777,7 @@ def insert(repo_id: int, group_id:int = 1) -> bool: return_columns = ["group_id", "repo_id"] try: - data = bulk_insert_dicts(logger, repo_user_group_data, UserRepo, repo_user_group_unique, return_columns) + data = session.insert_data(repo_user_group_data, UserRepo, repo_user_group_unique, return_columns) except IntegrityError: return False @@ -833,7 +832,7 @@ def add_gitlab_repo(session, url: List[str], user_id: int, group_name=None, grou if not repo_id: return False, {"status": "Repo insertion failed", "repo_url": url} - result = UserRepo.insert(repo_id, group_id) + result = UserRepo.insert(session, repo_id, group_id) if not result: return False, {"status": "repo_user insertion failed", "repo_url": url} @@ -898,7 +897,7 @@ def add_github_repo(session, url: List[str], user_id: int, group_name=None, grou if not repo_id: return False, {"status": "Repo insertion failed", "repo_url": url} - result = UserRepo.insert(repo_id, group_id) + result = UserRepo.insert(session, repo_id, group_id) if not result: return False, {"status": "repo_user insertion failed", "repo_url": url} @@ -1225,11 +1224,11 @@ class CollectionStatus(Base): repo = relationship("Repo", back_populates="collection_status") @staticmethod - def insert(logger, repo_id): + def insert(session, logger, repo_id): from augur.tasks.github.util.util import get_repo_weight_by_issue from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps - repo = get_repo_by_repo_id(repo_id) + repo = Repo.get_by_id(repo_id) repo_git = repo.repo_git collection_status_unique = ["repo_id"] @@ -1266,7 +1265,7 @@ def insert(logger, repo_id): } - result = bulk_insert_dicts(logger, record, CollectionStatus, collection_status_unique, on_conflict_update=False) + result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) logger.info(f"Trying to insert repo \n issue and pr sum: {record['issue_pr_sum']}") diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 0b3a3017df..401e8bcb5a 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -367,9 +367,11 @@ def create_collection_status_records(self): repo = execute_sql(query).first() - while repo is not None: - CollectionStatus.insert(logger, repo[0]) - repo = execute_sql(query).first() + with DatabaseSession(logger) as session: + + while repo is not None: + CollectionStatus.insert(session, logger, repo[0]) + repo = execute_sql(query).first() #Check for new repos every seven minutes to be out of step with the clone_repos task create_collection_status_records.si().apply_async(countdown=60*7) diff --git a/augur/util/repo_load_controller.py b/augur/util/repo_load_controller.py index c35bfab0f7..7021a215fb 100644 --- a/augur/util/repo_load_controller.py +++ b/augur/util/repo_load_controller.py @@ -74,7 +74,7 @@ def add_cli_repo(self, repo_data: Dict[str, Any], from_org_list=False, repo_type logger.warning(f"Invalid repo group id specified for {url}, skipping.") return False, {"status": f"Invalid repo group id specified for {url}, skipping."} - UserRepo.insert(repo_id) + UserRepo.insert(self.session, repo_id) #collection_status records are now only added during collection -IM 5/1/23 #CollectionStatus.insert(self.session, repo_id) From 6df1bbf7e4a9259f2234bb225ab51f84a516d7f7 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 15 Apr 2024 18:54:29 -0500 Subject: [PATCH 037/122] Reduce use of github task manifest Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 52 ++++++++++++++- .../contributor_breadth_worker.py | 1 - augur/tasks/github/contributors/tasks.py | 3 +- augur/tasks/github/events/tasks.py | 65 ++++--------------- augur/tasks/github/releases/tasks.py | 1 - augur/tasks/github/traffic/tasks.py | 1 - 6 files changed, 66 insertions(+), 57 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 07679ac64b..d1f5dc6163 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -9,7 +9,7 @@ from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union -from augur.application.db.models import Config, Repo, Commit, WorkerOauth +from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts @@ -423,3 +423,53 @@ def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys +def get_issues_by_repo_id(repo_id): + + with get_session() as session: + + return session.query(Issue).filter(Issue.repo_id == repo_id).all() + +def get_pull_requests_by_repo_id(repo_id): + + with get_session() as session: + + return session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + + +def update_issue_closed_cntrbs_by_repo_id(repo_id): + + engine = get_engine() + + get_ranked_issues = s.text(f""" + WITH RankedIssues AS ( + SELECT repo_id, issue_id, cntrb_id, + ROW_NUMBER() OVER(PARTITION BY issue_id ORDER BY created_at DESC) AS rn + FROM issue_events + WHERE "action" = 'closed' + ) + + SELECT issue_id, cntrb_id from RankedIssues where rn=1 and repo_id={repo_id} and cntrb_id is not NULL + """) + + with engine.connect() as conn: + result = conn.execute(get_ranked_issues).fetchall() + + update_data = [] + for row in result: + update_data.append( + { + 'issue_id': row[0], + 'cntrb_id': row[1], + 'repo_id': repo_id + } + ) + + if update_data: + with engine.connect() as connection: + update_stmt = s.text(""" + UPDATE issues + SET cntrb_id = :cntrb_id + WHERE issue_id = :issue_id + AND repo_id = :repo_id + """) + connection.execute(update_stmt, update_data) \ No newline at end of file diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 0fd419ccb9..232614ad1c 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -4,7 +4,6 @@ from datetime import datetime from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.util.github_paginator import GithubPaginator from augur.application.db.models import ContributorRepo from augur.application.db.lib import bulk_insert_dicts diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index e7ac4b902c..8e6b3fca8d 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -4,11 +4,10 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.github.util.github_paginator import hit_api -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.facade_github.tasks import * from augur.application.db.models import Contributor from augur.application.db.util import execute_session_query -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts +from augur.application.db.lib import bulk_insert_dicts from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py index cb48a236ec..ee4f407616 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events/tasks.py @@ -6,11 +6,12 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.github.util.util import get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequest, PullRequestEvent, Issue, IssueEvent, Contributor -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts +from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id + platform_id = 1 @@ -28,14 +29,14 @@ def collect_events(repo_git: str): logger.info(f"Collecting Github events for {owner}/{repo}") - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - event_data = retrieve_all_event_data(repo_git, logger, manifest.key_auth) + event_data = retrieve_all_event_data(repo_git, logger, key_auth) - if event_data: - process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger, manifest.augur_db) - else: - logger.info(f"{owner}/{repo} has no events") + if event_data: + process_events(event_data, f"{owner}/{repo}: Event task", repo_id, logger) + else: + logger.info(f"{owner}/{repo} has no events") except Exception as e: logger.error(f"Could not collect events for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") @@ -71,7 +72,7 @@ def retrieve_all_event_data(repo_git: str, logger, key_auth): return all_data -def process_events(events, task_name, repo_id, logger, augur_db): +def process_events(events, task_name, repo_id, logger): tool_source = "Github events task" tool_version = "2.0" @@ -84,13 +85,13 @@ def process_events(events, task_name, repo_id, logger, augur_db): # create mapping from issue url to issue id of current issues issue_url_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = get_issues_by_repo_id(repo_id) for issue in issues: issue_url_to_id_map[issue.issue_url] = issue.issue_id # create mapping from pr url to pr id of current pull requests pr_url_to_id_map = {} - prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + prs = get_pull_requests_by_repo_id(repo_id) for pr in prs: pr_url_to_id_map[pr.pr_url] = pr.pull_request_id @@ -173,7 +174,7 @@ def process_events(events, task_name, repo_id, logger, augur_db): issue_event_natural_keys = ["issue_id", "issue_event_src_id"] bulk_insert_dicts(logger, issue_event_dicts, IssueEvent, issue_event_natural_keys) - update_issue_closed_cntrbs_from_events(augur_db.engine, repo_id) + update_issue_closed_cntrbs_by_repo_id(repo_id) # TODO: Should we skip an event if there is no contributor to resolve it o def process_github_event_contributors(logger, event, tool_source, tool_version, data_source): @@ -189,41 +190,3 @@ def process_github_event_contributors(logger, event, tool_source, tool_version, return event, event_cntrb - -def update_issue_closed_cntrbs_from_events(engine, repo_id): - - get_ranked_issues = s.text(f""" - WITH RankedIssues AS ( - SELECT repo_id, issue_id, cntrb_id, - ROW_NUMBER() OVER(PARTITION BY issue_id ORDER BY created_at DESC) AS rn - FROM issue_events - WHERE "action" = 'closed' - ) - - SELECT issue_id, cntrb_id from RankedIssues where rn=1 and repo_id={repo_id} and cntrb_id is not NULL - """) - - with engine.connect() as conn: - result = conn.execute(get_ranked_issues).fetchall() - - update_data = [] - for row in result: - update_data.append( - { - 'issue_id': row[0], - 'cntrb_id': row[1], - 'repo_id': repo_id - } - ) - - if update_data: - with engine.connect() as connection: - update_stmt = s.text(""" - UPDATE issues - SET cntrb_id = :cntrb_id - WHERE issue_id = :issue_id - AND repo_id = :repo_id - """) - connection.execute(update_stmt, update_data) - - diff --git a/augur/tasks/github/releases/tasks.py b/augur/tasks/github/releases/tasks.py index ab947ed3a9..2bfc802d06 100644 --- a/augur/tasks/github/releases/tasks.py +++ b/augur/tasks/github/releases/tasks.py @@ -4,7 +4,6 @@ from augur.tasks.github.releases.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.util import execute_session_query from augur.application.db.lib import get_repo_by_repo_git diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index a1678c68c2..573c691301 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -3,7 +3,6 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.data_parse import extract_needed_clone_history_data from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import RepoClone From 4f50ffa3dbb2968ad87fc4b83fe21447675af47c Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 15 Apr 2024 18:59:49 -0500 Subject: [PATCH 038/122] Reduce use of GithubTaskManifest Signed-off-by: Andrew Brain --- augur/tasks/github/repo_info/tasks.py | 9 ++++++--- augur/tasks/github/util/util.py | 8 +++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/augur/tasks/github/repo_info/tasks.py b/augur/tasks/github/repo_info/tasks.py index 0d0222ea51..85d639d2a6 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/augur/tasks/github/repo_info/tasks.py @@ -1,11 +1,12 @@ import logging -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.application.db.session import DatabaseSession from augur.tasks.github.repo_info.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.lib import get_repo_by_repo_git from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.application.db import get_engine #Task to get regular misc github info @@ -25,10 +26,12 @@ def collect_repo_info(repo_git: str): @celery.task(base=AugurCoreRepoCollectionTask) def collect_linux_badge_info(repo_git: str): + engine = get_engine() + logger = logging.getLogger(collect_linux_badge_info.__name__) repo = get_repo_by_repo_git(repo_git) - with GithubTaskManifest(logger) as manifest: + with DatabaseSession(logger, engine=engine) as session: - badges_model(logger, repo_git, repo.repo_id, manifest.augur_db) + badges_model(logger, repo_git, repo.repo_id, session) diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 432f674512..f338d67d05 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -4,6 +4,7 @@ import json import httpx from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.application.db.lib import get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps @@ -71,9 +72,10 @@ def get_repo_weight_by_issue(logger,repo_git): owner,name = get_owner_repo(repo_git) - with GithubTaskManifest(logger) as manifest: - repo_graphql = GitHubRepoGraphql(logger, manifest.key_auth, owner, name) - number_of_issues_and_prs = len(repo_graphql.get_issues_collection()) + len(repo_graphql.get_pull_requests_collection()) + key_auth = GithubRandomKeyAuth(logger) + + repo_graphql = GitHubRepoGraphql(logger, key_auth, owner, name) + number_of_issues_and_prs = len(repo_graphql.get_issues_collection()) + len(repo_graphql.get_pull_requests_collection()) return number_of_issues_and_prs From 674a5a71a307c64088875915aeeecf4d1bc0857f Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 15 Apr 2024 19:04:43 -0500 Subject: [PATCH 039/122] Reduce usage of github task manifest Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 8 +- .../github/pull_requests/files_model/tasks.py | 10 +- augur/tasks/github/pull_requests/tasks.py | 141 +++++++++--------- 3 files changed, 83 insertions(+), 76 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index d1f5dc6163..fc41fc00e8 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -9,7 +9,7 @@ from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union -from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest +from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts @@ -435,6 +435,12 @@ def get_pull_requests_by_repo_id(repo_id): return session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() +def get_pull_request_reviews_by_repo_id(repo_id): + + with get_session() as session: + + return session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id).all() + def update_issue_closed_cntrbs_by_repo_id(repo_id): diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index 762a8c24f8..b5248aa8ba 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -1,18 +1,22 @@ import logging from augur.tasks.github.pull_requests.files_model.core import * -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.application.db.session import DatabaseSession from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask from augur.application.db.lib import get_repo_by_repo_git +from augur.application.db import get_engine + @celery.task(base=AugurSecondaryRepoCollectionTask) def process_pull_request_files(repo_git: str) -> None: + engine = get_engine() + logger = logging.getLogger(process_pull_request_files.__name__) repo = get_repo_by_repo_git(repo_git) - with GithubTaskManifest(logger) as manifest: + with DatabaseSession(logger, engine=engine) as session: - pull_request_files_model(repo, logger, manifest.key_auth) \ No newline at end of file + pull_request_files_model(repo, logger, session) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 02ca264a54..682a4aa1e3 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -9,7 +9,7 @@ from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth @@ -207,106 +207,103 @@ def collect_pull_request_review_comments(repo_git: str) -> None: repo_id = get_repo_by_repo_git(repo_git).repo_id - # define GithubTaskSession to handle insertions, and store oauth keys - with GithubTaskManifest(logger) as manifest: - - query = manifest.augur_db.session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id) - pr_reviews = execute_session_query(query, 'all') - - # maps the github pr_review id to the auto incrementing pk that augur stores as pr_review id - pr_review_id_mapping = {} - for review in pr_reviews: - pr_review_id_mapping[review.pr_review_src_id] = review.pr_review_id + pr_reviews = get_pull_request_reviews_by_repo_id(repo_id) + # maps the github pr_review id to the auto incrementing pk that augur stores as pr_review id + pr_review_id_mapping = {} + for review in pr_reviews: + pr_review_id_mapping[review.pr_review_src_id] = review.pr_review_id - tool_source = "Pr review comment task" - tool_version = "2.0" - data_source = "Github API" - pr_review_messages = GithubPaginator(review_msg_url, manifest.key_auth, logger) - num_pages = pr_review_messages.get_num_pages() + tool_source = "Pr review comment task" + tool_version = "2.0" + data_source = "Github API" - all_raw_pr_review_messages = [] - for page_data, page in pr_review_messages.iter_pages(): + key_auth = GithubRandomKeyAuth(logger) + pr_review_messages = GithubPaginator(review_msg_url, key_auth, logger) + num_pages = pr_review_messages.get_num_pages() - if page_data is None: - break + all_raw_pr_review_messages = [] + for page_data, page in pr_review_messages.iter_pages(): - if len(page_data) == 0: - logger.debug(f"{owner}/{repo} Pr Review Messages Page {page} contains no data...returning") - logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") - break + if page_data is None: + break + if len(page_data) == 0: + logger.debug(f"{owner}/{repo} Pr Review Messages Page {page} contains no data...returning") logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") + break - all_raw_pr_review_messages += page_data + logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}") - contributors = [] - for comment in all_raw_pr_review_messages: - - _, contributor = process_github_comment_contributors(comment, tool_source, tool_version, data_source) - if contributor is not None: - contributors.append(contributor) + all_raw_pr_review_messages += page_data - logger.info(f"{owner}/{repo} Pr review messages: Inserting {len(contributors)} contributors") - bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) + contributors = [] + for comment in all_raw_pr_review_messages: + + _, contributor = process_github_comment_contributors(comment, tool_source, tool_version, data_source) + if contributor is not None: + contributors.append(contributor) + + logger.info(f"{owner}/{repo} Pr review messages: Inserting {len(contributors)} contributors") + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) - pr_review_comment_dicts = [] - pr_review_msg_mapping_data = {} + pr_review_comment_dicts = [] + pr_review_msg_mapping_data = {} - pr_review_comments_len = len(all_raw_pr_review_messages) - logger.info(f"{owner}/{repo}: Pr review comments len: {pr_review_comments_len}") - for index, comment in enumerate(all_raw_pr_review_messages): + pr_review_comments_len = len(all_raw_pr_review_messages) + logger.info(f"{owner}/{repo}: Pr review comments len: {pr_review_comments_len}") + for index, comment in enumerate(all_raw_pr_review_messages): - # pull_request_review_id is required to map it to the correct pr review - if not comment["pull_request_review_id"]: - continue + # pull_request_review_id is required to map it to the correct pr review + if not comment["pull_request_review_id"]: + continue - pr_review_comment_dicts.append( - extract_needed_message_data(comment, platform_id, repo_id, tool_source, tool_version, data_source) - ) + pr_review_comment_dicts.append( + extract_needed_message_data(comment, platform_id, repo_id, tool_source, tool_version, data_source) + ) - # map github message id to the data that maps it to the pr review - github_msg_id = comment["id"] - pr_review_msg_mapping_data[github_msg_id] = comment + # map github message id to the data that maps it to the pr review + github_msg_id = comment["id"] + pr_review_msg_mapping_data[github_msg_id] = comment - logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") - message_natural_keys = ["platform_msg_id", "pltfrm_id"] - message_return_columns = ["msg_id", "platform_msg_id"] - message_return_data = bulk_insert_dicts(logger, pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) - if message_return_data is None: - return + logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") + message_natural_keys = ["platform_msg_id", "pltfrm_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_return_data = bulk_insert_dicts(logger, pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) + if message_return_data is None: + return - pr_review_message_ref_insert_data = [] - for data in message_return_data: + pr_review_message_ref_insert_data = [] + for data in message_return_data: - augur_msg_id = data["msg_id"] - github_msg_id = data["platform_msg_id"] + augur_msg_id = data["msg_id"] + github_msg_id = data["platform_msg_id"] - comment = pr_review_msg_mapping_data[github_msg_id] - comment["msg_id"] = augur_msg_id + comment = pr_review_msg_mapping_data[github_msg_id] + comment["msg_id"] = augur_msg_id - github_pr_review_id = comment["pull_request_review_id"] + github_pr_review_id = comment["pull_request_review_id"] - try: - augur_pr_review_id = pr_review_id_mapping[github_pr_review_id] - except KeyError: - logger.info(f"{owner}/{repo}: Could not find related pr review") - logger.info(f"{owner}/{repo}: We were searching for pr review with id: {github_pr_review_id}") - logger.info("Skipping") - continue + try: + augur_pr_review_id = pr_review_id_mapping[github_pr_review_id] + except KeyError: + logger.info(f"{owner}/{repo}: Could not find related pr review") + logger.info(f"{owner}/{repo}: We were searching for pr review with id: {github_pr_review_id}") + logger.info("Skipping") + continue - pr_review_message_ref = extract_pr_review_message_ref_data(comment, augur_pr_review_id, github_pr_review_id, repo_id, tool_version, data_source) - pr_review_message_ref_insert_data.append(pr_review_message_ref) + pr_review_message_ref = extract_pr_review_message_ref_data(comment, augur_pr_review_id, github_pr_review_id, repo_id, tool_version, data_source) + pr_review_message_ref_insert_data.append(pr_review_message_ref) - logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") - pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] - bulk_insert_dicts(logger, pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) + logger.info(f"Inserting {len(pr_review_message_ref_insert_data)} pr review refs") + pr_comment_ref_natural_keys = ["pr_review_msg_src_id"] + bulk_insert_dicts(logger, pr_review_message_ref_insert_data, PullRequestReviewMessageRef, pr_comment_ref_natural_keys) From 8e29c8eea9159b512109ced80c1d8393b5ad2900 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 15 Apr 2024 19:23:37 -0500 Subject: [PATCH 040/122] Continue decoupling code from github task manifest Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 26 +++- .../contributor_interface.py | 14 +- augur/tasks/github/facade_github/tasks.py | 138 +++++++++--------- augur/tasks/github/messages/tasks.py | 28 ++-- 4 files changed, 112 insertions(+), 94 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index fc41fc00e8..1260d2de6b 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -9,7 +9,7 @@ from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union -from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview +from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias, UnresolvedCommitEmail, Contributor from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts @@ -441,6 +441,30 @@ def get_pull_request_reviews_by_repo_id(repo_id): return session.query(PullRequestReview).filter(PullRequestReview.repo_id == repo_id).all() +def get_contributor_aliases_by_email(email): + + with get_session() as session: + + return session.query(ContributorsAlias).filter_by(alias_email=email).all() + +def get_unresolved_commit_emails_by_name(name): + + with get_session() as session: + + return session.query(UnresolvedCommitEmail).filter_by(name=name).all() + +def get_contributors_by_full_name(full_name): + + with get_session() as session: + + return session.query(Contributor).filter_by(cntrb_full_name=full_name).all() + +def get_contributors_by_github_user_id(id): + + with get_session() as session: + + # Look into this, where it was used was doing .all() but this query should really only return one + return session.query(Contributor).filter_by(gh_user_id=id).all() def update_issue_closed_cntrbs_by_repo_id(repo_id): diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index be2358ff00..49ff2dc14b 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -7,8 +7,7 @@ # Debugger import traceback from augur.tasks.github.util.github_paginator import GithubApiResult -from augur.application.db.util import execute_session_query -from augur.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql +from augur.application.db.lib import get_repo_by_repo_id, bulk_insert_dicts, execute_sql, get_contributors_by_github_user_id ##TODO: maybe have a TaskSession class that holds information about the database, logger, config, etc. @@ -106,7 +105,7 @@ def create_endpoint_from_email(email): return url -def create_endpoint_from_commit_sha(logger,db,commit_sha, repo_id): +def create_endpoint_from_commit_sha(logger, commit_sha, repo_id): logger.info( f"Trying to create endpoint from commit hash: {commit_sha}") @@ -153,14 +152,13 @@ def create_endpoint_from_name(contributor): return url -def insert_alias(logger,db, contributor, email): +def insert_alias(logger, contributor, email): # Insert cntrb_id and email of the corresponding record into the alias table # Another database call to get the contributor id is needed because its an autokeyincrement that is accessed by multiple workers # Same principle as enrich_cntrb_id method. - query = db.query(Contributor).filter_by(gh_user_id=contributor["gh_user_id"]) - contributor_table_data = execute_session_query(query, 'all') + contributor_table_data = get_contributors_by_github_user_id(contributor["gh_user_id"]) # self.logger.info(f"Contributor query: {contributor_table_data}") # Handle potential failures @@ -371,11 +369,11 @@ def get_login_with_supplemental_data(logger, auth, commit_data): return match['login'] -def get_login_with_commit_hash(logger,db,auth, commit_data, repo_id): +def get_login_with_commit_hash(logger, auth, commit_data, repo_id): # Get endpoint for login from hash url = create_endpoint_from_commit_sha( - logger,db,commit_data['hash'], repo_id) + logger, commit_data['hash'], repo_id) #TODO: here. # Send api request diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 6c44879603..574ac4a133 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -4,15 +4,14 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask from augur.tasks.github.util.github_paginator import retrieve_dict_from_endpoint -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.application.db.models import Contributor from augur.tasks.github.facade_github.core import * -from augur.application.db.lib import execute_sql -from augur.application.db.util import execute_session_query +from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_name, get_contributors_by_full_name from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * -def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id): +def process_commit_metadata(logger, auth, contributorQueue, repo_id, platform_id): for contributor in contributorQueue: # Get the email from the commit data @@ -23,8 +22,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) # check the email to see if it already exists in contributor_aliases # Look up email to see if resolved - query = db.query(ContributorsAlias).filter_by(alias_email=email) - alias_table_data = execute_session_query(query, 'all') + alias_table_data = get_contributor_aliases_by_email(email) if len(alias_table_data) >= 1: # Move on if email resolved logger.info( @@ -35,8 +33,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Check the unresolved_commits table to avoid hitting endpoints that we know don't have relevant data needlessly - query = db.query(UnresolvedCommitEmail).filter_by(name=name) - unresolved_query_result = execute_session_query(query, 'all') + unresolved_query_result = get_unresolved_commit_emails_by_name(name) if len(unresolved_query_result) >= 1: @@ -47,8 +44,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) #Check the contributors table for a login for the given name - query = db.query(Contributor).filter_by(cntrb_full_name=name) - contributors_with_matching_name = execute_session_query(query, 'first') + contributors_with_matching_name = get_contributors_by_full_name(name) if not contributors_with_matching_name: logger.debug("Failed local login lookup") @@ -58,7 +54,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) # Try to get the login from the commit sha if login == None or login == "": - login = get_login_with_commit_hash(logger,db,auth,contributor, repo_id) + login = get_login_with_commit_hash(logger, auth, contributor, repo_id) if login == None or login == "": logger.info("Failed to get login from commit hash") @@ -135,7 +131,7 @@ def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id) try: # Update alias after insertion. Insertion needs to happen first so we can get the autoincrementkey - insert_alias(logger, db,cntrb, emailFromCommitData) + insert_alias(logger, cntrb, emailFromCommitData) except LookupError as e: logger.info( ''.join(traceback.format_exception(None, e, e.__traceback__))) @@ -196,71 +192,71 @@ def link_commits_to_contributor(logger, facade_helper, contributorQueue): @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) def insert_facade_contributors(self, repo_id): + # Set platform id to 1 since this task is github specific + platform_id = 1 + engine = self.app.engine logger = logging.getLogger(insert_facade_contributors.__name__) - with GithubTaskManifest(logger) as manifest: - + # Get all of the commit data's emails and names from the commit table that do not appear + # in the contributors table or the contributors_aliases table. + + logger.info( + "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) + new_contrib_sql = s.sql.text(""" + SELECT DISTINCT + commits.cmt_author_name AS NAME, + commits.cmt_commit_hash AS hash, + commits.cmt_author_raw_email AS email_raw, + 'not_unresolved' as resolution_status + FROM + commits + WHERE + commits.repo_id = :repo_id + AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) + or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) + AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) + GROUP BY + commits.cmt_author_name, + commits.cmt_commit_hash, + commits.cmt_author_raw_email + UNION + SELECT DISTINCT + commits.cmt_author_name AS NAME,--commits.cmt_id AS id, + commits.cmt_commit_hash AS hash, + commits.cmt_author_raw_email AS email_raw, + 'unresolved' as resolution_status + FROM + commits + WHERE + commits.repo_id = :repo_id + AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) + AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) + GROUP BY + commits.cmt_author_name, + commits.cmt_commit_hash, + commits.cmt_author_raw_email + ORDER BY + hash + """).bindparams(repo_id=repo_id) - # Get all of the commit data's emails and names from the commit table that do not appear - # in the contributors table or the contributors_aliases table. - - logger.info( - "Beginning process to insert contributors from facade commits for repo w entry info: {}\n".format(repo_id)) - new_contrib_sql = s.sql.text(""" - SELECT DISTINCT - commits.cmt_author_name AS NAME, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'not_unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) - or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - UNION - SELECT DISTINCT - commits.cmt_author_name AS NAME,--commits.cmt_id AS id, - commits.cmt_commit_hash AS hash, - commits.cmt_author_raw_email AS email_raw, - 'unresolved' as resolution_status - FROM - commits - WHERE - commits.repo_id = :repo_id - AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) - AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) - GROUP BY - commits.cmt_author_name, - commits.cmt_commit_hash, - commits.cmt_author_raw_email - ORDER BY - hash - """).bindparams(repo_id=repo_id) - - #Execute statement with session. - result = execute_sql(new_contrib_sql) - new_contribs = [dict(row) for row in result.mappings()] - - #print(new_contribs) - - #json.loads(pd.read_sql(new_contrib_sql, self.db, params={ - # 'repo_id': repo_id}).to_json(orient="records")) - - - - process_commit_metadata(logger,manifest.augur_db,manifest.key_auth,list(new_contribs),repo_id,manifest.platform_id) - - logger.debug("DEBUG: Got through the new_contribs") - + #Execute statement with session. + result = execute_sql(new_contrib_sql) + new_contribs = [dict(row) for row in result.mappings()] + + #print(new_contribs) + #json.loads(pd.read_sql(new_contrib_sql, self.db, params={ + # 'repo_id': repo_id}).to_json(orient="records")) + + + key_auth = GithubRandomKeyAuth(logger) + + process_commit_metadata(logger, key_auth, list(new_contribs), repo_id, platform_id) + + logger.debug("DEBUG: Got through the new_contribs") + facade_helper = FacadeHelper(logger) # sql query used to find corresponding cntrb_id's of emails found in the contributor's table # i.e., if a contributor already exists, we use it! diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index d387ec8171..ae96f0efe5 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -5,11 +5,11 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts +from augur.application.db.models import Message, PullRequestMessageRef, IssueMessageRef, Contributor +from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id platform_id = 1 @@ -23,15 +23,15 @@ def collect_github_messages(repo_git: str) -> None: owner, repo = get_owner_repo(repo_git) - with GithubTaskManifest(logger) as manifest: - - task_name = f"{owner}/{repo}: Message Task" - message_data = retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) + key_auth = GithubRandomKeyAuth(logger) - if message_data: - process_messages(message_data, task_name, repo_id, logger, manifest.augur_db) - else: - logger.info(f"{owner}/{repo} has no messages") + task_name = f"{owner}/{repo}: Message Task" + message_data = retrieve_all_pr_and_issue_messages(repo_git, logger, key_auth, task_name) + + if message_data: + process_messages(message_data, task_name, repo_id, logger) + else: + logger.info(f"{owner}/{repo} has no messages") @@ -74,7 +74,7 @@ def retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_nam return all_data -def process_messages(messages, task_name, repo_id, logger, augur_db): +def process_messages(messages, task_name, repo_id, logger): tool_source = "Pr comment task" tool_version = "2.0" @@ -93,13 +93,13 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): # create mapping from issue url to issue id of current issues issue_url_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = get_issues_by_repo_id(repo_id) for issue in issues: issue_url_to_id_map[issue.issue_url] = issue.issue_id # create mapping from pr url to pr id of current pull requests pr_issue_url_to_id_map = {} - prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + prs = get_pull_requests_by_repo_id(repo_id) for pr in prs: pr_issue_url_to_id_map[pr.pr_issue_url] = pr.pull_request_id From cce031d83a48c37f963f11985e18186b456c5aa3 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 27 Apr 2024 09:54:42 -0500 Subject: [PATCH 041/122] Remove more references to session in domain Signed-off-by: Andrew Brain --- augur/application/db/lib.py | 9 ++- augur/application/db/models/augur_data.py | 12 ++-- augur/tasks/start_tasks.py | 67 ++++++++++++----------- augur/tasks/util/collection_util.py | 28 +++++----- 4 files changed, 63 insertions(+), 53 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 1260d2de6b..0b4ebbdd6b 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -9,7 +9,8 @@ from psycopg2.errors import DeadlockDetected from typing import List, Any, Optional, Union -from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias, UnresolvedCommitEmail, Contributor +from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias, UnresolvedCommitEmail, Contributor, CollectionStatus +from augur.tasks.util.collection_state import CollectionState from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts @@ -188,6 +189,12 @@ def get_worker_oauth_keys(platform: str): results = session.query(WorkerOauth).filter(WorkerOauth.platform == platform).order_by(func.random()).all() return [row.access_token for row in results] + +def get_active_repo_count(collection_type): + + with get_session() as session: + + return session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{collection_type}_status" ) == CollectionState.COLLECTING.value).count() def facade_bulk_insert_commits(logger, records): diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 82ef7b7ca4..544c88b2aa 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -29,6 +29,7 @@ from augur.application.db.models.base import Base from augur.application.db.util import execute_session_query +from augur.application.db import get_session DEFAULT_REPO_GROUP_ID = 1 @@ -588,12 +589,13 @@ def is_valid_repo_group_id(session, repo_group_id: int) -> bool: @staticmethod def get_by_name(session, rg_name): - query = session.query(RepoGroup).filter(RepoGroup.rg_name == rg_name) + with get_session() as session: - try: - result = execute_session_query(query, 'one') - except NoResultFound: - return None + try: + query = session.query(RepoGroup).filter(RepoGroup.rg_name == rg_name) + result = execute_session_query(query, 'one') + except NoResultFound: + return None return result diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 401e8bcb5a..4fbf70c5bd 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -154,7 +154,7 @@ def non_repo_domain_tasks(self): tasks.apply_async() -def build_primary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): +def build_primary_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): #Add all required tasks to a list and pass it to the CollectionRequest primary_enabled_phases = [] primary_gitlab_enabled_phases = [] @@ -174,10 +174,10 @@ def core_task_success_util_gen(repo_git): primary_gitlab_enabled_phases.append(core_task_success_util_gen) primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7, gitlab_phases=primary_gitlab_enabled_phases) - primary_request.get_valid_repos(session, logger) + primary_request.get_valid_repos(logger) return primary_request -def build_secondary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): +def build_secondary_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): #Deal with secondary collection secondary_enabled_phases = [] @@ -193,11 +193,11 @@ def secondary_task_success_util_gen(repo_git): secondary_enabled_phases.append(secondary_task_success_util_gen) request = CollectionRequest("secondary",secondary_enabled_phases,max_repo=10, days_until_collect_again=10) - request.get_valid_repos(session, logger) + request.get_valid_repos(logger) return request -def build_facade_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): +def build_facade_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): #Deal with facade collection facade_enabled_phases = [] @@ -215,10 +215,10 @@ def facade_task_update_weight_util_gen(repo_git): request = CollectionRequest("facade",facade_enabled_phases,max_repo=30, days_until_collect_again=7) - request.get_valid_repos(session, logger) + request.get_valid_repos(logger) return request -def build_ml_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): +def build_ml_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): ml_enabled_phases = [] ml_enabled_phases.append(machine_learning_phase) @@ -229,7 +229,7 @@ def ml_task_success_util_gen(repo_git): ml_enabled_phases.append(ml_task_success_util_gen) request = CollectionRequest("ml",ml_enabled_phases,max_repo=5, days_until_collect_again=10) - request.get_valid_repos(session, logger) + request.get_valid_repos(logger) return request @celery.task(bind=True) @@ -241,31 +241,32 @@ def augur_collection_monitor(self): logger.info("Checking for repos to collect") - with DatabaseSession(logger, engine) as session: - #Get list of enabled phases - enabled_phase_names = get_enabled_phase_names_from_config() - - enabled_collection_hooks = [] - - if primary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_primary_repo_collect_request(session, logger, enabled_phase_names)) - - if secondary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_secondary_repo_collect_request(session, logger, enabled_phase_names)) - #start_secondary_collection(session, max_repo=10) - - if facade_phase.__name__ in enabled_phase_names: - #start_facade_collection(session, max_repo=30) - enabled_collection_hooks.append(build_facade_repo_collect_request(session, logger, enabled_phase_names)) - - if machine_learning_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_ml_repo_collect_request(session, logger, enabled_phase_names)) - #start_ml_collection(session,max_repo=5) - - logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") - main_routine = AugurTaskRoutine(logger, session,enabled_collection_hooks) - - main_routine.start_data_collection() + + #Get list of enabled phases + enabled_phase_names = get_enabled_phase_names_from_config() + + enabled_collection_hooks = [] + + if primary_repo_collect_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_primary_repo_collect_request(logger, enabled_phase_names)) + + if secondary_repo_collect_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_secondary_repo_collect_request(logger, enabled_phase_names)) + #start_secondary_collection(session, max_repo=10) + + if facade_phase.__name__ in enabled_phase_names: + #start_facade_collection(session, max_repo=30) + enabled_collection_hooks.append(build_facade_repo_collect_request(logger, enabled_phase_names)) + + if machine_learning_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_ml_repo_collect_request(logger, enabled_phase_names)) + #start_ml_collection(session,max_repo=5) + + logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") + + main_routine = AugurTaskRoutine(logger, enabled_collection_hooks) + + main_routine.start_data_collection() # have a pipe of 180 diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index d9726b4634..e81b25d82e 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -14,7 +14,7 @@ from augur.tasks.github.util.util import get_repo_weight_core, get_repo_weight_by_issue from augur.application.db.session import DatabaseSession from augur.application.db import get_engine -from augur.application.db.lib import execute_sql +from augur.application.db.lib import execute_sql, get_session, get_active_repo_count, get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps from augur.tasks.util.collection_state import CollectionState @@ -130,15 +130,12 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab if name == "facade": self.new_status = CollectionState.UPDATE.value - def get_active_repo_count(self,session): - return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) - #Get repo urls based on passed in info. - def get_valid_repos(self,session, logger): + def get_valid_repos(self, logger): #getattr(CollectionStatus,f"{hook}_status" ) represents the status of the given hook #Get the count of repos that are currently running this collection hook #status_column = f"{hook}_status" - active_repo_count = self.get_active_repo_count(session) + active_repo_count = get_active_repo_count(self.name) #Will always disallow errored repos and repos that are already collecting @@ -536,21 +533,21 @@ class to keep track of various groups of collection tasks for a group of repos. collection_hook (str): String determining the attributes to update when collection for a repo starts. e.g. core session: Database session to use """ - def __init__(self, logger, session,collection_hooks): + def __init__(self, logger,collection_hooks): self.logger = logger self.collection_hooks = collection_hooks - self.session = session - def update_status_and_id(self,repo_git, task_id, name): - repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + def update_status_and_id(self,repo_git, task_id, name, session): + # NOTE: Can't simply replace with lib method because it is doing .collection_status[0] afterwards + repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() #Set status in database to collecting repoStatus = repo.collection_status[0] # setattr(repoStatus,f"{name}_task_id",task_id) setattr(repoStatus,f"{name}_status", CollectionState.COLLECTING.value) - self.session.commit() + session.commit() def start_data_collection(self): @@ -564,8 +561,11 @@ def start_data_collection(self): #Send messages starts each repo and yields its running info #to concurrently update the correct field in the database. - for repo_git, task_id, hook_name in self.send_messages(): - self.update_status_and_id(repo_git,task_id,hook_name) + + with get_session() as session: + + for repo_git, task_id, hook_name in self.send_messages(): + self.update_status_and_id(repo_git,task_id,hook_name, session) def send_messages(self): augur_collection_list = [] @@ -576,7 +576,7 @@ def send_messages(self): for repo_git in col_hook.repo_list: - repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + repo = get_repo_by_repo_git(repo_git) if "github" in repo.repo_git: augur_collection_sequence = [] for job in col_hook.phases: From 25b6adc0680f8749a2c1a57a8c8e2322e28370c7 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 27 Apr 2024 09:59:11 -0500 Subject: [PATCH 042/122] Remove more database Signed-off-by: Andrew Brain --- augur/tasks/github/contributors/tasks.py | 53 ++++++++++++------------ 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index 8e6b3fca8d..1976f4de1a 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -8,6 +8,7 @@ from augur.application.db.models import Contributor from augur.application.db.util import execute_session_query from augur.application.db.lib import bulk_insert_dicts +from augur.application.db import get_engine from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth @@ -21,48 +22,48 @@ def process_contributors(): tool_version = "2.0" data_source = "Github API" - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - augur_db = manifest.augur_db + with DatabaseSession(logger, get_engine()) as session: - query = augur_db.session.query(Contributor).filter(Contributor.data_source == data_source, Contributor.cntrb_created_at is None, Contributor.cntrb_last_used is None) + query = session.query(Contributor).filter(Contributor.data_source == data_source, Contributor.cntrb_created_at is None, Contributor.cntrb_last_used is None) contributors = execute_session_query(query, 'all') - contributors_len = len(contributors) + contributors_len = len(contributors) - if contributors_len == 0: - logger.info("No contributors to enrich...returning...") - return + if contributors_len == 0: + logger.info("No contributors to enrich...returning...") + return - print(f"Length of contributors to enrich: {contributors_len}") - enriched_contributors = [] - for index, contributor in enumerate(contributors): + print(f"Length of contributors to enrich: {contributors_len}") + enriched_contributors = [] + for index, contributor in enumerate(contributors): - logger.info(f"Contributor {index + 1} of {contributors_len}") + logger.info(f"Contributor {index + 1} of {contributors_len}") - contributor_dict = contributor.__dict__ + contributor_dict = contributor.__dict__ - del contributor_dict["_sa_instance_state"] + del contributor_dict["_sa_instance_state"] - url = f"https://api.github.com/users/{contributor_dict['cntrb_login']}" + url = f"https://api.github.com/users/{contributor_dict['cntrb_login']}" - data = retrieve_dict_data(url, manifest.key_auth, logger) + data = retrieve_dict_data(url, key_auth, logger) - if data is None: - print(f"Unable to get contributor data for: {contributor_dict['cntrb_login']}") - continue + if data is None: + print(f"Unable to get contributor data for: {contributor_dict['cntrb_login']}") + continue - new_contributor_data = { - "cntrb_created_at": data["created_at"], - "cntrb_last_used": data["updated_at"] - } + new_contributor_data = { + "cntrb_created_at": data["created_at"], + "cntrb_last_used": data["updated_at"] + } - contributor_dict.update(new_contributor_data) + contributor_dict.update(new_contributor_data) - enriched_contributors.append(contributor_dict) + enriched_contributors.append(contributor_dict) - logger.info(f"Enriching {len(enriched_contributors)} contributors") - bulk_insert_dicts(enriched_contributors, Contributor, ["cntrb_id"]) + logger.info(f"Enriching {len(enriched_contributors)} contributors") + bulk_insert_dicts(enriched_contributors, Contributor, ["cntrb_id"]) From 2153305dd3c7716d87137fda73233e4066a3a20d Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 27 Apr 2024 10:04:20 -0500 Subject: [PATCH 043/122] Remove references to github task manifest Signed-off-by: Andrew Brain --- augur/tasks/github/detect_move/tasks.py | 18 ++-- augur/tasks/github/pull_requests/tasks.py | 89 ++++++++++--------- augur/tasks/github/releases/tasks.py | 10 ++- .../tasks/github/util/github_task_session.py | 18 ---- augur/tasks/github/util/util.py | 1 - 5 files changed, 66 insertions(+), 70 deletions(-) diff --git a/augur/tasks/github/detect_move/tasks.py b/augur/tasks/github/detect_move/tasks.py index 708173629f..44ba766d4f 100644 --- a/augur/tasks/github/detect_move/tasks.py +++ b/augur/tasks/github/detect_move/tasks.py @@ -1,10 +1,12 @@ import logging -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.detect_move.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.lib import get_repo_by_repo_git +from augur.application.db import get_engine +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.application.db.session import DatabaseSession @@ -19,10 +21,13 @@ def detect_github_repo_move_core(repo_git : str) -> None: logger.info(f"Pinging repo: {repo_git}") - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) + + with DatabaseSession(logger, get_engine()) as session: + #Ping each repo with the given repo_git to make sure #that they are still in place. - ping_github_for_repo_move(manifest.augur_db, manifest.key_auth, repo, logger) + ping_github_for_repo_move(session, key_auth, repo, logger) @celery.task(base=AugurSecondaryRepoCollectionTask) @@ -36,7 +41,10 @@ def detect_github_repo_move_secondary(repo_git : str) -> None: logger.info(f"Pinging repo: {repo_git}") - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) + + with DatabaseSession(logger, get_engine()) as session: + #Ping each repo with the given repo_git to make sure #that they are still in place. - ping_github_for_repo_move(manifest.augur_db, manifest.key_auth, repo, logger,collection_hook='secondary') \ No newline at end of file + ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='secondary') \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 682a4aa1e3..d8c2a3e07d 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -5,14 +5,15 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors +from augur.application.db import get_engine from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.application.db.session import DatabaseSession platform_id = 1 @@ -321,67 +322,69 @@ def collect_pull_request_reviews(repo_git: str) -> None: repo_id = get_repo_by_repo_git(repo_git).repo_id - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) + + with DatabaseSession(logger, get_engine()) as session: - query = manifest.augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) + query = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) prs = execute_session_query(query, 'all') - pr_count = len(prs) + pr_count = len(prs) - all_pr_reviews = {} - for index, pr in enumerate(prs): + all_pr_reviews = {} + for index, pr in enumerate(prs): - pr_number = pr.pr_src_number - pull_request_id = pr.pull_request_id + pr_number = pr.pr_src_number + pull_request_id = pr.pull_request_id - logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") + logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") - pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" + pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" - pr_reviews = [] - pr_reviews_generator = GithubPaginator(pr_review_url, manifest.key_auth, logger) - for page_data, page in pr_reviews_generator.iter_pages(): + pr_reviews = [] + pr_reviews_generator = GithubPaginator(pr_review_url, key_auth, logger) + for page_data, page in pr_reviews_generator.iter_pages(): - if page_data is None: - break + if page_data is None: + break - if len(page_data) == 0: - break + if len(page_data) == 0: + break - pr_reviews.extend(page_data) - - if pr_reviews: - all_pr_reviews[pull_request_id] = pr_reviews + pr_reviews.extend(page_data) + + if pr_reviews: + all_pr_reviews[pull_request_id] = pr_reviews - if not list(all_pr_reviews.keys()): - logger.info(f"{owner}/{repo} No pr reviews for repo") - return + if not list(all_pr_reviews.keys()): + logger.info(f"{owner}/{repo} No pr reviews for repo") + return - contributors = [] - for pull_request_id in all_pr_reviews.keys(): + contributors = [] + for pull_request_id in all_pr_reviews.keys(): - reviews = all_pr_reviews[pull_request_id] - for review in reviews: - contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) - if contributor: - contributors.append(contributor) + reviews = all_pr_reviews[pull_request_id] + for review in reviews: + contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) + if contributor: + contributors.append(contributor) - logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") - bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) + logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") + bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) - pr_reviews = [] - for pull_request_id in all_pr_reviews.keys(): + pr_reviews = [] + for pull_request_id in all_pr_reviews.keys(): - reviews = all_pr_reviews[pull_request_id] - for review in reviews: - - if "cntrb_id" in review: - pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) + reviews = all_pr_reviews[pull_request_id] + for review in reviews: + + if "cntrb_id" in review: + pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) - logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") - pr_review_natural_keys = ["pr_review_src_id",] - bulk_insert_dicts(logger, pr_reviews, PullRequestReview, pr_review_natural_keys) + logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") + pr_review_natural_keys = ["pr_review_src_id",] + bulk_insert_dicts(logger, pr_reviews, PullRequestReview, pr_review_natural_keys) diff --git a/augur/tasks/github/releases/tasks.py b/augur/tasks/github/releases/tasks.py index 2bfc802d06..1ab86441ba 100644 --- a/augur/tasks/github/releases/tasks.py +++ b/augur/tasks/github/releases/tasks.py @@ -1,10 +1,12 @@ import logging -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.releases.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.lib import get_repo_by_repo_git +from augur.application.db import get_engine +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.application.db.session import DatabaseSession @celery.task(base=AugurCoreRepoCollectionTask) @@ -15,6 +17,8 @@ def collect_releases(repo_git): repo_obj = get_repo_by_repo_git(repo_git) repo_id = repo_obj.repo_id - with GithubTaskManifest(logger) as manifest: + key_auth = GithubRandomKeyAuth(logger) - releases_model(manifest.augur_db, manifest.key_auth, logger, repo_git, repo_id) \ No newline at end of file + with DatabaseSession(logger, get_engine()) as session: + + releases_model(session, key_auth, logger, repo_git, repo_id) \ No newline at end of file diff --git a/augur/tasks/github/util/github_task_session.py b/augur/tasks/github/util/github_task_session.py index 80c12feb81..a21fbc233a 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/augur/tasks/github/util/github_task_session.py @@ -4,24 +4,6 @@ from augur.application.db.session import DatabaseSession from augur.application.db import get_engine -class GithubTaskManifest: - - def __init__(self, logger): - - engine = get_engine() - - self.augur_db = DatabaseSession(logger, engine) - self.key_auth = GithubRandomKeyAuth(logger) - self.platform_id = 1 - - def __enter__(self): - - return self - - def __exit__(self, exception_type, exception_value, exception_traceback): - - self.augur_db.close() - class GithubTaskSession(DatabaseSession): """ORM session used in github tasks. diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index f338d67d05..8dd6e4d81b 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -3,7 +3,6 @@ import logging import json import httpx -from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.application.db.lib import get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps From 88789597968446a4048297bd41cdc3c33f00d3ef Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 27 Apr 2024 10:11:33 -0500 Subject: [PATCH 044/122] Continue reducing instances of DatabaseSession Signed-off-by: Andrew Brain --- augur/api/routes/config.py | 3 ++- augur/application/config.py | 4 +++- augur/tasks/github/pull_requests/files_model/tasks.py | 7 +++++-- augur/tasks/github/releases/core.py | 10 +++++----- augur/tasks/github/releases/tasks.py | 6 ++---- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/augur/api/routes/config.py b/augur/api/routes/config.py index 6a2f82976e..08618091a9 100644 --- a/augur/api/routes/config.py +++ b/augur/api/routes/config.py @@ -8,6 +8,7 @@ # Disable the requirement for SSL by setting env["AUGUR_DEV"] = True from augur.application.config import get_development_flag +from augur.application.db.lib import get_session from augur.application.db.models import Config from augur.application.config import AugurConfig from augur.application.db.session import DatabaseSession @@ -45,7 +46,7 @@ def update_config(): update_dict = request.get_json() - with DatabaseSession(logger, engine=current_app.engine) as session: + with get_session() as session: for section, data in update_dict.items(): diff --git a/augur/application/config.py b/augur/application/config.py index 8998d6094e..0f6ba1acec 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -4,6 +4,7 @@ from typing import List, Any, Optional import os from augur.application.db.models import Config +from augur.application.db.lib import get_session from augur.application.db.util import execute_session_query def get_development_flag_from_config(): @@ -12,7 +13,7 @@ def get_development_flag_from_config(): from augur.application.db.session import DatabaseSession logger = getLogger(__name__) - with DatabaseSession(logger) as session: + with get_session() as session: config = AugurConfig(logger, session) @@ -288,6 +289,7 @@ def add_or_update_settings(self, settings: List[dict]): query = self.session.query(Config).filter(and_(Config.section_name == setting["section_name"],Config.setting_name == setting["setting_name"]) ) if execute_session_query(query, 'first') is None: + # TODO: Update to use bulk insert dicts so config doesn't require database session self.session.insert_data(setting,Config, ["section_name", "setting_name"]) else: #If setting exists. use raw update to not increase autoincrement diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index b5248aa8ba..564278227b 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -5,6 +5,9 @@ from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask from augur.application.db.lib import get_repo_by_repo_git from augur.application.db import get_engine +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth + + @@ -17,6 +20,6 @@ def process_pull_request_files(repo_git: str) -> None: repo = get_repo_by_repo_git(repo_git) - with DatabaseSession(logger, engine=engine) as session: + key_auth = GithubRandomKeyAuth(logger) - pull_request_files_model(repo, logger, session) \ No newline at end of file + pull_request_files_model(repo, logger, key_auth) \ No newline at end of file diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index 20d9c3bb32..3192401ae3 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -64,11 +64,11 @@ def get_release_inf(repo_id, release, tag_only): return release_inf -def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): +def insert_release(session, logger, repo_id, owner, release, tag_only = False): # Get current table values logger.info('Getting release table values\n') - query = augur_db.session.query(Release.release_id).filter(Release.repo_id == repo_id) + query = session.query(Release.release_id).filter(Release.repo_id == repo_id) release_id_data = execute_session_query(query, 'all')#pd.read_sql(release_id_data_sql, self.db, params={'repo_id': repo_id}) release_id_data = [str(r_id).strip() for r_id in release_id_data]#release_id_data.apply(lambda x: x.str.strip()) @@ -167,7 +167,7 @@ def fetch_data(key_auth, logger, github_url, repo_id, tag_only = False): return data -def releases_model(augur_db, key_auth, logger, repo_git, repo_id): +def releases_model(session, key_auth, logger, repo_git, repo_id): try: data = fetch_data(key_auth, logger, repo_git, repo_id) @@ -182,7 +182,7 @@ def releases_model(augur_db, key_auth, logger, repo_git, repo_id): if 'node' in n: release = n['node'] #self.insert_release(task, repo_id, data['owner'], release) - insert_release(augur_db, logger, repo_id, data['owner'], release) + insert_release(session, logger, repo_id, data['owner'], release) else: logger.info("There's no release to insert. Current node is not available in releases: {}\n".format(n)) elif 'edges' in data['releases'] and not data['releases']['edges']: @@ -195,7 +195,7 @@ def releases_model(augur_db, key_auth, logger, repo_git, repo_id): if 'node' in n: release = n['node'] #self.insert_release(task, repo_id, data['owner'], release, True) - insert_release(augur_db,logger, repo_id, data['owner'], release, True) + insert_release(session, logger, repo_id, data['owner'], release, True) else: logger.info("There's no release to insert. Current node is not available in releases: {}\n".format(n)) else: diff --git a/augur/tasks/github/releases/tasks.py b/augur/tasks/github/releases/tasks.py index 1ab86441ba..3e2210a7c9 100644 --- a/augur/tasks/github/releases/tasks.py +++ b/augur/tasks/github/releases/tasks.py @@ -3,10 +3,8 @@ from augur.tasks.github.releases.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.application.db.lib import get_repo_by_repo_git -from augur.application.db import get_engine +from augur.application.db.lib import get_repo_by_repo_git, get_session from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.application.db.session import DatabaseSession @celery.task(base=AugurCoreRepoCollectionTask) @@ -19,6 +17,6 @@ def collect_releases(repo_git): key_auth = GithubRandomKeyAuth(logger) - with DatabaseSession(logger, get_engine()) as session: + with get_session() as session: releases_model(session, key_auth, logger, repo_git, repo_id) \ No newline at end of file From 59d5ee45e7a7ba5c75f3d16cce5896bf475b89cb Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 27 Apr 2024 10:45:00 -0500 Subject: [PATCH 045/122] Remove GitlabTaskManifest Signed-off-by: Andrew Brain --- augur/tasks/github/contributors/tasks.py | 5 +- augur/tasks/github/detect_move/tasks.py | 9 +- .../github/pull_requests/files_model/tasks.py | 4 - augur/tasks/github/pull_requests/tasks.py | 6 +- augur/tasks/gitlab/events_task.py | 33 +++---- augur/tasks/gitlab/gitlab_task_session.py | 29 ------ augur/tasks/gitlab/issues_task.py | 18 ++-- augur/tasks/gitlab/merge_request_task.py | 90 ++++++++++--------- augur/tasks/init/celery_app.py | 4 +- augur/tasks/start_tasks.py | 4 +- augur/tasks/util/collection_util.py | 17 ++-- 11 files changed, 95 insertions(+), 124 deletions(-) diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index 1976f4de1a..8c2eed255a 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -7,8 +7,7 @@ from augur.tasks.github.facade_github.tasks import * from augur.application.db.models import Contributor from augur.application.db.util import execute_session_query -from augur.application.db.lib import bulk_insert_dicts -from augur.application.db import get_engine +from augur.application.db.lib import bulk_insert_dicts, get_session from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth @@ -24,7 +23,7 @@ def process_contributors(): key_auth = GithubRandomKeyAuth(logger) - with DatabaseSession(logger, get_engine()) as session: + with get_session() as session: query = session.query(Contributor).filter(Contributor.data_source == data_source, Contributor.cntrb_created_at is None, Contributor.cntrb_last_used is None) contributors = execute_session_query(query, 'all') diff --git a/augur/tasks/github/detect_move/tasks.py b/augur/tasks/github/detect_move/tasks.py index 44ba766d4f..f542d89289 100644 --- a/augur/tasks/github/detect_move/tasks.py +++ b/augur/tasks/github/detect_move/tasks.py @@ -3,11 +3,8 @@ from augur.tasks.github.detect_move.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask -from augur.application.db.lib import get_repo_by_repo_git -from augur.application.db import get_engine +from augur.application.db.lib import get_repo_by_repo_git, get_session from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.application.db.session import DatabaseSession - @celery.task(base=AugurCoreRepoCollectionTask) @@ -23,7 +20,7 @@ def detect_github_repo_move_core(repo_git : str) -> None: key_auth = GithubRandomKeyAuth(logger) - with DatabaseSession(logger, get_engine()) as session: + with get_session() as session: #Ping each repo with the given repo_git to make sure #that they are still in place. @@ -43,7 +40,7 @@ def detect_github_repo_move_secondary(repo_git : str) -> None: key_auth = GithubRandomKeyAuth(logger) - with DatabaseSession(logger, get_engine()) as session: + with get_session() as session: #Ping each repo with the given repo_git to make sure #that they are still in place. diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index 564278227b..134e05e900 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -1,6 +1,5 @@ import logging from augur.tasks.github.pull_requests.files_model.core import * -from augur.application.db.session import DatabaseSession from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask from augur.application.db.lib import get_repo_by_repo_git @@ -8,9 +7,6 @@ from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth - - - @celery.task(base=AugurSecondaryRepoCollectionTask) def process_pull_request_files(repo_git: str) -> None: diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index d8c2a3e07d..3a36f638c0 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -8,12 +8,10 @@ from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id +from augur.application.db.lib import get_session, get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors -from augur.application.db import get_engine from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth -from augur.application.db.session import DatabaseSession platform_id = 1 @@ -324,7 +322,7 @@ def collect_pull_request_reviews(repo_git: str) -> None: key_auth = GithubRandomKeyAuth(logger) - with DatabaseSession(logger, get_engine()) as session: + with get_session() as session: query = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) prs = execute_session_query(query, 'all') diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index 6a5b4ce5d3..5d36e88556 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -6,12 +6,12 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent -from augur.application.db.util import execute_session_query -from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git +from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth + platform_id = 2 @@ -30,13 +30,15 @@ def collect_gitlab_issue_events(repo_git) -> int: repo_id = get_repo_by_repo_git(repo_git).repo_id - with GitlabTaskManifest(logger) as manifest: + key_auth = GitlabRandomKeyAuth(logger) + + events = retrieve_all_gitlab_event_data("issue", repo_git, logger, key_auth) - events = retrieve_all_gitlab_event_data("issue", repo_git, logger, manifest.key_auth) + with get_session() as session: if events: logger.info(f"Length of gitlab issue events: {len(events)}") - process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, manifest.augur_db) + process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab issue events") @@ -50,20 +52,21 @@ def collect_gitlab_merge_request_events(repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) repo_id = get_repo_by_repo_git(repo_git).repo_id - with GitlabTaskManifest(logger) as manifest: + key_auth = GitlabRandomKeyAuth(logger) + + events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, key_auth) - events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, manifest.key_auth) + with get_session() as session: if events: logger.info(f"Length of gitlab merge request events: {len(events)}") - process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, manifest.augur_db) + process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request events") @@ -105,7 +108,7 @@ def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None: return all_data -def process_issue_events(events, task_name, repo_id, logger, augur_db): +def process_issue_events(events, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr label data from the api response @@ -125,7 +128,7 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): # create mapping from issue number to issue id of current issues issue_url_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = session.query(Issue).filter(Issue.repo_id == repo_id).all() for issue in issues: issue_url_to_id_map[issue.gh_issue_number] = issue.issue_id @@ -151,7 +154,7 @@ def process_issue_events(events, task_name, repo_id, logger, augur_db): bulk_insert_dicts(logger, issue_event_dicts, IssueEvent, issue_event_natural_keys) -def process_mr_events(events, task_name, repo_id, logger, augur_db): +def process_mr_events(events, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr events from the api response @@ -175,7 +178,7 @@ def process_mr_events(events, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id diff --git a/augur/tasks/gitlab/gitlab_task_session.py b/augur/tasks/gitlab/gitlab_task_session.py index 52b1cf879b..3f65f89f42 100644 --- a/augur/tasks/gitlab/gitlab_task_session.py +++ b/augur/tasks/gitlab/gitlab_task_session.py @@ -7,35 +7,6 @@ from augur.application.db.session import DatabaseSession from augur.application.db import get_engine -class GitlabTaskManifest: - """ - Manifest object that represents the state and common elements of - the specified task. GitLab version for the GitLab tasks. - - Attributes: - augur_db: sqlalchemy db object - key_auth: GitLab specific key auth retrieval collection - logger: logging object - platform_id: GitLab specific platform id (github is 1) - """ - - def __init__(self, logger): - - engine = get_engine() - - self.augur_db = DatabaseSession(logger, engine) - self.key_auth = GitlabRandomKeyAuth(logger) - self.logger = logger - self.platform_id = 2 - - def __enter__(self): - - return self - - def __exit__(self, exception_type, exception_value, exception_traceback): - - self.augur_db.close() - class GitlabTaskSession(DatabaseSession): """ORM session used in gitlab tasks. This class adds the platform_id and the gitlab key authentication class, diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index b6f7a6b2b6..495dd11e5f 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -7,13 +7,11 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo, Contributor -from augur.application.db.util import execute_session_query +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth platform_id = 2 @@ -215,13 +213,15 @@ def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: repo_id = get_repo_by_repo_git(repo_git).repo_id - with GitlabTaskManifest(logger) as manifest: + key_auth = GitlabRandomKeyAuth(logger) + + comments = retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git) - comments = retrieve_all_gitlab_issue_comments(manifest.key_auth, logger, issue_ids, repo_git) + with get_session() as session: if comments: logger.info(f"Length of comments: {len(comments)}") - process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, manifest.augur_db) + process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab issue comments") @@ -266,7 +266,7 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): return all_comments -def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): +def process_gitlab_issue_messages(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for issue messages from the api response @@ -284,7 +284,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs issue_number_to_id_map = {} - issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + issues = session.session.query(Issue).filter(Issue.repo_id == repo_id).all() for issue in issues: issue_number_to_id_map[issue.gh_issue_number] = issue.issue_id diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index e36d0b6acc..f5c2895941 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -3,13 +3,12 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler -from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee -from augur.application.db.util import execute_session_query +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git +from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session platform_id = 2 @@ -29,17 +28,17 @@ def collect_gitlab_merge_requests(repo_git: str) -> int: owner, repo = get_owner_repo(repo_git) - with GitlabTaskManifest(logger) as manifest: + key_auth = GitlabRandomKeyAuth(logger) - mr_data = retrieve_all_mr_data(repo_git, logger, manifest.key_auth) + mr_data = retrieve_all_mr_data(repo_git, logger, key_auth) - if mr_data: - mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, manifest.augur_db) + if mr_data: + mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger) - return mr_ids - else: - logger.info(f"{owner}/{repo} has no merge requests") - return [] + return mr_ids + else: + logger.info(f"{owner}/{repo} has no merge requests") + return [] def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: @@ -79,7 +78,7 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: return all_data -def process_merge_requests(data, task_name, repo_id, logger, augur_db): +def process_merge_requests(data, task_name, repo_id, logger): """ Retrieve only the needed data for mr label data from the api response @@ -88,7 +87,6 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object Returns: List of parsed MR ids. @@ -179,19 +177,21 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: repo_id = get_repo_by_repo_git(repo_git).repo_id - with GitlabTaskManifest(logger) as manifest: + key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") - comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger, response_type="list") + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") + comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, key_auth, logger, response_type="list") + + with get_session() as session: if comments: logger.info(f"Length of merge request comments: {len(comments)}") - process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, manifest.augur_db) + process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request comments") -def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): +def process_gitlab_mr_messages(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr label data from the api response @@ -209,7 +209,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -288,18 +288,20 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int: repo_id = get_repo_by_repo_git(repo_git).repo_id - with GitlabTaskManifest(logger) as manifest: + key_auth = GitlabRandomKeyAuth(logger) + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") + metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, key_auth, logger, response_type="dict") - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") - metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger, response_type="dict") + with get_session() as session: if metadata_list: logger.info(f"Length of merge request metadata: {len(metadata_list)}") - process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, manifest.augur_db) + process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request metadata") -def process_mr_metadata(data, task_name, repo_id, logger, augur_db): +def process_mr_metadata(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr label data from the api response @@ -317,7 +319,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -351,18 +353,20 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: repo_id = get_repo_by_repo_git(repo_git).repo_id - with GitlabTaskManifest(logger) as manifest: + key_auth = GitlabRandomKeyAuth(logger) + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") + reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, key_auth, logger, response_type="dict") - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") - reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger, response_type="dict") + with get_session() as session: if reviewers: logger.info(f"Length of merge request reviewers: {len(reviewers)}") - process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, manifest.augur_db) + process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") -def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): +def process_mr_reviewers(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr Reviewer data from the api response @@ -381,7 +385,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -416,19 +420,21 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: repo_id = get_repo_by_repo_git(repo_git).repo_id - with GitlabTaskManifest(logger) as manifest: + key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") - commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger, response_type="list") + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") + commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, key_auth, logger, response_type="list") + + with get_session() as session: if commits: logger.info(f"Length of merge request commits: {len(commits)}") - process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, manifest.augur_db) + process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request commits") -def process_mr_commits(data, task_name, repo_id, logger, augur_db): +def process_mr_commits(data, task_name, repo_id, logger, session): """ Retrieve only the needed data for mr commits from the api response @@ -446,7 +452,7 @@ def process_mr_commits(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id @@ -482,14 +488,16 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: repo_id = get_repo_by_repo_git(repo_git).repo_id - with GitlabTaskManifest(logger) as manifest: + key_auth = GitlabRandomKeyAuth(logger) + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") + files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, key_auth, logger, response_type="dict") - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") - files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger, response_type="dict") + with get_session() as session: if files: logger.info(f"Length of merge request files: {len(files)}") - process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, manifest.augur_db) + process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, session) else: logger.info(f"{owner}/{repo} has no gitlab merge request files") diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index a18284186c..e57fb674d2 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -13,8 +13,8 @@ from augur.application.logs import TaskLogConfig, AugurLogger from augur.application.db.session import DatabaseSession -from augur.application.db.engine import DatabaseEngine from augur.application.db import get_engine +from augur.application.db.lib import get_session from augur.application.config import AugurConfig from augur.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string from augur.application.db.models import Repo @@ -83,7 +83,7 @@ def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_h logger.error(f"Task {task_id} raised exception: {exc}\n Traceback: {''.join(traceback.format_exception(None, exc, exc.__traceback__))}") - with DatabaseSession(logger,engine) as session: + with get_session() as session: logger.info(f"Repo git: {repo_git}") repo = session.query(Repo).filter(Repo.repo_git == repo_git).one() diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 4fbf70c5bd..ca1401d88d 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -27,7 +27,7 @@ from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import * from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor -from augur.application.db.lib import execute_sql +from augur.application.db.lib import execute_sql, get_session CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) @@ -280,7 +280,7 @@ def augur_collection_update_weights(self): logger.info("Updating stale collection weights") - with DatabaseSession(logger,engine) as session: + with get_session() as session: core_weight_update_repos = session.query(CollectionStatus).filter(CollectionStatus.core_weight != None).all() diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index e81b25d82e..efec0b1904 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -12,7 +12,6 @@ from augur.application.db.util import execute_session_query from augur.application.db.lib import get_section from augur.tasks.github.util.util import get_repo_weight_core, get_repo_weight_by_issue -from augur.application.db.session import DatabaseSession from augur.application.db import get_engine from augur.application.db.lib import execute_sql, get_session, get_active_repo_count, get_repo_by_repo_git from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps @@ -222,7 +221,7 @@ def task_failed_util(self, request,exc,traceback): # log traceback to error file logger.error(f"Task {request.id} raised exception: {exc}\n{traceback}") - with DatabaseSession(logger,engine) as session: + with get_session() as session: core_id_match = CollectionStatus.core_task_id == request.id secondary_id_match = CollectionStatus.secondary_task_id == request.id facade_id_match = CollectionStatus.facade_task_id == request.id @@ -281,7 +280,7 @@ def issue_pr_task_update_weight_util(self, issue_and_pr_nums,repo_git=None,sessi if session is not None: update_issue_pr_weights(logger, session, repo_git, sum(issue_and_pr_nums)) else: - with DatabaseSession(logger,engine=engine) as session: + with get_session() as session: update_issue_pr_weights(logger,session,repo_git,sum(issue_and_pr_nums)) @@ -294,7 +293,7 @@ def core_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through core collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -361,7 +360,7 @@ def secondary_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through secondary collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -387,7 +386,7 @@ def get_repo_weight_secondary(logger,repo_git): engine = get_engine() - with DatabaseSession(logger,engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") @@ -414,7 +413,7 @@ def facade_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through facade task collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -437,7 +436,7 @@ def ml_task_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through machine learning task collection") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: @@ -462,7 +461,7 @@ def facade_clone_success_util(self, repo_git): logger.info(f"Repo '{repo_git}' succeeded through facade update/clone") - with DatabaseSession(logger, engine) as session: + with get_session() as session: repo = Repo.get_by_repo_git(session, repo_git) if not repo: From a3a5406bbb4060e16a098b2c9e1800419e530b1b Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 27 Apr 2024 10:47:09 -0500 Subject: [PATCH 046/122] Fix some syntax errors and comments Signed-off-by: Andrew Brain --- augur/tasks/github/detect_move/core.py | 6 +++--- augur/tasks/github/pull_requests/tasks.py | 1 - augur/tasks/gitlab/events_task.py | 2 +- augur/tasks/gitlab/issues_task.py | 4 ++-- augur/tasks/gitlab/merge_request_task.py | 12 ++++++------ 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index ca916d744e..db005ce22c 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -44,7 +44,7 @@ def extract_owner_and_repo_from_endpoint(key_auth, url, logger): return splits[0], splits[-1] -def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook='core'): +def ping_github_for_repo_move(session, key_auth, repo, logger,collection_hook='core'): owner, name = get_owner_repo(repo.repo_git) url = f"https://api.github.com/repos/{owner}/{name}" @@ -93,7 +93,7 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' update_repo_with_dict(repo, repo_update_dict, logger) - statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) + statusQuery = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) collectionRecord = execute_session_query(statusQuery,'one') @@ -114,7 +114,7 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' collectionRecord.ml_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - augur_db.session.commit() + session.commit() raise Exception("ERROR: Repo has moved! Resetting Collection!") diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 3a36f638c0..5b823d4f8c 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -79,7 +79,6 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): task_name: Name of the calling task and the repo repo_id: augur id of the repository logger: logging object - augur_db: sqlalchemy db object """ tool_source = "Pr Task" tool_version = "2.0" diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index 5d36e88556..c8d9a8f8a7 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -117,7 +117,7 @@ def process_issue_events(events, task_name, repo_id, logger, session): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Gitlab issue events task" diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 495dd11e5f..8a987a7744 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -92,7 +92,7 @@ def process_issues(issues, task_name, repo_id, logger) -> None: task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ # get repo_id or have it passed @@ -275,7 +275,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, session): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Gitlab issue comments" diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index f5c2895941..5e56067c53 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -200,7 +200,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, session): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Gitlab mr comments" @@ -310,7 +310,7 @@ def process_mr_metadata(data, task_name, repo_id, logger, session): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Mr Metadata Task" @@ -374,7 +374,7 @@ def process_mr_reviewers(data, task_name, repo_id, logger, session): data: List of dictionaries of mr Reviewer data repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Mr Reviewer Task" @@ -443,7 +443,7 @@ def process_mr_commits(data, task_name, repo_id, logger, session): task_name: name of the task as well as the repo being processed repo_id: augur id of the repo logger: logging object - augur_db: sqlalchemy db object + session: sqlalchemy db object """ tool_source = "Mr Commit Task" @@ -501,7 +501,7 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: else: logger.info(f"{owner}/{repo} has no gitlab merge request files") -def process_mr_files(data, task_name, repo_id, logger, augur_db): +def process_mr_files(data, task_name, repo_id, logger, session): tool_source = "Mr files Task" tool_version = "2.0" @@ -509,7 +509,7 @@ def process_mr_files(data, task_name, repo_id, logger, augur_db): # create mapping from mr number to pull request id of current mrs mr_number_to_id_map = {} - mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + mrs = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for mr in mrs: mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id From 1495ef2c54d00cd2a947d2badf6e271b3c54dc88 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sun, 28 Apr 2024 16:36:50 -0500 Subject: [PATCH 047/122] Fix multiprocessing issue Signed-off-by: Andrew Brain --- augur/application/config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/augur/application/config.py b/augur/application/config.py index 0f6ba1acec..bfda4c8773 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -4,7 +4,6 @@ from typing import List, Any, Optional import os from augur.application.db.models import Config -from augur.application.db.lib import get_session from augur.application.db.util import execute_session_query def get_development_flag_from_config(): @@ -13,7 +12,7 @@ def get_development_flag_from_config(): from augur.application.db.session import DatabaseSession logger = getLogger(__name__) - with get_session() as session: + with DatabaseSession(logger) as session: config = AugurConfig(logger, session) From 33e639c273fec52dd9a8fe8ca795de0d9ff608ca Mon Sep 17 00:00:00 2001 From: Ulincsys Date: Mon, 6 May 2024 04:12:04 -0500 Subject: [PATCH 048/122] Attempt to clarify profile page terminology, fix redirects Signed-off-by: Ulincsys --- augur/templates/settings.j2 | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/augur/templates/settings.j2 b/augur/templates/settings.j2 index c75b6522ad..c10a0c914c 100644 --- a/augur/templates/settings.j2 +++ b/augur/templates/settings.j2 @@ -56,7 +56,7 @@
  • - Repo Tracker + My Repos
  • @@ -170,7 +170,7 @@