From 4db0b6a7f83f1e87ebe8d52ad29c5d19e8f05202 Mon Sep 17 00:00:00 2001 From: Liza Kozlova Date: Thu, 12 Oct 2023 09:44:03 +0000 Subject: [PATCH 1/3] chore: update gitignore --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1de96b7..8918e90 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,8 @@ rcsbsearch/ pulchra304 tmp/ all_structures/ -all_structures.zip \ No newline at end of file +all_structures.zip +esmfold_output/ +igfold_output/ +*.csv +run_metrics.py \ No newline at end of file From ffb0038310d98f91df024dd09ff10cf9a13bcdd0 Mon Sep 17 00:00:00 2001 From: Liza Kozlova Date: Thu, 12 Oct 2023 10:44:24 +0000 Subject: [PATCH 2/3] fix: excluded clusters dictionary bug --- .gitignore | 1 - proteinflow/split/__init__.py | 24 +++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 8918e90..a6f66b6 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ rcsbsearch/ .pytest_cache/ *ipynb ./*zip -pulchra304 tmp/ all_structures/ all_structures.zip diff --git a/proteinflow/split/__init__.py b/proteinflow/split/__init__.py index 3cadc4a..838be28 100644 --- a/proteinflow/split/__init__.py +++ b/proteinflow/split/__init__.py @@ -1430,8 +1430,8 @@ def _split_data( if len(excluded_files) > 0: set_to_exclude = set(excluded_files) excluded_files = set() - excluded_clusters_dict = defaultdict(set) if exclude_clusters: + excluded_clusters_dict = defaultdict(set) for clusters_dict in [ train_clusters_dict, valid_clusters_dict, @@ -1442,6 +1442,28 @@ def _split_data( ) excluded_files.update(subset_excluded_set) excluded_clusters_dict.update(subset_excluded_dict) + else: + excluded_clusters_dict = defaultdict(list) + for clusters_dict in [ + train_clusters_dict, + valid_clusters_dict, + test_clusters_dict, + ]: + for cluster in list(clusters_dict.keys()): + idx_to_include = [] + for i, chain in enumerate(clusters_dict[cluster]): + if chain[0] in set_to_exclude: + excluded_clusters_dict[cluster].append(chain) + else: + idx_to_include.append(i) + if len(idx_to_include) == 0: + clusters_dict.pop(cluster) + else: + clusters_dict[cluster] = clusters_dict[cluster][idx_to_include] + if cluster in excluded_clusters_dict: + excluded_clusters_dict[cluster] = np.array( + excluded_clusters_dict[cluster] + ) excluded_files.update(set_to_exclude) excluded_clusters_dict = {k: list(v) for k, v in excluded_clusters_dict.items()} excluded_path = os.path.join(dataset_path, "excluded") From 9d538b6b3e30006170c2e053224977583df7f7e9 Mon Sep 17 00:00:00 2001 From: Liza Kozlova Date: Thu, 12 Oct 2023 11:13:51 +0000 Subject: [PATCH 3/3] fix: splitting issues with excluding clusters + creating cluster dictionaries --- proteinflow/split/__init__.py | 68 ++++++++++++++++++----------------- proteinflow/split/utils.py | 2 +- 2 files changed, 36 insertions(+), 34 deletions(-) diff --git a/proteinflow/split/__init__.py b/proteinflow/split/__init__.py index 838be28..a14e9a2 100644 --- a/proteinflow/split/__init__.py +++ b/proteinflow/split/__init__.py @@ -1430,40 +1430,38 @@ def _split_data( if len(excluded_files) > 0: set_to_exclude = set(excluded_files) excluded_files = set() - if exclude_clusters: - excluded_clusters_dict = defaultdict(set) - for clusters_dict in [ - train_clusters_dict, - valid_clusters_dict, - test_clusters_dict, - ]: - subset_excluded_set, subset_excluded_dict = _exclude( - clusters_dict, set_to_exclude, exclude_based_on_cdr - ) - excluded_files.update(subset_excluded_set) - excluded_clusters_dict.update(subset_excluded_dict) - else: - excluded_clusters_dict = defaultdict(list) - for clusters_dict in [ - train_clusters_dict, - valid_clusters_dict, - test_clusters_dict, - ]: - for cluster in list(clusters_dict.keys()): - idx_to_include = [] - for i, chain in enumerate(clusters_dict[cluster]): - if chain[0] in set_to_exclude: - excluded_clusters_dict[cluster].append(chain) - else: - idx_to_include.append(i) - if len(idx_to_include) == 0: + excluded_clusters_dict = defaultdict(list) + for clusters_dict in [ + train_clusters_dict, + valid_clusters_dict, + test_clusters_dict, + ]: + for cluster in list(clusters_dict.keys()): + idx_to_exclude = [] + exclude_whole_cluster = False + for i, chain in enumerate(clusters_dict[cluster]): + if chain[0] in set_to_exclude: + if exclude_clusters: + if exclude_based_on_cdr is not None and cluster.endswith( + exclude_based_on_cdr + ): + exclude_whole_cluster = True + elif exclude_based_on_cdr is None: + exclude_whole_cluster = True + if exclude_whole_cluster: + break + excluded_clusters_dict[cluster].append(chain) + idx_to_exclude.append(i) + if exclude_whole_cluster: + excluded_clusters_dict[cluster] = clusters_dict.pop(cluster) + else: + clusters_dict[cluster] = [ + x + for i, x in enumerate(clusters_dict[cluster]) + if i not in idx_to_exclude + ] + if len(clusters_dict[cluster]) == 0: clusters_dict.pop(cluster) - else: - clusters_dict[cluster] = clusters_dict[cluster][idx_to_include] - if cluster in excluded_clusters_dict: - excluded_clusters_dict[cluster] = np.array( - excluded_clusters_dict[cluster] - ) excluded_files.update(set_to_exclude) excluded_clusters_dict = {k: list(v) for k, v in excluded_clusters_dict.items()} excluded_path = os.path.join(dataset_path, "excluded") @@ -1472,6 +1470,10 @@ def _split_data( print("Updating the split dictionaries...") with open(os.path.join(dict_folder, "train.pickle"), "wb") as f: pickle.dump(train_clusters_dict, f) + with open(os.path.join(dict_folder, "valid.pickle"), "wb") as f: + pickle.dump(valid_clusters_dict, f) + with open(os.path.join(dict_folder, "test.pickle"), "wb") as f: + pickle.dump(test_clusters_dict, f) with open(os.path.join(dict_folder, "excluded.pickle"), "wb") as f: pickle.dump(excluded_clusters_dict, f) print("Moving excluded files...") diff --git a/proteinflow/split/utils.py b/proteinflow/split/utils.py index 996fbea..2f9408b 100644 --- a/proteinflow/split/utils.py +++ b/proteinflow/split/utils.py @@ -184,7 +184,7 @@ def _exclude(clusters_dict, set_to_exclude, exclude_based_on_cdr=None): files = clusters_dict[cluster] exclude = False for biounit in files: - if biounit in set_to_exclude: + if biounit[0] in set_to_exclude: exclude = True break if exclude: