From 4db0b6a7f83f1e87ebe8d52ad29c5d19e8f05202 Mon Sep 17 00:00:00 2001
From: Liza Kozlova <liza@adaptyvbio.com>
Date: Thu, 12 Oct 2023 09:44:03 +0000
Subject: [PATCH 1/3] chore: update gitignore

---
 .gitignore | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 1de96b7..8918e90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,8 @@ rcsbsearch/
 pulchra304
 tmp/
 all_structures/
-all_structures.zip
\ No newline at end of file
+all_structures.zip
+esmfold_output/
+igfold_output/
+*.csv
+run_metrics.py
\ No newline at end of file

From ffb0038310d98f91df024dd09ff10cf9a13bcdd0 Mon Sep 17 00:00:00 2001
From: Liza Kozlova <liza@adaptyvbio.com>
Date: Thu, 12 Oct 2023 10:44:24 +0000
Subject: [PATCH 2/3] fix: excluded clusters dictionary bug

---
 .gitignore                    |  1 -
 proteinflow/split/__init__.py | 24 +++++++++++++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8918e90..a6f66b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,6 @@ rcsbsearch/
 .pytest_cache/
 *ipynb
 ./*zip
-pulchra304
 tmp/
 all_structures/
 all_structures.zip
diff --git a/proteinflow/split/__init__.py b/proteinflow/split/__init__.py
index 3cadc4a..838be28 100644
--- a/proteinflow/split/__init__.py
+++ b/proteinflow/split/__init__.py
@@ -1430,8 +1430,8 @@ def _split_data(
     if len(excluded_files) > 0:
         set_to_exclude = set(excluded_files)
         excluded_files = set()
-        excluded_clusters_dict = defaultdict(set)
         if exclude_clusters:
+            excluded_clusters_dict = defaultdict(set)
             for clusters_dict in [
                 train_clusters_dict,
                 valid_clusters_dict,
@@ -1442,6 +1442,28 @@ def _split_data(
                 )
                 excluded_files.update(subset_excluded_set)
                 excluded_clusters_dict.update(subset_excluded_dict)
+        else:
+            excluded_clusters_dict = defaultdict(list)
+            for clusters_dict in [
+                train_clusters_dict,
+                valid_clusters_dict,
+                test_clusters_dict,
+            ]:
+                for cluster in list(clusters_dict.keys()):
+                    idx_to_include = []
+                    for i, chain in enumerate(clusters_dict[cluster]):
+                        if chain[0] in set_to_exclude:
+                            excluded_clusters_dict[cluster].append(chain)
+                        else:
+                            idx_to_include.append(i)
+                    if len(idx_to_include) == 0:
+                        clusters_dict.pop(cluster)
+                    else:
+                        clusters_dict[cluster] = clusters_dict[cluster][idx_to_include]
+                    if cluster in excluded_clusters_dict:
+                        excluded_clusters_dict[cluster] = np.array(
+                            excluded_clusters_dict[cluster]
+                        )
         excluded_files.update(set_to_exclude)
         excluded_clusters_dict = {k: list(v) for k, v in excluded_clusters_dict.items()}
         excluded_path = os.path.join(dataset_path, "excluded")

From 9d538b6b3e30006170c2e053224977583df7f7e9 Mon Sep 17 00:00:00 2001
From: Liza Kozlova <liza@adaptyvbio.com>
Date: Thu, 12 Oct 2023 11:13:51 +0000
Subject: [PATCH 3/3] fix: splitting issues with excluding clusters + creating
 cluster dictionaries

---
 proteinflow/split/__init__.py | 68 ++++++++++++++++++-----------------
 proteinflow/split/utils.py    |  2 +-
 2 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/proteinflow/split/__init__.py b/proteinflow/split/__init__.py
index 838be28..a14e9a2 100644
--- a/proteinflow/split/__init__.py
+++ b/proteinflow/split/__init__.py
@@ -1430,40 +1430,38 @@ def _split_data(
     if len(excluded_files) > 0:
         set_to_exclude = set(excluded_files)
         excluded_files = set()
-        if exclude_clusters:
-            excluded_clusters_dict = defaultdict(set)
-            for clusters_dict in [
-                train_clusters_dict,
-                valid_clusters_dict,
-                test_clusters_dict,
-            ]:
-                subset_excluded_set, subset_excluded_dict = _exclude(
-                    clusters_dict, set_to_exclude, exclude_based_on_cdr
-                )
-                excluded_files.update(subset_excluded_set)
-                excluded_clusters_dict.update(subset_excluded_dict)
-        else:
-            excluded_clusters_dict = defaultdict(list)
-            for clusters_dict in [
-                train_clusters_dict,
-                valid_clusters_dict,
-                test_clusters_dict,
-            ]:
-                for cluster in list(clusters_dict.keys()):
-                    idx_to_include = []
-                    for i, chain in enumerate(clusters_dict[cluster]):
-                        if chain[0] in set_to_exclude:
-                            excluded_clusters_dict[cluster].append(chain)
-                        else:
-                            idx_to_include.append(i)
-                    if len(idx_to_include) == 0:
+        excluded_clusters_dict = defaultdict(list)
+        for clusters_dict in [
+            train_clusters_dict,
+            valid_clusters_dict,
+            test_clusters_dict,
+        ]:
+            for cluster in list(clusters_dict.keys()):
+                idx_to_exclude = []
+                exclude_whole_cluster = False
+                for i, chain in enumerate(clusters_dict[cluster]):
+                    if chain[0] in set_to_exclude:
+                        if exclude_clusters:
+                            if exclude_based_on_cdr is not None and cluster.endswith(
+                                exclude_based_on_cdr
+                            ):
+                                exclude_whole_cluster = True
+                            elif exclude_based_on_cdr is None:
+                                exclude_whole_cluster = True
+                        if exclude_whole_cluster:
+                            break
+                        excluded_clusters_dict[cluster].append(chain)
+                        idx_to_exclude.append(i)
+                if exclude_whole_cluster:
+                    excluded_clusters_dict[cluster] = clusters_dict.pop(cluster)
+                else:
+                    clusters_dict[cluster] = [
+                        x
+                        for i, x in enumerate(clusters_dict[cluster])
+                        if i not in idx_to_exclude
+                    ]
+                    if len(clusters_dict[cluster]) == 0:
                         clusters_dict.pop(cluster)
-                    else:
-                        clusters_dict[cluster] = clusters_dict[cluster][idx_to_include]
-                    if cluster in excluded_clusters_dict:
-                        excluded_clusters_dict[cluster] = np.array(
-                            excluded_clusters_dict[cluster]
-                        )
         excluded_files.update(set_to_exclude)
         excluded_clusters_dict = {k: list(v) for k, v in excluded_clusters_dict.items()}
         excluded_path = os.path.join(dataset_path, "excluded")
@@ -1472,6 +1470,10 @@ def _split_data(
         print("Updating the split dictionaries...")
         with open(os.path.join(dict_folder, "train.pickle"), "wb") as f:
             pickle.dump(train_clusters_dict, f)
+        with open(os.path.join(dict_folder, "valid.pickle"), "wb") as f:
+            pickle.dump(valid_clusters_dict, f)
+        with open(os.path.join(dict_folder, "test.pickle"), "wb") as f:
+            pickle.dump(test_clusters_dict, f)
         with open(os.path.join(dict_folder, "excluded.pickle"), "wb") as f:
             pickle.dump(excluded_clusters_dict, f)
         print("Moving excluded files...")
diff --git a/proteinflow/split/utils.py b/proteinflow/split/utils.py
index 996fbea..2f9408b 100644
--- a/proteinflow/split/utils.py
+++ b/proteinflow/split/utils.py
@@ -184,7 +184,7 @@ def _exclude(clusters_dict, set_to_exclude, exclude_based_on_cdr=None):
         files = clusters_dict[cluster]
         exclude = False
         for biounit in files:
-            if biounit in set_to_exclude:
+            if biounit[0] in set_to_exclude:
                 exclude = True
                 break
         if exclude: