AlgoLab · jorgeavilacartes · Jan 30, 2023
diff --git a/eda.smk b/eda.smk
@@ -34,7 +34,7 @@ rule eda_msa:
                 mv_problems()
 
         # list of (path to) msas
-        list_paths = list(Path(config["PATH_MSAS"]).rglob("*.fa"))
+        list_paths = list(Path(config["PATH_MSAS"]).rglob("*.[fa]*"))
 
         with ThreadPoolExecutor(max_workers=16) as pool:
             with tqdm(total=len(list_paths)) as progress:

diff --git a/pangeblock.smk b/pangeblock.smk
@@ -11,7 +11,7 @@ STATS_MSAS = pd.read_csv(
                 index_col=False, sep="\t"
                 )
 NAMES = STATS_MSAS["path_msa"].apply(lambda path: Path(path).stem) # txt file with names of the MSAs
-
+NAMES = STATS_MSAS[["path_msa","n_seqs"]].query("n_seqs>1")["path_msa"].apply(lambda path: Path(path).stem)
 # --- 
 
 rule all:
@@ -27,7 +27,7 @@ rule all:
 
 rule compute_blocks:
     input:
-        expand("{path_msas}/{{name_msa}}.fa", path_msas=PATH_MSAS)
+        expand("{path_msas}/{{name_msa}}.fasta", path_msas=PATH_MSAS)
     output: 
         expand("{path_output}/max_blocks/{{name_msa}}.json", path_output=PATH_OUTPUT)
     log: 
@@ -62,7 +62,7 @@ rule decompose_blocks:
 rule pangeblock:
     input:
         path_blocks=expand("{path_output}/block_decomposition/{{name_msa}}.json", path_output=PATH_OUTPUT),
-        path_msa=expand("{path_msas}/{{name_msa}}.fa", path_msas=PATH_MSAS)
+        path_msa=expand("{path_msas}/{{name_msa}}.fasta", path_msas=PATH_MSAS)
     output: 
         path_gfa=expand("{path_output}/gfa/{{name_msa}}.gfa", path_output=PATH_OUTPUT),
         path_oc=expand("{path_output}/opt-coverage/{{name_msa}}.json", path_output=PATH_OUTPUT)
@@ -92,7 +92,7 @@ rule bandage_labels:
 rule coverage:
     input:
         path_blocks=expand("{path_output}/block_decomposition/{{name_msa}}.json", path_output=PATH_OUTPUT),
-        path_msa=expand("{path_msa}/{{name_msa}}.fa", path_msa=PATH_MSAS)
+        path_msa=expand("{path_msa}/{{name_msa}}.fasta", path_msa=PATH_MSAS)
     output:
         path_gray=expand("{path_output}/coverage/{{name_msa}}-gray.jpg", path_output=PATH_OUTPUT),
         path_color=expand("{path_output}/coverage/{{name_msa}}-color.jpg", path_output=PATH_OUTPUT)

diff --git a/params.yaml b/params.yaml
@@ -1,7 +1,7 @@
-PATH_MSAS: "msa-difficile"
-PATH_OUTPUT: "output-msa-difficile-weighted"
+PATH_MSAS: "/data/pangeblocks-experiments/data/msas-dna"
+PATH_OUTPUT: "/data/pangeblocks-experiments/output/gfa/pangeblocks-strings"
 OPTIMIZATION:
-  OBJECTIVE_FUNCTION: "weighted" # nodes, strings, weighted
+  OBJECTIVE_FUNCTION: "strings" # nodes, strings, weighted
   PENALIZATION: 3 # used only with "weighted"
   MIN_LEN: 2 # used only with "weighted"
 LOG_LEVEL: "INFO"
diff --git a/src/eda_msas.py b/src/eda_msas.py
@@ -1,42 +1,53 @@
-import sys
-import time 
-
 from pathlib import Path
-from Bio import AlignIO #load MSA
-from utils.monitor_values import MonitorValues
-from rich.progress import track
-
-def run(path_msas: str):
-
-    out_file = Path("out/stats_msas.tsv")
-    out_file.parent.mkdir(exist_ok=True, parents=True)
-    problematic_out_file = Path("out/problematic_files.tsv")
+from concurrent.futures import ThreadPoolExecutor
+from src.utils import MonitorValuesPlus
+from src.msa import AnalyzerMSA
+from tqdm import tqdm
+import argparse
 
-    # log values
-    mv_msas = MonitorValues(["path_msa","n_seqs","n_unique_seqs","n_cols"])
-    mv_msas_problems = MonitorValues(["path_msa"])
+def main(path_msas: str, path_output: str):
 
-    list_msa=list(Path(path_msas).rglob("*fa"))
-    for path_msa in track(list_msa, description="Working on MSAs"):
+    mv = MonitorValuesPlus(list_vars=["path_msa","n_seqs","n_unique_seqs","n_identical_cols","n_cols", "perc_identical_cols"],
+                            out_file=f"{path_output}/analysis-msa/stats_msas.tsv",
+                            overwrite=True)
 
+    mv_problems = MonitorValuesPlus(list_vars=["path_msa"],
+                                    out_file=f"{path_output}/analysis-msa/problematic_files.tsv",
+                                    overwrite=True)
+
+    analyzer = AnalyzerMSA()
+    def run(path_msa):
+        "Function to run with ThreadPoolExecutor"
         try:
-            # load MSA, count seqs and columns
-            align=AlignIO.read(path_msa, "fasta")
-            n_cols = align.get_alignment_length()
-            n_seqs = len(align)
-            n_unique_seqs = len(set([str(record.seq) for record in align]))
+            n_cols, n_seqs, n_unique_seqs, n_identical_cols = analyzer(path_msa)
+            perc_identical_cols = round(100*n_identical_cols/n_cols, 2)
+            mv()
+        except:     
+            mv_problems()
 
-            mv_msas()
-        except:
-            mv_msas_problems()
+    # list of (path to) msas
+    list_paths = list(Path(path_msas).rglob("*.[fa]*"))
+
+    with ThreadPoolExecutor(max_workers=16) as pool:
+        with tqdm(total=len(list_paths)) as progress:
+            progress.set_description("Analyzing MSA")
+            futures=[]
+            for path_msa in list_paths:
+
+                future = pool.submit(run, path_msa)
+                future.add_done_callback(lambda p: progress.update())
+                futures.append(future)
+
+            for future in futures:
+                future.result()
 
-    df = mv_msas.get_values_asdf()
-    df.sort_values(by=["n_cols","n_seqs"], inplace=True)
-    df.to_csv(out_file,sep="\t")
-    mv_msas_problems.get_values_asdf().to_csv(problematic_out_file, sep="\t")
 
 if __name__ == "__main__":
 
-    # TODO: use argparse
-    path_msas = sys.argv[-1]
-    run(path_msas)
+    # Command line options
+    parser = argparse.ArgumentParser()
+    parser.add_argument("path_msas", help="directory to MSAs")
+    parser.add_argument("path_output", help="output directory")
+    args = parser.parse_args()
+
+    main(args.path_msas, args.path_output)    
diff --git a/src/ilp/optimization.py b/src/ilp/optimization.py
@@ -142,6 +142,9 @@ def __call__(self, return_times: bool = False):
 
         c_variables = list(disjoint_vertical) + [item["idx"]
                                                  for item in vertical_blocks.values()] + list(range(first_private_block, len(all_blocks)))
+        for idx, cvar in enumerate(c_variables):
+            logging.debug(f"c variables: idx {idx}, value {cvar}")
+
         # msa_positions is a list of all positions (r,c) that are required to be
         # covered. We exclude the positions covered by vertical blocks, since
         # they will be guaranteed to be covered, as an effect of the fact that
@@ -205,12 +208,12 @@ def __call__(self, return_times: bool = False):
         tf = time.time()
         times["constraints1-2-3"] = round(tf - ti, 3)
 
-        # constraint 4: vertical blocks are part of the solution
-        for idx in vertical_blocks:
-            model.addConstr(C[idx] == 1)
-            logging.info(
-                f"constraint4: vertical block ({idx}) - {self.input_blocks[idx].str()}"
-            )
+        # # constraint 4: vertical blocks are part of the solution
+        # for idx in vertical_blocks:
+        #     model.addConstr(C[idx] == 1)
+        #     logging.info(
+        #         f"constraint4: vertical block ({idx}) - {self.input_blocks[idx].str()}"
+        #     )
 
         ti = time.time()
         # TODO: include input to decide which objective function to use