Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove old constraint in optimization #6

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eda.smk
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ rule eda_msa:
mv_problems()

# list of (path to) msas
list_paths = list(Path(config["PATH_MSAS"]).rglob("*.fa"))
list_paths = list(Path(config["PATH_MSAS"]).rglob("*.[fa]*"))

with ThreadPoolExecutor(max_workers=16) as pool:
with tqdm(total=len(list_paths)) as progress:
Expand Down
8 changes: 4 additions & 4 deletions pangeblock.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ STATS_MSAS = pd.read_csv(
index_col=False, sep="\t"
)
NAMES = STATS_MSAS["path_msa"].apply(lambda path: Path(path).stem) # txt file with names of the MSAs

NAMES = STATS_MSAS[["path_msa","n_seqs"]].query("n_seqs>1")["path_msa"].apply(lambda path: Path(path).stem)
# ---

rule all:
Expand All @@ -27,7 +27,7 @@ rule all:

rule compute_blocks:
input:
expand("{path_msas}/{{name_msa}}.fa", path_msas=PATH_MSAS)
expand("{path_msas}/{{name_msa}}.fasta", path_msas=PATH_MSAS)
output:
expand("{path_output}/max_blocks/{{name_msa}}.json", path_output=PATH_OUTPUT)
log:
Expand Down Expand Up @@ -62,7 +62,7 @@ rule decompose_blocks:
rule pangeblock:
input:
path_blocks=expand("{path_output}/block_decomposition/{{name_msa}}.json", path_output=PATH_OUTPUT),
path_msa=expand("{path_msas}/{{name_msa}}.fa", path_msas=PATH_MSAS)
path_msa=expand("{path_msas}/{{name_msa}}.fasta", path_msas=PATH_MSAS)
output:
path_gfa=expand("{path_output}/gfa/{{name_msa}}.gfa", path_output=PATH_OUTPUT),
path_oc=expand("{path_output}/opt-coverage/{{name_msa}}.json", path_output=PATH_OUTPUT)
Expand Down Expand Up @@ -92,7 +92,7 @@ rule bandage_labels:
rule coverage:
input:
path_blocks=expand("{path_output}/block_decomposition/{{name_msa}}.json", path_output=PATH_OUTPUT),
path_msa=expand("{path_msa}/{{name_msa}}.fa", path_msa=PATH_MSAS)
path_msa=expand("{path_msa}/{{name_msa}}.fasta", path_msa=PATH_MSAS)
output:
path_gray=expand("{path_output}/coverage/{{name_msa}}-gray.jpg", path_output=PATH_OUTPUT),
path_color=expand("{path_output}/coverage/{{name_msa}}-color.jpg", path_output=PATH_OUTPUT)
Expand Down
6 changes: 3 additions & 3 deletions params.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH_MSAS: "msa-difficile"
PATH_OUTPUT: "output-msa-difficile-weighted"
PATH_MSAS: "/data/pangeblocks-experiments/data/msas-dna"
PATH_OUTPUT: "/data/pangeblocks-experiments/output/gfa/pangeblocks-strings"
OPTIMIZATION:
OBJECTIVE_FUNCTION: "weighted" # nodes, strings, weighted
OBJECTIVE_FUNCTION: "strings" # nodes, strings, weighted
PENALIZATION: 3 # used only with "weighted"
MIN_LEN: 2 # used only with "weighted"
LOG_LEVEL: "INFO"
75 changes: 43 additions & 32 deletions src/eda_msas.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,53 @@
import sys
import time

from pathlib import Path
from Bio import AlignIO #load MSA
from utils.monitor_values import MonitorValues
from rich.progress import track

def run(path_msas: str):

out_file = Path("out/stats_msas.tsv")
out_file.parent.mkdir(exist_ok=True, parents=True)
problematic_out_file = Path("out/problematic_files.tsv")
from concurrent.futures import ThreadPoolExecutor
from src.utils import MonitorValuesPlus
from src.msa import AnalyzerMSA
from tqdm import tqdm
import argparse

# log values
mv_msas = MonitorValues(["path_msa","n_seqs","n_unique_seqs","n_cols"])
mv_msas_problems = MonitorValues(["path_msa"])
def main(path_msas: str, path_output: str):

list_msa=list(Path(path_msas).rglob("*fa"))
for path_msa in track(list_msa, description="Working on MSAs"):
mv = MonitorValuesPlus(list_vars=["path_msa","n_seqs","n_unique_seqs","n_identical_cols","n_cols", "perc_identical_cols"],
out_file=f"{path_output}/analysis-msa/stats_msas.tsv",
overwrite=True)

mv_problems = MonitorValuesPlus(list_vars=["path_msa"],
out_file=f"{path_output}/analysis-msa/problematic_files.tsv",
overwrite=True)

analyzer = AnalyzerMSA()
def run(path_msa):
"Function to run with ThreadPoolExecutor"
try:
# load MSA, count seqs and columns
align=AlignIO.read(path_msa, "fasta")
n_cols = align.get_alignment_length()
n_seqs = len(align)
n_unique_seqs = len(set([str(record.seq) for record in align]))
n_cols, n_seqs, n_unique_seqs, n_identical_cols = analyzer(path_msa)
perc_identical_cols = round(100*n_identical_cols/n_cols, 2)
mv()
except:
mv_problems()

mv_msas()
except:
mv_msas_problems()
# list of (path to) msas
list_paths = list(Path(path_msas).rglob("*.[fa]*"))

with ThreadPoolExecutor(max_workers=16) as pool:
with tqdm(total=len(list_paths)) as progress:
progress.set_description("Analyzing MSA")
futures=[]
for path_msa in list_paths:

future = pool.submit(run, path_msa)
future.add_done_callback(lambda p: progress.update())
futures.append(future)

for future in futures:
future.result()

df = mv_msas.get_values_asdf()
df.sort_values(by=["n_cols","n_seqs"], inplace=True)
df.to_csv(out_file,sep="\t")
mv_msas_problems.get_values_asdf().to_csv(problematic_out_file, sep="\t")

if __name__ == "__main__":

# TODO: use argparse
path_msas = sys.argv[-1]
run(path_msas)
# Command line options
parser = argparse.ArgumentParser()
parser.add_argument("path_msas", help="directory to MSAs")
parser.add_argument("path_output", help="output directory")
args = parser.parse_args()

main(args.path_msas, args.path_output)
15 changes: 9 additions & 6 deletions src/ilp/optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ def __call__(self, return_times: bool = False):

c_variables = list(disjoint_vertical) + [item["idx"]
for item in vertical_blocks.values()] + list(range(first_private_block, len(all_blocks)))
for idx, cvar in enumerate(c_variables):
logging.debug(f"c variables: idx {idx}, value {cvar}")

# msa_positions is a list of all positions (r,c) that are required to be
# covered. We exclude the positions covered by vertical blocks, since
# they will be guaranteed to be covered, as an effect of the fact that
Expand Down Expand Up @@ -205,12 +208,12 @@ def __call__(self, return_times: bool = False):
tf = time.time()
times["constraints1-2-3"] = round(tf - ti, 3)

# constraint 4: vertical blocks are part of the solution
for idx in vertical_blocks:
model.addConstr(C[idx] == 1)
logging.info(
f"constraint4: vertical block ({idx}) - {self.input_blocks[idx].str()}"
)
# # constraint 4: vertical blocks are part of the solution
# for idx in vertical_blocks:
# model.addConstr(C[idx] == 1)
# logging.info(
# f"constraint4: vertical block ({idx}) - {self.input_blocks[idx].str()}"
# )

ti = time.time()
# TODO: include input to decide which objective function to use
Expand Down