seqverify

#!/usr/bin/env python
 
import argparse
import os
import sys
import configparser
from seqver_functions import *
from seqver_plots import *
from seqver_genomeupdate import *
from seqver_lofFinder import *
from seqver_gtf import *

parser = argparse.ArgumentParser()

#Parses commandline arguments:
#Main arguments
parser.add_argument('--output', type=str, required=False, default="output") #Name of the output folder
parser.add_argument('--reads_1', type=str, required=False) #Name/Path of First .fq file for reads
parser.add_argument('--reads_2', type=str, required=False) #Name/Path of Second .fq file for reads
parser.add_argument('--untargeted', type=str, required=False, nargs='+') #Name/Path of transgenes to be appended to the genome as extra sequences
parser.add_argument('--targeted', type=str, required=False) #Name/Path of command file instructing where to insert known-insertion transgenes
parser.add_argument('--keepgoing',action='store_const',const=True,required=False,default=False) #continue to next part after completing
parser.add_argument('--keep_temp', action='store_const',const=True, required=False, default=False) #HIGHLY RECOMMENDED - Option to delete temporary files when the pipeline is done

#Genome arguments
parser.add_argument('--genome', type=str, required=False, default=f"{os.getcwd()}/seqverify_defaults/chm13v2.0.fa") #Name/Path of genome file (CHM13 recommended) used to perform everything but variant calling.
parser.add_argument('--gtf', type=str, required=False, default=f"{os.getcwd()}/seqverify_defaults/chm13v2.0_RefSeq_Liftoff_v5.1.gff3") #Name/Path of GTF file for editing

#KRAKEN2 arguments
parser.add_argument('--kraken', action='store_const',const=True, required=False,default=False) #Flag enabling KRAKEN2/BRACKEN analysis
parser.add_argument('--database',type=str,required=False, default=f"{os.getcwd()}/seqverify_defaults/seqverify_database") #Path to KRAKEN2 database

#Insertion Site Detection Arguments
parser.add_argument('--granularity',type=int, required=False, default=500) #How far apart two insertions can be to count as the same insertion site
parser.add_argument('--threads',type=int,required=False, default=1) #How many threads seqverify can use
parser.add_argument('--max_mem', type=str, required=False, default='16G') #Max memory allowed, ending in capital "G" or "M"
parser.add_argument('--min_matches', type=int, required=False,default=1) #How many times an insertion needs to align to the reference genome to show up in the readout
parser.add_argument('--start', type=str, required=False, default="all") #which section of the code to run
parser.add_argument('--mitochondrial',action='store_const',const=True,required=False,default=False) #runs insertion site detection on mitochondrial DNA
parser.add_argument('--stringency',type=float, required=False, default=0.005) #sets stringency for the confidence score calculations
parser.add_argument('--spurious_filtering_threshold', type=float, required=False, default=0.00001)

#CNV/Plotting Arguments
parser.add_argument('--manual_plots',action='store_const',const=True,required=False,default=False) #Flag enabling matplotlib plots instead of IGV plots 
parser.add_argument('--bin_size',type=int, required=False, default=100000) #If manual plots are enabled, the size of each bin in the histogram

#Variant Calling Arguments
parser.add_argument('--variant_calling',nargs=2,type=str,required=False) #Space-separated: the Path/File to the hg38 genome, the SnpEff.jar config path, the clinvar DB path
parser.add_argument('--variant_intensity',type=str,required=False,default="MODERATE") #Minimum severity needed for a variant to show up in the readout, can be set to "MODIFIER", "LOW", "MODERATE", or "HIGH"
parser.add_argument('--variant_window_size',type=int,required=False,default=10000) #Makes sure all variants within variant_window_size of any command's start are printed, regardless of their quality or intensity.  

#Other
parser.add_argument('--download_defaults',action='store_const',const=True,required=False,default=False) #Downloads hg38.p14, CHM13 V2.0, and the PFPlus-8GB KRAKEN2 DB for ease of use. Terminates the program after doing that
parser.add_argument('--similarity',nargs=2,type=str,required=False) #Runs the SNP similarity checker
parser.add_argument('--min_quality',type=int,required=False,default=3)
parser.add_argument('--config',type=str,required=False,default=None)

args = parser.parse_args()

if args.config is not None:
    config = configparser.ConfigParser(allow_no_value=True)
    config.read(args.config)

    args.output = config["MAIN"]["output"]
    args.reads_1 = config["MAIN"]["reads_1"]
    args.reads_2 = config["MAIN"]["reads_2"]
    args.untargeted = config["MAIN"]["untargeted"].split(" ")
    args.targeted = config["MAIN"]["targeted"]
    args.keepgoing = config["MAIN"]["keepgoing"]
    args.keep_temp = bool(config["MAIN"]["keep_temp"])
    args.genome = config["GENOME"]["genome"]
    args.gtf = config["GENOME"]["gtf"]
    args.kraken = bool(config["KRAKEN2"]["kraken"])
    args.database = config["KRAKEN2"]["database"]
    args.granularity = int(config["INSERTION"]["granularity"])
    args.threads = int(config["INSERTION"]["threads"])
    args.max_mem = config["INSERTION"]["max_mem"]
    args.min_matches = int(config["INSERTION"]["min_matches"])
    args.start = config["INSERTION"]["start"]
    args.mitochondrial = config["INSERTION"]["mitochondrial"]
    args.stringency = float(config["INSERTION"]["stringency"])
    args.spurious_filtering_threshold = float(config["INSERTION"]["spurious_filtering_threshold"])
    args.manual_plots = config["CNV"]["manual_plots"]
    args.bin_size = int(config["CNV"]["bin_size"])
    args.variant_calling = eval(config["VARIANT"]["variant_calling"])
    args.variant_intensity = config["VARIANT"]["variant_intensity"]
    args.variant_window_size = int(config["VARIANT"]["variant_window_size"])
    args.min_quality = int(config["OTHER"]["min_quality"])

if args.download_defaults: #If --download_defaults is set, downloads the three databases and quits the program
    #Note:  These FTP links are likely to change over time and break the command. 
    #       Please open an issue on GitHub if they do! https://github.com/mpiersonsmela/SeqVerify
    os.system('mkdir seqverify_defaults')

    #Downloads chm13v2.0 as a reference genome for everything except the variant calling
    os.system('curl -OJX GET "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz" --output-dir seqverify_defaults')
    os.system('gunzip seqverify_defaults/chm13v2.0.fa.gz')

    #Downloads chm13v2.0's gff3 file for gtf editing
    os.system('curl -OJX GET "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/annotation/chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" --output-dir seqverify_defaults')
    os.system('gunzip seqverify_defaults/chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz')

    #Downloads GRCh38 version 105 primary assembly (no alt contigs) as a reference genome for the variant calling
    os.system('curl -OJX GET "ftp://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" --output-dir seqverify_defaults')
    os.system('gunzip seqverify_defaults/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz')

    #Downloads a KRAKEN-valid 8GB database, PlusPF_8GB
    os.system('mkdir seqverify_defaults/seqverify_database')
    os.system('curl -OJX GET "https://genome-idx.s3.amazonaws.com/kraken/k2_pluspf_08gb_20230605.tar.gz" --output-dir seqverify_defaults')
    os.system('tar -xzf seqverify_defaults/k2_pluspf_08gb_20230605.tar.gz -C seqverify_defaults/seqverify_database')

    #Downloads the snpEff config file for ease of access during variant calling
    os.system('curl -OJX GET "https://raw.githubusercontent.com/pcingola/SnpEff/master/config/snpEff.config" --output-dir seqverify_defaults')

    #Downloads the latest version of the ClinVar database for variant calling purposes
    os.system('curl -OJX GET "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz" --output-dir seqverify_defaults')
    os.system('curl -OJX GET "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi" --output-dir seqverify_defaults')
    os.system('gunzip seqverify_defaults/clinvar.vcf.gz')
    
    sys.exit(0)

output_name = args.output

#Stores Variable/Folder Names:
folder = f"{output_name}_seqverify"
temp_folder =f"{output_name}_seqverify_temp"
marker_list_file = f"seqverify_{output_name}_transgene_list.fa"
genome_with_markers = f"seqverify_{output_name}_genome_with_markers.fa"
aligned_with_markers = f"seqverify_{output_name}_markers.sam"
aligned_diff_chr = f"seqverify_{output_name}_markers_diff_chr.sam"
aligned_diff_chr_bam = f"seqverify_{output_name}_markers_diff_chr.bam"
header = f"seqverify_{output_name}_markers_header.sam"
overall_coverage = f"seqverify_{output_name}_markers_coverage.cov"
pytor = f"{output_name}.pytor"
pytor_gc = f"{output_name}_gc.pytor"
pytor_conf = f"{output_name}_genome_conf.py"
unaligned = f"seqverify_{output_name}_unaligned.sam"
unaligned_R1 = f"seqverify_{output_name}_unaligned_R1.fastq"
unaligned_R2 = f"seqverify_{output_name}_unaligned_R2.fastq"
variant_genome_aligned = f"seqverify_{output_name}_variant.sam"
variant_genome_aligned_bam = f"seqverify_{output_name}_variant.bam"
variant_vcf = f"seqverify_{output_name}.vcf"
variant_vcf_ann = f"seqverify_{output_name}.ann.vcf"
variant_lof = f"seqverify_{output_name}_variants.tsv"
bed_file = f"{output_name}.bed"
vcf_similarity = f"seqverify_{output_name}_bcfstats.txt"
vcf_isec = f"seqverify_{output_name}_unique.vcf"
new_gtf_unsorted = f"seqverify_{output_name}_unsorted.gtf"
new_gtf_sorted = f"seqverify_{output_name}.gtf"

#Output Subdirectories:
folder_insertion = f"{folder}/insertion"
folder_cnv = f"{folder}/copy_number"
folder_kraken = f"{folder}/kraken"
folder_snp =f"{folder}/variant_calling"

if args.similarity is not None: #similarity also requires --output to be set, takes two arguments: first the original cell line, then the modified cell line, then --min_quality can be set to change the minimum quality (default: 100).
    compare(args.similarity[0],args.similarity[1],args.min_quality,temp_folder,folder,vcf_similarity, vcf_isec)
    sys.exit(0)

#The pathFinder function detects whether arguments are paths or names and acts accordingly such that it can handle both cases for ease of use.
reads_1_path = pathFinder(args.reads_1)
reads_2_path = pathFinder(args.reads_2)
genome_source_path = pathFinder(args.genome)

marker_sources_path = None
exact_path = None
database = None
gtf_file = None
gtf = None

granularity_threshold = args.granularity

if args.untargeted is not None:
    marker_sources_path = [pathFinder(i) for i in args.untargeted]

if args.targeted is not None:
    exact_path = pathFinder(args.targeted)

if args.database is not None:
    database = pathFinder(args.database)

if args.gtf is not None:
    gtf_file = pathFinder(args.gtf)
    
os.system(f'mkdir -p {folder}')
os.system(f'mkdir -p {temp_folder}')    

if exact_path is not None:
    commands = commandHandler(genome_source_path,exact_path,temp_folder,genome_with_markers,return_commands_only = False)

min_matches = args.min_matches
bin_size = args.bin_size
threads = args.threads

#Checks for a bin size that CNVPytor will recognize
if args.bin_size != 100000:
    if args.bin_size % 100 == 0:
        bin_size = args.bin_size
    else:
        raise ValueError("Custom bin size not divisible by 100")

#Deals with memory logic, converts max_mem argument into number of bytes of memory requested in order to build optimal BWA indexing block size
if args.max_mem[-1] == "M":
    max_mem = int(args.max_mem[:-1])*1000000
elif args.max_mem[-1] == "G":
    max_mem = int(args.max_mem[:-1])*1000000000
elif type(args.max_mem) == 'int':
    max_mem = int(args.max_mem)

base_block = max_mem / 8
#Set Java max mem for Java subprocesses (snpEff, etc)
os.environ['_JAVA_OPTIONS'] = '-Xmx'+args.max_mem
os.system('echo $_JAVA_OPTIONS')

if args.start == "beginning" or args.start == "all":
    #Preliminary Work - Generates the output folders given a specified --output string as well as the temp folder for the pipeline to work within
    os.system(f'mkdir -p {folder_insertion}')
    #Automatic Preparation of Collated Genome:
    #Collates all markers/transgenes into a single FASTA file for ease of handling
    
    print("started gtf")
    if args.gtf is not None and exact_path is not None:
        gtfEdit(gtf_file, commands, temp_folder, folder_insertion, new_gtf_unsorted, new_gtf_sorted)
    
    if marker_sources_path is not None:
        if exact_path is None:
            #Copies the reference genome into a new FASTA file that the transgenes will also be added to
            os.system(f'cp {genome_source_path} {temp_folder}/{genome_with_markers}')

        for file in marker_sources_path: 
            print(f'COMMAND FOR MARKER SOURCES PATH: echo "$(cat {file})" >> {temp_folder}/{marker_list_file}') #DEBUG
            os.system(f'echo "$(cat {file})" >> {temp_folder}/{marker_list_file}') 

        #Adds the transgenes to the new "collated" FASTA file
        print(f'ADDING TO COLLATED FILE: echo "$(cat {temp_folder}/{marker_list_file})" >> {temp_folder}/{genome_with_markers}') #DEBUG
        os.system(f'echo "$(cat {temp_folder}/{marker_list_file})" >> {temp_folder}/{genome_with_markers}')

    #Builds the indices for the genome that has now been augmented with the transgenes, necessary for alignment
    os.system(f'bwa index -b {base_block} {temp_folder}/{genome_with_markers}')
    os.system(f'cp {temp_folder}/{genome_with_markers} {folder_insertion}/{genome_with_markers}')
    
    if args.keepgoing:
        print("Finished beginning, continuing to align")
        args.start = "align"

if args.start == "align" or args.start == "all":
    #Beginning of multithreaded part:

    #Aligns the new genome using BWA-MEM
    os.system(f'bwa mem -M -t {threads} {temp_folder}/{genome_with_markers} {reads_1_path} {reads_2_path} > {temp_folder}/{aligned_with_markers}')
    if args.keepgoing:
        print("Finished align, continuing to markers")
        args.start = "markers"
    
if args.start == "markers" or args.start == "all":
    os.system(f'mkdir -p {folder_insertion}')
    if marker_sources_path != []:
        #Saves all names of transgenes/markers we want to find insertion sites of into a list for later usage and makes it into a string
        with open(f'{temp_folder}/{marker_list_file}') as f:
            marker_list = [i[1:].strip() for i in f if i.startswith('>')]
        markers = "|".join(marker_list)
    
        #Uses the "markers" string as a regular expression to find reads in the newly-aligned SAM file that align to a transgene and have a mate on a human chromosome
        #Exports these reads into a new SAM file, "diff_chr"
        if args.mitochondrial:
            markers += "|chrM"

            os.system(f"samtools view -h -@ {threads} {temp_folder}/{aligned_with_markers} |gawk '{{if ((($3 ~ /^({markers})/)&&($7 ~ /chr([0-9]+|[XY])/))||($1 ~ /^@/)) print $0}};' > {temp_folder}/{aligned_diff_chr}")
        else:
            os.system(f"samtools view -h -@ {threads} {temp_folder}/{aligned_with_markers} |gawk '{{if ((($3 ~ /^({markers})/)&&($7 ~ /chr([0-9]+|[XYM])/))||($1 ~ /^@/)) print $0}};' > {temp_folder}/{aligned_diff_chr}")

        #Sorts the previously-aligned SAM file to a BAM file and places it in the main output folder for both potential post-pipeline manual viewing and for later steps in the pipeline itself 
        os.system(f"samtools sort -@ {threads} {temp_folder}/{aligned_with_markers} > {folder_insertion}/{aligned_diff_chr_bam}")
        os.system(f"samtools index -@ {threads} {folder_insertion}/{aligned_diff_chr_bam}")

        #Creates a header from the BAM file
        os.system(f'samtools view -@ {threads} -H {folder_insertion}/{aligned_diff_chr_bam} > {temp_folder}/{header}')
    
    if args.keepgoing:
        print("Finished markers, continuing to readout")
        args.start = "readout"
        
if args.start == "readout" or args.start == "all": #lines 244-249 have been unindented by one level to accomodate this new start setting.
    with open(f'{temp_folder}/{marker_list_file}') as f: #also i added this
        marker_list = [i[1:].strip() for i in f if i.startswith('>')] #and this
    #Functions imported from seqver_functions, refer to comments there for details on their functioning.
    #Takes "diff_chr", generates a .txt human-readable readout of the insertion sites, places it into the output folder.
    data = group(f'{temp_folder}/{aligned_diff_chr}')
    insertions = compress(data,granularity_threshold)
    filteredInsertions = filterAndScore(temp_folder,folder_insertion,aligned_diff_chr_bam,insertions,args.spurious_filtering_threshold,args.stringency)
    readout(folder_insertion,filteredInsertions[1],filteredInsertions[0],marker_list,min_matches)
    
    if args.keepgoing:
        print("Finished markers, continuing to CNV")
        args.start = "cnv"

if args.start == "cnv" or args.start == "all":
    os.system(f'mkdir -p {folder_cnv}')
    #CNVPytor Logic/Commands, these create a .pytor file for the BAM file and go through all the steps necessary to generate a final plot of the entire genome for ease of CNV analysis
    #Refer to github.com/abyzovlab/CNVpytor/tree/master/cnvpytor for more information

    if exact_path is None: #if there weren't any changes to the default genome
        os.system(f'cnvpytor -root {folder_cnv}/{pytor} -rd {folder_insertion}/{aligned_diff_chr_bam}')
        os.system(f'cnvpytor -root {folder_cnv}/{pytor} -his {bin_size}')
        os.system(f'cnvpytor -root {folder_cnv}/{pytor} -partition {bin_size}')
        os.system(f'cnvpytor -root {folder_cnv}/{pytor} -call {bin_size} > {folder_cnv}/calls.{bin_size}.tsv')
        os.system(f'cnvpytor -root {folder_cnv}/{pytor} -plot manhattan {bin_size} -o {folder_cnv}/{output_name}.png')
    else:
        genome_configurator(temp_folder,pytor_conf,pytor_gc,genome_with_markers,header)
        os.system(f'cnvpytor -conf {temp_folder}/{pytor_conf} -root {folder_cnv}/{pytor} -rd {folder_insertion}/{aligned_diff_chr_bam}')
        os.system(f'cnvpytor -conf {temp_folder}/{pytor_conf} -root {folder_cnv}/{pytor} -his {bin_size}')
        os.system(f'cnvpytor -conf {temp_folder}/{pytor_conf} -root {folder_cnv}/{pytor} -partition {bin_size}')
        os.system(f'cnvpytor -conf {temp_folder}/{pytor_conf} -root {folder_cnv}/{pytor} -call {bin_size} > {folder_cnv}/calls.{bin_size}.tsv')
        os.system(f'cnvpytor -conf {temp_folder}/{pytor_conf} -root {folder_cnv}/{pytor} -plot manhattan {bin_size} -o {folder_cnv}/{output_name}.png')

    if args.keepgoing:
        print("Finished CNVpytor, continuing to plots")
        args.start = "plots"

if args.start == "plots" or args.start == "all":
    os.system(f'mkdir -p {folder_cnv}')
    try:
        region_bed(temp_folder,header,commands,marker_list,bed_file) #Creates a bed file to be used as coordinates for the IGV screenshot engine.
    except NameError: #regenerate commands if the beginning step was skipped
        commands = commandHandler(genome_source_path,exact_path,temp_folder,genome_with_markers, return_commands_only = True)
        with open(f'{temp_folder}/{marker_list_file}') as f:
            marker_list = [i[1:].strip() for i in f if i.startswith('>')]
        markers = "|".join(marker_list)
        region_bed(temp_folder,header,commands,marker_list,bed_file)
    #Plotting Logic
    if not args.manual_plots:
        #If IGV plots are used (default option), creates plots of the transgenes through IGV: the .bed file here is used to direct IGV towards the relevant regions of the genome
        if args.gtf is not None:
            gtf = new_gtf_sorted #don't put {folder_insertion} at the front
        igvScreenshot_new(temp_folder,folder_insertion,f'{folder_insertion}/{aligned_diff_chr_bam}',f'{folder_insertion}/{genome_with_markers}',bed_file, gtf_file=gtf)
    else:
        #If IGV plots are not used, creates matplotlib plots of the transgene copy numbers: the .bed file here is used to direct samtools depth towards the relevant regions to cut down time.
        os.system(f'samtools depth -b {bed_file} {folder_insertion}/{aligned_diff_chr_bam} > {temp_folder}/{overall_coverage}')
        #Creates the histograms (refer to seqver_plots.py for more information)
        chrHistograms(f'{temp_folder}/{overall_coverage}',marker_list)
        #Moves the plots and the coverage maps to the main and temp folders, respectively.
        os.system(f'mv fig_* {folder_insertion}')
        os.system(f'mv *.cov {temp_folder}')
        
    if args.keepgoing:
        print("Finished plots, continuing to Kraken (if enabled)")
        args.start = "kraken"

if args.start == "kraken" or args.start == "all":
    os.system(f'mkdir -p {folder_kraken}')
    #KRAKEN2/BRACKEN system

    if args.kraken:
        #If KRAKEN is enabled (default: not enabled), runs the KRAKEN pipeline.
        #Pulls out all unclassified reads (flag 0x13 means "paired (1), unmapped (4), mate unmapped (8)") and stores them in a SAM file
        os.system(f'samtools view -f 13 -@ {threads} {folder_insertion}/{aligned_diff_chr_bam}> {temp_folder}/{unaligned}')

        #Separates the SAM file of all unaligned reads into its forward and backward reads
        os.system(f'samtools fastq -@ {threads} {temp_folder}/{unaligned} -1 {temp_folder}/{unaligned_R1} -2 {temp_folder}/{unaligned_R2}')

        #Runs KRAKEN2, generating a report and placing it in the main output folder
        os.system(f'kraken2 --threads {threads} --db {database} --report {folder_kraken}/classified_seqs_{output_name}.kreport --paired --classified-out {folder_kraken}/classified_seqs_{output_name}#.fq {temp_folder}/{unaligned_R1} {temp_folder}/{unaligned_R2} > {folder_kraken}/classified_{output_name}.kraken')

        #Runs BRACKEN, generating a report in the main output folder.  
        os.system(f'bracken -d {database} -i {folder_kraken}/classified_seqs_{output_name}.kreport -o {folder_kraken}/classified_seqs_{output_name}.bracken -r 150')

    if args.keepgoing:
        print("Finished kraken, continuing to variant calling (if enabled)")
        args.start = "variant"

if args.start == "variant" or args.start == "all":
    os.system(f'mkdir -p {folder_snp}')
    if args.variant_calling is not None and args.variant_calling != []:
        ##### Do the variant calling #####
        
        #Sets up the three variables needed for variant calling
        # 1) The genome to be re-aligned to (GRCh38, given that CHM13 snpEff does not work)
        # 2) The path to the config file
        # 3) The path to a valid annotation DB
        variant_genome_source_path = pathFinder(args.variant_calling[0]) #example: "seqverify_database/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
        #variant_config_path = args.variant_calling[1] #not used. Maybe add as optional argument?
        clinvardb_source_path = pathFinder(args.variant_calling[1]) #example: "seqverify_database/clinvar.vcf.gz" Note, vcf.gz.tbi file must also be present
        
        #check if BWA index file already exists, if not then create it
        if os.path.isfile(variant_genome_source_path+".bwt") and os.path.isfile(variant_genome_source_path+".pac") and os.path.isfile(variant_genome_source_path+".sa") and os.path.isfile(variant_genome_source_path+".ann") and os.path.isfile(variant_genome_source_path+".amb"):
            print("Detected that BWA indexing was already performed. Skipping BWA indexing.")
        else:
            print("Existing BWA index not detected. Performing indexing.")
            os.system(f"bwa index {variant_genome_source_path}")
        if os.path.isfile(variant_genome_source_path+".fai"):
            print("Detected that samtools faidx was already performed. Skipping samtools faidx.")
        else:
            print("Existing FASTA index not detected. Performing indexing.")
            os.system(f"samtools faidx {variant_genome_source_path}")
        
        #realigning the reads to the new genome
        bwa_cmd = f"bwa mem -M -t {threads} {variant_genome_source_path} {reads_1_path} {reads_2_path} > {temp_folder}/{variant_genome_aligned}"
        
        exitcode = os.system(bwa_cmd)
        if exitcode!=0: sys.exit("BWA failed")
        
        os.system(f"samtools sort -@ {threads} {temp_folder}/{variant_genome_aligned} > {temp_folder}/{variant_genome_aligned_bam}") 
        os.system(f"samtools index -@ {threads} {temp_folder}/{variant_genome_aligned_bam}")
        
        #Generates the VCF through bcftools mpileup
        #Calls variants using bcftools call. -mv option outputs only variant sites
        #TODO: add multithreading (maybe split by chromosomes and then recombine)
        os.system(f"bcftools mpileup -Ou -f {variant_genome_source_path} {temp_folder}/{variant_genome_aligned_bam} | bcftools call -mv -Ov -o {temp_folder}/{variant_vcf}")
        
        if args.keepgoing:
            args.start = "snp_filtering"
        
if args.start == "snp_filtering" or args.start == "all":
        variant_genome_source_path = pathFinder(args.variant_calling[0]) #example: "seqverify_database/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"
        clinvardb_source_path = pathFinder(args.variant_calling[1]) #example: "seqverify_database/clinvar.vcf.gz" Note, vcf.gz.tbi file must also be present
        
        #Runs snpEff, SnpSift to do a first pass of filtering the variants, saves everything in the main folder for post-pipeline analysis
        os.system(f"snpEff -dataDir seqverify_defaults -v GRCh38.105 {temp_folder}/{variant_vcf} > {temp_folder}/{variant_vcf_ann}")
        os.system(f"SnpSift annotate -v {clinvardb_source_path} {temp_folder}/{variant_vcf_ann} > {folder_snp}/{variant_vcf_ann}")
        
        #Further filter the reads, saves everything to a readout in the main folder
        if exact_path is None:
            mutation_logger(folder_snp,variant_lof,variant_vcf_ann,args.min_quality,args.variant_intensity, args.variant_window_size)
        else:
            mutation_logger(folder_snp,variant_lof,variant_vcf_ann,args.min_quality,args.variant_intensity, args.variant_window_size, exact_path)
    #If deletion of files is enabled (which is it by default), deletes the temp folder.
    #This is recommended, especially since temp files contain SAM files that can get to 100+ GB depending on the size of the original inputs.
if not args.keep_temp:
    os.system(f'rm -r {temp_folder}')