no message

Imamachi-n · Feb 15, 2017 · 528493a · 528493a
1 parent bcd2dfc
commit 528493a
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 98 deletions.
diff --git a/RNA-seq_analysis/RNA-seq_unstr_single-end_1_mapping.sh b/RNA-seq_analysis/RNA-seq_unstr_single-end_1_mapping.sh
@@ -14,7 +14,7 @@ mkdir fastqc_${file}
 fastqc -o ./fastqc_${file} ./${file}.fastq -f fastq
 
 ## 2. Quality filtering (Option)
-fastq_quality_trimmer -Q33 -t 20 -l 10 -i ./${file}.fastq | fastq_quality_filter -Q33 -q 20 -p 80 -o ${file}_1_filtered.fastq
+# fastq_quality_trimmer -Q33 -t 20 -l 10 -i ./${file}.fastq | fastq_quality_filter -Q33 -q 20 -p 80 -o ${file}_1_filtered.fastq
 
 ## 3. rRNA removal (Option)
 # bowtie -p 8 --un ./${file}_2_norrna.fastq ${indexFile} ./${file}_1_filtered.fastq > rRNA_${file}.fastq 2>> ./log_${file}.txt
@@ -27,7 +27,7 @@ maxRAM=32000000000
 
 mkdir STAR_output_${file}
 STAR --runMode alignReads --runThreadN 8 --genomeDir ${indexFile} \
---readFilesIn ./${file}_1_filtered.fastq --outFilterType BySJout \
+--readFilesIn ./${file}.fastq --outFilterType BySJout \
 --outFileNamePrefix ./STAR_output_${file}/${file}_4_STAR_result_ \
 --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical \
 --outSAMattributes All --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM ${maxRAM} \

diff --git a/RNA-seq_analysis/prep_cuffdiff_settings.py b/RNA-seq_analysis/prep_cuffdiff_settings.py
@@ -3,78 +3,145 @@
 from __future__ import print_function
 import sys
 
-input_file = open(sys.argv[1], 'r')
-mode_infor = sys.argv[2] # SINGLE, PAIRED
-project_dict = sys.argv[3] # 20_PRJNA312917
-
-filename_srrid_dict = {}
-filename_label_dict = {}
-Project_id = ""
-
-output_file = open("RNA-seq_unstr_0_sra2fastq_for_run.sh", 'w')
-sra2fastq_count = 0
-core_line_1 = """#!/bin/bash
+## Shell Script Blocks ##############
+header_line_4G = """#!/bin/bash
 #$ -S /bin/bash
 #$ -cwd
 #$ -soft -l ljob,lmem
 #$ -l s_vmem=4G
 #$ -l mem_req=4G
 
 """
-print(core_line_1, end="\n", file=output_file)
 
-sra2fastq_job_list = []
-for line in input_file:
+header_line_32G = """#!/bin/bash
+#$ -S /bin/bash
+#$ -cwd
+#$ -soft -l ljob,lmem
+#$ -l s_vmem=32G
+#$ -l mem_req=32G
+
+"""
+
+cuffdiff_core_line = """## 5. Quantification
+cuffdiff -p 8 --multi-read-correct -o ./cuffdiff_out_${filename} ${gtfFile} \\"""
+
+cuffdiff_anno_line = """
+## Annotation
+gene_list="/home/akimitsu/database/gencode.v19.annotation_filtered_symbol_type_list.txt" #Required
+cuffnorm_data="gene_exp.diff"
+# mRNA
+gene_type="mRNA"
+result_file="cuffdiff_result_mRNA.txt"
+python ~/custom_command/cuffdiff_result.py ${gene_list} ./cuffdiff_out_${filename}/${cuffnorm_data} ${gene_type} ./cuffdiff_out_${filename}/${result_file}
+
+# lncRNA
+gene_type="lncRNA"
+result_file="cuffdiff_result_lncRNA.txt"
+python ~/custom_command/cuffdiff_result.py ${gene_list} ./cuffdiff_out_${filename}/${cuffnorm_data} ${gene_type} ./cuffdiff_out_${filename}/${result_file}
+"""
+
+edger_line = """
+# Add Annotation
+gene_list="/home/akimitsu/database/gencode.v19.annotation_filtered_symbol_type_list.txt"
+python2 ~/custom_command/annotate_gene_symbol_type.py  ${gene_list} edgeR_output_${filename}/edgeR_test_result.txt edgeR_output_${filename}/edgeR_test_result_anno_plus.txt
+python2 ~/custom_command/split_into_each_gene_type.py edgeR_output_${filename}/edgeR_test_result_anno_plus.txt
+"""
+
+slack_line ="""# Slack comment
+dirpath=`pwd`
+dirname=`basename ${dirpath}`
+python ~/custom_command/slack_bot.py ${dirname}_analysis_was_finished
+"""
+
+
+###############################
+
+## Input args
+sample_label_file = open(sys.argv[1], 'r')    # srafile_name_label.txt
+mode_infor = sys.argv[2]    # SINGLE, PAIRED
+project_dict = sys.argv[3]    # 20_PRJNA312917
+
+
+## Create "sra2fastq" run shell script
+output_file = open("RNA-seq_unstr_0_sra2fastq_for_run.sh", 'w')
+print(header_line_4G, end="\n", file=output_file)
+
+
+sra2fastq_job_list = []    # sra2fastq qsub job ID
+filename_srrid_dict = {}    # SRR5011314,SRR5011315,...
+filename_label_dict = {}    # C0,C1,C2,C3,T0,T1,T2,T3,...
+Project_id = ""
+sra2fastq_count = 0    # The number of SRA files
+
+
+for line in sample_label_file:
     line = line.rstrip()
     data = line.split("\t")
-    Project_id = data[0]
-    srr_id = data[1]
-    file_name = data[2]
 
-    # create each sra2fastq script
-    output_sra_file = open("RNA-seq_unstr_0_sra2fastq_No{0}.sh".format(sra2fastq_count), 'w')
+    # Get basic sample information
+    Project_id = data[0]    # PRJNA312917
+    srr_id = data[1]    # SRR5011314
+    file_name = data[2]    # C0,C1,C2 || T1 || C0,C1,T2
+
+    # Create each "sra2fastq" number shell script for qsub command
+    sra2fastq_each_filename = "RNA-seq_unstr_0_sra2fastq_No{0}.sh".format(sra2fastq_count)
+    output_sra2fastq_number_file = open(sra2fastq_each_filename, 'w')
+
+    # Prepare SRA file name
     sra_name = "{0}_{1}.sra".format(srr_id, file_name)
-    sra_line_2 = ""
+
+    # Add fastq-dump command line in "sra2fastq" number shell script for qsub command
+    sra2fastq_No_line_1 = ""
     if mode_infor == "SINGLE":
-        sra_line_2 = "fastq-dump {0}".format(sra_name)
+        sra2fastq_No_line_1 = "fastq-dump {0}".format(sra_name)
     elif mode_infor == "PAIRED":
-        sra_line_2 = "fastq-dump --split-files {0}".format(sra_name)
-    print(core_line_1, end="\n", file=output_sra_file)
-    print(sra_line_2, end="\n", file=output_sra_file)
+        sra2fastq_No_line_1 = "fastq-dump --split-files {0}".format(sra_name)
+    print(header_line_4G, end="\n", file=output_sra2fastq_number_file)
+    print(sra2fastq_No_line_1, end="\n", file=output_sra2fastq_number_file)
 
-    # Create qsub line for sra2fastq
+    # Add qsub command line in "sra2fastq" run shell script
     job_id = "sra2fastq_{0}_{1}".format(project_dict, sra2fastq_count)
     sra2fastq_job_list.append(job_id)
     qsub_line = "qsub -N {1} ./RNA-seq_unstr_0_sra2fastq_No{0}.sh".format(sra2fastq_count, job_id)
     print(qsub_line, end="\n", file=output_file)
 
     sra2fastq_count += 1
-    output_sra_file.close()
+    output_sra2fastq_number_file.close()
 
 
     # Skip non-labeled samples
     if data[3] == '':
         continue
-    sample_label_list = data[3].split(',')
+
+    # Prep sample labels
+    sample_label_list = data[3].split(',')    # C0,C1,C2 || T0
+
     # Prep srr_id & sample_label for each sample
     if not file_name in filename_srrid_dict:
+        # Store sample SRA ID
         filename_srrid_dict[file_name] = [srr_id]
+
+        # Store sample labels
         filename_label_dict[file_name] = []
         for sample_label in sample_label_list:
             filename_label_dict[file_name].append(sample_label)
     else:
+        # Store sample SRA ID
         filename_srrid_dict[file_name].append(srr_id)
+
+        # Store sample labels
         for sample_label in sample_label_list:
             filename_label_dict[file_name].append(sample_label)
 
-# sra2fstq run script
+## Add qsub command in "sra2fastq" run shell script
 x_analysis_name = "RNA-seq_unstr_0_X_analysis_for_run.sh"    # Merge, mapping, quant
 merge_fastq_line = "qsub -hold_jid {0} {1}".format(','.join(sra2fastq_job_list), x_analysis_name)
 print(merge_fastq_line, end="\n", file=output_file)
 
-# merge, mapping, quant run script
+# merge, mapping, quant run shell script
 merge_fastq_file = open(x_analysis_name, 'w')
-print(core_line_1, end="\n", file=merge_fastq_file)
+print(header_line_4G, end="\n", file=merge_fastq_file)
+
 x_analysis_line=""
 if mode_infor == "SINGLE":
     x_analysis_line="""
@@ -93,42 +160,52 @@
 merge_fastq_file.close()
 output_file.close()
 
-ref_infor = {}
-ref_infor_edger = {}
-count_list = []
 
+ref_infor = {}    # Cuffdiff filepath list
+ref_infor_edger = {}    # edgeR filepath list
+count_list = []    # The number of pairs (Control vs Treatment)
+
+# Mapping run shell script
 output_file = open("RNA-seq_unstr_1_mapping_for_run.sh", 'w')
 print("#!/bin/bash", file=output_file)
-job_count = 0
-job_id_list = []
+
+job_count = 0    # The number of Cuffdiff, edgeR jobs
+job_id_list = []    # Mapping job ID list
 
 for file_name in filename_srrid_dict.keys():
-    srr_list = filename_srrid_dict[file_name]    # SRR001, SRR002
+    srr_list = filename_srrid_dict[file_name]    # SRR5011314, SRR5011315
     label_list = filename_label_dict[file_name]    # C0, T0, C1, T1
 
-    # Prep filepass
+    # Prep filename
     srr_name = "{0}_{1}".format("-".join(srr_list), file_name)
 
     # Mapping shell scripts
     fastq_name = "{0}.fastq".format(srr_name)
     if mode_infor == "SINGLE":
-        print("bash ./RNA-seq_unstr_single-end_1_mapping.sh {0}".format(fastq_name), end="\n", file=output_file)
+        print("qsub -N job_{1}_{2} ./RNA-seq_unstr_single-end_1_mapping.sh {0}".format(fastq_name, Project_id, str(job_count)), end="\n", file=output_file)
+        # Prep job ID list
         job_id_list.append("job_{0}_{1}".format(Project_id, str(job_count)))
     elif mode_infor == "PAIRED":
-        print("bash ./RNA-seq_unstr_paired-end_1_mapping.sh {0}".format(fastq_name), end="\n", file=output_file)
+        print("qsub -N job_{1}_{2} ./RNA-seq_unstr_paired-end_1_mapping.sh {0}".format(fastq_name, Project_id, str(job_count)), end="\n", file=output_file)
+        # Prep job ID list
         job_id_list.append("job_{0}_{1}".format(Project_id, str(job_count)))
     else:
         print("Error: mode_infor should be SINGLE or PAIRED")
         sys.exit()
     job_count += 1
 
+    # Prep sample filepath (Cuffdiff, edgeR)
     filepass = "./STAR_output_{0}/{0}_4_STAR_result_Aligned.sortedByCoord.out.bam".format(srr_name)
     filepass_edger = "./featureCounts_result_{0}/featureCounts_result_{0}_for_R.txt".format(srr_name)
+
     for sample_label in label_list:
+        # Count the number of cuffdiff, edgeR jobs.
         sample_label_counter = sample_label.replace("C", "")
         sample_label_counter = sample_label_counter.replace("T", "")
         if not sample_label_counter in count_list:
             count_list.append(sample_label_counter)
+
+        # Prep sample filepath list (Cuffdiff, edgeR)
         if not sample_label in ref_infor:
             ref_infor[sample_label] = [filepass]
             ref_infor_edger[sample_label] = [filepass_edger]
@@ -138,108 +215,83 @@
 
 output_file.close()
 
-# Cuffdiff shell scripts
+# Cuffdiff/edgeR run shell scripts
 cuffdiff_run_file = open("RNA-seq_unstr_2_quantification_for_run.sh", 'w')
 print("#!/bin/bash", file=cuffdiff_run_file)
+
+# Prep job ID
 job_id = '"job_{0}_*"'.format(Project_id)
 
-counter = len(count_list)
-cuff_job_id_list = []
+counter = len(count_list)    # The number of pairs (Control vs Treatment)
+cuff_job_id_list = []    # Cuffdiff, edgeR job ID list
+
 for x in range(counter):
+    # Create each "Cuffdiff, edgeR" number shell script for qsub command
     output_file = open("RNA-seq_unstr_2_quantification_No{0}.sh".format(x), 'w')
+
+    # Add bash command in cuffdiff, edgeR run shell script.
     print("qsub -N cuffdiff_{2}_No{0} -hold_jid {1} RNA-seq_unstr_2_quantification_No{0}.sh".format(x, ",".join(job_id_list), Project_id), file=cuffdiff_run_file)
+
+    # Prep cuffdiff, edgeR job ID
     cuff_job_id_list.append("cuffdiff_{1}_No{0}".format(x, Project_id))
+
+    # Prep cuffdiff filepath lists (Control & Treatment)
     control = "C{0}".format(str(x))
     treated = "T{0}".format(str(x))
     control_filepass = ref_infor[control]
     treated_filepass = ref_infor[treated]
 
-    # edgeR
+    # Prep edgeR filepath lists (Control + Treatment)
     edger_filepass = []
     control_filepass_edger = ref_infor_edger[control]
     treated_filepass_edger = ref_infor_edger[treated]
     edger_filepass.extend(control_filepass_edger)
     edger_filepass.extend(treated_filepass_edger)
+
+    # Prepr edgeR label lists (Control + Treatment)
     edger_control = ["C"] * len(control_filepass_edger)
     edger_treatment = ["T"] * len(treated_filepass_edger)
     edger_label = []
     edger_label.extend(edger_control)
     edger_label.extend(edger_treatment)
 
-    test ="""#!/bin/bash
-#$ -S /bin/bash
-#$ -cwd
-#$ -soft -l ljob,lmem
-#$ -l s_vmem=32G
-#$ -l mem_req=32G
-
-gtfFile="/home/akimitsu/database/gencode.v19.annotation_filtered.gtf"
-"""
+    # qsub job ID
     filename = 'filename="No{0}"'.format(x)
-    test2= """## 5. Quantification
-cuffdiff -p 8 --multi-read-correct -o ./cuffdiff_out_${filename} ${gtfFile} \\"""
-
-    test3 ="""
-## Annotation
-gene_list="/home/akimitsu/database/gencode.v19.annotation_filtered_symbol_type_list.txt" #Required
-cuffnorm_data="gene_exp.diff"
-# mRNA
-gene_type="mRNA"
-result_file="cuffdiff_result_mRNA.txt"
-python ~/custom_command/cuffdiff_result.py ${gene_list} ./cuffdiff_out_${filename}/${cuffnorm_data} ${gene_type} ./cuffdiff_out_${filename}/${result_file}
 
-# lncRNA
-gene_type="lncRNA"
-result_file="cuffdiff_result_lncRNA.txt"
-python ~/custom_command/cuffdiff_result.py ${gene_list} ./cuffdiff_out_${filename}/${cuffnorm_data} ${gene_type} ./cuffdiff_out_${filename}/${result_file}
-"""
-
-    # Cuffdiff
-    print(test, file=output_file)
+    # Add Cuffdiff number shell script for qsub command
+    print(header_line_32G, file=output_file)
+    print('gtfFile="/home/akimitsu/database/gencode.v19.annotation_filtered.gtf"', file=output_file)
     print(filename, file=output_file)
     print("\n", file=output_file)
-    print(test2, file=output_file)
+    print(cuffdiff_core_line, file=output_file)
     print(",".join(control_filepass), " \\", sep=" ", end="\n", file=output_file)
     print(",".join(treated_filepass), sep=" ", end="\n", file=output_file)
-    print(test3, file=output_file)
+    print(cuffdiff_anno_line, file=output_file)
 
-    # edgeR
+    # Add edgeR number shell script for qsub command
     print("mkdir edgeR_output_${filename}", end="\n", file=output_file)
     print("Rscript ~/custom_command/edgeR_test.R", " \\", sep=" ", end="\n", file=output_file)
     print(",".join(edger_filepass), " \\", sep=" ", end="\n", file=output_file)
     print(",".join(edger_label), " \\", sep=" ", end="\n", file=output_file)
     print("edgeR_output_${filename}", " \\", sep=" ", end="\n", file=output_file)
-    if len(control_filepass) == 1 and len(treated_filepass) == 1:
+
+    # Check Duplicate
+    if len(control_filepass_edger) == 1 and len(treated_filepass_edger) == 1:
         print("No", end="\n", file=output_file)
     else:
         print("Yes", end="\n", file=output_file)
 
-    rdger_line ="""
-# Add Annotation
-gene_list="/home/akimitsu/database/gencode.v19.annotation_filtered_symbol_type_list.txt"
-python2 ~/custom_command/annotate_gene_symbol_type.py  ${gene_list} edgeR_output_${filename}/edgeR_test_result.txt edgeR_output_${filename}/edgeR_test_result_anno_plus.txt
-python2 ~/custom_command/split_into_each_gene_type.py edgeR_output_${filename}/edgeR_test_result_anno_plus.txt
-"""
-    print(rdger_line, end="\n", file=output_file)
+    print(edger_line, end="\n", file=output_file)
 
     output_file.close()
 
+## Add bash command in cuffdiff, edgeR run shell script.
 print("qsub -hold_jid {1} RNA-seq_unstr_3_finish.sh".format(x, ",".join(cuff_job_id_list)), file=cuffdiff_run_file)
 
 cuffdiff_run_file.close()
 
+# Slack finish shell script
 finish_run_file = open("RNA-seq_unstr_3_finish.sh", 'w')
-test4 ="""#!/bin/bash
-#$ -S /bin/bash
-#$ -cwd
-#$ -soft -l ljob,lmem
-#$ -l s_vmem=4G
-#$ -l mem_req=4G
-
-# Slack comment
-dirpath=`pwd`
-dirname=`basename ${dirpath}`
-python ~/custom_command/slack_bot.py ${dirname}_analysis_was_finished
-"""
-print(test4, file=finish_run_file)
+print(header_line_4G, file=finish_run_file)
+print(slack_line, file=finish_run_file)
 finish_run_file.close()