diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..623ad15
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,180 @@
+
+
+
+4.0.0
+customprodbj
+1.0.0
+
+
+
+
+
+ org.apache.commons
+ commons-lang3
+ 3.4
+
+
+
+
+ org.apache.commons
+ commons-math3
+ 3.6.1
+
+
+
+ org.apache.maven.plugins
+ maven-resources-plugin
+ 2.4.3
+
+
+
+
+ JFASTA
+ JFASTA
+ 2.2.0
+
+
+
+
+
+
+
+
+
+
+ src
+
+
+ src
+
+ **/*.java
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-assembly-plugin
+ 2.5.3
+
+
+
+ main.java.DbCreator
+
+
+
+
+ jar-with-dependencies
+
+
+
+
+
+ make-assembly
+ package
+
+ single
+
+
+
+
+
+
+
+
+ maven-compiler-plugin
+ 2.3.2
+
+
+ 1.8
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+ 2.4
+
+
+
+ true
+ true
+ main.java.DbCreator
+ true
+ lib
+
+
+ ${project.build.directory}/${project.artifactId}-${project.version}
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ copy-dependencies
+ package
+
+ copy-dependencies
+
+
+ ${project.build.directory}/${project.artifactId}-${project.version}/lib
+
+ true
+ true
+ false
+
+
+
+
+
+
+
+
+
+
+
+
+ org.eclipse.m2e
+ lifecycle-mapping
+ 1.0.0
+
+
+
+
+
+
+ org.apache.maven.plugins
+
+
+ maven-dependency-plugin
+
+ [2.1,)
+
+ copy-dependencies
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+pg
+
+
diff --git a/src/main/java/DbCreator.java b/src/main/java/DbCreator.java
new file mode 100644
index 0000000..845c36d
--- /dev/null
+++ b/src/main/java/DbCreator.java
@@ -0,0 +1,1165 @@
+package main.java;
+
+import net.sf.jfasta.FASTAElement;
+import net.sf.jfasta.FASTAFileReader;
+import net.sf.jfasta.impl.FASTAElementIterator;
+import net.sf.jfasta.impl.FASTAFileReaderImpl;
+import org.apache.commons.cli.*;
+import org.apache.commons.lang3.StringUtils;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Build a customized protein database
+ */
+public class DbCreator {
+
+ public static HashMap codons;
+
+
+ /**
+ * The prefix of variant protein ID, default is VAR_
+ */
+ public String prefix4variantID = "VAR";
+
+
+ public String output_var_table_suffix = "-varInfo.txt";
+
+ public String output_db_table_suffix = "-var.fasta";
+
+ public String output_stat_table_suffix = "-varStat.txt";
+
+ /**
+ * mRNA annotation data and mRNA sequences
+ */
+ private HashMap gMap = new HashMap<>();
+
+
+ public String outdir = "./";
+
+ /**
+ * Whether or not to add reference protein sequences separately
+ */
+ public String outputRefProteinDB = "";
+
+
+ /**
+ * Construct a new DbCreator object
+ * @param mRNA_db *_refGeneMrna.fa
+ * @param gene_anno_file *__refGene.txt
+ */
+ public DbCreator(String mRNA_db, String gene_anno_file){
+ try {
+ this.readAnnotationData(mRNA_db,gene_anno_file);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ codons = getCodon();
+ }
+
+
+ public static void main(String[] args) throws ParseException, IOException {
+ Options options = new Options();
+
+ // input file
+ // sample somatic germline rna
+ // s1 s.txt g.txt r.txt
+ // s2 s.txt g.txt r.txt
+ // ...
+
+ // output file
+ // one sample one column,
+ // s1 s2 ...
+ // 0,1,1 1,1,1
+
+ options.addOption("i", true,"Annovar annotation result file. Multiple files are separated by ','.");
+ options.addOption("f", true,"A file which includes multiple samples. This parameter is used to build a customized database for ");
+ options.addOption("r", true,"Gene annotation data");
+ options.addOption("d", true,"mRNA fasta database");
+ options.addOption("o", true,"Output folder");
+ options.addOption("p1", true,"The prefix of variant protein ID, default is VAR_");
+ options.addOption("p2", true,"The prefix of final output files, default is merge");
+ options.addOption("t",false,"Whether or not to add reference protein sequences to the output database file");
+ options.addOption("ref", true, "Output reference protein database file");
+ options.addOption("v", false, "Verbose");
+ options.addOption("h", false, "Help");
+
+
+ CommandLineParser parser = new PosixParser();
+ CommandLine cmd = parser.parse(options, args);
+ if (cmd.hasOption("h") || cmd.hasOption("help") || args.length == 0) {
+ HelpFormatter f = new HelpFormatter();
+ System.out.println("java -Xmx2G -jar customprodbj.jar");
+ // System.out.println(version);
+ f.printHelp("Options", options);
+ System.exit(0);
+ }
+
+
+ String geneAnno = cmd.getOptionValue("r");
+ String genefa = cmd.getOptionValue("d");
+
+ DbCreator createDB4Annovar = new DbCreator(genefa,geneAnno);
+
+ if(cmd.hasOption("o")){
+ createDB4Annovar.outdir = cmd.getOptionValue("o");
+ File OD = new File(createDB4Annovar.outdir);
+ if(!OD.isDirectory()){
+ OD.mkdirs();
+ }
+ }
+
+ boolean includeRefProteins = false;
+ if(cmd.hasOption("t")){
+ includeRefProteins = true;
+ }
+
+ if(cmd.hasOption("p1")){
+ createDB4Annovar.prefix4variantID = cmd.getOptionValue("p1");
+ }
+
+ boolean verbose = false;
+ if(cmd.hasOption("v")){
+ verbose = true;
+ }
+
+ if(cmd.hasOption("f")) {
+ // input is a file which includes multiple samples or multiple variant annotation files for the same sample
+ // This is used to build a customized protein database for a TMT/iTRAQ experiment which usually contains
+ // multiple samples. This can also be used to build a customized protein database for a sample which contains
+ // multiple variant results, for example, a variant calling result from WES data, a variant calling result
+ // from RNA-Seq data.
+ String input_list = cmd.getOptionValue("f");
+ File IFL = new File(input_list);
+ BufferedReader iReader = new BufferedReader(new FileReader(IFL));
+ String headLine = iReader.readLine().trim();
+ String h[] = headLine.split("\\s+");
+ HashMap headerMap = new HashMap<>();
+ ArrayList hList = new ArrayList<>();
+ for (int i = 0; i < h.length; i++) {
+ headerMap.put(h[i], i);
+ hList.add(h[i]);
+ }
+
+ // read sample list file
+ // please note that a file may not exist
+ String line;
+ HashMap>> resfileMap = new HashMap<>();
+ while ((line = iReader.readLine()) != null) {
+ String d[] = line.trim().split("\\s+");
+ String sample_name = d[0];
+ System.out.println("process file:"+sample_name);
+ HashMap> smap = new HashMap<>();
+ for (int i = 1; i < d.length; i++) {
+ System.out.println(" => "+h[i]+" => "+ d[i]);
+ String prefix = sample_name + "-" + h[i];
+ ArrayList rfile = createDB4Annovar.run(d[i], prefix, false,true, true);
+ // h[i] is sample name
+ smap.put(h[i], rfile);
+
+ }
+ resfileMap.put(sample_name, smap);
+ }
+
+ iReader.close();
+
+ // merge
+ System.out.println("Merge ...");
+ String oprefix = IFL.getName().replaceAll(".txt","");
+ createDB4Annovar.merge(resfileMap, hList, oprefix, includeRefProteins);
+ }else{
+
+ String annovarRes = cmd.getOptionValue("i");
+ // we don't need to merge
+ String oprefix;
+ if(cmd.hasOption("p2")){
+ oprefix = cmd.getOptionValue("p2");
+ }else{
+ oprefix = "merge";
+ }
+ createDB4Annovar.run(annovarRes,oprefix, includeRefProteins, verbose,true);
+ if(cmd.hasOption("ref")){
+ createDB4Annovar.outputRefProteinDB = cmd.getOptionValue("ref");
+ }else{
+ createDB4Annovar.outputRefProteinDB = "";
+ }
+
+ if(!createDB4Annovar.outputRefProteinDB.isEmpty()){
+ createDB4Annovar.addReferenceProteins(createDB4Annovar.gMap, createDB4Annovar.outputRefProteinDB);
+ }
+
+ }
+
+ }
+
+
+ public void merge(HashMap>> resfileMap, ArrayList h, String prefix, boolean addRef) throws IOException {
+
+
+ HashMap mergeVarMap = new HashMap<>();
+
+ // varID => sample+head
+ HashMap> varID2sample = new HashMap<>();
+
+ // Get all varIDs across all samples
+ ArrayList sampleNames = new ArrayList<>();
+ for(String sample_name : resfileMap.keySet()){
+ sampleNames.add(sample_name);
+ for(int i=1;i varMap = readData(annofile, varfa);
+
+ // varID => String varID = d[hMap.get("Chr")]+"_"+d[hMap.get("Start")]+"_"+d[hMap.get("End")]+"_"+d[hMap.get("Ref")]+"_"+d[hMap.get("Alt")]+"_"+d[hMap.get("mRNA")];
+ String sample_head = sample_name + " " + fileClass;
+ for (String varID : varMap.keySet()) {
+ if(varID2sample.containsKey(varID)){
+ varID2sample.get(varID).add(sample_head);
+ }else {
+ HashSet newMap = new HashSet<>();
+ newMap.add(sample_head);
+ varID2sample.put(varID,newMap);
+ }
+
+ mergeVarMap.put(varID, varMap.get(varID));
+
+ }
+ }
+ }
+
+
+ String out_annofile = this.outdir + "/"+ prefix + this.output_var_table_suffix;
+ String out_varfa = this.outdir + "/"+ prefix + this.output_db_table_suffix;
+
+ BufferedWriter annoWriter = new BufferedWriter(new FileWriter(new File(out_annofile)));
+ BufferedWriter pWriter = new BufferedWriter(new FileWriter(new File(out_varfa)));
+
+ ArrayList outSampleNames = new ArrayList<>();
+ for(int i=0;i out = new ArrayList<>();
+ for(String sample_name : sampleNames){
+ ArrayList ex = new ArrayList<>();
+ for(int i=1;i"+newVarID+" "+mergeVarMap.get(varID).protein_header+"\n"+mergeVarMap.get(varID).protein_seq+"\n");
+
+
+ k++;
+ }
+
+ if(addRef){
+ addReferenceProteins(gMap, pWriter);
+ }
+
+ annoWriter.close();
+ pWriter.close();
+
+ String stat_file = this.outdir + "/"+ prefix + this.output_stat_table_suffix;
+ printSummary(out_annofile,stat_file);
+
+
+ }
+
+
+ public HashMap readData(String annofile, String varfa) throws IOException {
+
+
+ HashMap proMap = new HashMap<>();
+ HashMap proInfoMap = new HashMap<>();
+
+ File dbFile = new File(varfa);
+ FASTAFileReader dbReader = new FASTAFileReaderImpl(dbFile);
+ FASTAElementIterator it = dbReader.getIterator();
+ // int num = 0;
+ while (it.hasNext()) {
+ FASTAElement el = it.next();
+ el.setLineLength(1);
+ String hLine = el.getHeader().trim();
+ String headLine[] = hLine.split("\\s+");
+ String pID = headLine[0];
+ proMap.put(pID,el.getSequence());
+ proInfoMap.put(pID,hLine.replaceAll("^[^ ]*",""));
+
+ }
+ dbReader.close();
+
+ HashMap gMap = new HashMap<>();
+
+ BufferedReader aReader = new BufferedReader(new FileReader(new File(annofile)));
+ String headLine = aReader.readLine().trim();
+ String h[] = headLine.split("\t");
+ HashMap hMap = new HashMap<>();
+ for(int i=0;i run(String variantAnno, String outfilePrefix, boolean addRef, boolean verbose, boolean addAnnoInfo) throws IOException {
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+ // output variant information
+ String variant_table_file = this.outdir + "/" + outfilePrefix + this.output_var_table_suffix;
+ String outdb = this.outdir + "/" + outfilePrefix + this.output_db_table_suffix;
+ //String variant_table_file = this.outdb;
+ //variant_table_file = variant_table_file.replaceAll(".fa$","_variant.txt");
+ //if(variant_table_file.equalsIgnoreCase(this.outdb)){
+ // variant_table_file = variant_table_file.replaceAll(".fasta$","_variant.txt");
+ //}
+
+ //if(variant_table_file.equalsIgnoreCase(this.outdb)){
+ // variant_table_file = variant_table_file+"_variant.txt";
+ //}
+
+ BufferedWriter vWriter = new BufferedWriter(new FileWriter(new File(variant_table_file)));
+ vWriter.write("Variant_ID\tChr\tStart\tEnd\tRef\tAlt\tVariant_Type\tVariant_Function\tGene\tmRNA\tProtein\tmRNA_Change\tProtein_Change\tAA_Ref\tAA_Pos\tAA_Var\n");
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+ // read variant annotation result by annovar
+ BufferedReader vReader = new BufferedReader(new FileReader(new File(variantAnno)));
+ // there is no header information
+ String line ;
+ Pattern p = Pattern.compile("^[\\w\\-\\.]+:([\\w\\.]+):wholegene");
+ Pattern p2 = Pattern.compile("^[\\w\\-\\.]+?:([\\w\\.]+?):exon\\d+:c.([\\w\\->]+)(:p.([\\w\\*]+))?");
+
+ Pattern pcc1 = Pattern.compile("^([ACGTacgt])(\\d+)([ACGTacgt])$");
+ Pattern pcc2 = Pattern.compile("^(\\d+)([ACGTacgt])>([ACGTacgt])$");
+ // block substitution
+ Pattern pcc3 = Pattern.compile("^(\\d+)_(\\d+)delins(\\w+)");
+ // block substitution for a single nucleotide such as c.3delinsGT
+ Pattern pcc4 = Pattern.compile("^(\\d+)delins(\\w+)");
+ // single base deletion
+ Pattern pcc5 = Pattern.compile("^(\\d+)del(\\w+)");
+ // multi-base deletion
+ Pattern pcc6 = Pattern.compile("^(\\d+)_(\\d+)del(\\w*)");
+ // if end is equal to start, this is an insertion
+ Pattern pcc7 = Pattern.compile("^(\\d+)_(\\d+)ins(\\w+)");
+ // insertion
+ Pattern pcc8 = Pattern.compile("^(\\d+)dup(\\w+)");
+ // non-frameshift substitution (example: c.1825_1826TT, now the ref is not included in the string)
+ Pattern pcc9 = Pattern.compile("^(\\d+)_(\\d+)(\\w+)");
+
+ //
+ HashMap> vMap = new HashMap<>();
+
+ BufferedWriter pWriter = new BufferedWriter(new FileWriter(new File(outdb)));
+
+ String headLine = vReader.readLine().trim();
+ String hv[] = headLine.split("\t");
+ HashMap vheadMap = new HashMap<>();
+ for(int i=0;i colNameMap = new HashMap<>();
+ if(vheadMap.containsKey("AAChange.refGene")){
+
+ colNameMap.put("Func","Func.refGene");
+ colNameMap.put("Gene","Gene.refGene");
+ colNameMap.put("ExonicFunc","ExonicFunc.refGene");
+ colNameMap.put("AAChange","AAChange.refGene");
+
+ }else if(vheadMap.containsKey("AAChange.ensGene")){
+
+ colNameMap.put("Func","Func.ensGene");
+ colNameMap.put("Gene","Gene.ensGene");
+ colNameMap.put("ExonicFunc","ExonicFunc.ensGene");
+ colNameMap.put("AAChange","AAChange.ensGene");
+
+ }else if(vheadMap.containsKey("AAChange.knownGene")){
+
+ colNameMap.put("Func","Func.knownGene");
+ colNameMap.put("Gene","Gene.knownGene");
+ colNameMap.put("ExonicFunc","ExonicFunc.knownGene");
+ colNameMap.put("AAChange","AAChange.knownGene");
+
+ }
+
+ // Func.refGene
+ HashSet funcMap = new HashSet<>();
+ funcMap.add("exonic");
+ funcMap.add("exonic;splicing");
+ funcMap.add("splicing;exonic");
+
+
+ int n_variant_id = 0;
+ while((line = vReader.readLine())!=null){
+ line = line.trim();
+
+ String d[] = line.split("\t");
+
+ String func_gene = d[vheadMap.get(colNameMap.get("Func"))];
+ // exonic;splicing
+ // exonic
+ // ncRNA_exonic
+ if(!funcMap.contains(func_gene)){
+ continue;
+ }
+
+ String AAChange_Gene = d[vheadMap.get(colNameMap.get("AAChange"))];
+ if(AAChange_Gene.equalsIgnoreCase("unknown")){
+ continue;
+ }else if(AAChange_Gene.equalsIgnoreCase(".")){
+ continue;
+ }
+
+ // AAChange.refGene
+ // $field[2] =~ m/^[\w\-\.]+:([\w\.]+):wholegene/ and next;
+ Matcher m = p.matcher(AAChange_Gene);
+ if(m.find()){
+ continue;
+ }
+
+ // synonymous SNV removed
+ String ExonicFunc_Gene = d[vheadMap.get(colNameMap.get("ExonicFunc"))];
+ if(ExonicFunc_Gene.equalsIgnoreCase("synonymous SNV")){
+ continue;
+ }
+
+ Matcher m2 = p2.matcher(AAChange_Gene);
+ if(!m2.find()){
+ System.err.println("Error: invalid record found in exonic_variant_function file (exonic format error): <"+line+">");
+ System.exit(1);
+ }
+
+ String chr = d[vheadMap.get("Chr")];
+
+ int genome_start = Integer.valueOf(d[vheadMap.get("Start")]);
+ int genome_end = Integer.valueOf(d[vheadMap.get("End")]);
+ String variant_type = ExonicFunc_Gene;
+
+ String genome_ref = d[vheadMap.get("Ref")];
+ String genome_var = d[vheadMap.get("Alt")];
+
+
+
+ // PERM1:NM_001291366:exon2:c.T1183C:p.L395L,PERM1:NM_001291367:exon3:c.T901C:p.L301L
+ String md[] = AAChange_Gene.split(",");
+ for(int i=0;i=5) {
+ pchange = dd[4].replaceAll("^p.","");
+ }
+
+ Matcher mc1 = pcc1.matcher(cchange);
+ Matcher mc2 = pcc2.matcher(cchange);
+ Matcher mc3 = pcc3.matcher(cchange);
+ Matcher mc4 = pcc4.matcher(cchange);
+ Matcher mc5 = pcc5.matcher(cchange);
+ Matcher mc6 = pcc6.matcher(cchange);
+ Matcher mc7 = pcc7.matcher(cchange);
+ Matcher mc8 = pcc8.matcher(cchange);
+ Matcher mc9 = pcc9.matcher(cchange);
+
+ int start = -100;
+ int end = -100;
+ String ref = "";
+ String obs = "";
+
+ if(mc1.find()){
+ start = Integer.valueOf(mc1.group(2));
+ end = start;
+ ref = mc1.group(1);
+ obs = mc1.group(3);
+ }else if(mc2.find()){
+ start = Integer.valueOf(mc2.group(1));
+ end = start;
+ ref = mc2.group(2);
+ obs = mc2.group(3);
+ }else if(mc3.find()){
+ start = Integer.valueOf(mc3.group(1));
+ end = Integer.valueOf(mc3.group(2));
+ obs = mc3.group(3);
+ }else if(mc4.find()){
+ start = Integer.valueOf(mc4.group(1));
+ end = start;
+ ref = "REF";
+ obs = mc4.group(2);
+ }else if(mc5.find()){
+ start = Integer.valueOf(mc5.group(1));
+ end = start;
+ ref = mc5.group(2);
+ obs = "";
+ }else if(mc6.find()){
+ start = Integer.valueOf(mc6.group(1));
+ end = Integer.valueOf(mc6.group(2));
+ ref = mc6.group(3);
+ obs = "";
+ }else if(mc7.find()){
+ start = Integer.valueOf(mc7.group(1));
+ end = start;
+ ref = "";
+ obs = mc7.group(3);
+ }else if(mc8.find()){
+ start = Integer.valueOf(mc8.group(1));
+ end = start;
+ ref = "";
+ obs = mc8.group(2);
+ }else if(mc9.find()){
+ start = Integer.valueOf(mc9.group(1));
+ end = Integer.valueOf(mc9.group(2));
+ ref = "REF";
+ obs = mc9.group(3);
+ }else{
+ System.err.println("Warning: invalid coding change format: <"+cchange+">"+ " within <"+ md[i]);
+ continue;
+ }
+
+ // variant
+
+ JVariant jVariant = new JVariant();
+ n_variant_id++;
+ jVariant.ID = String.valueOf(n_variant_id);// d[0];
+ jVariant.transcriptID = transcript;
+ jVariant.start = start;
+ jVariant.end = end;
+ jVariant.ref = ref;
+ jVariant.obs = obs;
+ jVariant.cchange = cchange;
+ jVariant.pchange = pchange;
+ jVariant.chr = chr;
+ jVariant.genome_start = genome_start;
+ jVariant.genome_end = genome_end;
+ jVariant.variant_type = variant_type;
+ jVariant.geneID = geneID;
+
+ jVariant.genome_ref = genome_ref;
+ jVariant.genome_var = genome_var;
+
+ if(vMap.containsKey(jVariant.ID)){
+ vMap.get(jVariant.ID).add(jVariant);
+ }else{
+ ArrayList js = new ArrayList<>();
+ js.add(jVariant);
+ vMap.put(jVariant.ID,js);
+ }
+
+
+
+ }
+ }
+
+
+ }
+
+ vReader.close();
+
+
+
+
+
+ // process variants
+ HashMap flagged_transcript = new HashMap<>();
+
+ System.out.println("Transcripts with variant: "+vMap.size());
+
+ int nvar = 0;
+
+ int cpu = Runtime.getRuntime().availableProcessors();
+
+ ExecutorService fixedThreadPool = Executors.newFixedThreadPool(cpu);
+
+ TranslateVariantWorker.gMap = gMap;
+
+
+ for(String vID: vMap.keySet()){
+ for(JVariant jv : vMap.get(vID)){
+ nvar++;
+ fixedThreadPool.execute(new TranslateVariantWorker(jv,nvar,verbose));
+ }
+ }
+
+ fixedThreadPool.shutdown();
+
+ try {
+ fixedThreadPool.awaitTermination(Long.MAX_VALUE, TimeUnit.HOURS);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+
+ for(String vID: vMap.keySet()){
+ for(JVariant jv : vMap.get(vID)){
+ if(jv.valid) {
+ vWriter.write(jv.line + "\n");
+
+ // it's possible to have a variant in which the mutant protein sequence is zero.
+ // In this case, we cannot output this item because it will cause problem when reading the fasta
+ if(jv.protein_seq != null && jv.protein_seq.length()>=1) {
+ pWriter.write(">" + jv.ID + " " + jv.protein_header + "\n");
+ pWriter.write(jv.protein_seq + "\n");
+ }else{
+ System.err.println("Protein length is zero: "+jv.ID);
+ }
+ }
+ }
+ }
+
+
+
+
+ // Add the reference protein sequences into the output database
+ if(addRef){
+ addReferenceProteins(gMap,pWriter);
+ }
+
+ pWriter.close();
+ vWriter.close();
+
+ String stat_file = this.outdir + "/" + outfilePrefix + this.output_stat_table_suffix;
+ printSummary(variant_table_file,stat_file);
+
+ ArrayList resfiles = new ArrayList<>();
+ resfiles.add(variant_table_file);
+ resfiles.add(outdb);
+ return(resfiles);
+
+ }
+
+ public void readAnnotationData(String db, String geneAnno) throws IOException {
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+ // read mRNA DNA sequences: hg19_refGeneMrna.fa
+ File dbFile = new File(db);
+ FASTAFileReader dbReader = new FASTAFileReaderImpl(dbFile);
+ FASTAElementIterator it = dbReader.getIterator();
+ int num = 0;
+ //HashMap gMap = new HashMap<>();
+
+ // (leftmost exon at chr1:222001007)
+ Pattern gInforPattern = Pattern.compile("leftmost exon at (.+?:\\d+)");
+
+ while (it.hasNext()) {
+ FASTAElement el = it.next();
+ el.setLineLength(1);
+ String hLine = el.getHeader().trim();
+ // some transcripts (such as NM_001075) occur multiple times in a file, sometimes with bad ORF annotation
+ if(hLine.contains("does not have correct ORF annotation")){
+ continue;
+ }
+ String headLine[] = hLine.split("\\s+");
+ String pID = headLine[0];
+
+ Matcher gInforMatcher = gInforPattern.matcher(hLine);
+ String infor = "";
+ if(gInforMatcher.find()){
+ infor = gInforMatcher.group(1);
+ }
+
+ String pSeq = el.getSequence().toUpperCase();
+
+ if(gMap.containsKey(pID)){
+ if(gMap.get(pID).mRNA_seq.length() < pSeq.length()){
+ gMap.get(pID).mRNA_seq = pSeq;
+ gMap.get(pID).infor = infor;
+ }
+ }else{
+ JTranscript jTranscript = new JTranscript();
+ jTranscript.transcriptID = pID;
+ jTranscript.mRNA_seq = pSeq;
+ jTranscript.infor = infor;
+ gMap.put(pID,jTranscript);
+ }
+ num++;
+
+ }
+ System.out.println("Read mRNA sequences: "+num);
+ it.close();
+ dbReader.close();
+
+
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
+ // read gene information: hg19_refGene.txt
+ // no head
+ // 585 NR_024540 chr1 - 14361 29370 29370 29370 11 14361,14969,15795,16606,16857,17232,17605,17914,18267,24737,29320, 14829,15038,15947,16765,17055,17368,17742,18061,18366,24891,29370, 0 WASH7P unk unk -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+ BufferedReader gReader = new BufferedReader(new FileReader(new File(geneAnno)));
+
+ // refGene and ensGene has bin as the first column
+ Pattern pGene = Pattern.compile("^\\d+$");
+
+ // delete any transcripts that occurs multiple times in human genome
+ HashMap transcriptCopy = new HashMap<>();
+ String line;
+ while((line = gReader.readLine())!=null){
+ line = line.trim();
+ String d[] = line.split("\t");
+ if(d.length<11){
+ System.err.println("Error: invalid record found in gene file (>=11 fields expected): <"+line+">");
+ System.exit(1);
+ }
+
+
+
+ String name;
+ String chr;
+ String strand;
+ int txstart;
+ int txend;
+ int cdsstart;
+ int cdsend;
+ String exonstart;
+ String exonend;
+
+ //
+ Matcher pMatch = pGene.matcher(d[0]);
+ if(pMatch.find()) { // refGene and ensGene has bin as the first column
+ // remove the first element
+ name = d[1];
+ chr = d[2];
+ strand = d[3];
+ txstart = Integer.valueOf(d[4]);
+ txend = Integer.valueOf(d[5]);
+ cdsstart = Integer.valueOf(d[6]);
+ cdsend = Integer.valueOf(d[7]);
+ exonstart = d[9];
+ exonend = d[10];
+ }else{
+ name = d[0];
+ chr = d[1];
+ strand = d[2];
+ txstart = Integer.valueOf(d[3]);
+ txend = Integer.valueOf(d[4]);
+ cdsstart = Integer.valueOf(d[5]);
+ cdsend = Integer.valueOf(d[6]);
+ exonstart = d[8];
+ exonend = d[9];
+ }
+
+ if(transcriptCopy.containsKey(name)){
+ transcriptCopy.put(name,transcriptCopy.get(name)+1);
+ }else{
+ transcriptCopy.put(name,1);
+ }
+
+ if(!gMap.containsKey(name)){
+ System.out.println("Warning: remove ("+name+") "+line);
+ continue;
+ }
+
+ if(!gMap.get(name).infor.isEmpty()){
+ String infor = chr+":"+exonstart.split(",")[0];
+ if(!gMap.get(name).infor.equalsIgnoreCase(infor)){
+ System.out.println("Warning: remove (used "+name+" "+gMap.get(name).infor+") "+line);
+ continue;
+ }
+ }
+
+
+ // we must save the original cds position information. we can know which gene is coding gene based on
+ // these information.
+ gMap.get(name).cds_start = cdsstart;
+ gMap.get(name).cds_end = cdsend;
+
+
+ // next we need to make sure that there is no intron between transcription start and translation start (this is rare but it happens when cdsstart is not in the first exon)
+ exonstart = exonstart.replaceAll(",$","");
+ exonend = exonend.replaceAll(",$","");
+ String sexonstart[] = exonstart.split(",");
+ String sexonend[] = exonend.split(",");
+
+ // System.out.println("exonstart:"+exonstart+"===exonend:"+exonend);
+
+ int dexonstart[] = new int[sexonstart.length];
+ int dexonend[] = new int[sexonend.length];
+ for(int i=0;i= dexonstart[i] && cdsstart <= dexonend[i]){
+ mrnastart = cdsstart - txstart + 1 - intron;
+ }
+
+
+ if(cdsend >= dexonstart[i] && cdsend <= dexonend[i]){
+ mrnaend = cdsend - txstart + 1 - intron;
+ }
+
+ //System.out.println("i="+i+","+mrnastart+","+mrnaend);
+
+ }
+
+ }else if(strand.equalsIgnoreCase("-")){
+ int intron = 0;
+ for(int i=dexonstart.length-1;i>=0;i--){
+ if(i < (dexonstart.length-1)){
+ intron = intron + dexonstart[i+1] - dexonend[i] - 1;
+ }
+ if(cdsend >= dexonstart[i] && cdsend <= dexonend[i]){
+ mrnastart = txend - cdsend + 1 - intron;
+ }
+ if(cdsstart >= dexonstart[i] && cdsstart <= dexonend[i]){
+ mrnaend = txend - cdsstart + 1 - intron;
+ }
+ }
+ }
+
+ gMap.get(name).start = mrnastart;
+ gMap.get(name).end = mrnaend;
+ gMap.get(name).transcriptID = name;
+
+
+ // System.out.println(name+"\t"+mrnastart+"\t"+mrnaend);
+ // gMap.put(name,jTranscript);
+
+ }
+
+ gReader.close();
+
+ // delete any transcripts that occurs multiple times in human genome
+ int n_remove_transcript = 0;
+ for(String tname : transcriptCopy.keySet()){
+ if(transcriptCopy.get(tname) >=2){
+ gMap.remove(tname);
+ n_remove_transcript++;
+ }
+ }
+ System.out.println("Remove transcripts: "+n_remove_transcript);
+
+ }
+
+ public void addReferenceProteins(HashMap gMap, BufferedWriter pWriter){
+
+ Pattern pStop = Pattern.compile("\\*.+");
+ int nrefProteins = 0;
+
+ for(String transcript: gMap.keySet()){
+ if(gMap.containsKey(transcript)){
+ // String dna_seq = mrnaMap.get(transcript).substring(gMap.get(transcript).start-1, gMap.get(transcript).end);
+
+ // remove non-coding genes;
+ if(gMap.get(transcript).cds_start == gMap.get(transcript).cds_end){
+ continue;
+ }
+ // System.out.println(transcript+"\t"+gMap.get(transcript).start+"\t"+gMap.get(transcript).end);
+ String dna_seq = gMap.get(transcript).mRNA_seq.substring(gMap.get(transcript).start-1, gMap.get(transcript).end);
+ String protein = translateDNA(dna_seq);
+ Matcher pStopMatcher = pStop.matcher(protein);
+ if(pStopMatcher.find()){
+ System.out.println("Warning:"+transcript+" "+gMap.get(transcript).cds_start+" "+ gMap.get(transcript).cds_end+"\t"+protein);
+ }
+
+ protein = protein.replaceAll("\\*.*$","");
+ try {
+ pWriter.write(">"+transcript+"\n"+protein+"\n");
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ nrefProteins++;
+
+
+ }
+
+ }
+ System.out.println("Reference proteins: "+nrefProteins);
+
+ }
+
+ public void addReferenceProteins(HashMap gMap, String file) throws IOException {
+ BufferedWriter pWriter = new BufferedWriter(new FileWriter(new File(file)));
+ addReferenceProteins(gMap,pWriter);
+ pWriter.close();
+ }
+
+
+ public static String translateDNA(String dna){
+ String protein = "";
+ dna = dna.toUpperCase();
+ Pattern p3 = Pattern.compile("(...)");
+ Matcher matcher = p3.matcher(dna);
+ while(matcher.find()){
+ String t3 = matcher.group(1);
+ if(codons.containsKey(t3)){
+ protein = protein+codons.get(t3);
+ }else{
+ System.err.println("Error: invalid triplets found in DNA sequence to be translated: <"+t3+"> in <"+dna+">");
+ System.exit(1);
+ }
+ }
+
+ return(protein);
+ }
+
+ public HashMap getCodon(){
+
+ HashMap codonMap = new HashMap<>();
+ codonMap.put("TTT","F");
+ codonMap.put("TTC","F");
+ codonMap.put("TCT","S");
+ codonMap.put("TCC","S");
+ codonMap.put("TAT","Y");
+ codonMap.put("TAC","Y");
+ codonMap.put("TGT","C");
+ codonMap.put("TGC","C");
+ codonMap.put("TTA","L");
+ codonMap.put("TCA","S");
+ codonMap.put("TAA","*");
+ codonMap.put("TGA","*");
+
+ codonMap.put("TTG","L");
+ codonMap.put("TCG","S");
+ codonMap.put("TAG","*");
+ codonMap.put("TGG","W");
+ codonMap.put("CTT","L");
+ codonMap.put("CTC","L");
+ codonMap.put("CCT","P");
+ codonMap.put("CCC","P");
+ codonMap.put("CAT","H");
+ codonMap.put("CAC","H");
+ codonMap.put("CGT","R");
+ codonMap.put("CGC","R");
+ codonMap.put("CTA","L");
+ codonMap.put("CTG","L");
+ codonMap.put("CCA","P");
+ codonMap.put("CCG","P");
+ codonMap.put("CAA","Q");
+ codonMap.put("CAG","Q");
+ codonMap.put("CGA","R");
+ codonMap.put("CGG","R");
+ codonMap.put("ATT","I");
+ codonMap.put("ATC","I");
+ codonMap.put("ACT","T");
+ codonMap.put("ACC","T");
+ codonMap.put("AAT","N");
+ codonMap.put("AAC","N");
+ codonMap.put("AGT","S");
+ codonMap.put("AGC","S");
+ codonMap.put("ATA","I");
+ codonMap.put("ACA","T");
+ codonMap.put("AAA","K");
+ codonMap.put("AGA","R");
+ codonMap.put("ATG","M");
+ codonMap.put("ACG","T");
+ codonMap.put("AAG","K");
+ codonMap.put("AGG","R");
+ codonMap.put("GTT","V");
+ codonMap.put("GTC","V");
+ codonMap.put("GCT","A");
+ codonMap.put("GCC","A");
+ codonMap.put("GAT","D");
+ codonMap.put("GAC","D");
+ codonMap.put("GGT","G");
+ codonMap.put("GGC","G");
+ codonMap.put("GTA","V");
+ codonMap.put("GTG","V");
+ codonMap.put("GCA","A");
+ codonMap.put("GCG","A");
+ codonMap.put("GAA","E");
+ codonMap.put("GAG","E");
+ codonMap.put("GGA","G");
+ codonMap.put("GGG","G");
+ return(codonMap);
+ }
+
+
+ /**
+ *
+ * @param anno_table_file The input is *-varInfo.txt
+ */
+ public void printSummary(String anno_table_file, String stat_file) throws IOException {
+
+
+ //
+ HashMap> typeMap = new HashMap<>();
+
+ HashMap typeMap_mRNA = new HashMap<>();
+
+ HashMap> funMap = new HashMap<>();
+
+ HashMap funMap_mRNA = new HashMap<>();
+
+ BufferedReader aReader = new BufferedReader(new FileReader(new File(anno_table_file)));
+
+ String headLine = aReader.readLine().trim();
+ String h[] = headLine.split("\t");
+ HashMap hMap = new HashMap<>();
+ for(int i=0;i allTypes = new HashSet<>();
+ HashSet allFunctions = new HashSet<>();
+
+ HashSet all_variant_no_mRNA = new HashSet<>();
+
+ int total_variants = 0;
+
+ while((line = aReader.readLine())!=null){
+ String d[] = line.trim().split("\t");
+ String variant_type = d[hMap.get("Variant_Type")];
+ String variant_fun = d[hMap.get("Variant_Function")];
+
+ allTypes.add(variant_type);
+ allFunctions.add(variant_fun);
+
+ // don't consider mRNA
+ String varID = d[hMap.get("Chr")]+"_"+d[hMap.get("Start")]+"_"+d[hMap.get("End")]+"_"+d[hMap.get("Ref")]+"_"+d[hMap.get("Alt")];
+
+ all_variant_no_mRNA.add(varID);
+ // variant type
+ if(typeMap.containsKey(variant_type)){
+ typeMap.get(variant_type).add(varID);
+ }else{
+ HashSet vlist = new HashSet<>();
+ vlist.add(varID);
+ typeMap.put(variant_type,vlist);
+ }
+
+ // function type
+ if(funMap.containsKey(variant_fun)){
+ funMap.get(variant_fun).add(varID);
+ }else{
+ HashSet flist = new HashSet<>();
+ flist.add(varID);
+ funMap.put(variant_fun,flist);
+ }
+
+ // consider mRNA
+ // varID = d[hMap.get("Chr")]+"_"+d[hMap.get("Start")]+"_"+d[hMap.get("End")]+"_"+d[hMap.get("Ref")]+"_"+d[hMap.get("Alt")]+"_"+d[hMap.get("mRNA")];
+ if(typeMap_mRNA.containsKey(variant_type)){
+ typeMap_mRNA.put(variant_type,typeMap_mRNA.get(variant_type)+1);
+ }else{
+ typeMap_mRNA.put(variant_type,1);
+ }
+ if(funMap_mRNA.containsKey(variant_fun)){
+ funMap_mRNA.put(variant_fun,funMap_mRNA.get(variant_fun)+1);
+ }else{
+ funMap_mRNA.put(variant_fun,1);
+ }
+
+ total_variants++;
+
+
+ }
+
+ aReader.close();
+
+ BufferedWriter pWriter = new BufferedWriter(new FileWriter(new File(stat_file)));
+
+ pWriter.write("#Variant_Type\n");
+ for(String type: allTypes){
+ double ratio1 = 1.0*typeMap.get(type).size()/all_variant_no_mRNA.size();
+ double ratio2 = 1.0*typeMap_mRNA.get(type)/total_variants;
+ pWriter.write(type+"\t"+typeMap.get(type).size()+"\t"+ratio1+"\t"+typeMap_mRNA.get(type)+"\t"+ratio2+"\n");
+ }
+
+ pWriter.write("#Variant_Function\n");
+ for(String fun: allFunctions){
+ double ratio1 = 1.0*funMap.get(fun).size()/all_variant_no_mRNA.size();
+ double ratio2 = 1.0*funMap_mRNA.get(fun)/total_variants;
+ pWriter.write(fun+"\t"+funMap.get(fun).size()+"\t"+ratio1+"\t"+funMap_mRNA.get(fun)+"\t"+ratio2+"\n");
+ }
+
+
+ pWriter.close();
+
+
+
+ }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+}
diff --git a/src/main/java/JTranscript.java b/src/main/java/JTranscript.java
new file mode 100644
index 0000000..c8a5d64
--- /dev/null
+++ b/src/main/java/JTranscript.java
@@ -0,0 +1,29 @@
+package main.java;
+
+public class JTranscript {
+
+ public int start;
+ public int end;
+ public String transcriptID;
+
+ /**
+ * can be used to distinguish whether it's coding gene or not.
+ */
+ public int cds_start = 0;
+ public int cds_end = 0;
+
+ /**
+ * DNA sequence from hg19_refGeneMrna.fa
+ */
+ public String mRNA_seq = "";
+
+ /**
+ *
+ * This variable is used to save the information for confirming the transcript in hg19_refGeneMrna.fa and
+ * hg19_refGene.txt. Sometimes, there are multiple rows from the same transcript. In this situation, we need to
+ * select the true transcript based on the following information from hg19_refGeneMrna.fa
+ * Comment: this sequence (leftmost exon at chr1:222001007) is generated ...
+ * The format is chr1:222001007
+ */
+ public String infor = "";
+}
diff --git a/src/main/java/JVariant.java b/src/main/java/JVariant.java
new file mode 100644
index 0000000..f6222f3
--- /dev/null
+++ b/src/main/java/JVariant.java
@@ -0,0 +1,50 @@
+package main.java;
+
+import java.util.ArrayList;
+
+/**
+ * sava a variant record
+ */
+public class JVariant {
+
+ /**
+ * Variant ID
+ */
+ public String ID;
+
+ public int start;
+ public int end;
+ public String ref;
+ public String obs;
+ public String transcriptID;
+ public String geneID;
+ public String cchange;
+ public String pchange;
+ public String proteinID;
+ public String chr;
+
+ public int genome_start;
+ public int genome_end;
+ public String variant_type;
+
+ public String variant_function;
+
+ public String genome_ref = "";
+ public String genome_var = "";
+
+ public String protein_seq;
+
+ /**
+ * The information will be exported in fasta header position
+ */
+ public String protein_header = "";
+ /**
+ * This is used to save the information of output string.
+ */
+ public String line = "";
+ public static String lineHead = "";
+ public ArrayList samples = new ArrayList<>();
+
+ public boolean valid = false;
+
+}
diff --git a/src/main/java/TranslateVariantWorker.java b/src/main/java/TranslateVariantWorker.java
new file mode 100644
index 0000000..a0bf171
--- /dev/null
+++ b/src/main/java/TranslateVariantWorker.java
@@ -0,0 +1,465 @@
+package main.java;
+
+
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static main.java.DbCreator.translateDNA;
+
+
+public class TranslateVariantWorker implements Runnable{
+
+
+ private JVariant jv = new JVariant();
+ private boolean verbose = false;
+
+ public static HashMap gMap = new HashMap<>();
+
+ public static String prefix4variantID = "VAR";
+
+ private int nvar = 0;
+
+ public TranslateVariantWorker(JVariant jVariant, int nv, boolean vb) {
+ this.jv = jVariant;
+ this.nvar = nv;
+ this.verbose = vb;
+ }
+
+
+ @Override
+ public void run() {
+ String ID = jv.ID;
+ String transcript = jv.transcriptID;
+ int start = jv.start;
+ int end = jv.end;
+ String ref = jv.ref;
+ String obs = jv.obs;
+ String cchange = jv.cchange;
+ String pchange = jv.pchange;
+ String chr = jv.chr;
+
+ if(verbose){
+ System.out.println("Process: "+jv.ID+" with "+jv.transcriptID+", "+jv.start+", "+jv.end+", "+jv.ref+", "+jv.obs+", "+jv.cchange+", "+jv.pchange);
+ }
+
+ if(!gMap.containsKey(jv.transcriptID)){
+ System.out.println("WARNING: cannot find mRNA sequence for "+jv.transcriptID+" in the fasta file!");
+ jv.valid = false;
+ return;
+ }
+ if(!gMap.containsKey(jv.transcriptID)){
+ System.out.println("WARNING: cannot find annotation for "+jv.transcriptID+" in the gene file or cannot infer the transcription start site");
+ jv.valid = false;
+ return;
+ }
+
+ if(jv.end > gMap.get(jv.transcriptID).mRNA_seq.length()){
+ System.out.println("ERROR: transcript end ("+gMap.get(jv.transcriptID).end+") for "+jv.transcriptID+" is longer than transcript length "+gMap.get(jv.transcriptID).mRNA_seq.length()+", skipping this transcript");
+ jv.valid = false;
+ return;
+ }
+
+ // not used
+ if(false){
+ String utr5 = gMap.get(transcript).mRNA_seq.substring(0,gMap.get(transcript).start-1);
+ String utr3 = gMap.get(transcript).mRNA_seq.substring(gMap.get(transcript).end);
+ String mrna1;
+ String mrna2;
+ mrna1 = gMap.get(transcript).mRNA_seq;
+
+ String dna_seq = gMap.get(jv.transcriptID).mRNA_seq.substring(gMap.get(jv.transcriptID).start-1, gMap.get(jv.transcriptID).end - gMap.get(jv.transcriptID).start + 1);
+ String dna[] = dna_seq.split("");
+ ArrayList dnaList = new ArrayList<>();
+ for(int i=0;i dna.length){
+ System.out.println("ERROR in "+jv.ID+": end position of variant ("+jv.end+") in "+jv.transcriptID+" is longer than coding portion length "+dna.length+", skipping this transcript");
+ jv.valid = false;
+ return;
+ }
+
+ protein1 = translateDNA(dna_seq);
+ Pattern pPro = Pattern.compile("\\*\\w");
+ Matcher matcher = pPro.matcher(protein1);
+ if(!matcher.find()){
+ warning = "(WARNING: Potential FASTA sequence error!!!)";
+ }
+
+ // we need to confirm here
+ if(jv.start == jv.end && jv.ref.isEmpty() && !jv.obs.isEmpty()){
+ // this is an insertion
+ dnaList.add(jv.start,jv.obs);
+ }else{
+ for(int i=jv.start;i<(jv.end-jv.start+1);i++){
+ dnaList.remove(i);
+ }
+ dnaList.add(jv.start,jv.obs);
+ }
+
+ String dna_seq_variant = StringUtils.join(dnaList,"");
+
+ mrna2 = utr5 + dna_seq_variant + utr3;
+
+
+ }else{
+
+ String dna_seq = gMap.get(transcript).mRNA_seq.substring(gMap.get(transcript).start-1, gMap.get(transcript).end);
+
+
+ String dna_seq_utr = gMap.get(transcript).mRNA_seq.substring(gMap.get(transcript).start-1);
+ String dna[] = dna_seq_utr.split("");
+ ArrayList dnaList = new ArrayList<>();
+ for(int i=0;i dna.length){
+ System.out.println("WARNING in "+ID+": end position of variant ("+end+") in "+transcript+" is longer than coding portion length "+dna.length+", possibly due to a deletion on a stop codon");
+ }
+
+ if(verbose){
+ System.out.println(gMap.get(transcript).mRNA_seq+"\t"+(gMap.get(transcript).start-1)+","+(gMap.get(transcript).end - gMap.get(transcript).start + 1));
+ System.out.println("NOTICE: wild-type DNA sequence is "+dna_seq);
+ System.out.println("Include UTR: "+dna_seq_utr);
+ }
+
+ protein1 = translateDNA(dna_seq);
+
+ // System.out.println(protein1);
+
+ Pattern pStop = Pattern.compile("\\*.+");
+ Matcher pStopMatcher = pStop.matcher(protein1);
+
+ if(pStopMatcher.find()){
+
+ warning = "(WARNING: Potential FASTA sequence error!!!)";
+ jv.valid = false;
+ return;
+ }
+
+ if(start == end && ref.isEmpty() && !obs.isEmpty()){
+ // this is an insertion (ref is treated as 0 when we read the EVF file previously)
+ dnaList.add(start,obs);
+ start = start + 1;
+ end = end + 1;
+ if(obs.length() % 3 == 0){
+ inframe = inframe + 1;
+ }
+
+ }else { // this is a substitution
+ if(obs.isEmpty()){ // deletion
+ // AAA -> ""
+ // fromIndex, inclusive, and toIndex, exclusive.
+ dnaList.subList(start-1,end).clear();
+ if((end - start + 1) % 3 == 0){
+ inframe = inframe + 1;
+ }
+
+ }else {
+ // absolutely important here to split obs into individual elements in an array!!!
+ dnaList.subList(start-1,end).clear();
+ // Inserts the specified element at the specified position in this
+ // list. Shifts the element currently at that position (if any) and
+ // any subsequent elements to the right (adds one to their indices).
+ // index index at which the specified element is to be inserted
+ dnaList.add(start-1,obs);
+
+
+ if((end - start + 1 - obs.length() % 3 == 0) ){
+ inframe = inframe + 1;
+ }
+
+
+ }
+ }
+
+ int aastart = (int) ((start-1)/3) + 1 ;
+ int aaend1 = (int)((end-1)/3)+1;
+ int aaend2 = (int)((start+obs.length()-1-1)/3) + 1;
+ String aachange = ""; // assign default value, since synonymous variants do not have aachange
+ String dna_seq_variant = StringUtils.join(dnaList,"");
+
+ // protein2 should be truncated so that anything after stop codon be deleted
+ protein2 = translateDNA(dna_seq_variant);
+
+ protein2 = protein2.replaceAll("\\*.+","*");
+
+ if(verbose){
+
+ }
+
+ String function;
+
+ String protein2_tmp;
+ if(aaend2 <= protein2.length()) {
+ protein2_tmp = protein2.substring(aastart - 1, aaend2);
+ }else{
+ protein2_tmp = protein2.substring(aastart - 1);
+ }
+
+ Pattern pStopOnly = Pattern.compile("\\*");
+ Matcher pStopOnlyMatcher = pStopOnly.matcher(protein2_tmp);
+ if(pStopOnlyMatcher.find()){
+ function = "immediate-stopgain";
+ }else if(end >= (gMap.get(transcript).end-2)){
+ function = "immediate-stoploss";
+ }else {
+ if(protein1.equalsIgnoreCase(protein2)){
+ function = "silent";
+ if(verbose){
+ System.out.println("NOTICE: silent variant!");
+ jv.valid = false;
+ return;
+ }
+ }else{
+ function = "protein-altering"; // change nonsynonymous to protein-altering to reduce confusion among users
+ }
+ }
+
+
+ // The goal is to find the exact change of all amino acids (not just the first one), until a stop codon is found
+ // pos1: first position with difference aa1:peptide with difference
+
+ int pos1 = 0;
+ int pos2 = 0;
+ String aa1 = "";
+ String aa2 = "";
+ int len_max = protein1.length();
+ if(len_max < protein2.length()){
+ len_max = protein2.length();
+ }
+
+ String pro1List[] = protein1.split("");
+ String pro2List[] = protein2.split("");
+
+ for(int i=0;i=1){
+
+ if(i < protein1.length()){
+ aa1 = aa1 + pro1List[i];
+ }
+ if(i < protein2.length()){
+ aa2 = aa2 + pro2List[i];
+ }
+ }else{
+ if(i > protein1.length()){
+ continue;
+ }
+
+ if(i > protein2.length()){
+ continue;
+ }
+
+ if(i == protein1.length()){
+ pos1 = i+1;
+ aa1 = "";
+ aa2 = pro2List[i];
+ }else if(i == protein2.length()){
+ pos1 = i+1;
+ aa1 = pro1List[i];
+ aa2 = "";
+ }else if(!pro1List[i].equalsIgnoreCase(pro2List[i])){
+ pos1 = i+1;
+ aa1 = pro1List[i];
+ aa2 = pro2List[i];
+ }
+ }
+
+ }
+
+ // maximum of pos2 is the end of the wildtype protein
+ if(pos2==0){
+ pos2 = protein1.length();
+ }
+
+
+ String aa1d[] = aa1.split("");
+ String aa2d[] = aa2.split("");
+ ArrayList aa1List = new ArrayList<>();
+ ArrayList aa2List = new ArrayList<>();
+ for(int i=0;i=1 && aa2List.size()>=1) {
+ String curAA1 = StringUtils.join(aa1List, "");
+ String lastAA1 = aa1List.get(aa1List.size() - 1);
+ String lastAA2 = aa2List.get(aa2List.size() - 1);
+
+ // System.out.println("aa1list size:" + aa1List.size() + ", aa2list size:" + aa2List.size() + "\t" + lastAA1 + "\t" + lastAA2);
+
+ if (pos1 >= 1 && !curAA1.equalsIgnoreCase("*") && lastAA1.equalsIgnoreCase(lastAA2)) {
+ aa1List.remove(aa1List.size() - 1);
+ aa2List.remove(aa2List.size() - 1);
+ pos2 = pos2 - 1;
+ } else {
+ break;
+ }
+ }else{
+ break;
+ }
+ }
+
+ aa1 = StringUtils.join(aa1List,"");
+ aa2 = StringUtils.join(aa2List,"");
+
+ // protein position
+ String position = "-";
+
+ if(pos1 == 0 ){
+ aachange = " (no amino acid change)";
+
+ Pattern fsPattern = Pattern.compile("([\\w\\*]+?)(\\d+)fs");
+ Matcher fsMatcher = fsPattern.matcher(pchange);
+ if(pchange.isEmpty()){
+ // it is actually not that simple to handle this situation of silent change. We have to calculate it from c change instead
+ Pattern numPattern = Pattern.compile("(\\d+)");
+ Matcher numMatcher = numPattern.matcher(cchange);
+ if(numMatcher.find()){
+ int pchange_pos = (int)((Integer.valueOf(numMatcher.group(1)) -1)/3);
+ if(pchange_pos >= protein1.length()){ // something is wrong here, the wildtype sequence is not complete possibly
+
+ jv.valid = false;
+ return;
+ }
+ String pchange_aa = protein1.substring(pchange_pos-1,pchange_pos);
+ pchange = pchange_aa+pchange_pos+pchange_aa;
+ position = String.valueOf(pchange_pos);
+ }
+ }else if(fsMatcher.find()){
+ pchange = fsMatcher.group(1)+fsMatcher.group(2)+fsMatcher.group(1);
+ position = fsMatcher.group(2);
+ }
+
+ function = "silent";
+
+ }else if(pos1 == pos2){ // a simple non-synonymous change
+ // a simple non-synonymous change, or an inframe insertion if length of aa1!=aa2
+
+ aachange = " (position "+pos1+" changed from "+aa1+" to "+aa2+")";
+ pchange = aa1+pos1+aa2;
+ position = String.valueOf(pos1);
+
+ if(aa1.equalsIgnoreCase("*")){
+ function = "immediate-stoploss";
+ }
+ if(aa2.equalsIgnoreCase("*")){
+ function = "immediate-stopgain";
+ }
+
+ if(aa2.isEmpty()){
+ pchange = pchange + "del";
+ }else if(aa1.length() != aa2.length()){
+ pchange = aa1 + pos1 + "delins" + aa2;
+ }
+
+ }else if(pos1 > pos2){ // when there is an insertion, pos2 will be 1 less than pos1 (example: 1 1647893 1647893 - TTTCTT)
+ aachange = " (position "+pos2+"-"+pos1+" has insertion "+aa2+")";
+ position = pos2+"-"+pos1;
+
+ if(pos1<=protein1.length()){
+ pchange = protein1.substring(pos2-1,pos2)+pos2+"_"+protein1.substring(pos1-1,pos1)+pos1+"ins"+aa2;
+ }else{
+ pchange = protein1.substring(pos2-1,pos2)+pos2+"_"+pos1+"ins"+aa2;
+ }
+
+
+ }else{
+
+ position = pos1+"-"+pos2;
+
+ if(inframe>0){ // inframe substitution
+ aachange = " (position "+pos1+"-"+pos2+" changed from "+aa1+" to "+aa2+")";
+ pchange = aa1.substring(0,1)+pos1+"_"+aa1.substring(aa1.length()-1)+pos2+"delins"+aa2;
+ if(aa2.isEmpty()){
+ // when aa2 is empty, it should be just deletion, not delins
+ pchange.replaceAll("ins$","");
+ }
+ }else {
+ aachange = " (position "+pos1+"-"+pos2+" changed from "+aa1+" to "+aa2+")";
+ if(aa2.length()==0) {
+ pchange = aa1.substring(0, 1) + pos1 + "fs*" + aa2.length();
+ }else{
+ pchange = aa1.substring(0, 1) + pos1 + aa2.substring(0, 1) + "fs*" + aa2.length();
+ }
+ }
+ }
+
+ protein2 = protein2.replaceAll("\\*$","");
+
+
+ if(function.equalsIgnoreCase("silent")){
+ System.out.println("Warning: silent => "+ transcript+"\tc."+cchange+"\tp."+pchange+"\t"+aachange);
+ jv.valid = false;
+ return;
+ }
+
+ jv.variant_function = function;
+
+ // nvar++;
+ // ID:
+ String new_variant_protein_ID = prefix4variantID+"|"+transcript+"|"+nvar;
+
+
+ if(aa2.isEmpty()){
+ if(jv.variant_function.equalsIgnoreCase("immediate-stopgain")){
+ aa2 = "*";
+ }else{
+ aa2 = "-";
+ }
+ }
+
+ StringBuilder vBuilder = new StringBuilder();
+ vBuilder.append(new_variant_protein_ID).append("\t")
+ .append(chr).append("\t")
+ .append(jv.genome_start).append("\t")
+ .append(jv.genome_end).append("\t")
+ .append(jv.genome_ref).append("\t")
+ .append(jv.genome_var).append("\t")
+ .append(jv.variant_type).append("\t")
+ .append(jv.variant_function).append("\t")
+ .append(jv.geneID).append("\t")
+ .append(jv.transcriptID).append("\t")
+ .append(jv.transcriptID).append("\t") // protein id
+ .append(jv.cchange).append("\t")
+ .append(jv.pchange).append("\t")
+ .append(aa1).append("\t")
+ .append(position).append("\t")
+ .append(aa2);
+ jv.valid = true;
+ jv.ID = new_variant_protein_ID;
+ jv.line = vBuilder.toString();
+ jv.protein_seq = protein2;
+ jv.protein_header = transcript+" c."+cchange+" p."+pchange+" "+function+" "+aachange;
+ // vWriter.write(vBuilder.toString());
+
+ }
+
+ }
+}