precimed · ofrei · Apr 20, 2020 · Apr 20, 2020 · Apr 20, 2020 · May 5, 2020
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@
 *.pyc
 *.log
 *.json
+*.sh.o*
+*.sh.e*
 *.png
 /.metadata/
 src/build/*

diff --git a/README.md b/README.md
diff --git a/annotations/readme_annot.txt b/annotations/readme_annot.txt
@@ -0,0 +1,63 @@
+1.1. Download KnownGene table
+wget -O /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.txt.gz ftp://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/knownGene.txt.gz
+
+
+
+1.2. Download MiRNA target regions and regulatory features from Ensembl BioMart.
+MiRNA:
+wget -O /mnt/seagate10/projects/annotations/data/test/mirna_targets.txt 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_mirna_target_feature" interface = "default" ><Attribute name = "chromosome_name" /><Attribute name = "chromosome_start" /><Attribute name = "chromosome_end" /><Attribute name = "accession" /></Dataset></Query>'
+
+Regulatory features:
+wget -O /mnt/seagate10/projects/annotations/data/test/regulatory_features.txt 'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_regulatory_feature" interface = "default" ><Attribute name = "chromosome_name" /><Attribute name = "chromosome_start" /><Attribute name = "chromosome_end" /><Attribute name = "regulatory_stable_id" /><Attribute name = "feature_type_name" /></Dataset></Query>'
+
+http://grch37.ensembl.org/biomart/martview/891d411f1bfeb1cd8d2a0f46fe62cff2?VIRTUALSCHEMANAME=default&ATTRIBUTES=hsapiens_mirna_target_feature.default.mirna_target_feature.chromosome_name|hsapiens_mirna_target_feature.default.mirna_target_feature.chromosome_start|hsapiens_mirna_target_feature.default.mirna_target_feature.chromosome_end|hsapiens_mirna_target_feature.default.mirna_target_feature.accession&FILTERS=&VISIBLEPANEL=attributepanel
+
+
+
+2. Process UCSC annotations with knownGene2annot.py.
+python knownGene2annot.py /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.txt.gz /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annot.txt.gz
+
+
+
+3.1. Convert gene, mirna and regulatury feature files into bed format (mark features accordingly in the 4-th column of bed file).
+zcat /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annot.txt.gz | awk -F$'\t' 'BEGIN{OFS="\t"} {if($2 ~ "chr[0-9]+") print($2,$6,$7,$5)}' | gzip -c > /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annot.bed.gz
+
+awk -F$'\t' 'BEGIN{OFS="\t"} {print("chr"$1,$2-1,$3,"mirna"}' /mnt/seagate10/projects/annotations/data/test/mirna_targets.txt | gzip -c > /mnt/seagate10/projects/annotations/data/test/mirna_targets.bed.gz
+awk -F$'\t' 'BEGIN{OFS="\t"} {print("chr"$1,$2-1,$3,"tfbs"}' /mnt/seagate10/projects/annotations/data/test/regulatory_features.txt | gzip -c > /mnt/seagate10/projects/annotations/data/test/regulatory_features.bed.gz
+
+
+
+3.2. Merge and sort all bed files for known genes, mirna and tfbs.
+zcat /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annot.bed.gz /mnt/seagate10/projects/annotations/data/test/mirna_targets.bed.gz /mnt/seagate10/projects/annotations/data/test/regulatory_features.bed.gz | sort -k1,1 -k2,2n | gzip -c > /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annot.complete.sorted.bed.gz
+
+
+
+4. Create sorted template bed file.
+cat /mnt/seagate10/genotypes/1000genomes503eur9m/chr[0-9]*.bim | awk 'BEGIN{OFS="\t";} {print("chr"$1, $4-1, $4-1+length($5), $2)}' | sort -k1,1 -k2,2n | gzip -c > /mnt/seagate10/projects/annotations/data/test/template.1000genomes503eur9m.sorted.bed.gz
+
+
+
+5. Intersect annotations bed file with template bed file using bedtools.
+/mnt/seagate10/projects/github/bedtools2/bedtools intersect -a /mnt/seagate10/projects/annotations/data/test/template.1000genomes503eur9m.sorted.bed.gz -b /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annot.complete.sorted.bed.gz -wa -wb -sorted | gzip -c > /mnt/seagate10/projects/annotations/data/test/template.1000genomes503eur9m.complete_annot_hg19.intersect.txt.gz
+
+
+
+6. Create binary annotations (this step is slow).
+python annot2annomat.py /mnt/seagate10/projects/annotations/data/test/template.1000genomes503eur9m.complete_annot_hg19.intersect.txt.gz /mnt/seagate10/projects/annotations/data/test/template.1000genomes503eur9m.sorted.bed.gz /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annomat.txt.gz
+
+
+
+7. Create unique annotations (many variants belong to multiple annotation categories, this step assigns a single category to each variant, prioritizing categoryes according to "All SNPs ...").
+python uniq_annot.py /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annomat.txt.gz /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annomat.uniq.txt.gz
+
+
+
+8. Calculate LD r2 coefficients with parameters from "All SNPs are not created equal" supplement.
+for ((i=1;i<23;i++)); do plink --bfile /mnt/seagate10/genotypes/1000genomes503eur9m/chr${i} --r2 inter-chr gz yes-really --ld-window-r2 0.2 --out /mnt/seagate10/genotypes/1000genomes503eur9m/schork/chr${i}.schork.r2; done
+
+
+
+9. Create ld-induced categories for the experiment (this step is very slow).
+python ld_informed_annot.py
+python ld_informed_annot.py /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annomat.uniq.txt.gz /mnt/seagate10/genotypes/1000genomes503eur9m/schork/ /mnt/seagate10/projects/annotations/data/test/knownGene_ucsc_hg19.annot.ld_informed.txt.gz
+
diff --git a/annotations/src/annot2annomat.py b/annotations/src/annot2annomat.py
@@ -0,0 +1,45 @@
+import sys
+import pandas as pd
+from collections import defaultdict
+import numpy as np
+import argparse
+
+
+def parseArgs(args):
+    parser = argparse.ArgumentParser(
+        description="Creates binary annotation matrix from the output of knownGene2annot.py")
+    parser.add_argument("annot_file", help="A file with snp id in column 4 and annotation category name in column 8")
+    parser.add_argument("template_file", help="Template bed file")
+    parser.add_argument("out_file", help="Output file name")
+    return parser.parse_args(args)
+
+
+if __name__ == "__main__":
+    args = parseArgs(sys.argv[1:])
+    print("Processing %s" % args.annot_file)
+    df = pd.read_csv(args.annot_file, header=None, sep='\t')
+    dd = defaultdict(set)
+
+    for t in df.itertuples():
+        dd[t[4]].add(t[8]) # t[0] = index
+
+    template_snps = pd.read_csv(args.template_file,sep='\t',usecols=[3],
+                                header=None,names=["SNP"],squeeze=True)
+    # hardcoded list of categories supported by knownGene2annot.py
+    l = ["100kDown", "10kDown", "1kDown", "100kUp", "10kUp", "1kUp", "3UTR",
+        "5UTR", "Exon", "Intron", "ProteinCoding", "NoncodingTranscript",
+        "mirna", "tfbs"]
+
+    data = np.zeros((len(template_snps), len(l)), dtype=int)
+    for i,s in enumerate(template_snps):
+        for j, k in enumerate(l):
+            if k in dd[s]:
+                data[i,j] = 1
+
+    dfa = pd.DataFrame(index=template_snps, columns=l, data=data)
+    print("Writing results to %s" % args.out_file)
+    # out_file = "../data/9m_template_knownGene_ucsc_hg19_annomat.txt.gz"
+    dfa.to_csv(args.out_file, sep='\t',
+        compression='gzip' if args.out_file.endswith('.gz') else None)
+    print("Done")
+
diff --git a/annotations/src/knownGene2annot.py b/annotations/src/knownGene2annot.py
@@ -0,0 +1,187 @@
+import gzip
+import sys
+import argparse
+
+SHOW_WARNINGS = False
+
+def parseArgs(args):
+    parser = argparse.ArgumentParser(
+        description="Create annotation file from UCSC knownGene file.")
+    parser.add_argument("known_gene_file", help="UCSC knownGene file")
+    parser.add_argument("out_file", help="Output file name")
+    parser.add_argument("--show-warns", action="store_true",
+        help="Suppress warnings")
+    return parser.parse_args(args)
+
+def myOpen(f_name, mode='r'):
+    if f_name.endswith(".gz"):
+        return gzip.open(f_name, mode)
+    else:
+        return open(f_name, mode)
+
+
+def write2file(columnList, f):
+    line = '\t'.join(columnList)
+    if columnList[-2] == columnList[-1]:
+        msg = "Warning generating annotation:\n%s\n" % line
+        msg += "Empty segment generated. It will be ignored."
+        if SHOW_WARNINGS: print(msg)
+    else:
+        f.write("%s\n" % line)
+
+
+def parseLine(line, outFile):
+    """
+    Parse a line from UCSC knownGene file. Write annotated segments to outFile.
+    UCSC knownGene file for hg19 can be downloaded from here:
+        http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/
+    knownGene format description is here:
+        http://genome.ucsc.edu/cgi-bin/hgTables
+    Supported categories:
+        5UTR, 3UTR, Exon, ProteinConding, Intron, 1kUp, 10kUp, 100kUp, 1kDown,
+        10kDown, 100kDown
+        for coding transcripts and a single category NoncodingTranscript for
+        noncoding transcripts (to be consistent wit classification used in 
+        "All SNPs are not vreated equal" paper by A. Schork).
+    Output file contains 7 tab-delimited columns:
+        gene name, chromosome, strand, protein ID, category, start, end
+    Algorithm (assuming the gene is on "+" strand):
+                                 Gene
+                start                             end
+     -------------[--Exon--|----Intron----|--Exon--]-------------
+           [-1kUp-]                                [-1kDown-]
+      [---10kUp---]                                [---10kUp---]
+    All Exons are taken as they are reported in the knownGene file, all segments
+    between consequent exons are taken as Introns.
+    If cdsStart == cdsEnd: no coding sequence, no further processing
+    else:
+        each exon is compared to cds start/end as follows (6 different cases):
+                s                  e
+     -----------|-------Exon-------|-----------
+        cs   ce
+     1: [-cds-]                                  (s,e)   - 3UTR
+     2: [----cds----]                            (s,ce)  - ProtCod
+                                                 (ce,e)  - 3UTR
+     3: [----------------cds----------------]    (s,e)   - ProtCod
+     4:             [---cds---]                  (s,cs)  - 5UTR
+                                                 (cs,ce) - ProtCod
+                                                 (ce,e)  - 3UTR
+     5:             [----------cds----------]    (s,cs)  - 5UTR
+                                                 (cs,e)  - ProtCod
+     6:                               [-cds-]    (s,e)   - 5UTR
+
+     additional special sub-cases are when cds limits are equal to exon limits.
+    """
+    spltLine = line.split("\t")
+    (name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount,
+        exonStarts, exonEnds, proteinID, alignID) = spltLine
+    txStart, txEnd, cdsStart, cdsEnd, exonCount = map(int, spltLine[3:8])
+    exonStarts = exonStarts.split(",")[:-1]
+    exonEnds = exonEnds.split(",")[:-1]
+    columnList = [name, chrom, strand, proteinID]
+    if cdsStart == cdsEnd:
+        # noncoding transcript
+        s,e = str(txStart), str(txEnd)
+        write2file(columnList + ["NoncodingTranscript", s, e], outFile)
+    else:
+        for s,e in zip(exonEnds[:-1], exonStarts[1:]):
+            write2file(columnList + ["Intron", s, e], outFile)
+        for s,e in zip(exonStarts, exonEnds):
+            write2file(columnList + ["Exon", s, e], outFile)
+        exonStarts = map(int,exonStarts)
+        exonEnds = map(int, exonEnds)
+        if strand == "+":
+            s, e = max(0, txStart-1000), txStart
+            write2file(columnList + ["1kUp", "%d" % s, "%d" % e], outFile)
+            s, e = max(0, txStart-10000), txStart
+            write2file(columnList + ["10kUp", "%d" % s, "%d" % e], outFile)
+            s, e = max(0, txStart-100000), txStart
+            write2file(columnList + ["100kUp", "%d" % s, "%d" % e], outFile)
+            s, e = txEnd, txEnd+1000
+            write2file(columnList + ["1kDown", "%d" % s, "%d" % e], outFile)
+            s, e = txEnd, txEnd+10000
+            write2file(columnList + ["10kDown", "%d" % s, "%d" % e], outFile)
+            s, e = txEnd, txEnd+100000
+            write2file(columnList + ["100kDown", "%d" % s, "%d" % e], outFile)
+            if cdsStart < cdsEnd: # otherwise this gene is non-coding
+                # consider 6 different cases described above
+                for s,e in zip(exonStarts, exonEnds):
+                    if cdsEnd <= s: # case 1
+                        write2file(columnList + ["3UTR", "%d" % s, "%d" % e], outFile)
+                    elif e <= cdsStart: # case 6
+                        write2file(columnList + ["5UTR", "%d" % s, "%d" % e], outFile)
+                    elif cdsStart < s and cdsEnd < e: # case 2
+                        write2file(columnList + ["ProteinCoding", "%d" % s, "%d" % cdsEnd], outFile)
+                        write2file(columnList + ["3UTR", "%d" % cdsEnd, "%d" % e], outFile)
+                    elif cdsStart <= s and cdsEnd >= e: # case 3
+                        write2file(columnList + ["ProteinCoding", "%d" % s, "%d" % e], outFile)
+                    elif s <= cdsStart and cdsEnd <= e: # case 4
+                        write2file(columnList + ["ProteinCoding", "%d" % cdsStart, "%d" % cdsEnd], outFile)
+                        if s < cdsStart:
+                            write2file(columnList + ["5UTR", "%d" % s, "%d" % cdsStart], outFile)
+                        if cdsEnd < e:
+                            write2file(columnList + ["3UTR", "%d" % cdsEnd, "%d" % e], outFile)
+                    elif s < cdsStart and cdsEnd > e: # case 5
+                        write2file(columnList + ["ProteinCoding", "%d" % cdsStart, "%d" % e], outFile)
+                        write2file(columnList + ["5UTR", "%d" % s, "%d" % cdsStart], outFile)
+                    else:
+                        msg = "Error processing line:\n%s\n" % line
+                        msg += "unclassified exon: (start, end) = (%d,%d)" % (s, e)
+                        raise ValueError(msg)
+        elif strand == "-":
+            s, e = max(0, txStart-1000), txStart
+            write2file(columnList + ["1kDown", "%d" % s, "%d" % e], outFile)
+            s, e = max(0, txStart-10000), txStart
+            write2file(columnList + ["10kDown", "%d" % s, "%d" % e], outFile)
+            s, e = max(0, txStart-100000), txStart
+            write2file(columnList + ["100kDown", "%d" % s, "%d" % e], outFile)
+            s, e = txEnd, txEnd+1000
+            write2file(columnList + ["1kUp", "%d" % s, "%d" % e], outFile)
+            s, e = txEnd, txEnd+10000
+            write2file(columnList + ["10kUp", "%d" % s, "%d" % e], outFile)
+            s, e = txEnd, txEnd+100000
+            write2file(columnList + ["100kUp", "%d" % s, "%d" % e], outFile)
+            if cdsStart < cdsEnd: # otherwise this gene is non-coding
+                # consider 6 different cases described above, but for "-" strand
+                for s,e in zip(exonStarts, exonEnds):
+                    if cdsEnd <= s: # case 1
+                        write2file(columnList + ["5UTR", "%d" % s, "%d" % e], outFile)
+                    elif e <= cdsStart: # case 6
+                        write2file(columnList + ["3UTR", "%d" % s, "%d" % e], outFile)
+                    elif cdsStart < s and cdsEnd < e: # case 2
+                        write2file(columnList + ["ProteinCoding", "%d" % s, "%d" % cdsEnd], outFile)
+                        write2file(columnList + ["5UTR", "%d" % cdsEnd, "%d" % e], outFile)
+                    elif cdsStart <= s and cdsEnd >= e: # case 3
+                        write2file(columnList + ["ProteinCoding", "%d" % s, "%d" % e], outFile)
+                    elif s <= cdsStart and cdsEnd <= e: # case 4
+                        write2file(columnList + ["ProteinCoding", "%d" % cdsStart, "%d" % cdsEnd], outFile)
+                        if s < cdsStart:
+                            write2file(columnList + ["3UTR", "%d" % s, "%d" % cdsStart], outFile)
+                        if cdsEnd < e:
+                            write2file(columnList + ["5UTR", "%d" % cdsEnd, "%d" % e], outFile)
+                    elif s < cdsStart and cdsEnd > e: # case 5
+                        write2file(columnList + ["ProteinCoding", "%d" % cdsStart, "%d" % e], outFile)
+                        write2file(columnList + ["3UTR", "%d" % s, "%d" % cdsStart], outFile)
+                    else:
+                        msg = "Error processing line:\n%s\n" % line
+                        msg += "unclassified exon: (start, end) = (%d,%d)" % (s, e)
+                        raise ValueError(msg)
+        else:
+            msg = "Error processing line:\n%s\n" % line
+            msg += "unknown strand: '%s'" % strand
+            raise ValueError(msg)
+
+
+
+if __name__ == "__main__":
+    args = parseArgs(sys.argv[1:])
+    SHOW_WARNINGS = args.show_warns
+    with myOpen(args.known_gene_file, 'rt') as f, myOpen(args.out_file, 'wt') as of:
+        print("Processing UCSC knownGene file %s" % args.known_gene_file)
+        print("Writing to %s" % args.out_file)
+        for i, l in enumerate(f):
+            parseLine(l, of)
+            if (i+1)%10000 == 0:
+                print("%d lines processed" % (i+1))
+        print("%d lines processed in total" % (i+1))
+    print("Completed")
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,8 @@ @@
     *.pyc
     *.log
     *.json
+    *.sh.o*
+    *.sh.e*
     *.png
     /.metadata/
     src/build/*
@@ Expand Down @@