-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpacked_sam2coverage_per_gene.php
executable file
·191 lines (164 loc) · 7.71 KB
/
packed_sam2coverage_per_gene.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/php
<?php
/*******************************************************************************
*
* This script generates a file with a list of genes with their coverage based
* on the packed sam file, using the mm10_gene_list.txt file to retrieve the
* gene symbol for the transcripts.
*
* Created : 2013-08-29
* Modified : 2013-08-29
* Version : 0.2
*
* Copyright : 2013-2015 Leiden University Medical Center; http://www.LUMC.nl/
* Programmer : Ing. Ivo F.A.C. Fokkema <I.F.A.C.Fokkema@LUMC.nl>
*
*
* This work is licensed under the Creative Commons
* Attribution-NonCommercial-ShareAlike 4.0 International License. To view a
* copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/
* or send a letter to:
* Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
*
*************/
$_SETT =
array(
'version' => '0.2',
'output_suffix' => '.coverage_per_gene.txt',
'show_top_unknown_transcripts' => 5,
);
print('PackedSam2CoveragePerGene v.' . $_SETT['version'] . "\n");
$aFiles = $_SERVER['argv'];
$sScriptName = array_shift($aFiles); // Shifts first argument off the array (this script's name), we don't need it anymore after this.
if (count($aFiles) < 2) {
die('Usage: ' . $sScriptName . ' path_to_mm10_gene_list.txt PACKED_SAM_FILE1 [ PACKED_SAM_FILE2 [ ... ]]' . "\n\n");
}
// Check if all files can be read.
foreach ($aFiles as $sFile) {
if (!is_readable($sFile)) {
die('Unable to open ' . $sFile . '.' . "\n");
}
}
// Load transcript/gene info file.
$sGeneFile = array_shift($aFiles); // Shifts next argument off the array (the gene info file's name), we don't need it anymore after this.
$aGeneFile = file($sGeneFile, FILE_IGNORE_NEW_LINES);
$aTranscripts = array();
print('Parsing gene file... ');
foreach ($aGeneFile as $sLine) {
if (!trim($sLine) || $sLine{0} == '#') {
// Comment, header.
continue; // Skip this line, continue to next.
}
list($sTranscript, $sStrand, $sGene) = explode("\t", $sLine);
$aTranscripts[$sTranscript] = $sGene;
}
print('done, loaded ' . count($aTranscripts) . ' transcripts in memory.' . "\n\n");
// Now, loop the SAM files, load them one by one in the memory.
$nSamples = count($aFiles);
$aCoveragePerGene = array();
$aTranscriptsNotFoundPerSample = array();
$aTranscriptsNotFoundWithCoveragePerSample = array();
foreach ($aFiles as $nFile => $sFile) {
$aSAMFile = file($sFile, FILE_IGNORE_NEW_LINES);
$aTranscriptsNotFoundWithCoverage = array();
print('Parsing ' . $sFile . '... ');
foreach ($aSAMFile as $sLine) {
list($sTranscript, $nPosition, $nCoverage) = explode("\t", $sLine);
// Take away version since it doesn't matter and the gene list doesn't have it either.
$sTranscript = substr($sTranscript, 0, strpos($sTranscript . '.', '.'));
if (isset($aTranscripts[$sTranscript])) {
$sGene = $aTranscripts[$sTranscript];
if (!isset($aCoveragePerGene[$sGene])) {
$aCoveragePerGene[$sGene] = array_fill(0, $nSamples, 0);
}
$aCoveragePerGene[$sGene][$nFile] += $nCoverage;
} else {
// We don't know the transcript, it's not in our mm10_gene_list.txt file. Store it, so we can report it.
if (!isset($aTranscriptsNotFoundWithCoverage[$sTranscript])) {
$aTranscriptsNotFoundWithCoverage[$sTranscript] = 0;
}
$aTranscriptsNotFoundWithCoverage[$sTranscript] += $nCoverage;
}
}
$aTranscriptsNotFoundPerSample[$nFile] = array_keys($aTranscriptsNotFoundWithCoverage); // Copy the list of transcripts not found, without coverage info.
sort($aTranscriptsNotFoundPerSample[$nFile]); // Then sort this list.
arsort($aTranscriptsNotFoundWithCoverage, SORT_NUMERIC); // Associated array Reverse Sort transcripts without genes on coverage.
// Isolate the top # transcripts based on their coverage.
$aTranscriptsNotFoundWithCoveragePerSample[$nFile] = array();
for ($i = 0; $i < $_SETT['show_top_unknown_transcripts']; $i++) {
list($sTranscript, $nCoverage) = each($aTranscriptsNotFoundWithCoverage);
$aTranscriptsNotFoundWithCoveragePerSample[$nFile][$sTranscript] = $nCoverage;
}
print('done.' . "\n" .
' ' . count($aTranscriptsNotFoundWithCoverage) . ' transcripts were not found, details will be in the output file.' . "\n");
}
// How should we call the new file?
if ($nSamples == 1) {
$sFileOut = $aFiles[0] . $_SETT['output_suffix'];
} else {
// Find prefix and suffix for file(s), to have an output file that matches the name.
$nPrefixLength = $nSuffixLength = min(array_map('strlen', $aFiles)); // Length of the shortest file name in argument list.
$sPrefix = substr($aFiles[0], 0, $nPrefixLength); // Limit prefix already to length of shortest file name.
$sSuffix = substr($aFiles[0], -$nSuffixLength); // Limit suffix already to length of shortest file name.
foreach ($aFiles as $sFile) {
for ($i = 0; $i < $nPrefixLength; $i++) {
if ($sPrefix{$i} != $sFile{$i}) {
// No match!
$sPrefix = substr($sPrefix, 0, $i);
$nPrefixLength = strlen($sPrefix);
break; // Go to next file.
}
}
}
foreach ($aFiles as $sFile) {
for ($i = 1; $i < $nSuffixLength; $i++) {
if (substr($sSuffix, -$i, 1) != substr($sFile, -$i, 1)) {
// No match!
$sSuffix = substr($sSuffix, -($i-1));
$nSuffixLength = strlen($sSuffix);
break; // Go to next file.
}
}
}
$sFileOut = $sPrefix . preg_replace('/^' . preg_quote($sPrefix, '/') . '(.+)' . preg_quote($sSuffix, '/') . '$/', '$1', $aFiles[0]) . '-' . preg_replace('/^' . preg_quote($sPrefix, '/') . '(.+)' . preg_quote($sSuffix, '/') . '$/', '$1', $aFiles[$nSamples-1]) . $sSuffix . $_SETT['output_suffix'];
}
// Open the file and, looping through the samples, print the basic info.
$f = fopen($sFileOut, 'w');
if (!$f) {
die('FAILED, can\'t create file.' . "\n");
}
fputs($f, '# PackedSam2CoveragePerGene v.' . $_SETT['version'] . "\n" .
'# NOTE: Transcripts not found happens for transcripts that can not be mapped on the genome. UCSC does not know about these transcripts at all.' . "\n");
foreach ($aFiles as $nFile => $sFile) {
$nSampleID = $nFile + 1;
fputs($f, "\n" .
'# SAMPLE' . str_pad($nSampleID, 2, '0', STR_PAD_LEFT) . "\t" . $sFile . "\n" .
'# Transcripts not found (' . count($aTranscriptsNotFoundPerSample[$nFile]) . '):' . "\t" . implode(',', $aTranscriptsNotFoundPerSample[$nFile]) . "\n" .
'# Top unknown transcripts ordered on coverage:' . "\n" .
'# Transcript' . "\t" . 'Total coverage' . "\n");
foreach ($aTranscriptsNotFoundWithCoveragePerSample[$nFile] as $sTranscript => $nCoverage) {
fputs($f, $sTranscript . "\t" . $nCoverage . "\n");
}
}
// Let user know we're working here...
print('Writing output to ' . $sFileOut . '... ');
// Now, sort the list of genes properly, so we can loop through it to print the results.
$aGenesInOrder = array_map('array_sum', $aCoveragePerGene);
arsort($aGenesInOrder, SORT_NUMERIC);
// Print the header.
fputs($f, "\n" .
'# Gene' . "\t" . 'Total coverage');
for ($i = 1; $i <= $nSamples; $i ++) {
fputs($f, "\t" . 'SAMPLE ' . str_pad($i, 2, '0', STR_PAD_LEFT));
}
fputs($f, "\n");
// Now, loop through these genes, to print the results per sample.
foreach ($aGenesInOrder as $sGene => $nCoverage) {
fputs($f, $sGene . "\t" . $nCoverage);
for ($i = 0; $i < $nSamples; $i ++) {
fputs($f, "\t" . $aCoveragePerGene[$sGene][$i]);
}
fputs($f, "\n");
}
print('done.' . "\n\n");
?>