-
Notifications
You must be signed in to change notification settings - Fork 20
/
earlGreyLibConstruct
377 lines (335 loc) · 15.6 KB
/
earlGreyLibConstruct
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#!/bin/bash
usage()
{
echo " #############################
earlGrey version 5.1.0 (Library Construction Only)
Required Parameters:
-g == genome.fasta
-s == species name
-o == output directory
Optional Parameters:
-t == Number of Threads (DO NOT specify more than are available)
-r == RepeatMasker search term (e.g arthropoda/eukarya)
-l == Starting consensus library for an initial mask (in fasta format)
-i == Number of Iterations to BLAST, Extract, Extend (Default: 10)
-f == Number flanking basepairs to extract (Default: 1000)
-n == Max number of sequences used to generate consensus sequences (Default: 20)
-a == minimum number of sequences required to build a consensus sequence (Default: 3)
-h == Show help
Example Usage:
earlGrey -g bombyxMori.fasta -s bombyxMori -o /home/toby/bombyxMori/repeatAnnotation/ -t 16
Queries can be sent to:
tobias.baril[at]unine.ch
Please make use of the GitHub Issues and Discussion Tabs at: https://github.com/TobyBaril/EarlGrey
#############################"
}
# Subprocess Make Directories #
makeDirectory()
{
directory=$(realpath ${directory})
mkdir -p ${directory}/${species}_EarlGrey/ && OUTDIR=${directory}/${species}_EarlGrey
if [ ! -z "$RepSpec" ] || [ ! -z "$startCust" ] ; then
mkdir -p $OUTDIR/${species}_RepeatMasker/
fi
mkdir -p $OUTDIR/${species}_Database/
mkdir -p $OUTDIR/${species}_RepeatModeler/
mkdir -p $OUTDIR/${species}_strainer/
mkdir -p ${OUTDIR}/${species}_summaryFiles/
}
# Subprocess PrepGenome #
prepGenome()
{
if [ ! -L ${genome} ]; then
genome=$(realpath ${genome})
fi
if [ -L $genome ]; then
genome=$(realpath -s ${genome})
fi
if [ ! -f ${genome}.prep ] || [ ! -f ${genome}.dict ]; then
cp ${genome} ${genome}.bak && gzip -f ${genome}.bak
sed '/>/ s/ .*//g; /^$/d' ${genome} > ${genome}.tmp
${SCRIPT_DIR}/headSwap.sh -i ${genome}.tmp -o ${genome}.prep && rm ${genome}.tmp
mv ${genome}.tmp.dict ${genome}.dict
sed -i.bak '/^>/! s/[DVHBPE]/N/g' ${genome}.prep
dict=${genome}.dict
genOrig=${genome}
genome=${genome}.prep
else
dict=${genome}.dict
genOrig=${genome}
genome=${genome}.prep
fi
}
# Subprocess getRepeatMaskerFasta
# Generate a copy of the RepeatMasker library subset used for the initial TE mask in FASTA format (only used if a RepeatMasker run is specified)
getRepeatMaskerFasta()
{
if [[ $RepSpec = *" "* ]]; then
echo "ERROR: You have entered a species name that contains a space, please use the NCBI TaxID rather than name. E.G In place of \"Homo sapiens\" use \"9606\""
exit 2
fi
if [[ $(which RepeatMasker) == *"bin"* ]]; then
libpath="$(which RepeatMasker | sed 's|bin.*|share/RepeatMasker/Libraries/RepeatMaskerLib.h5|g')"
else
libpath="$(which RepeatMasker | sed 's|/[^/]*$||g')/Libraries/RepeatMaskerLib.h5"
fi
if [[ $(which RepeatMasker) == *"bin"* ]]; then
PATH=$PATH:"$(which RepeatMasker | sed 's|bin.*|share/RepeatMasker/|g')"
fi
mkdir -p $OUTDIR/${species}_Curated_Library/
famdb.py -i $libpath families -f fasta_name --include-class-in-name -a -d $RepSpec > ${OUTDIR}/${species}_Curated_Library/${RepSpec}.RepeatMasker.lib
RepSub=${OUTDIR}/${species}_Curated_Library/${RepSpec}.RepeatMasker.lib
}
# Subprocess firstMask
# Run the initial RepeatMasker run with the specified species subset (only used if a RepeatMasker run is specified)
firstMask()
{
cd ${OUTDIR}/${species}_RepeatMasker
rmthreads=$(expr $ProcNum / 4)
RepeatMasker -species $RepSpec -norna -lcambig -s -a -pa $rmthreads -dir $OUTDIR/${species}_RepeatMasker $genome
if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
echo "ERROR: RepeatMasker failed, please check logs. This is likely because of an invalid species search term, if issue persists please use NCBI Taxids (E.G Drosophila is replaced with 7125)"; exit 2
fi
}
# Subprocess firstMaskCustomLib
# Run the initial RepeatMasker run with the specific fasta consensus library
firstMaskCustomLib()
{
cd ${OUTDIR}/${species}_RepeatMasker
rmthreads=$(expr $ProcNum / 4)
RepeatMasker -lib ${startCust} -norna -lcambig -s -a -pa $rmthreads -dir $OUTDIR/${species}_RepeatMasker $genome
RepSub=${startCust}
if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
echo "ERROR: RepeatMasker failed, please check logs. This is likely because of an invalid custom consensus file, check the fasta file looks as expected"; exit 2
fi
}
# Subprocess buildDB
# if masked genome exists, build database from this. If not, build database from original input genome
buildDB()
{
if [ -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
cd ${OUTDIR}/${species}_Database
BuildDatabase -name ${species} ${OUTDIR}/${species}_RepeatMasker/*.masked
else
cd ${OUTDIR}/${species}_Database
BuildDatabase -name ${species} $genome
fi
}
# Subprocess deNovo1
# Run the initial de novo annotation run with RepeatModeler2, with error catches for weird genome sizes
deNovo1()
{
cd ${OUTDIR}/${species}_RepeatModeler
RepeatModeler -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species}
if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
echo "ERROR: RepeatModeler Failed, Retrying with limit set as Round 5"
RepeatModeler -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species} -genomeSampleSizeMax 81000000
if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
echo "ERROR: RepeatModeler Failed, Retrying with limit set as Round 4"
RepeatModeler -threads ${ProcNum} -database ${OUTDIR}/${species}_Database/${species} -genomeSampleSizeMax 27000000
if [ ! -e ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
echo "ERROR: RepeatModeler Failed"
exit 2
fi
fi
fi
}
# Subprocess strainer # CHECK FILE STRUCTURE FOR EARL GREY RUN
# contains the BLAST, Extract, Extend, Trim pipeline from James Galbraith
strainer()
{
cd ${OUTDIR}/${species}_strainer/
${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -g $genome -l ${OUTDIR}/${species}_Database/${species}-families.fa -t ${ProcNum} -f $Flank -r $num -n $no_seq -m $min_seq
latestFile="$(realpath $(ls -td -- ${OUTDIR}/${species}_strainer/*/ | head -n 1))/${species}-families.fa.strained"
cp $latestFile ${OUTDIR}/${species}_strainer/
}
# Subprocess sweepUp
# Puts required files into a summary folder
sweepUp()
{
cd ${OUTDIR}/${species}_summaryFiles/
if [ -s ${latestFile} ]; then
cp $latestFile ${OUTDIR}/${species}_summaryFiles/
else
echo "ERROR: TEstrainer failed to generate a TE library, please check logs" && exit 1
fi
}
# Subprocess runningTea
runningTea()
{
echo "
) (
( ) )
) ( (
_______)_
.-'---------|
( C|/\/\/\/\/|
'-./\/\/\/\/|
'_________'
'-------'
<<< $stage >>>"
}
# Subprocess GetTime
convertsecs()
{
h=$(bc <<< "${1}/3600")
m=$(bc <<< "(${1}%3600)/60")
s=$(bc <<< "${1}%60")
printf "%02d:%02d:%05.2f\n" $h $m $s
}
# Subprocess Checks
Checks()
{
if [ -z "$genome" ] || [ -z "$species" ] || [ -z "$directory" ] ; then
usage; exit 1
fi
if [ -z "$ProcNum" ] ; then
ProcNum=1; echo "$ProcNum Cores Will Be Used"
else
echo "$ProcNum Cores Will Be Used"
fi
if [ -z "$RepSpec" ] && [ -z "$startCust" ] ; then
echo "RepeatMasker species not specified, running Earl Grey without an initial mask with known repeats"
else
echo "Running Earl Grey with an intial mask with known repeats"
fi
if [ -z "$num" ] ; then
num=10; echo "De Novo Sequences Will Be Extended Through a Maximum of $num Iterations"
else
echo "De Novo Sequences Will Be Extended Through a Maximum of $num Iterations"
fi
if [ -z "$no_seq" ] ; then
no_seq=20; echo "$no_seq sequences will be used in BEAT consensus generation"
else
echo "$no_seq sequences will be used in BEAT consensus generation"
fi
if [ -z "$Flank" ]; then
Flank=1000; echo "Blast, Extend, Align, Trim Process Will Add 1000bp to Each End in Each Iteration"
else
echo "Blast, Extract, Extend, Trim Process Will Add $Flank to Each End in Each Iteration"
fi
if [ -z "$min_seq" ]; then
min_seq=3; echo "Blast, Extend, Align, Trim Process Will Require 3 Sequences to Generate a New Consensus Sequence"
else
echo "Blast, Extend, Align, Trim Process Will Require $min_seq Sequences to Generate a New Consensus Sequence"
fi
if [ ! -d "$SCRIPT_DIR" ]; then
echo "ERROR: Script directory variable not set, please run the configure script in the Earl Grey directory before attempting to run Earl Grey"; exit 1
fi
if [ ! -d "$SCRIPT_DIR/TEstrainer/" ]; then
echo "ERROR: teStrainer module not found, please check all modules are present and run the configure script in the Earl Grey directory before attempting to run Earl Grey"; exit 1
fi
# biocontainer checks
if [ -d "/usr/local/share/RepeatMasker/Libraries/" ]; then
if grep -q Placeholder /usr/local/share/RepeatMasker/Libraries/Dfam.h5 ; then
while true; do
read -p "Are you using the biocontainer installation? If yes, we can configure RepeatMasker to work for you:[YyNn]" yn
case $yn in
[Yy]* ) cd /usr/local/share/RepeatMasker/Libraries/
curl -O https://dfam.org/releases/Dfam_3.7/families/Dfam_curatedonly.h5.gz
gunzip Dfam_curatedonly.h5.gz && mv Dfam.h5 Dfam.h5.bak
mv Dfam_curatedonly.h5 Dfam.h5
cd /usr/local/share/RepeatMasker/
perl configure -rmblast_dir=/usr/local/bin -libdir=/usr/local/share/RepeatMasker/Libraries -trf_prgm=/usr/local/bin/trf -default_search_engine=rmblast
sed -i.bak 's|${SCRIPT_DIR}/LTR_FINDER_parallel|perl ${SCRIPT_DIR}/LTR_FINDER_parallel|g' /usr/local/share/earlgrey*/scripts/rcMergeRepeatsLoose
sed -i.bak 's|${SCRIPT_DIR}/LTR_FINDER_parallel|perl ${SCRIPT_DIR}/LTR_FINDER_parallel|g' /usr/local/share/earlgrey*/scripts/rcMergeRepeats
break;;
[Nn]* ) exit;;
* ) echo "Please answer yes or no.";;
esac
done
fi
fi
}
# Main #
while getopts g:s:o:t:f:i:r:l:n:a:h option
do
case "${option}"
in
g) genome=${OPTARG};;
s) species=${OPTARG};;
o) directory=${OPTARG};;
t) ProcNum=${OPTARG};;
f) Flank=${OPTARG};;
i) num=${OPTARG};;
r) RepSpec=${OPTARG};;
l) startCust=${OPTARG};;
n) no_seq=${OPTARG};;
a) min_seq=${OPTARG};;
h) usage; exit 0;;
esac
done
SECONDS=0
# Step 1 - set up the directories and make sure all modules are present
stage="Checking Parameters" && runningTea
SCRIPT_DIR=/data/toby/mambaforge/envs/earlgrey_440/share/earlgrey-4.4.0-0/scripts/
Checks
stage="Making Directories" && runningTea
makeDirectory
# Start Logs - only bother starting if everything is present and correct
exec > >(tee "${OUTDIR}/${species}EarlGrey.log") 2>&1
stage="Cleaning Genome" && runningTea
prepGenome
sleep 1
# Step 2 - initial annotation, either de novo, or if required an initial RepeatMasker followed by a de novo run
if [ -z "$RepSpec" ]; then
sleep 1
else
if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
stage="Getting RepeatMasker Sequences for $RepSpec and Saving as Fasta" && runningTea
getRepeatMaskerFasta
stage="Running Initial Mask with Known Repeats" && runningTea
firstMask
else
stage="Genome has already been masked, skipping..." && runningTea
getRepeatMaskerFasta
fi
fi
if [ -z "$startCust" ]; then
sleep 1
else
if [ ! -f ${OUTDIR}/${species}_RepeatMasker/*.masked ]; then
stage="Running Initial Mask with Known Repeats in Custom Library" && runningTea
firstMaskCustomLib
else
stage="Genome has already been masked, skipping..." && runningTea
fi
fi
# Step 3 - make a database from the genome, and then run a de novo TE annotation using RepeatModeler2
if [ ! -f ${OUTDIR}/${species}_Database/${species}-families.fa ]; then
stage="Detecting Novel Repeats" && runningTea
buildDB
deNovo1
sleep 1
else
stage="De novo repeats have already been detected, skipping..." && runningTea
sleep 1
fi
# Step 4 - run TEstrainer, which runs a BLAST, extract, extent, trim, on the the de novo repeat library to refine consensus sequences
if [ ! -s ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
stage="Straining TEs and Refining de novo Consensus Sequences" && runningTea
if [ -f ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
rm -r ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ${OUTDIR}/${species}_strainer/TS*
fi
strainer
if [ ! -s ${OUTDIR}/${species}_strainer/${species}-families.fa.strained ]; then
echo "WARNING: TEstrainer failed to produce a strain file, please check the log file for more information. If you have run an intial mask with known repeats, this could be due RepeatModeler2 failing to identify any new repeats. Please check if this is expected."
fi
sleep 1
else
stage="TEs already strained, skipping..." && runningTea
latestFile=${OUTDIR}/${species}_strainer/${species}-families.fa.strained
cp ${latestFile} ${OUTDIR}/${species}_summaryFiles/
sleep 1
fi
# Stage 5 - Tidy up!
stage="Tidying Directories and Organising Important Files" && runningTea
sweepUp
sleep 1
# Stage 6
time=$(convertsecs $SECONDS)
stage="Done in $time" && runningTea
sleep 5
# Stage 10
stage="TE library in Standard Format Can Be Found in ${OUTDIR}/${species}_summaryFiles/" && runningTea
sleep 5