-
Notifications
You must be signed in to change notification settings - Fork 9
/
HybPhyloMaker0f_createPlastomeRef.sh
395 lines (361 loc) · 16.7 KB
/
HybPhyloMaker0f_createPlastomeRef.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
#!/bin/bash
#----------------MetaCentrum----------------
#PBS -l walltime=1:0:0
#PBS -l select=1:ncpus=1:mem=1gb:scratch_local=1gb
#PBS -j oe
#PBS -N cpRefFromGB
#PBS -m abe
#-------------------HYDRA-------------------
#$ -S /bin/bash
#$ -pe mthread 4
#$ -q sThM.q
#$ -l mres=4G,h_data=4G,h_vmem=4G,himem
#$ -cwd
#$ -j y
#$ -N HybPhyloMaker8a2_Astral4
#$ -o HybPhyloMaker8a2_Astral4.log
# ********************************************************************************
# * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building *
# * https://github.com/tomas-fer/HybPhyloMaker *
# * Script 0f - Prepare plastome reference from GenBank file *
# * v.1.8.0 *
# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2024 *
# * tomas.fer@natur.cuni.cz *
# ********************************************************************************
#optionally DOWNLOAD full plastome accession from GenBank (requires Entrez Direct)
#(https://www.ncbi.nlm.nih.gov/books/NBK179288/)
#EXTRACT sequences (in FASTA format) from full plastome provided in GenBank format
#- all features (CDS, tRNA, rRNA), separated to exons
#- all sequences among them (introns, spacers)
#PREPARE reference for HybPhyloMaker
#- modify names
#- delete regions shorter than $short (defined below!)
#KNOWN BUGS:
#1. limited working in case the '/gene' descriptor is missing in CDS description (like, e.g., in MK460223)
# in this case this script requires 'product.txt' for proper modification to gene name ('/gene' descriptor is added)
# however, some '/product' descriptions matching appropriate gene names have to be added/modified to 'product.txt'
# known problem: 'RNA polymerase beta subunit' or 'RNA polymerase beta' describes rpoB, rpoC1 and rpoC2 - manual check necessary!!!
# known problem: some CDS have '/product' description as 'no product string in file'
# also, if tRNAs and rRNAs are not annotated in GB file, some of the non-coding regions extracted by this script include them
#2. Does not properly work for plastomes of parazitic plants, e.g. HG514460
# in this case only remaining functional genes are extracted (those annotated as CDS)
# non-coding regions then contain non-coding parts as well as non-functional genes (i.e. genes with '/pseudo')
# tRNAs are extracted both functional and pseudo
#Complete path and set configuration for selected location
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "\nHybPhyloMaker0f is running on MetaCentrum..."
#settings for MetaCentrum
#Move to scratch
cd $SCRATCHDIR
#Copy file with settings from home and set variables from settings.cfg
cp $PBS_O_WORKDIR/settings.cfg .
. settings.cfg
#. /packages/run/modules-2.0/init/bash
path=/storage/$server/home/$LOGNAME/$data
source=/storage/$server/home/$LOGNAME/HybSeqSource
#Add necessary modules
module add entrezdirect
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "\nHybPhyloMaker0f is running on Hydra..."
#settings for Hydra
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir0f
cd workdir0f
#Add necessary modules
module load entrezdirect???
else
echo -e "\nHybPhyloMaker0f is running locally..."
#settings for local run
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir0f
cd workdir0f
fi
if [[ ! $location == "1" ]]; then
if [ "$(ls -A ../workdir0f)" ]; then
echo -e "Directory 'workdir0f' already exists and is not empty. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir0f 2>/dev/null
exit 3
fi
fi
#Write log
logname=HPM0f
echo -e "HybPhyloMaker0f: Prepare plastome reference from GenBank file" > ${logname}.log
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "run on MetaCentrum: $PBS_O_HOST" >> ${logname}.log
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "run on Hydra: $HOSTNAME" >> ${logname}.log
else
echo -e "local run: "`hostname`"/"`whoami` >> ${logname}.log
fi
echo -e "\nBegin:" `date '+%A %d-%m-%Y %X'` >> ${logname}.log
echo -e "\nSettings" >> ${logname}.log
if [[ $PBS_O_HOST == *".cz" ]]; then
printf "%-25s %s\n" `echo -e "\nServer:\t$server"` >> ${logname}.log
fi
for set in cpGBfile cpGBnr mincplength; do
printf "%-25s %s\n" `echo -e "${set}:\t" ${!set}` >> ${logname}.log
done
#Add necessary files
cp $source/product.txt .
#remove all comments from product.txt
grep -v "^#" product.txt > tmp && mv tmp product.txt
#Make dir for results
mkdir -p $path/cp/00reference
#download GenBank file using efetch
if [[ "$cpGBnr" == "" && "$cpGBfile" == "" ]]; then
echo -e "Name of the GenBank file (cpGBfile) OR GenBank AccessionNumber (cpGBnr) not provided.\nExiting...\n" && exit 3
elif [[ -z $cpGBfile ]]; then
echo -en "Trying to download Accession: ${cpGBnr} ... "
#test if esearch returns Counts=1
estest=$(esearch -db nuccore -query "$cpGBnr" | grep "Count" | awk -F'>|<' '{ print $3 }')
if [[ ${estest} != 1 ]]; then
echo -e "not found.\nExiting...\n" && exit 3
else
efetch -db nuccore -id ${cpGBnr} -format genbank > ${cpGBnr}.gb
${cpGBfile}=${cpGBnr}.gb
echo -e "completed\n"
fi
elif [[ ! -f "$source/${cpGBfile}" ]]; then
echo -e "File: ${cpGBfile} was not found in HybSeqSource.\nExiting...\n" && exit 3
else
cp $source/${cpGBfile} .
fi
#set variables
name=`echo ${cpGBfile} | cut -d'.' -f1`
org=$(grep ORGANISM ${cpGBfile} | awk '{ print $2"-"$3 }')
org2=$(grep ORGANISM ${cpGBfile} | awk '{ print $2" "$3 }')
fastah=$(echo ">${org}_${name}")
echo -e "\nWorking with '${org2} (${name})'..."
#get just the sequence
echo -e "\nExtracting sequence..."
#take everything after 'ORIGIN', remove numbers, remove spaces, remove '//', remove CRs, remove LFs
sed -e '1,/ORIGIN/d' ${cpGBfile} | sed 's/[0-9]*//g' | sed 's/ //g' | sed 's/\/\///' | sed 's/\x0D$//' | tr -d '\n' > ${name}.fasta
#Modify GB file (in order to put join feature information on a single line)
#delete everything between ',' and any number [0-9], i.e. putting the join description of some CDS on a single line
perl -0777 -pe 's/(,)\s*([0-9])/$1$2/g' ${cpGBfile} > ${cpGBfile}.modif #'perl -0777' = do not any splitting
# Modify GB file in case there are no '/gene' descriptors for CDS
if [ "$(grep "/gene" ${cpGBfile} | wc -l)" -lt 20 ]; then
echo "Converting '/products' to '/genes'..."
# modify long names of '/product' (the case of 'ribulose 1,5-bisphosphate carboxylase/oxygenase large subunit')
perl -0777 -pe 's/(oxygenase)\s*(large)/$1 $2/g' ${cpGBfile}.modif > tmp && mv tmp ${cpGBfile}.modif
# change '/product' to gene name according to product.txt
cat product.txt | while read -r a b; do
c=$(echo "$b" | sed 's/\//\\\//g') #change '/' to '\/' before sed on the next line
sed -i "s/\"$c\"/\"$a\"/" ${cpGBfile}.modif
done
# delete EOL on lines with 'note'
sed -i '/note/ { N; s/\n// }' ${cpGBfile}.modif
# remove unnecessary lines
sed -i '/\/note/ d' ${cpGBfile}.modif
sed -i '/\/trans_splicing/ d' ${cpGBfile}.modif
sed -i '/\/codon_start/ d' ${cpGBfile}.modif
sed -i '/\/transl/ d' ${cpGBfile}.modif
sed -i '/locus_tag/ d' ${cpGBfile}.modif
sed -i '/\/exception/ d' ${cpGBfile}.modif
# change '/product' to '/gene'
sed -i 's/\/product/\/gene/' ${cpGBfile}.modif
fi
rm product.txt
echo "Extracting features..."
#select lines containing CDS, tRNA and one following line (the line with gene name)
#CDS is surrounded by spaces ( CDS ) to avoid capturing the string from AA translation part
grep -A1 -E " CDS |tRNA " ${cpGBfile}.modif > resultCDS.txt
#add '--'' as last line (to mimic grep output after later file merging)
#select lines containing rRNA and two following lines (the line with gene name)
echo "--" >> resultCDS.txt
#remove spaces in gene names if any
sed -i '/gene/s/ //g' resultCDS.txt
#replace two or more spaces by ',', remove lines starting with ',/locus' or ',/gene' or '/old_locus_tag' and select lines containing rRNA and one following line
#(this is to remove other descriptors and get product' descriptor as the next line to rRNA)
sed 's/ \{2,\}/,/g' ${cpGBfile}.modif | sed '/^,\/locus/ d' | sed '/^,\/gene/ d' | sed '/^,\/old_locus_tag/ d' | grep -A1 -E ",rRNA" --no-group-separator | sed "s/,rRNA/--\n,rRNA/" | sed 1d > resultR.txt
rm ${cpGBfile}.modif
#grep -A2 -E "rRNA " ${cpGBfile} > resultR.txt
sed -i 's/ ribosomal RNA//' resultR.txt
#replace spaces by ','
sed -i 's/ \{1,\}/,/g' resultCDS.txt
sed -i 's/ \{1,\}/,/g' resultR.txt
#delete first ',' at each line
sed -i 's/,//' resultCDS.txt
sed -i 's/,//' resultR.txt
#delete first '/' at each line
sed -i 's/\///' resultCDS.txt
sed -i 's/\///' resultR.txt
#select correct name by deleting other lines
sed -i '/^locus/ d' resultR.txt
#merge both files
cat resultCDS.txt resultR.txt > result.txt
rm resultCDS.txt resultR.txt
#replace EOLs by ','
cat result.txt | tr "\n" "," > tmp && mv tmp result.txt
#delete "(" and ")"
sed -i 's/(//g' result.txt
sed -i 's/)//g' result.txt
#delete 'complement'
sed -i 's/complement//g' result.txt
#add ',' after 'join'
sed -i 's/join/join,/g' result.txt
#replace ',--,' by EOL
sed -i "s/,--,/\n/g" result.txt
#replace '..' by ','
sed -i 's/\.\./,/g' result.txt
#delete 'gene='
sed -i 's/gene=//' result.txt
#delete 'product='
sed -i 's/product=//' result.txt
#remove double quotes
sed -i 's/\"//g' result.txt
#delete last character on last line (if it is an excessive ',')
sed -i '$s/,$//' result.txt
#add newline at the end of the file
sed -i '$a\' result.txt
#replace '-' by 'x'
sed -i 's/-/x/' result.txt
echo "Treating joined features..."
#extract lines containing join
grep join result.txt > join.txt
#calculates number of regions to join
awk -F "," '{ print ( NF - 3 ) / 2 }' join.txt > nrRegions.txt
#change ',' to TABs
cat join.txt | tr "," "\t" > tmp && mv tmp join.txt
#merge two files
paste nrRegions.txt join.txt > nrRegionsJoin.txt
#loop over genes with more parts and separates parts to one line each
cat nrRegionsJoin.txt | while read a b c d e f g h i j; do
if [ "$a" -eq 2 ]; then
echo -e "$b,$d,$e,${h}_1" >> joinResult.txt
echo -e "$b,$f,$g,${h}_2" >> joinResult.txt
elif [ "$a" -eq 3 ]; then
echo -e "$b,$d,$e,${j}_1" >> joinResult.txt
echo -e "$b,$f,$g,${j}_2" >> joinResult.txt
echo -e "$b,$h,$i,${j}_3" >> joinResult.txt
fi
done
rm join.txt nrRegions.txt nrRegionsJoin.txt
#remove lines with 'join' from results.txt
sed -i '/join/d' result.txt
#combine result.txt and joinResult.txt
cat result.txt joinResult.txt > finalResult.txt
rm result.txt joinResult.txt
#change ',' to TABs
cat finalResult.txt | tr "," "\t" > tmp && mv tmp finalResult.txt
#sort according 2nd field (i.e., starting position)
sort -n -k2 finalResult.txt > finalResultSorted.txt
rm finalResult.txt
#add intergenic regions
echo "Adding intergenic regions..."
#generate file starting positions of intergenic regions
awk '{ print $3 + 1 "\t" $4}' finalResultSorted.txt > startIntergenic.txt
#get name of the last gene
last=$(tail -1 finalResultSorted.txt | cut -f4)
#add as first line
sed -i "1i 1\t$last" startIntergenic.txt
#merge sorted file and
paste finalResultSorted.txt startIntergenic.txt > finalPlusIntergenic.txt
rm startIntergenic.txt
#write noncoding regions
awk '{ print "non\t" $5 "\t" $2 - 1 "\t" $6 "-" $4}' finalPlusIntergenic.txt > intergenic.txt
rm finalPlusIntergenic.txt
#get the length of the plastome
length=$(head -n1 ${cpGBfile} | sed 's/ \{1,\}/,/g' | cut -d',' -f3)
#get the end of last coding gene and increase by one (start position of last non-coding region)
lastcodpos=$(tail -n1 finalResultSorted.txt | cut -f3)
lastcodpos=$((lastcodpos+1))
#delete last line
head -n -1 intergenic.txt > tmp && mv tmp intergenic.txt
#create last line
echo -e "non\t$lastcodpos\t$length\t$last-end" > lastline.txt
cat intergenic.txt lastline.txt > tmp && mv tmp intergenic.txt
rm lastline.txt
#merge with coding and non-coding and sort
cat finalResultSorted.txt intergenic.txt | sort -n -k2 > tmp && mv tmp finalResultSorted.txt
rm intergenic.txt
#treat negative-length non-coding regions (established by the script when genes are overlapping)
echo "Treating negative-lengths regions..."
#calculate region lengths and add it as forth column
awk '{ print $1 "\t" $2 "\t" $3 "\t" $3 - $2 + 1 "\t" $4 }' finalResultSorted.txt > tmp && mv tmp finalResultSorted.txt
#print only lines with non-negative region length (4th column > 0)
awk '{ if ($4 > 0) print $0 }' finalResultSorted.txt > tmp && mv tmp finalResultSorted.txt
#create reference
echo "Creating reference..."
x=1
cat finalResultSorted.txt | while read a b c d e; do
echo -e ">${x}_${x}_${e}_${a}" >> ${name}_reference.fasta
cut -c $b-$c ${name}.fasta >> ${name}_reference.fasta
x=$((x+1))
done
#Add organism name to fasta record
sed -i "1i ${fastah}" file${name}.fasta
echo "Removing empty records..."
#RS declares records separated by '>', if the record contains a second line $2 (i.e. is not an empty fasta record), print the record (add a > in front).
awk 'BEGIN {RS = ">" ; FS = "\n" ; ORS = ""} $2 {print ">"$0}' ${name}_reference.fasta > tmp && mv tmp ${name}_reference.fasta
echo -e "feature\tstart\tstop\tlength\tname" > header.txt
cat header.txt finalResultSorted.txt > ${name}_plastomeFeaturePositions.txt
rm header.txt finalResultSorted.txt
#modifying reference for the use with HybPhyloMaker
echo "Modifying reference for HybPhyloMaker..."
#1. separate to types (tRNA, rRNA, CDS, non, CDS+non)
grep "tRNA$" -A1 --no-group-separator ${name}_reference.fasta > ${name}_reference_tRNA.fasta
grep "rRNA$" -A1 --no-group-separator ${name}_reference.fasta > ${name}_reference_rRNA.fasta
grep "CDS$" -A1 --no-group-separator ${name}_reference.fasta > ${name}_reference_CDS.fasta
grep "non$" -A1 --no-group-separator ${name}_reference.fasta > ${name}_reference_non.fasta
grep -e "CDS$" -e "non$" -A1 --no-group-separator ${name}_reference.fasta > ${name}_reference_CDSnon.fasta
#2. modify names
#first sed command replaces last occurrence of '_' by 'zzz'
sed 's/\(.*\)_/\1zzz/; s/_/exon/3; s/_/exon/3; s/-/XXX/; s/\./yy/' ${name}_reference_tRNA.fasta > ${name}_reference_tRNAHPM.fasta
sed 's/\(.*\)_/\1zzz/; s/_/exon/3; s/_/exon/3; s/-/XXX/; s/\./yy/' ${name}_reference_rRNA.fasta > ${name}_reference_rRNAHPM.fasta
sed 's/\(.*\)_/\1zzz/; s/_/exon/3; s/_/exon/3; s/-/XXX/; s/\./yy/' ${name}_reference_CDS.fasta > ${name}_reference_CDSHPM.fasta
sed 's/\(.*\)_/\1zzz/; s/_/exon/3; s/_/exon/3; s/-/XXX/; s/\./yy/' ${name}_reference_non.fasta > ${name}_reference_nonHPM.fasta
sed 's/\(.*\)_/\1zzz/; s/_/exon/3; s/_/exon/3; s/-/XXX/; s/\./yy/' ${name}_reference_CDSnon.fasta > ${name}_reference_CDSnonHPM.fasta
#3. remove regions shorter than 200 bp
grep "^[^>].\{$mincplength\}" -B1 --no-group-separator ${name}_reference_tRNAHPM.fasta > ${name}_reference_tRNAHPM${mincplength}.fasta
grep "^[^>].\{$mincplength\}" -B1 --no-group-separator ${name}_reference_rRNAHPM.fasta > ${name}_reference_rRNAHPM${mincplength}.fasta
grep "^[^>].\{$mincplength\}" -B1 --no-group-separator ${name}_reference_CDSHPM.fasta > ${name}_reference_CDSHPM${mincplength}.fasta
grep "^[^>].\{$mincplength\}" -B1 --no-group-separator ${name}_reference_nonHPM.fasta > ${name}_reference_nonHPM${mincplength}.fasta
grep "^[^>].\{$mincplength\}" -B1 --no-group-separator ${name}_reference_CDSnonHPM.fasta > ${name}_reference_CDSnonHPM${mincplength}.fasta
#statistics
echo "Calculating nr of regions..."
#full reference
echo -e "CDS\nnon\ntRNA\nrRNA" > ${name}_statHeader.txt
grep "CDS$" ${name}_reference.fasta | wc -l > ${name}_statD.txt
grep "non$" ${name}_reference.fasta | wc -l >> ${name}_statD.txt
grep "tRNA$" ${name}_reference.fasta | wc -l >> ${name}_statD.txt
grep "rRNA$" ${name}_reference.fasta | wc -l >> ${name}_statD.txt
paste ${name}_statHeader.txt ${name}_statD.txt > ${name}_statT.txt
rm ${name}_statHeader.txt ${name}_statD.txt
#after short removal
grep "CDS$" ${name}_reference_CDSHPM${mincplength}.fasta | wc -l > ${name}_statD.txt
grep "non$" ${name}_reference_nonHPM${mincplength}.fasta | wc -l >> ${name}_statD.txt
grep "tRNA$" ${name}_reference_tRNAHPM${mincplength}.fasta | wc -l >> ${name}_statD.txt
grep "rRNA$" ${name}_reference_rRNAHPM${mincplength}.fasta | wc -l >> ${name}_statD.txt
paste ${name}_statT.txt ${name}_statD.txt > ${name}_statF.txt
rm ${name}_statT.txt ${name}_statD.txt
echo -e "type\tfull\tlonger${mincplength}" > header.txt
cat header.txt ${name}_statF.txt > ${name}_stat.txt
rm header.txt ${name}_statF.txt
#Copy results to home
cp *.{gb,fasta,txt} $path/cp/00reference
#Copy HPM references to HybSeqSource
cp *HPM*.fasta $source
cp ${cpGBfile} $source
#Copy log to home
echo -e "\nEnd:" `date '+%A %d-%m-%Y %X'` >> ${logname}.log
cp ${logname}.log $path/cp/00reference
#Clean scratch/work directory
if [[ $PBS_O_HOST == *".cz" ]]; then
#delete scratch
if [[ ! $SCRATCHDIR == "" ]]; then
rm -rf $SCRATCHDIR/*
fi
else
cd ..
rm -r workdir0f
fi
echo -e "\nScript HybPhyloMaker0f finished...\n"