-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathncbi_download.py
248 lines (166 loc) · 9.36 KB
/
ncbi_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import os, sys, subprocess, argparse, datetime
from re import sub
from core import SBATCH, ezSub, CAPTURE
parser = argparse.ArgumentParser(
description='Download taxon library'
,prog=os.path.basename(__file__)
,usage='%(prog)s [options]'
,epilog='see readme for further details.'
)
taxons = [
'bacteria', 'fungi', 'protozoa', 'archaea', 'viral', 'plant', 'invertebrate'
]
databases = [
'genbank', 'refseq'
]
current_date = datetime.datetime.now().strftime("%Y-%m-%d")
parser.add_argument('-p', metavar='</path/to/directory>', type=str, default=f'x.DOWNLOADS/NCBI/{current_date}', help='specify path to working directory')
parser.add_argument('-t', metavar='<taxon>', type=str, default='all', nargs='+', choices=['all',*taxons, 'custom'], help='specify taxon to download')
parser.add_argument('-d', metavar='<database>', type=str, default='all', nargs='+', choices=['all',*databases], help='specify database source')
parser.add_argument('-q', metavar='<quality>', type=str, default='good', choices=['good','best'], help='select only Complete/Chromosome (good) or Representative (best) genomes')
parser.add_argument('-c', metavar='</path/to/custom_list>', type=str, required=False, help='specify path to custom list')
modes = parser.add_mutually_exclusive_group(required=True) # run modes
modes.add_argument( '-download', action='store_true', help='download files' )
modes.add_argument( '-review', action='store_true', help='review file status' )
PATH, SELECTED_TAXONS, SELECTED_DATABASES, QUALITY, CUSTOM_LIST, downloading, reviewing = vars(parser.parse_args()).values() # define user inputs
if 'custom' in SELECTED_TAXONS:
if not CUSTOM_LIST:
print("ERROR!")
sys.exit(0)
else:
if CUSTOM_LIST:
custom_accessions = [ ]
for line in open(CUSTOM_LIST,'r').readlines():
if line.startswith('#'): continue
accession, *_ = info = line.strip().split('\t')
*_, accession = accession.split(':')
custom_accessions.append(accession)
taxons = taxons if 'all' in SELECTED_TAXONS else SELECTED_TAXONS
databases = ['custom'] if 'custom' in SELECTED_TAXONS else databases if 'all' in SELECTED_DATABASES else SELECTED_DATABASES
WRK_DIR = f'/{PATH.strip("/")}'
os.makedirs(WRK_DIR, exist_ok=True)
for SOURCE in databases:
# format path as per rsync_from_ncbi.pl
RSYNC_PATH = 'https://ftp.ncbi.nlm.nih.gov/genomes'
ASSEMBLY_SUFFIX='_genomic.fna.gz'
BASE_URL = f'{RSYNC_PATH}/{SOURCE}'
SOURCE_DIR = f'{WRK_DIR}/{SOURCE}/{QUALITY}'
os.makedirs(SOURCE_DIR, exist_ok=True)
for TAXON in taxons:
TAXON_DIR = f'{SOURCE_DIR}/{TAXON}'
GEN_DIR = f'{TAXON_DIR}/all'
[ os.makedirs(subdir, exist_ok=True) for subdir in [TAXON_DIR, GEN_DIR] ]
MANIFEST_FILE = f'{TAXON_DIR}/manifest.txt'
SELECTED_FILE = f'{TAXON_DIR}/selected.txt'
ASSEMBLY_FILE = CUSTOM_LIST if TAXON == 'custom' else f'{TAXON_DIR}/assembly_summary.txt'
if downloading:
# specify assembly summary to download
if TAXON != 'custom':
ASSEMBLY_URL = f'{BASE_URL}/{TAXON}/assembly_summary.txt'
# download assembly file
subprocess.run(f'wget -O {ASSEMBLY_FILE} {ASSEMBLY_URL}', shell=True) # linux
#subprocess.run(f'cURL -o {ASSEMBLY_FILE} {ASSEMBLY_URL}', shell=True) # unix
with open(MANIFEST_FILE, 'w') as MANIFEST, open(SELECTED_FILE, 'w') as SELECTED:
for i, line in enumerate(open(ASSEMBLY_FILE, 'r').readlines()):
if i == 1:
line = line.strip('\n')
print(line, file=SELECTED)
# extract relevant column headers
assembly_header = line.strip('# ').split('\t')
header_idx = [
assembly_header.index(name)
for name in [
'assembly_accession'
,'taxid'
,'organism_name'
,'assembly_level'
,'refseq_category'
,'ftp_path'
]
]
# ignore other comments
if line.startswith('#'): continue
# extract relevant line info
info = line.strip().split('\t')
assembly_info = [
info[idx]
for idx in header_idx
]
assembly_accession, taxid, organism_name, assembly_level, refseq_category, ftp_path = assembly_info
# specify filter criteria; any listed assembly (custom) or only best quality (taxon)
assembly_level_criteria = ('') if TAXON == 'custom' or QUALITY == 'best' else ('chromosome', 'complete genome')
refseq_category_criteria = ('') if TAXON == 'custom' or QUALITY == 'good' else ('reference', 'representative')
if assembly_level.lower().startswith(assembly_level_criteria):
if refseq_category.lower().startswith(refseq_category_criteria):
# ignore assembly if also part of custom set
if TAXON != 'custom' and CUSTOM_LIST:
if assembly_accession in custom_accessions:
print(f'skipping {organism_name}; {assembly_accession} (included in custom list)')
continue
# path recorded in assembly file
if ftp_path != 'na':
print(*info, sep='\t', file=SELECTED)
# remove server from path
ftp_path = ftp_path.replace(f'{RSYNC_PATH}/','')
*_, basename = os.path.split(ftp_path)
ftp_file = f'{basename}{ASSEMBLY_SUFFIX}'
manifest_path = f'{ftp_path}/{ftp_file}'
print(manifest_path, file=MANIFEST)
scripts = []
batch_dirs = [ f'{TAXON_DIR}/{description}' for description in ['id','sh','oe'] ]
id_dir, sh_dir, oe_dir = batch_dirs
[
os.makedirs(path, exist_ok=True)
for path in batch_dirs
]
id_file = f'{id_dir}/{TAXON}.id'
if reviewing:
print('REVIEWING...')
GEN_FILES = [
contents.name
for contents in os.scandir(GEN_DIR)
]
for line in open(MANIFEST_FILE, 'r').readlines():
URL = line.strip()
*_, TARGET_FILE = os.path.split(URL)
SAMPLE = TARGET_FILE.replace(ASSEMBLY_SUFFIX,'')
DESCRIPTION = f'{TAXON}_{SAMPLE}'
if reviewing:
if not TARGET_FILE in GEN_FILES:
print(f'{TARGET_FILE} missing!')
err_file = f'{oe_dir}/{DESCRIPTION}.err'
status = CAPTURE(f'cat {err_file}')
print(status)
# write slurm file
if downloading:
sh_file = f'{sh_dir}/{DESCRIPTION}.sh'
with open(sh_file, 'w') as sh:
# specify hpcc directives (slurm)
hpcc_directives = SBATCH(
job_id=DESCRIPTION
,partition='defq'
,nodes=1
,ntasks=1
,memory='5gb'
,walltime='02:00:00'
,out_err=oe_dir
)
sh.write(
hpcc_directives
+'echo STARTED `date`\n'
+f'wget --spider {RSYNC_PATH}/{URL}\n'
+f'wget -O {GEN_DIR}/{TARGET_FILE} {RSYNC_PATH}/{URL}\n'
#+f'mv {sh_file} {downloaded_dir}/\n'
+'echo FINISHED `date`\n'
)
scripts.append(sh_file)
# submit tasks
if downloading:
print('\nSUBMITTING TASKS...')
with open(id_file, 'a+') as id_output:
for i,script in enumerate(scripts,1):
ezSub(i=i, check=120, user='mbzmch', limit=500) # maintain tasks below parellel task limit
sub_id = CAPTURE(f'sbatch -d singleton {script}') # submit task
if os.path.basename(script).startswith('buddy'): continue
else: print(sub_id, script, sep='\t', file=id_output, flush=True) # record task job id & shell script
print('\tALL TASKS SUBMITTED\n')