-
Notifications
You must be signed in to change notification settings - Fork 2
/
run_cellranger_pipeline.py
115 lines (90 loc) · 3.29 KB
/
run_cellranger_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# # Single Sample Pipeline
# ###########################################
# from common_utils.s3_utils import download_file, upload_file, download_folder, upload_folder
# from common_utils.job_utils import generate_working_dir, delete_working_dir
import re
import yaml
import pandas as pd
import glob
import json
import sys
import boto3
import botocore
import os
import shlex
import subprocess
from common_utils.s3_utils import download_file, upload_file, download_folder, upload_folder
from common_utils.job_utils import generate_working_dir, delete_working_dir
###############################################################################
# parse arguments
###############################################################################
# the first element is the script name, the second is the first argument
# the argument is specified by the cloudformation json
print('----------- CELLRANGER PIPELINE --------------\n\n')
print('parsing argument data\n------------------------------')
print('sys.argv[1]')
print(sys.argv[1])
try:
arg_string = sys.argv[1]
arg_dict = json.loads(arg_string)
print('------------------------------')
print('Able parse argument json')
print(arg_dict)
print('------------------------------')
# inst_bucket = arg_dict['bucket']
# inst_fcs = arg_dict['inst_fcs']
# print('the bucket is: ' + str(inst_bucket))
# print('the folder has been hard wired')
# print('inst_fcs: ' + str(inst_fcs))
except:
print('------------------------------')
print('Unable to parse argument json')
print('------------------------------')
try:
print(arg_string = sys.argv[1])
except:
pass
# check available disk space
cmd = "df -h"
subprocess.check_call(shlex.split(cmd))
directory = 'scratch/tenant_1'
if not os.path.exists(directory):
os.makedirs(directory)
# move into scratch directory
os.chdir('scratch/tenant_1')
# Copy files from S3
###########################################
# refdata
inst_bucket = 'cellranger-tiny-bucket'
s3_folder = 'refdata-cellranger-GRCh38-1.2.0'
s3_path = 's3://'+inst_bucket + '/' + s3_folder
download_folder(s3_path, 'refdata-cellranger-GRCh38-1.2.0')
# tiny-bcl
inst_bucket = 'cellranger-tiny-bucket'
s3_folder = 'tiny-bcl'
s3_path = 's3://'+inst_bucket + '/' + s3_folder
download_folder(s3_path, 'tiny-bcl')
# check refdata
cmd = "ls -l refdata-cellranger-GRCh38-1.2.0"
subprocess.check_call(shlex.split(cmd))
# Run Cellranger MKFASTQ and COUNT
############################################
# cellranger mkfastqs
cmd = 'cellranger mkfastq --id=tiny-bcl-output --run=tiny-bcl/cellranger-tiny-bcl-1.2.0/ --csv=tiny-bcl/cellranger-tiny-bcl-samplesheet-1.2.0.csv'
subprocess.check_call(shlex.split(cmd))
#
# use full path for reference transcriptome
#
# cellranger count
cmd = 'cellranger count --id=test_sample --fastqs=tiny-bcl-output/outs/fastq_path/p1/s1 --sample=test_sample --expect-cells=1000 --localmem=3 --chemistry=SC3Pv2 --transcriptome=refdata-cellranger-GRCh38-1.2.0'
subprocess.check_call(shlex.split(cmd))
# # Copy data back to S3
# ###########################
# copy mkfastq outputs
s3_path = 's3://'+inst_bucket + '/tiny-bcl-output'
fcs_files_path = 'tiny-bcl-output'
upload_folder(s3_path, fcs_files_path)
# copy count outputs
s3_path = 's3://'+inst_bucket + '/test_sample'
fcs_files_path = 'test_sample'
upload_folder(s3_path, fcs_files_path)