-
Notifications
You must be signed in to change notification settings - Fork 0
/
check-and-create-bsgenome.sh
56 lines (48 loc) · 1.9 KB
/
check-and-create-bsgenome.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/bin/bash
#
# Execute BSgenome builds from config file
#
PROJECT_DIR=$HOME/repos/custom_ArchR_genomes_and_annotations
SCRATCH_DIR=/scratch/$USER
GENOME_DIR=${PROJECT_DIR}/output/genomes
# Process genomes from config file
conda activate custom_genes
echo "Processing genomes..."
config_file="config/target_genomes.tsv"
num_genomes=$(wc -l < $config_file)
# Process each genome starting from line 2 (skip header)
for i in $(seq 2 $num_genomes); do
# Parse TSV line using awk
row=$(awk -v line=$i 'NR==line' "$config_file")
genome=$(echo "$row" | awk -F'\t' '{print $2}')
species=$(echo "$row" | awk -F'\t' '{print $3}')
pgk_name=$(echo "$row" | awk -F'\t' '{print $9}')
source_file=$(echo "$row" | awk -F'\t' '{print $4}')
fasta_path=${GENOME_DIR}/${genome}/${genome}.fa.gz
if [ -z "$pgk_name" ]; then
# Time to create the package name
GENUS=$(echo "$species" | cut -d' ' -f1)
SPECIES_NAME=$(echo "$species" | cut -d' ' -f2)
SPECIES_FORMATTED=$(echo "${GENUS:0:1}${SPECIES_NAME}" | sed 's/^./\U&/g; s/ //g')
pgk_name="BSgenome.${SPECIES_FORMATTED}.Custom.${genome}"
else
continue
fi
if [[ "$pkg_name" == *"Custom"* ]]; then
if Rscript -e "if('${pgk_name}' %in% rownames(installed.packages())) quit(status = 0) else quit(status = 1)"; then
echo "Package ${pgk_name} already installed, skipping..."
else
echo "Submitting job for genome: $genome"
# Uncomment the line below to actually submit the job
sbatch --job-name="bsg_${genome}" scripts/make-bsgenome.sh \
-s "$species" \
-g "$genome" \
-f "$fasta_path" \
-n "$pgk_name" \
-w "$SCRATCH_DIR" \
-o "${GENOME_DIR}/${genome}" \
-p "$PROJECT_DIR"
fi
fi
done
echo "All BSgenome build jobs submitted"