Skip to content

Commit

Permalink
Merge pull request #37 from MRCIEU/revert_fastgwa
Browse files Browse the repository at this point in the history
Revert fastgwa
  • Loading branch information
explodecomputer authored Nov 17, 2024
2 parents f545afb + 6ba3668 commit cbaf286
Show file tree
Hide file tree
Showing 19 changed files with 57,060 additions and 391 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.DS_Store

# History files
.Rhistory
.Rapp.history
Expand Down Expand Up @@ -50,3 +52,5 @@ rsconnect/

config.env
config.json

bin/liftOver
36 changes: 15 additions & 21 deletions 00-extract-pruned-variants.sh → 00a-genotype-organisation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,15 @@ source config.env
# create results directory
mkdir -p ${results_dir}/00

# log everything from this script to a logfile in the results director
# log everything from this script to a logfile in the results directory
exec &> >(tee ${results_dir}/00/logfile)

mkdir -p ${genotype_processed_dir}/scratch

echo "Organise samples"
Rscript resources/genotypes/organise_samples.r ${genotype_input_list} ${genotype_processed_dir}/sample_inclusion.txt ${sample_inclusion_list}


echo "Get list of pruned SNPs"
if test -f "resources/genotypes/hm3_prune_th_${build}.bed.gz"; then
echo "Found prune file"
Expand Down Expand Up @@ -71,25 +75,15 @@ for i in $(seq 1 $nchr)
do
bgen=$(awk -v i=$i 'NR==i { print $1 }' ${genotype_input_list})
sample=$(awk -v i=$i 'NR==i { print $2 }' ${genotype_input_list})
# check if $sample is empty - this would mean it's a pgen fileset
if [ -z "$sample" ]; then
./bin/plink2 \
--bgen ${bgen} \
--sample ${sample} \
--extract range ${prunefile} \
--make-bed \
--out ${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen) \
--threads ${env_threads}
echo "${genotype_processed_dir}/bgen_extract/$(basename ${bgen})" >> ${genotype_processed_dir}/bgen_extract/mergelist
else
./bin/plink2 \
--bgen ${bgen} ref-first \
--sample ${sample} \
--extract range ${prunefile} \
--make-bed \
--out ${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen) \
--threads ${env_threads}
echo "${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen)" >> ${genotype_processed_dir}/bgen_extract/mergelist
./bin/plink2 \
--bgen ${bgen} ref-first \
--sample ${sample} \
--extract range ${prunefile} \
--keep ${genotype_processed_dir}/sample_inclusion.txt \
--make-bed \
--out ${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen) \
--threads ${env_threads}
echo "${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen)" >> ${genotype_processed_dir}/bgen_extract/mergelist
done

./bin/plink2 \
Expand All @@ -103,4 +97,4 @@ done

Rscript resources/genotypes/variant_ids_bim.r ${genotype_processed_dir}/scratch/indep

echo "Successfully extracted pruned variants from bgen files"
echo "Successfully extracted pruned genotypes"
39 changes: 39 additions & 0 deletions 00b-genotype-organisation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

# strict stop if there are any errors
set -e

# get environmental variables
source config.env

# create results directory
mkdir -p ${results_dir}/00

# log everything from this script to a logfile in the results directory
exec &> >(tee ${results_dir}/00/logfile)


echo "Get MAF and INFO scores"
Rscript resources/genotypes/rand.r ${genotype_processed_dir}/sample_inclusion.txt ${genotype_processed_dir}/scratch/phenrand.txt

samplefile=$(head -n 1 ${genotype_input_list} | awk '{print $2}')

./bin/gcta-1.94.1 \
--mbgen ${genotype_input_list} \
--sample ${samplefile} \
--pheno ${genotype_processed_dir}/scratch/phenrand.txt \
--fastGWA-lr \
--keep ${genotype_processed_dir}/sample_inclusion.txt \
--thread-num ${env_threads} \
--maf 0 \
--geno 1 \
--out ${genotype_processed_dir}/scratch/phenrand

Rscript resources/genotypes/organise_variants.r \
${genotype_processed_dir}/scratch/phenrand.fastGWA \
${results_dir}/00 \
${genotype_processed_dir}/variant_inclusion.txt \
${genotype_processed_dir}/build_mapping.txt


echo "Successfully summarised and filtered variants"
56 changes: 53 additions & 3 deletions 01-ancestry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@ exec &> >(tee ${results_dir}/01/logfile)
# - PCs for each ancestry
# - mbfile for fastGWA




containsElement () {
local e
for e in "${@:2}"; do [[ "$e" == "$1" ]] && return 0; done
Expand Down Expand Up @@ -226,4 +223,57 @@ then
fi


echo "Generate sparse GRM"

if [ "$arg" = "grm" ] || [ "$arg" = "all" ]
then

section_message "grm"

if [ "${env_family_data}" = "true" ]
then
bin/king \
-b ${genotype_processed_dir}/scratch/indep.bed \
--related \
--degree 3 \
--cpus ${env_threads} \
--prefix ${genotype_processed_dir}/scratch/king

awk '{ print $1, $3, $14 }' ${genotype_processed_dir}/scratch/king.kin0 | grep -v "4th" | sed 1d > ${genotype_processed_dir}/scratch/king.kin0.formatted

Rscript resources/genotypes/pedFAM.R \
${genotype_processed_dir}/scratch/indep.fam \
${genotype_processed_dir}/scratch/king.kin0.formatted \
${genotype_processed_dir}/sparsegrm
fi
fi

echo "Final keep lists"
# Unrelateds
# +kingunrelated.txt
# -genetic_outliers.txt

cat ${genotype_processed_dir}/kingunrelated.txt | \
grep -vw -f ${genotype_processed_dir}/genetic_outliers.txt > \
${genotype_processed_dir}/unrelated_keep.txt

nunrelated=$(cat ${genotype_processed_dir}/unrelated_keep.txt | wc -l)

echo "N Unrelated: ${nunrelated}"

# Relateds
# +fam file (everyone)
# -genetic_outliers.txt

if [ "${env_family_data}" = "true" ]
then
awk '{ print $1"\t"$2 }' ${genotype_processed_dir}/scratch/indep.fam | \
grep -vw -f ${genotype_processed_dir}/genetic_outliers.txt > \
${genotype_processed_dir}/related_keep.txt
fi

nrelated=$(cat ${genotype_processed_dir}/related_keep.txt | wc -l)

echo "N Related: ${nrelated}"

echo "Successfully generated PCs etc!"
3 changes: 0 additions & 3 deletions 02-phenotype-organisation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,4 @@ ls ${phenotype_processed_dir}/*.phen > ${phenotype_processed_dir}/phenolist
nphen=`cat ${phenotype_processed_dir}/phenolist | wc -l`
echo "Generated ${nphen} phenotype subsets"

echo "Formatting for regenie"
Rscript resources/phenotypes/regenie.r ${phenotype_processed_dir}/phenolist

echo "Successfully organised phenotypes!"
138 changes: 138 additions & 0 deletions 04-gwas.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/bin/bash

# strict stop if there are any errors
set -e

# get environmental variables
source config.env

# create results directory
mkdir -p ${results_dir}/04

# log everything from this script to a logfile in the results director
exec &> >(tee ${results_dir}/04/logfile_step1)

# Inputs:

# - sparse GRM - 01
# - PCs - 01
# - genotype data - 00
# - phenotypes - 02
# - covariates - 00

# Processes:

# - FastGWA per phen x age x ancestry

# Output:

# - GWAS summary stats per phen x age x ancestry
# - results/04/phen_<phencode>_<ancestry>_<age>.*

phenolist=( $(cat ${phenotype_processed_dir}/phenolist) )

# Allow specific analysis to be run
# Can take any number between 1:ngwas where ngwas is the number of rows in ${phenotype_processed_dir}/phenolist
index=$1
nphen=`cat ${phenotype_processed_dir}/phenolist | wc -l`

if [ -z $index ]
then
echo "Running all $nphen GWASs"
elif [ ! -z $index ]; then
re='^[0-9]+$'
if ! [[ $index =~ $re ]] ; then
# check if $index is in the phenolist array
if [[ " ${phenolist[@]} " =~ " ${index} " ]]; then
echo "Running GWAS for phenotype $index"
else
echo "error: Index is not a number or a valid phenotype"
echo "Usage: ${0} [index number]"
exit 1
fi
else
if [ "$index" -gt "$nphen" ] ; then
echo "error: Index is larger than number of phenotypes"
echo "Usage: ${0} [index number]"
exit 1
fi
echo "Running $index of $nphen GWASs"
fi
fi

echo $index

## TODO
# copy bim files over to results/04
# Do GWAS for each phenotype
i=1
samplefile=$(head -n 1 ${genotype_input_list} | awk '{print $2}')
for phen in ${phenolist[@]}
do
if [ -z $index ] || [[ "$index" == "$phen" ]] || [[ "$index" == "$i" ]] ; then
filename=$(basename -- ${phen})
filename="${filename%.*}"
echo $filename
covs=$(echo $phen | sed 's/.phen$/.covs/1')
echo $covs
echo "0" > ${phen}.flag
if [ "$env_family_data" == "true" ]
then
echo "family"
(
./bin/gcta-1.94.1 \
--mbgen ${genotype_input_list} \
--sample ${samplefile} \
--fastGWA-mlm \
--grm-sparse ${genotype_processed_dir}/sparsegrm \
--extract ${genotype_processed_dir}/variant_inclusion.txt \
--keep ${genotype_processed_dir}/sample_inclusion.txt \
--pheno ${phen} \
--qcovar ${covs} \
--thread-num ${env_threads} \
--maf 0 \
--geno 1 \
--out ${results_dir}/04/${filename}
) || ( echo "1" > ${phen}.flag )
flag=`cat ${phen}.flag`
echo $flag
if [ "$flag" -eq "1" ] ; then
echo "LMM failed. Trying linear model using unrelateds only"
./bin/gcta-1.94.1 \
--mbgen ${genotype_input_list} \
--sample ${samplefile} \
--fastGWA-mlm \
--grm-sparse ${genotype_processed_dir}/sparsegrm \
--extract ${genotype_processed_dir}/variant_inclusion.txt \
--keep ${genotype_processed_dir}/sample_inclusion.txt \
--pheno ${phen} \
--qcovar ${covs} \
--thread-num ${env_threads} \
--maf 0 \
--geno 1 \
--out ${results_dir}/04/${filename}
fi
else
echo "not family"
./bin/gcta-1.94.1 \
--mbgen ${genotype_input_list} \
--sample ${samplefile} \
--fastGWA-lr \
--pheno ${phen} \
--extract ${genotype_processed_dir}/variant_inclusion.txt \
--keep ${genotype_processed_dir}/sample_inclusion.txt \
--qcovar ${covs} \
--thread-num ${env_threads} \
--maf 0 \
--geno 1 \
--out ${results_dir}/04/${filename}
fi

# compress GWAS
# keep only b, se because all other info is constant across GWASs
echo "Compressing output..."
Rscript resources/genotypes/compress_gwas.r ${results_dir}/04/${filename}.fastGWA ${genotype_processed_dir}/build_mapping.txt
rm ${results_dir}/04/${filename}.fastGWA
fi
i=$((i+1))
done
53 changes: 0 additions & 53 deletions 04a-gwas.sh

This file was deleted.

Loading

0 comments on commit cbaf286

Please sign in to comment.