Skip to content

Commit

Permalink
1.10: Separate hmm_mincov params for domains and proteins
Browse files Browse the repository at this point in the history
  • Loading branch information
erikrikarddaniel committed Dec 16, 2021
1 parent effc5af commit 3f8502c
Show file tree
Hide file tree
Showing 6 changed files with 769 additions and 31 deletions.
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% set version = "1.9.15" %}
{% set version = "1.10" %}

package:
name: pfitmap-db
Expand Down
47 changes: 26 additions & 21 deletions src/R-test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@ SQLITE_SELECT_CLASSIFICATION = sqlite3 $@.sqlite3 "select '--> accessions <--';
SQLITE_SELECT_GTDB_CLASSIFICATION = sqlite3 $@.sqlite3 "SELECT '--> accessions <--'; SELECT accno, genome_accno, db FROM accessions ORDER BY accno, genome_accno, db; SELECT '--> proteins <--'; SELECT accno, profile, score, evalue, tlen, qlen, hmmlen, alilen, envlen, hmm_FROM, hmm_to, ali_FROM, ali_to, env_FROM, env_to FROM proteins ORDER BY accno, profile; SELECT '--> hmm_profiles <--'; SELECT profile, psuperfamily, pfamily, pclass, psubclass, pgroup, prank, version, plen FROM hmm_profiles ORDER BY profile; SELECT '--> taxa <--'; SELECT genome_accno, ncbi_taxon_id, tdomain, tphylum, tclass, torder, tfamily, tgenus, tspecies, trank FROM taxa ORDER BY genome_accno; SELECT '--> dbsources <--'; SELECT source, name, version FROM dbsources; SELECT '--> domains <--'; SELECT accno, profile, score, evalue FROM domains ORDER BY accno, profile, score; SELECT '--> tblout <--'; SELECT * FROM tblout ORDER BY accno, profile; SELECT '--> domtblout <--'; SELECT * FROM domtblout ORDER BY accno, profile, i;" > $@.out
SQLITE_SELECT_CLASSIFY_SEQUENCES = sqlite3 $@.sqlite3 "SELECT '--> sequences <--'; SELECT accno, sequence FROM sequences ORDER BY accno;" >> $@.out
SQLITE_SELECT_FETCHSEQS = sqlite3 $@.sqlite3 "SELECT accno, sequence FROM sequences ORDER BY accno;" > $@.out
#SQLITE_SELECT_CLASSIFICATION = sqlite3 $@.sqlite3 "select accno, accto, taxon, db from accessions order by accno, accto, taxon, db; select accno, profile, score, evalue, tlen, qlen, hmmlen, alilen, envlen, hmm_from, hmm_to, ali_from, ali_to, env_from, env_to from proteins order by accno, profile; select accno, profile, score, evalue, tlen, qlen, hmmlen, alilen, envlen, hmm_from, hmm_to, ali_from, ali_to, env_from, env_to from dupfree_proteins order by accno, profile; select accno_from, accno_to from dupfree_accessions order by accno, accto; select profile, psuperfamily, pfamily, pclass, psubclass, pgroup, prank, version, plen from hmm_profiles order by profile; select ncbi_taxon_id, tdomain, tkingdom, tphylum, tclass, torder, tfamily, tgenus, tspecies, taxon, trank from taxa order by ncbi_taxon_id; SELECT source, name, version FROM dbsources; SELECT accno, profile, i, n, dom_c_evalue, dom_i_evalue, dom_score, hmm_from, hmm_to, ali_from, ali_to, env_from, env_to FROM domains ORDER BY accno, profile, i; SELECT * FROM tblout ORDER BY accno, profile; SELECT * FROM domtblout ORDER BY accno, profile, i;" > $@.out

DB2FEATHER_OUT = for f in $@.*.feather; do Rscript --default-packages=dplyr,feather -e "print(paste0('$$f', ': ', read_feather('$$f') %>% nrow(), ' rows'))"; done > $@.out
#DB2FEATHER_CONTENT = for f in $@.*.feather; do echo "--> $$f <--"; Rscript --default-packages=dplyr,feather -e "options(width = 1e4); print(arrange(read_feather('$$f')), n = 10000, width = Inf)"; done > $@.out
DB2FEATHER_CONTENT = for f in $@.*.feather; do echo "--> $$f <--"; Rscript --default-packages=dplyr,feather -e "options(width = 1e4); as.data.frame(arrange(read_feather('$$f')))"; done > $@.out

all: pf-classify.gtdb pf-classify pf-db2feather pf-fetchseqs
Expand All @@ -19,7 +17,7 @@ clean:

all.sqlite: pf-classify.02 pf-classify.03 pf-classify.04

pf-classify.gtdb: pf-classify.gtdb.00 pf-classify.gtdb.01 pf-classify.gtdb.02 pf-classify.gtdb.03 pf-classify.gtdb.04 pf-classify.gtdb.05 pf-classify.gtdb.06 pf-classify.gtdb.07 pf-classify.gtdb.08
pf-classify.gtdb: pf-classify.gtdb.00 pf-classify.gtdb.01 pf-classify.gtdb.02 pf-classify.gtdb.03 pf-classify.gtdb.04 pf-classify.gtdb.05 pf-classify.gtdb.06 pf-classify.gtdb.07 pf-classify.gtdb.08 pf-classify.gtdb.09

pf-classify.ncbi: pf-classify.00 pf-classify.01 pf-classify.02 pf-classify.03 pf-classify.04 pf-classify.05 pf-classify.06 pf-classify.07 pf-classify.08 pf-classify.09

Expand All @@ -38,85 +36,92 @@ Nrd.test.tar.gz:

pf-classify.gtdb.00:
@rm -f $@.sqlite3
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --singletable=$@.out --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --singletable=$@.out --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout
@$(CHECK)

pf-classify.gtdb.01:
@rm -f $@.sqlite3
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout
@$(SQLITE_SELECT_GTDB_CLASSIFICATION)
@$(CHECK)

pf-classify.gtdb.02:
@rm -f $@.sqlite3
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout
@$(SQLITE_SELECT_GTDB_CLASSIFICATION)
@$(CHECK)

pf-classify.gtdb.03:
@rm -f $@.sqlite3
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout
@$(SQLITE_SELECT_GTDB_CLASSIFICATION)
@$(SQLITE_SELECT_CLASSIFY_SEQUENCES)
@$(CHECK)

pf-classify.gtdb.04:
@rm -f $@.*.feather
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout
@$(DB2FEATHER_CONTENT)
@$(CHECK)

pf-classify.gtdb.05:
@rm -f $@.*.feather
../R/pf-classify.r --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa.gz $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa.gz $@.d/*.tblout $@.d/*.domtblout
@$(DB2FEATHER_CONTENT)
@$(CHECK)

pf-classify.gtdb.06:
@rm -f $@.*.feather
../R/pf-classify.r --hmm_mincov=0.8 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.8 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout
@$(DB2FEATHER_CONTENT)
@$(CHECK)

# File output with missing genomes
pf-classify.gtdb.07:
@rm -f $@.*.feather
../R/pf-classify.r --hmm_mincov=0.8 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa --missing=$@.missing $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.8 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa --missing=$@.missing $@.d/*.tblout $@.d/*.domtblout
@$(DB2FEATHER_CONTENT)
@cat $@.missing >> $@.out
@$(CHECK)

# Short and other problematic proteins
pf-classify.gtdb.08:
@rm -f $@.*.feather
../R/pf-classify.r --hmm_mincov=0.1 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.1 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv $@.d/*.tblout $@.d/*.domtblout
@$(DB2FEATHER_CONTENT)
@$(CHECK)

# Test the domain_hmm_mincov param
pf-classify.gtdb.09:
@rm -f $@.*.feather
../R/pf-classify.r --domain_hmm_mincov=0.9 --protein_hmm_mincov=0.0 --dbsource GTDB:GTDB:RS89 --profilehierarchies=$@.phier.tsv --featherprefix=$@ --gtdbmetadata=$@.d/gtdb_metadata.tsv --seqfaa=$@.d/genomes.faa $@.d/*.tblout $@.d/*.domtblout
@$(DB2FEATHER_CONTENT)
@$(CHECK)

pf-classify.00:
../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
@$(CHECK)

pf-classify.01:
../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
@$(CHECK)

pf-classify.02:
@rm -f $@.sqlite3
../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
@$(SQLITE_SELECT_CLASSIFICATION)
@$(CHECK)

pf-classify.03:
@rm -f $@.sqlite3
../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
@$(SQLITE_SELECT_CLASSIFICATION)
@$(CHECK)

# Test the fuzzy_factor to produce a taxon reduced protein list
pf-classify.04:
@rm -f $@.sqlite3
../R/pf-classify.r --fuzzy_factor=30 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --fuzzy_factor=30 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
@$(SQLITE_SELECT_CLASSIFICATION)
@$(CHECK)

Expand All @@ -126,25 +131,25 @@ pf-classify.05:
@$(CHECK)

pf-classify.06:
( ../R/pf-classify.r --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout 2>&1 | grep -o 'dbsource is required' > $@.out )
( ../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout 2>&1 | grep -o 'dbsource is required' > $@.out )
@$(CHECK)

pf-classify.07:
@rm -f $@.sqlite3
../R/pf-classify.r --hmm_mincov=0.95 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.95 --protein_hmm_mincov=0.95 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
@$(SQLITE_SELECT_CLASSIFICATION)
@$(CHECK)

# Thought I had problems with the 4CON structure...
pf-classify.08:
@rm -f $@.sqlite3
../R/pf-classify.r --hmm_mincov=0.95 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
../R/pf-classify.r --domain_hmm_mincov=0.95 --protein_hmm_mincov=0.95 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --sqlitedb=$@.sqlite3 --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout
@$(SQLITE_SELECT_CLASSIFICATION)
@$(CHECK)

# Check that the script fails when the hmm_profiles table is not unique on profile
pf-classify.09:
( ../R/pf-classify.r --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout 2>&1 | grep -o 'The profile column in the hmm_profiles table .* needs to be unique' > $@.out )
( ../R/pf-classify.r --domain_hmm_mincov=0.0 --protein_hmm_mincov=0.0 --dbsource NCBI:NR:20180109 --profilehierarchies=$@.phier.tsv --singletable=$@.out --taxflat=pf-classify.taxflat.tsv $@.d/*.tblout $@.d/*.domtblout 2>&1 | grep -o 'The profile column in the hmm_profiles table .* needs to be unique' > $@.out )
@$(CHECK)

pf-db2feather.00:
Expand Down
1 change: 1 addition & 0 deletions src/R-test/pf-classify.gtdb.09.d
Loading

0 comments on commit 3f8502c

Please sign in to comment.