Skip to content

Commit

Permalink
Incorporate PHROG db to improve the annotation of phage-borne genes o…
Browse files Browse the repository at this point in the history
…schwengers#43 (oschwengers#189)

* introduce PHROG PSC annotations oschwengers#43
* constrain PHROG annotation to hypotheticals oschwengers#43
* fix indentation in CLI output [skip-cli]
  • Loading branch information
oschwengers committed Feb 7, 2023
1 parent 032bdf1 commit 2a14a73
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 34 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,8 +491,9 @@ IPS & PSC have been comprehensively pre-annotated integrating annotations & data
- NCBI COG db (PSC: 3,424,142)
- KEGG Kofams (PSC: 17,787,347)
- SwissProt EC/GO terms (PSC: 336,030)
- NCBI AMRFinderPlus (IPS: 7,009)
- NCBI NCBIfams (PSC: 13,466,827)
- PHROG (PSC: 0)
- NCBI AMRFinderPlus (IPS: 7,009)
- ISFinder db (IPS: 53,341, PSC: 11,412)
- Pfam families (PSC: 3,917,555)
Expand Down Expand Up @@ -737,6 +738,7 @@ Bakta is *standing on the shoulder of giants* taking advantage of many great sof
- RefSeq: <https://doi.org/10.1093/nar/gkx1068>
- COG: <https://doi.org/10.1093/bib/bbx117>
- KEGG: <https://doi.org/10.1093/bioinformatics/btz859>
- PHROG: <https://doi.org/10.1093/nargab/lqab067>
- AMRFinder: <https://doi.org/10.1128/AAC.00483-19>
- ISFinder: <https://doi.org/10.1093/nar/gkj014>
- Pfam: <https://doi.org/10.1093/nar/gky995>
Expand Down
6 changes: 3 additions & 3 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,9 +490,9 @@ def main():
print(f"\tCRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}")
cdss = [f for f in features if f['type'] == bc.FEATURE_CDS]
print(f"\tCDSs: {len(cdss)}")
print(f"\t hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}")
print(f"\t pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}")
print(f"\t signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}")
print(f"\t\thypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}")
print(f"\t\tpseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}")
print(f"\t\tsignal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}")
print(f"\tsORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}")
print(f"\tgaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}")
print(f"\toriCs/oriVs: {len([f for f in features if (f['type'] == bc.FEATURE_ORIC or f['type'] == bc.FEATURE_ORIV)])}")
Expand Down
74 changes: 74 additions & 0 deletions db-scripts/annotate-phrogs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import argparse
import logging
import sqlite3

from pathlib import Path

from alive_progress import alive_bar

parser = argparse.ArgumentParser(
description='Annotate PSCs by PHROG protein sequences.'
)
parser.add_argument('--db', action='store', help='Path to Bakta db file.')
parser.add_argument('--annotation', action='store', help='Path to PHROG annotation file.')
parser.add_argument('--psc-alignments', action='store', dest='psc_alignments', help='Path to PSC diamond alignment file.')
args = parser.parse_args()


db_path = Path(args.db).resolve()
annotation_path = Path(args.annotation).resolve()
psc_alignments_path = Path(args.psc_alignments).resolve()


logging.basicConfig(
filename='bakta.db.log',
filemode='a',
format='%(name)s - PHROG - %(levelname)s - %(message)s',
level=logging.DEBUG
)
log = logging.getLogger('PSC')


phrogs = {}
with annotation_path.open() as fh_in:
for line in fh_in:
if not line.startswith('phrog'):
(id, color, product, category) = line.strip().split('\t')
if product != 'NA':
phrogs[f'phrog_{id}'] = product


print('parse PSC alignments and add PHROG annotations...')
psc_processed = 0
psc_updated = 0
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn:
conn.execute('PRAGMA page_size = 4096;')
conn.execute('PRAGMA cache_size = 100000;')
conn.execute('PRAGMA locking_mode = EXCLUSIVE;')
conn.execute(f'PRAGMA mmap_size = {20 * 1024 * 1024 * 1024};')
conn.execute('PRAGMA synchronous = OFF;')
conn.execute('PRAGMA journal_mode = OFF')
conn.execute('PRAGMA threads = 2;')
conn.commit()

with psc_alignments_path.open() as fh, alive_bar() as bar:
for line in fh:
(uniref90_id, sseqid, stitle, length, pident, qlen, slen, evalue) = line.strip().split('\t')
length = int(length)
qcov = length / int(qlen)
scov = length / int(slen)
if(qcov >= 0.80 and scov >= 0.80 and float(pident) >= 90. and float(evalue) < 1e-6):
if sseqid in phrogs:
product = phrogs[sseqid]
conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (product, uniref90_id))
log.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', product, uniref90_id)
psc_updated += 1
psc_processed += 1
if((psc_processed % 10000) == 0):
conn.commit()
bar()
conn.commit()
print(f'PSCs processed: {psc_processed}')
print(f'PSCs annotated by PHROGs: {psc_updated}')
log.debug('summary: PSC annotated=%i', psc_updated)

80 changes: 50 additions & 30 deletions db-scripts/buid-db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ cd db
printf "Create Bakta database\n"

# download rRNA covariance models from Rfam
printf "\n1/18: download rRNA covariance models from Rfam ...\n"
printf "\n1/19: download rRNA covariance models from Rfam ...\n"
wget https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz
pigz -d Rfam.cm.gz
cmfetch Rfam.cm RF00001 > rRNA
Expand All @@ -19,7 +19,7 @@ rm rRNA


# download and extract ncRNA gene covariance models from Rfam
printf "\n2/18: download ncRNA gene covariance models from Rfam ...\n"
printf "\n2/19: download ncRNA gene covariance models from Rfam ...\n"
mysql --user rfamro --host mysql-rfam-public.ebi.ac.uk --port 4497 --database Rfam < ${BAKTA_DB_SCRIPTS}/ncRNA-genes.sql | tail -n +2 > rfam-genes.raw.txt
grep "antitoxin;" rfam-genes.raw.txt >> rfam-genes.txt
grep "antisense;" rfam-genes.raw.txt >> rfam-genes.txt
Expand All @@ -35,7 +35,7 @@ rm rfam-genes.raw.txt rfam-genes.txt ncRNA-genes.blocklist ncRNA-genes rfam2go


# download and extract ncRNA regions (cis reg elements) covariance models from Rfam
printf "\n3/18: download ncRNA region covariance models from Rfam ...\n"
printf "\n3/19: download ncRNA region covariance models from Rfam ...\n"
mysql --user rfamro --host mysql-rfam-public.ebi.ac.uk --port 4497 --database Rfam < ${BAKTA_DB_SCRIPTS}/ncRNA-regions.sql | tail -n +2 > rfam-regions.raw.txt
grep "riboswitch;" rfam-regions.raw.txt >> rfam-regions.txt
grep "thermoregulator;" rfam-regions.raw.txt >> rfam-regions.txt
Expand All @@ -49,7 +49,7 @@ rm rfam-regions.raw.txt rfam-regions.txt ncRNA-regions.blocklist ncRNA-regions R


# download and extract spurious ORF HMMs from AntiFam
printf "\n4/18: download and extract spurious ORF HMMs from AntiFam ...\n"
printf "\n4/19: download and extract spurious ORF HMMs from AntiFam ...\n"
mkdir antifam-dir
cd antifam-dir
wget https://ftp.ebi.ac.uk/pub/databases/Pfam/AntiFam/current/Antifam.tar.gz
Expand All @@ -61,12 +61,12 @@ rm -r antifam antifam-dir/


# download & extract oriT sequences
printf "\n5/18: download and extract oriT sequences from Mob-suite ...\n"
printf "\n5/19: download and extract oriT sequences from Mob-suite ...\n"
wget https://zenodo.org/record/3786915/files/data.tar.gz
tar -xvzf data.tar.gz
mv data/orit.fas ./orit.fna
rm -r data/ data.tar.gz
printf "\n5/18: download and extract oriC sequences from DoriC ...\n"
printf "\n5/19: download and extract oriC sequences from DoriC ...\n"
wget http://tubic.org/doric/public/static/download/doric10.rar
unrar e doric10.rar
python3 ${BAKTA_DB_SCRIPTS}/extract-ori.py --doric tubic_bacteria.csv --fasta oric.chromosome.fna
Expand All @@ -77,7 +77,7 @@ rm doric10.rar tubic* oric.plasmid.fna


# download NCBI Taxonomy DB
printf "\n6/18: download NCBI Taxonomy DB ...\n"
printf "\n6/19: download NCBI Taxonomy DB ...\n"
mkdir taxonomy
cd taxonomy
wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
Expand All @@ -90,7 +90,7 @@ rm -rf taxonomy
############################################################################
# Setup SQLite Bakta db
############################################################################
printf "\n7/18: setup SQLite Bakta db ...\n"
printf "\n7/19: setup SQLite Bakta db ...\n"
python3 ${BAKTA_DB_SCRIPTS}/init-db.py --db bakta.db


Expand All @@ -100,13 +100,13 @@ python3 ${BAKTA_DB_SCRIPTS}/init-db.py --db bakta.db
# - read and transform UniRef90 XML file to DB and Fasta file
# - build PSC Diamond db
############################################################################
printf "\n8/18: download UniProt UniRef90 ...\n"
printf "\n8/19: download UniProt UniRef90 ...\n"
wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref90/uniref90.xml.gz
wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref50/uniref50.xml.gz
wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniparc/uniparc_active.fasta.gz
printf "\n8/18: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n"
printf "\n8/19: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n"
python3 ${BAKTA_DB_SCRIPTS}/init-psc.py --taxonomy nodes.dmp --uniref90 uniref90.xml.gz --uniref50 uniref50.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --psc psc.faa --sorf sorf.faa
printf "\n8/18: build PSC Diamond db ...\n"
printf "\n8/19: build PSC Diamond db ...\n"
diamond makedb --in psc.faa --db psc
diamond makedb --in sorf.faa --db sorf
rm uniref90.xml.gz
Expand All @@ -117,9 +117,9 @@ rm uniref90.xml.gz
# - download UniProt UniRef100
# - read, filter and transform UniRef100 entries and store to ips.db
############################################################################
printf "\n9/18: download UniProt UniRef100 ...\n"
printf "\n9/19: download UniProt UniRef100 ...\n"
wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref100/uniref100.xml.gz
printf "\n9/18: read, filter and store UniRef100 entries ...:\n"
printf "\n9/19: read, filter and store UniRef100 entries ...:\n"
python3 ${BAKTA_DB_SCRIPTS}/init-ups-ips.py --taxonomy nodes.dmp --uniref100 uniref100.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --ips ips.faa
rm uniref100.xml.gz uniparc_active.fasta.gz

Expand All @@ -130,7 +130,7 @@ rm uniref100.xml.gz uniparc_active.fasta.gz
# - align UniRef90 proteins to COG protein sequences
# - annotate PSCs with COG info
############################################################################
printf "\n10/18: download COG db ...\n"
printf "\n10/19: download COG db ...\n"
wget https://ftp.ncbi.nih.gov/pub/COG/COG2020/data/cog-20.def.tab # COG IDs and functional class
wget https://ftp.ncbi.nih.gov/pub/COG/COG2020/data/cog-20.cog.csv # Mapping GenBank IDs -> COG IDs
for i in $(seq -f "%04g" 1 5950)
Expand All @@ -139,7 +139,7 @@ do
pigz -dc COG${i}.fa.gz | seqtk seq -CU >> cog.faa
rm COG${i}.fa.gz
done
printf "\n10/18: annotate PSCs ...\n"
printf "\n10/19: annotate PSCs ...\n"
diamond makedb --in cog.faa --db cog.dmnd
nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in psc.faa --db cog.dmnd --block 1000000 --id 90 --qcov 80 --scov 80 --out diamond.cog.psc.tsv
python3 ${BAKTA_DB_SCRIPTS}/annotate-cog.py --db bakta.db --alignments diamond.cog.psc.tsv --cog-ids cog-20.def.tab --gi-cog-mapping cog-20.cog.csv
Expand All @@ -154,7 +154,7 @@ rm cognames2003-2015.tab cog2003-2015.csv prot2003-2015.fa.gz diamond.cog.tsv co
# - select eligible HMMs (f measure>0.77)
# - annotate PSCs
############################################################################
printf "\n11/18: download KEGG kofams HMM models...\n"
printf "\n11/19: download KEGG kofams HMM models...\n"
wget https://www.genome.jp/ftp/db/kofam/ko_list.gz
wget https://www.genome.jp/ftp/db/kofam/profiles.tar.gz
zcat ko_list.gz | grep full | awk '{ if($5>=0.77) print $0}' > hmms.selected.tsv
Expand All @@ -163,7 +163,7 @@ tar -I pigz -xf profiles.tar.gz
for kofam in `cat profiles/prokaryote.hal`; do cat profiles/$kofam >> kofam-prok; done
hmmfetch -f -o kofams kofam-prok hmms.ids.txt
hmmpress kofams
printf "\n11/18: annotate PSCs...\n"
printf "\n11/19: annotate PSCs...\n"
mkdir -p work/tblout work/domtblout
nextflow run ${BAKTA_DB_SCRIPTS}/hmmsearch.nf --in psc.faa --db kofams --no_tc --out hmmsearch.kofam.tblout
python3 ${BAKTA_DB_SCRIPTS}/annotate-kofams.py --db bakta.db --hmms hmms.selected.tsv --hmm-results hmmsearch.kofam.tblout
Expand All @@ -176,15 +176,15 @@ rm -rf profiles ko_list.gz kofam* hmmsearch.kofam.* hmms*
# - annotate UPSs with NCBI nrp IDs (WP_*)
# - annotate IPSs/PSCs with NCBI gene names (WP_* -> hash -> UniRef100 -> UniRef90 -> PSC)
############################################################################
printf "\n12/18: download RefSeq nonredundant proteins and clusters ...\n"
printf "\n12/19: download RefSeq nonredundant proteins and clusters ...\n"
wget https://ftp.ncbi.nlm.nih.gov/genomes/CLUSTERS/PCLA_proteins.txt
wget https://ftp.ncbi.nlm.nih.gov/genomes/CLUSTERS/PCLA_clusters.txt
for i in {1..1573}; do
wget https://ftp.ncbi.nlm.nih.gov/refseq/release/bacteria/bacteria.nonredundant_protein.${i}.protein.faa.gz
pigz -dc bacteria.nonredundant_protein.${i}.protein.faa.gz | seqtk seq -CU >> refseq-bacteria-nrp.trimmed.faa
rm bacteria.nonredundant_protein.${i}.protein.faa.gz
done
printf "\n12/18: annotate IPSs and PSCs ...\n"
printf "\n12/19: annotate IPSs and PSCs ...\n"
python3 ${BAKTA_DB_SCRIPTS}/annotate-ncbi-nrp.py --db bakta.db --nrp refseq-bacteria-nrp.trimmed.faa --pcla-proteins PCLA_proteins.txt --pcla-clusters PCLA_clusters.txt
rm refseq-bacteria-nrp.trimmed.faa PCLA_proteins.txt PCLA_clusters.txt

Expand All @@ -195,9 +195,9 @@ rm refseq-bacteria-nrp.trimmed.faa PCLA_proteins.txt PCLA_clusters.txt
# - annotate PSCs if IPS have PSC UniRef90 identifier (seq -> hash -> UPS -> IPS -> PSC)
# - annotate IPSs if IPS have no PSC UniRef90 identifier (seq -> hash -> UPS -> IPS)
############################################################################
printf "\n13/18: download UniProt/SwissProt ...\n"
printf "\n13/19: download UniProt/SwissProt ...\n"
wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
printf "\n13/18: annotate IPSs and PSCs ...\n"
printf "\n13/19: annotate IPSs and PSCs ...\n"
python3 ${BAKTA_DB_SCRIPTS}/annotate-swissprot.py --taxonomy nodes.dmp --xml uniprot_sprot.xml.gz --db bakta.db
rm uniprot_sprot.xml.gz

Expand All @@ -207,7 +207,7 @@ rm uniprot_sprot.xml.gz
# - download NCBIfams HMM models
# - annotate PSCs
############################################################################
printf "\n14/18: download NCBIfams HMM models...\n"
printf "\n14/19: download NCBIfams HMM models...\n"
wget https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.LIB
wget https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv
grep -v "(Provisional)" hmm_PGAP.tsv > hmms.non-prov.tsv
Expand All @@ -216,21 +216,41 @@ grep equivalog hmms.non-prov.tsv >> hmms.selected.tsv
cut -f1 hmms.selected.tsv > hmms.ids.txt
hmmfetch -f -o ncbifams hmm_PGAP.LIB hmms.ids.txt
hmmpress ncbifams
printf "\n14/18: annotate PSCs...\n"
printf "\n14/19: annotate PSCs...\n"
mkdir -p work/tblout work/domtblout
nextflow run ${BAKTA_DB_SCRIPTS}/hmmsearch.nf --in psc.faa --db ncbifams --out hmmsearch.ncbifams.tblout
python3 ${BAKTA_DB_SCRIPTS}/annotate-ncbi-fams.py --db bakta.db --hmms hmms.selected.tsv --hmm-results hmmsearch.ncbifams.tblout
rm ncbifams* hmms.* hmm_PGAP.* hmmsearch.ncbifams.tblout


############################################################################
# Integrate PHROG DB of phage orthologous
# - download PHROG protein sequences
# - filter unannotated PHROGs
# - annotate PSCs
############################################################################
printf "\n15/19: download PHROGs ...\n"
wget https://phrogs.lmge.uca.fr/downloads_from_website/FAA_phrog.tar.gz
wget https://phrogs.lmge.uca.fr/downloads_from_website/phrog_annot_v4.tsv
tar -xzf FAA_phrog.tar.gz
cat FAA_phrog/*.faa >> phrogs-raw.faa
python3 ${BAKTA_DB_SCRIPTS}/extract-phrogs.py --annotation phrog_annot_v4.tsv --proteins phrogs-raw.faa --filtered-proteins phrogs.faa
diamond makedb --in phrogs.faa --db phrog
printf "\n15/19: annotate PSCs...\n"
python3 ${BAKTA_DB_SCRIPTS}/extract-hypotheticals.py --psc psc.faa --db bakta.db --hypotheticals hypotheticals.faa
nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in hypotheticals.faa --db phrog.dmnd --block 1000000 --id 90 --qcov 80 --scov 80 --out diamond.phrog.psc.tsv
python3 ${BAKTA_DB_SCRIPTS}/annotate-phrogs.py --db bakta.db --psc-alignments diamond.phrog.psc.tsv
rm -r FAA_phrog.tar.gz phrog_annot_v4.tsv FAA_phrog phrogs-raw.faa phrogs.faa phrog.dmnd hypotheticals.faa


############################################################################
# Integrate NCBI Pathogen AMR db
# - download AMR gene WP_* annotations from NCBI Pathogen ReferenceGeneCatalog
# - annotate IPSs with AMR info
############################################################################
printf "\n15/18: download AMR gene WP_* annotations from NCBI Pathogen AMR db ...\n"
printf "\n16/19: download AMR gene WP_* annotations from NCBI Pathogen AMR db ...\n"
wget https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/ReferenceGeneCatalog.txt
printf "\n15/18: annotate PSCs...\n"
printf "\n16/19: annotate PSCs...\n"
python3 ${BAKTA_DB_SCRIPTS}/annotate-ncbi-amr.py --db bakta.db --genes ReferenceGeneCatalog.txt
rm ReferenceGeneCatalog.txt

Expand All @@ -241,10 +261,10 @@ rm ReferenceGeneCatalog.txt
# - extract IS transposase sequences and mark ORF A/B transposases
# - annotate IPSs/PCSs with IS info
############################################################################
printf "\n16/18: download & extract ISfinder protein sequences ...\n"
printf "\n17/19: download ISfinder protein sequences ...\n"
wget https://github.com/oschwengers/ISfinder-sequences/raw/2e9162bd5e3448c86ec1549a55315e498bef72fc/IS.faa
python3 ${BAKTA_DB_SCRIPTS}/extract-is.py --input IS.faa --output is.transposase.faa
printf "\n16/18: annotate IPSs/PCSs ...\n"
printf "\n17/19: annotate IPSs ...\n"
grep -A 1 ~~~Transposase~~~ IS.faa | tr -d - | tr -s "\n" > is.transposase.faa
diamond makedb --in is.transposase.faa --db is
nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in ips.faa --db is.dmnd --block 1000000 --id 95 --qcov 90 --scov 90 --out diamond.is.ips.tsv
nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in psc.faa --db is.dmnd --block 1000000 --id 90 --qcov 80 --scov 80 --out diamond.is.psc.tsv
Expand All @@ -259,7 +279,7 @@ rm is.transposase.faa is.dmnd diamond.is.ips.tsv diamond.is.psc.tsv
# - compress HMM models
# - annotate hypothetical PSC via Pfam families
############################################################################
printf "\n17/18: download HMM models from Pfam ...\n"
printf "\n18/19: download HMM models from Pfam ...\n"
wget https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz
python3 ${BAKTA_DB_SCRIPTS}/extract-pfam.py --pfam Pfam-A.hmm.dat.gz --family pfam.families.tsv --non-family pfam.non-families.tsv
wget https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz
Expand All @@ -281,7 +301,7 @@ rm pfam-families* pfam *.tsv Pfam* hmmsearch.pfam-families.tblout
# - import NCBI BlastRules models
# - import VFDB sequences
############################################################################
printf "\n18/18: download AA sequences for expert annotation system ...\n"
printf "\n19/19: download AA sequences for expert annotation system ...\n"
wget https://ftp.ncbi.nlm.nih.gov/pub/blastrules/4.2.2.tgz
tar -xzf 4.2.2.tgz
gunzip VFDB_setA_pro.fas.gz
Expand Down
Loading

0 comments on commit 2a14a73

Please sign in to comment.