Incorporate PHROG db to improve the annotation of phage-borne genes o…

…schwengers#43 (oschwengers#189) * introduce PHROG PSC annotations oschwengers#43 * constrain PHROG annotation to hypotheticals oschwengers#43 * fix indentation in CLI output [skip-cli]
JeanMainguy · Feb 7, 2023 · 2a14a73 · 2a14a73
1 parent 032bdf1
commit 2a14a73
Show file tree

Hide file tree

Showing 5 changed files with 177 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -491,8 +491,9 @@ IPS & PSC have been comprehensively pre-annotated integrating annotations & data
 - NCBI COG db (PSC: 3,424,142)
 - KEGG Kofams (PSC: 17,787,347)
 - SwissProt EC/GO terms (PSC: 336,030)
-- NCBI AMRFinderPlus (IPS: 7,009)
 - NCBI NCBIfams (PSC: 13,466,827)
+- PHROG (PSC: 0)
+- NCBI AMRFinderPlus (IPS: 7,009)
 - ISFinder db (IPS: 53,341, PSC: 11,412)
 - Pfam families (PSC: 3,917,555)
 
@@ -737,6 +738,7 @@ Bakta is *standing on the shoulder of giants* taking advantage of many great sof
 - RefSeq: <https://doi.org/10.1093/nar/gkx1068>
 - COG: <https://doi.org/10.1093/bib/bbx117>
 - KEGG: <https://doi.org/10.1093/bioinformatics/btz859>
+- PHROG: <https://doi.org/10.1093/nargab/lqab067>
 - AMRFinder: <https://doi.org/10.1128/AAC.00483-19>
 - ISFinder: <https://doi.org/10.1093/nar/gkj014>
 - Pfam: <https://doi.org/10.1093/nar/gky995>

diff --git a/bakta/main.py b/bakta/main.py
@@ -490,9 +490,9 @@ def main():
     print(f"\tCRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}")
     cdss = [f for f in features if f['type'] == bc.FEATURE_CDS]
     print(f"\tCDSs: {len(cdss)}")
-    print(f"\t  hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}")
-    print(f"\t  pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}")
-    print(f"\t  signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}")
+    print(f"\t\thypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}")
+    print(f"\t\tpseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}")
+    print(f"\t\tsignal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}")
     print(f"\tsORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}")
     print(f"\tgaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}")
     print(f"\toriCs/oriVs: {len([f for f in features if (f['type'] == bc.FEATURE_ORIC or f['type'] == bc.FEATURE_ORIV)])}")

diff --git a/db-scripts/annotate-phrogs.py b/db-scripts/annotate-phrogs.py
@@ -0,0 +1,74 @@
+import argparse
+import logging
+import sqlite3
+
+from pathlib import Path
+
+from alive_progress import alive_bar
+
+parser = argparse.ArgumentParser(
+    description='Annotate PSCs by PHROG protein sequences.'
+)
+parser.add_argument('--db', action='store', help='Path to Bakta db file.')
+parser.add_argument('--annotation', action='store', help='Path to PHROG annotation file.')
+parser.add_argument('--psc-alignments', action='store', dest='psc_alignments', help='Path to PSC diamond alignment file.')
+args = parser.parse_args()
+
+
+db_path = Path(args.db).resolve()
+annotation_path = Path(args.annotation).resolve()
+psc_alignments_path = Path(args.psc_alignments).resolve()
+
+
+logging.basicConfig(
+    filename='bakta.db.log',
+    filemode='a',
+    format='%(name)s - PHROG - %(levelname)s - %(message)s',
+    level=logging.DEBUG
+)
+log = logging.getLogger('PSC')
+
+
+phrogs = {}
+with annotation_path.open() as fh_in:
+    for line in fh_in:
+        if not line.startswith('phrog'):
+            (id, color, product, category) = line.strip().split('\t')
+            if product != 'NA':
+                phrogs[f'phrog_{id}'] = product
+
+
+print('parse PSC alignments and add PHROG annotations...')
+psc_processed = 0
+psc_updated = 0
+with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn:
+    conn.execute('PRAGMA page_size = 4096;')
+    conn.execute('PRAGMA cache_size = 100000;')
+    conn.execute('PRAGMA locking_mode = EXCLUSIVE;')
+    conn.execute(f'PRAGMA mmap_size = {20 * 1024 * 1024 * 1024};')
+    conn.execute('PRAGMA synchronous = OFF;')
+    conn.execute('PRAGMA journal_mode = OFF')
+    conn.execute('PRAGMA threads = 2;')
+    conn.commit()
+
+    with psc_alignments_path.open() as fh, alive_bar() as bar:
+        for line in fh:
+            (uniref90_id, sseqid, stitle, length, pident, qlen, slen, evalue) = line.strip().split('\t')
+            length = int(length)
+            qcov = length / int(qlen)
+            scov = length / int(slen)
+            if(qcov >= 0.80 and scov >= 0.80 and float(pident) >= 90. and float(evalue) < 1e-6):
+                if sseqid in phrogs:
+                    product = phrogs[sseqid]
+                    conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (product, uniref90_id))
+                    log.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', product, uniref90_id)
+                    psc_updated += 1
+            psc_processed += 1
+            if((psc_processed % 10000) == 0):
+                conn.commit()
+            bar()
+        conn.commit()
+    print(f'PSCs processed: {psc_processed}')
+    print(f'PSCs annotated by PHROGs: {psc_updated}')
+    log.debug('summary: PSC annotated=%i', psc_updated)
+
diff --git a/db-scripts/buid-db.sh b/db-scripts/buid-db.sh
@@ -8,7 +8,7 @@ cd db
 printf "Create Bakta database\n"
 
 # download rRNA covariance models from Rfam
-printf "\n1/18: download rRNA covariance models from Rfam ...\n"
+printf "\n1/19: download rRNA covariance models from Rfam ...\n"
 wget https://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz
 pigz -d Rfam.cm.gz
 cmfetch Rfam.cm RF00001 >  rRNA
@@ -19,7 +19,7 @@ rm rRNA
 
 
 # download and extract ncRNA gene covariance models from Rfam
-printf "\n2/18: download ncRNA gene covariance models from Rfam ...\n"
+printf "\n2/19: download ncRNA gene covariance models from Rfam ...\n"
 mysql --user rfamro --host mysql-rfam-public.ebi.ac.uk --port 4497 --database Rfam < ${BAKTA_DB_SCRIPTS}/ncRNA-genes.sql | tail -n +2 > rfam-genes.raw.txt
 grep "antitoxin;" rfam-genes.raw.txt >> rfam-genes.txt
 grep "antisense;" rfam-genes.raw.txt >> rfam-genes.txt
@@ -35,7 +35,7 @@ rm rfam-genes.raw.txt rfam-genes.txt ncRNA-genes.blocklist ncRNA-genes rfam2go
 
 
 # download and extract ncRNA regions (cis reg elements) covariance models from Rfam
-printf "\n3/18: download ncRNA region covariance models from Rfam ...\n"
+printf "\n3/19: download ncRNA region covariance models from Rfam ...\n"
 mysql --user rfamro --host mysql-rfam-public.ebi.ac.uk --port 4497 --database Rfam < ${BAKTA_DB_SCRIPTS}/ncRNA-regions.sql | tail -n +2 > rfam-regions.raw.txt
 grep "riboswitch;" rfam-regions.raw.txt >> rfam-regions.txt
 grep "thermoregulator;" rfam-regions.raw.txt >> rfam-regions.txt
@@ -49,7 +49,7 @@ rm rfam-regions.raw.txt rfam-regions.txt ncRNA-regions.blocklist ncRNA-regions R
 
 
 # download and extract spurious ORF HMMs from AntiFam
-printf "\n4/18: download and extract spurious ORF HMMs from AntiFam ...\n"
+printf "\n4/19: download and extract spurious ORF HMMs from AntiFam ...\n"
 mkdir antifam-dir
 cd antifam-dir
 wget https://ftp.ebi.ac.uk/pub/databases/Pfam/AntiFam/current/Antifam.tar.gz
@@ -61,12 +61,12 @@ rm -r antifam antifam-dir/
 
 
 # download & extract oriT sequences
-printf "\n5/18: download and extract oriT sequences from Mob-suite ...\n"
+printf "\n5/19: download and extract oriT sequences from Mob-suite ...\n"
 wget https://zenodo.org/record/3786915/files/data.tar.gz
 tar -xvzf data.tar.gz
 mv data/orit.fas ./orit.fna
 rm -r data/ data.tar.gz
-printf "\n5/18: download and extract oriC sequences from DoriC ...\n"
+printf "\n5/19: download and extract oriC sequences from DoriC ...\n"
 wget http://tubic.org/doric/public/static/download/doric10.rar
 unrar e doric10.rar
 python3 ${BAKTA_DB_SCRIPTS}/extract-ori.py --doric tubic_bacteria.csv --fasta oric.chromosome.fna
@@ -77,7 +77,7 @@ rm doric10.rar tubic* oric.plasmid.fna
 
 
 # download NCBI Taxonomy DB
-printf "\n6/18: download NCBI Taxonomy DB ...\n"
+printf "\n6/19: download NCBI Taxonomy DB ...\n"
 mkdir taxonomy
 cd taxonomy
 wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
@@ -90,7 +90,7 @@ rm -rf taxonomy
 ############################################################################
 # Setup SQLite Bakta db
 ############################################################################
-printf "\n7/18: setup SQLite Bakta db ...\n"
+printf "\n7/19: setup SQLite Bakta db ...\n"
 python3 ${BAKTA_DB_SCRIPTS}/init-db.py --db bakta.db
 
 
@@ -100,13 +100,13 @@ python3 ${BAKTA_DB_SCRIPTS}/init-db.py --db bakta.db
 # - read and transform UniRef90 XML file to DB and Fasta file
 # - build PSC Diamond db
 ############################################################################
-printf "\n8/18: download UniProt UniRef90 ...\n"
+printf "\n8/19: download UniProt UniRef90 ...\n"
 wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref90/uniref90.xml.gz
 wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref50/uniref50.xml.gz
 wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniparc/uniparc_active.fasta.gz
-printf "\n8/18: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n"
+printf "\n8/19: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n"
 python3 ${BAKTA_DB_SCRIPTS}/init-psc.py --taxonomy nodes.dmp --uniref90 uniref90.xml.gz --uniref50 uniref50.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --psc psc.faa --sorf sorf.faa
-printf "\n8/18: build PSC Diamond db ...\n"
+printf "\n8/19: build PSC Diamond db ...\n"
 diamond makedb --in psc.faa --db psc
 diamond makedb --in sorf.faa --db sorf
 rm uniref90.xml.gz
@@ -117,9 +117,9 @@ rm uniref90.xml.gz
 # - download UniProt UniRef100
 # - read, filter and transform UniRef100 entries and store to ips.db
 ############################################################################
-printf "\n9/18: download UniProt UniRef100 ...\n"
+printf "\n9/19: download UniProt UniRef100 ...\n"
 wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref100/uniref100.xml.gz
-printf "\n9/18: read, filter and store UniRef100 entries ...:\n"
+printf "\n9/19: read, filter and store UniRef100 entries ...:\n"
 python3 ${BAKTA_DB_SCRIPTS}/init-ups-ips.py --taxonomy nodes.dmp --uniref100 uniref100.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --ips ips.faa
 rm uniref100.xml.gz uniparc_active.fasta.gz
 
@@ -130,7 +130,7 @@ rm uniref100.xml.gz uniparc_active.fasta.gz
 # - align UniRef90 proteins to COG protein sequences
 # - annotate PSCs with COG info
 ############################################################################
-printf "\n10/18: download COG db ...\n"
+printf "\n10/19: download COG db ...\n"
 wget https://ftp.ncbi.nih.gov/pub/COG/COG2020/data/cog-20.def.tab  # COG IDs and functional class
 wget https://ftp.ncbi.nih.gov/pub/COG/COG2020/data/cog-20.cog.csv # Mapping GenBank IDs -> COG IDs
 for i in $(seq -f "%04g" 1 5950)
@@ -139,7 +139,7 @@ do
     pigz -dc COG${i}.fa.gz | seqtk seq -CU >> cog.faa
     rm COG${i}.fa.gz
 done
-printf "\n10/18: annotate PSCs ...\n"
+printf "\n10/19: annotate PSCs ...\n"
 diamond makedb --in cog.faa --db cog.dmnd
 nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in psc.faa --db cog.dmnd --block 1000000 --id 90 --qcov 80 --scov 80 --out diamond.cog.psc.tsv
 python3 ${BAKTA_DB_SCRIPTS}/annotate-cog.py --db bakta.db --alignments diamond.cog.psc.tsv --cog-ids cog-20.def.tab --gi-cog-mapping cog-20.cog.csv
@@ -154,7 +154,7 @@ rm cognames2003-2015.tab cog2003-2015.csv prot2003-2015.fa.gz diamond.cog.tsv co
 # - select eligible HMMs (f measure>0.77)
 # - annotate PSCs
 ############################################################################
-printf "\n11/18: download KEGG kofams HMM models...\n"
+printf "\n11/19: download KEGG kofams HMM models...\n"
 wget https://www.genome.jp/ftp/db/kofam/ko_list.gz
 wget https://www.genome.jp/ftp/db/kofam/profiles.tar.gz
 zcat ko_list.gz | grep full | awk '{ if($5>=0.77) print $0}' > hmms.selected.tsv
@@ -163,7 +163,7 @@ tar -I pigz -xf profiles.tar.gz
 for kofam in `cat profiles/prokaryote.hal`; do cat profiles/$kofam >> kofam-prok; done
 hmmfetch -f -o kofams kofam-prok hmms.ids.txt
 hmmpress kofams
-printf "\n11/18: annotate PSCs...\n"
+printf "\n11/19: annotate PSCs...\n"
 mkdir -p work/tblout work/domtblout
 nextflow run ${BAKTA_DB_SCRIPTS}/hmmsearch.nf --in psc.faa --db kofams --no_tc --out hmmsearch.kofam.tblout
 python3 ${BAKTA_DB_SCRIPTS}/annotate-kofams.py --db bakta.db --hmms hmms.selected.tsv --hmm-results hmmsearch.kofam.tblout
@@ -176,15 +176,15 @@ rm -rf profiles ko_list.gz kofam* hmmsearch.kofam.* hmms*
 # - annotate UPSs with NCBI nrp IDs (WP_*)
 # - annotate IPSs/PSCs with NCBI gene names (WP_* -> hash -> UniRef100 -> UniRef90 -> PSC)
 ############################################################################
-printf "\n12/18: download RefSeq nonredundant proteins and clusters ...\n"
+printf "\n12/19: download RefSeq nonredundant proteins and clusters ...\n"
 wget https://ftp.ncbi.nlm.nih.gov/genomes/CLUSTERS/PCLA_proteins.txt
 wget https://ftp.ncbi.nlm.nih.gov/genomes/CLUSTERS/PCLA_clusters.txt
 for i in {1..1573}; do
     wget https://ftp.ncbi.nlm.nih.gov/refseq/release/bacteria/bacteria.nonredundant_protein.${i}.protein.faa.gz
     pigz -dc bacteria.nonredundant_protein.${i}.protein.faa.gz | seqtk seq -CU >> refseq-bacteria-nrp.trimmed.faa
     rm bacteria.nonredundant_protein.${i}.protein.faa.gz
 done
-printf "\n12/18: annotate IPSs and PSCs ...\n"
+printf "\n12/19: annotate IPSs and PSCs ...\n"
 python3 ${BAKTA_DB_SCRIPTS}/annotate-ncbi-nrp.py --db bakta.db --nrp refseq-bacteria-nrp.trimmed.faa --pcla-proteins PCLA_proteins.txt --pcla-clusters PCLA_clusters.txt
 rm refseq-bacteria-nrp.trimmed.faa PCLA_proteins.txt PCLA_clusters.txt
 
@@ -195,9 +195,9 @@ rm refseq-bacteria-nrp.trimmed.faa PCLA_proteins.txt PCLA_clusters.txt
 # - annotate PSCs if IPS have PSC UniRef90 identifier (seq -> hash -> UPS -> IPS -> PSC)
 # - annotate IPSs if IPS have no PSC UniRef90 identifier (seq -> hash -> UPS -> IPS)
 ############################################################################
-printf "\n13/18: download UniProt/SwissProt ...\n"
+printf "\n13/19: download UniProt/SwissProt ...\n"
 wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
-printf "\n13/18: annotate IPSs and PSCs ...\n"
+printf "\n13/19: annotate IPSs and PSCs ...\n"
 python3 ${BAKTA_DB_SCRIPTS}/annotate-swissprot.py --taxonomy nodes.dmp --xml uniprot_sprot.xml.gz --db bakta.db
 rm uniprot_sprot.xml.gz
 
@@ -207,7 +207,7 @@ rm uniprot_sprot.xml.gz
 # - download NCBIfams HMM models
 # - annotate PSCs
 ############################################################################
-printf "\n14/18: download NCBIfams HMM models...\n"
+printf "\n14/19: download NCBIfams HMM models...\n"
 wget https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.LIB
 wget https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv
 grep -v "(Provisional)" hmm_PGAP.tsv > hmms.non-prov.tsv
@@ -216,21 +216,41 @@ grep equivalog hmms.non-prov.tsv >> hmms.selected.tsv
 cut -f1 hmms.selected.tsv > hmms.ids.txt
 hmmfetch -f -o ncbifams hmm_PGAP.LIB hmms.ids.txt
 hmmpress ncbifams
-printf "\n14/18: annotate PSCs...\n"
+printf "\n14/19: annotate PSCs...\n"
 mkdir -p work/tblout work/domtblout
 nextflow run ${BAKTA_DB_SCRIPTS}/hmmsearch.nf --in psc.faa --db ncbifams --out hmmsearch.ncbifams.tblout
 python3 ${BAKTA_DB_SCRIPTS}/annotate-ncbi-fams.py --db bakta.db --hmms hmms.selected.tsv --hmm-results hmmsearch.ncbifams.tblout
 rm ncbifams* hmms.* hmm_PGAP.* hmmsearch.ncbifams.tblout
 
 
+############################################################################
+# Integrate PHROG DB of phage orthologous
+# - download PHROG protein sequences
+# - filter unannotated PHROGs
+# - annotate PSCs
+############################################################################
+printf "\n15/19: download PHROGs ...\n"
+wget https://phrogs.lmge.uca.fr/downloads_from_website/FAA_phrog.tar.gz
+wget https://phrogs.lmge.uca.fr/downloads_from_website/phrog_annot_v4.tsv
+tar -xzf FAA_phrog.tar.gz
+cat FAA_phrog/*.faa >> phrogs-raw.faa
+python3 ${BAKTA_DB_SCRIPTS}/extract-phrogs.py --annotation phrog_annot_v4.tsv --proteins phrogs-raw.faa --filtered-proteins phrogs.faa
+diamond makedb --in phrogs.faa --db phrog
+printf "\n15/19: annotate PSCs...\n"
+python3 ${BAKTA_DB_SCRIPTS}/extract-hypotheticals.py --psc psc.faa --db bakta.db --hypotheticals hypotheticals.faa
+nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in hypotheticals.faa --db phrog.dmnd --block 1000000 --id 90 --qcov 80 --scov 80 --out diamond.phrog.psc.tsv
+python3 ${BAKTA_DB_SCRIPTS}/annotate-phrogs.py --db bakta.db --psc-alignments diamond.phrog.psc.tsv
+rm -r FAA_phrog.tar.gz phrog_annot_v4.tsv FAA_phrog phrogs-raw.faa phrogs.faa phrog.dmnd hypotheticals.faa
+
+
 ############################################################################
 # Integrate NCBI Pathogen AMR db
 # - download AMR gene WP_* annotations from NCBI Pathogen ReferenceGeneCatalog
 # - annotate IPSs with AMR info
 ############################################################################
-printf "\n15/18: download AMR gene WP_* annotations from NCBI Pathogen AMR db ...\n"
+printf "\n16/19: download AMR gene WP_* annotations from NCBI Pathogen AMR db ...\n"
 wget https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/ReferenceGeneCatalog.txt
-printf "\n15/18: annotate PSCs...\n"
+printf "\n16/19: annotate PSCs...\n"
 python3 ${BAKTA_DB_SCRIPTS}/annotate-ncbi-amr.py --db bakta.db --genes ReferenceGeneCatalog.txt
 rm ReferenceGeneCatalog.txt
 
@@ -241,10 +261,10 @@ rm ReferenceGeneCatalog.txt
 # - extract IS transposase sequences and mark ORF A/B transposases
 # - annotate IPSs/PCSs with IS info
 ############################################################################
-printf "\n16/18: download & extract ISfinder protein sequences ...\n"
+printf "\n17/19: download ISfinder protein sequences ...\n"
 wget https://github.com/oschwengers/ISfinder-sequences/raw/2e9162bd5e3448c86ec1549a55315e498bef72fc/IS.faa
-python3 ${BAKTA_DB_SCRIPTS}/extract-is.py --input IS.faa --output is.transposase.faa
-printf "\n16/18: annotate IPSs/PCSs ...\n"
+printf "\n17/19: annotate IPSs ...\n"
+grep -A 1 ~~~Transposase~~~ IS.faa | tr -d - | tr -s "\n" > is.transposase.faa
 diamond makedb --in is.transposase.faa --db is
 nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in ips.faa --db is.dmnd --block 1000000 --id 95 --qcov 90 --scov 90 --out diamond.is.ips.tsv
 nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in psc.faa --db is.dmnd --block 1000000 --id 90 --qcov 80 --scov 80 --out diamond.is.psc.tsv
@@ -259,7 +279,7 @@ rm is.transposase.faa is.dmnd diamond.is.ips.tsv diamond.is.psc.tsv
 # - compress HMM models
 # - annotate hypothetical PSC via Pfam families
 ############################################################################
-printf "\n17/18: download HMM models from Pfam ...\n"
+printf "\n18/19: download HMM models from Pfam ...\n"
 wget https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz
 python3 ${BAKTA_DB_SCRIPTS}/extract-pfam.py --pfam Pfam-A.hmm.dat.gz --family pfam.families.tsv --non-family pfam.non-families.tsv
 wget https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz
@@ -281,7 +301,7 @@ rm pfam-families* pfam *.tsv Pfam* hmmsearch.pfam-families.tblout
 # - import NCBI BlastRules models
 # - import VFDB sequences
 ############################################################################
-printf "\n18/18: download AA sequences for expert annotation system ...\n"
+printf "\n19/19: download AA sequences for expert annotation system ...\n"
 wget https://ftp.ncbi.nlm.nih.gov/pub/blastrules/4.2.2.tgz
 tar -xzf 4.2.2.tgz
 gunzip VFDB_setA_pro.fas.gz