kkonganti@11: #!/usr/bin/env perl kkonganti@11: kkonganti@11: use strict; kkonganti@11: use FindBin; kkonganti@11: use Bio::SeqIO; kkonganti@11: use Bio::Seq; kkonganti@11: use Path::Tiny; kkonganti@11: use File::Basename; kkonganti@11: use File::Spec; kkonganti@11: use File::Path qw(make_path remove_tree); kkonganti@11: use List::Util qw(first); kkonganti@11: use Cwd qw(abs_path); kkonganti@11: use Data::Dumper; kkonganti@11: use LWP::Simple; kkonganti@11: use JSON; kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: # Globals kkonganti@11: kkonganti@11: my $EXE = basename($0); kkonganti@11: my $ABX_SEP = ';'; kkonganti@11: kkonganti@11: my %DATABASE = ( kkonganti@11: 'resfinder' => \&get_resfinder, kkonganti@11: 'plasmidfinder' => \&get_plasmidfinder, kkonganti@11: 'megares' => \&get_megares, kkonganti@11: 'argannot' => \&get_argannot, kkonganti@11: 'card' => \&get_card, kkonganti@11: kkonganti@11: # 'ncbibetalactamase' => \&get_ncbibetalactamase, kkonganti@11: 'ncbi' => \&get_ncbi, kkonganti@11: 'vfdb' => \&get_vfdb, kkonganti@11: 'ecoli_vf' => \&get_ecoli_vf, # https://github.com/phac-nml/ecoli_vf kkonganti@11: 'ecoh' => \&get_ecoh, kkonganti@11: 'bacmet2' => \&get_bacmet2, kkonganti@11: 'victors' => \&get_victors, kkonganti@11: kkonganti@11: # 'serotypefinder' => \&get_serotypefinder, kkonganti@11: ); kkonganti@11: my $DATABASES = join( ' ', sort keys %DATABASE ); kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: # Command line options kkonganti@11: kkonganti@11: my ( @Options, $debug, $outdir, $db, $force ); kkonganti@11: setOptions(); kkonganti@11: kkonganti@11: $db or err("Please choose a --db from: $DATABASES"); kkonganti@11: exists $DATABASE{$db} or err("Unknown --db '$db', choose from: $DATABASES "); kkonganti@11: -d $outdir or err("--outdir '$outdir' does not exist"); kkonganti@11: kkonganti@11: my $dir = abs_path( File::Spec->catdir( $outdir, $db ) ); kkonganti@11: make_path($dir); kkonganti@11: msg("Setting up '$db' in '$dir'"); kkonganti@11: kkonganti@11: #my $tmpdir = tempdir("$db-XXXXXXXX", DIR=>$dir, CLEANUP=>0); kkonganti@11: #my $tmpdir = "/home/tseemann/git/abricate/db/resfinder/resfinder-6Kuphtvv"; kkonganti@11: my $tmpdir = "$dir/src"; kkonganti@11: make_path($tmpdir); kkonganti@11: kkonganti@11: # run the specific function from --db kkonganti@11: chdir $tmpdir; kkonganti@11: my $seq = $DATABASE{$db}->(); kkonganti@11: map { is_full_gene($_) } @$seq; # doesn't do anything? kkonganti@11: $seq = dedupe_seq($seq); kkonganti@11: kkonganti@11: #print Dumper($seq); kkonganti@11: msg("Sorting sequences by ID"); kkonganti@11: $seq = [ sort { $a->{ID} cmp $b->{ID} } @$seq ]; kkonganti@11: save_fasta( "$dir/sequences", $seq ); kkonganti@11: kkonganti@11: msg("Formatting BLASTN database: $dir/sequences"); kkonganti@11: my $logfile = "$tmpdir/makeblastdb.log"; kkonganti@11: my $ncbi_title = $db; kkonganti@11: if ( "$db" eq "ncbi" ) { kkonganti@11: $ncbi_title = "ncbiamrplus"; kkonganti@11: } kkonganti@11: my $ec = system( kkonganti@11: "makeblastdb -in '$dir/sequences' -title '$ncbi_title' -dbtype nucl -hash_index -logfile $logfile" kkonganti@11: ); kkonganti@11: if ( $ec != 0 ) { kkonganti@11: system("tail '$logfile'"); kkonganti@11: err("Error with makign BLAST database. See $logfile"); kkonganti@11: } kkonganti@11: kkonganti@11: #msg("Run 'abricate --setupdb' to format the database"); kkonganti@11: kkonganti@11: msg("Done."); kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: kkonganti@11: sub download { kkonganti@11: my ( $url, $dest ) = @_; kkonganti@11: if ( -r $dest and not $force ) { kkonganti@11: msg("Won't re-download existing $dest (use --force)"); kkonganti@11: kkonganti@11: #exit(1); kkonganti@11: } kkonganti@11: else { kkonganti@11: msg("Downloading: $url"); kkonganti@11: my $ec = mirror( $url, $dest ); kkonganti@11: msg("HTTP Result: $ec"); kkonganti@11: ( $ec == 200 or $ec = 304 ) kkonganti@11: or err("HTTP $ec | failed to download $url"); # is HTTP OK ? kkonganti@11: } kkonganti@11: msg("Destination: $dest"); kkonganti@11: msg( "Filesize:", ( -s $dest ), "bytes" ); kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub trim_spaces { kkonganti@11: my ($s) = @_; kkonganti@11: $s =~ s/^\s+//; kkonganti@11: $s =~ s/\s+$//; kkonganti@11: return $s; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_resfinder { kkonganti@11: my $name = "resfinder_db"; kkonganti@11: kkonganti@11: # FIXME - can we just get HEAD.zip like in plasmidfinder? kkonganti@11: my $url = "https://bitbucket.org/genomicepidemiology/$name.git"; kkonganti@11: kkonganti@11: # if (-r $name and not $force) { kkonganti@11: # msg("Won't overwrite existing $name (use --force)"); kkonganti@11: # # exit(1); kkonganti@11: # } kkonganti@11: # else { kkonganti@11: # msg("Nuking existing folder: $name"); kkonganti@11: # remove_tree("./$name"); kkonganti@11: # msg("Cloning $url to $name"); kkonganti@11: # system("git clone --quiet $url $name"); kkonganti@11: # } kkonganti@11: kkonganti@11: #<*.fsa> kkonganti@11: #>aac(6')-Ib_2_M23634 kkonganti@11: #>blaNDM-19_1_MF370080 kkonganti@11: #>mcr-1.1_1_KP347127 kkonganti@11: #>fosB1_1_CP001903 kkonganti@11: #>fusB_1_AY373761 kkonganti@11: #>VanHAX_1_FJ866609 kkonganti@11: #>ere(A)_6_DQ157752 kkonganti@11: #>nimA_1_X71444 kkonganti@11: #>cfr_1_AM408573 kkonganti@11: #>catB3_2_U13880 kkonganti@11: #>qnrA1_1_AY070235 kkonganti@11: #>ARR-2_1_HQ141279 kkonganti@11: #>sul1_2_U12338 kkonganti@11: #>tet_1_M74049 kkonganti@11: #>dfrA19_1_EU855687 kkonganti@11: kkonganti@11: # kkonganti@11: #aac(6')-Iv:Aminoglycoside resistance: kkonganti@11: #aac(6')-Iw:Aminoglycoside resistance:Alternate name; aac(6')-Ix kkonganti@11: #sul3:Sulphonamide resistance: kkonganti@11: ##Tetracycline: kkonganti@11: #ort(B):Tetracycline resistance: kkonganti@11: #blaCMY-59:Beta-lactam resistance: kkonganti@11: kkonganti@11: # kkonganti@11: #Gene_accession no. Class Phenotype PMID Mechanism of resistance Notes Required_gene kkonganti@11: #ant(2'')-Ia_1_X04555 Aminoglycoside Gentamicin, Tobramycin 3024112 Enzymatic modification Alternative name aadB kkonganti@11: #ant(2'')-Ia_2_JF826500 Aminoglycoside Gentamicin, Tobramycin 22271862 Enzymat kkonganti@11: kkonganti@11: $name = "~/apps/bettercallsal/assets/abricate_dbs/$name"; kkonganti@11: my $metafn = "$name/phenotypes.txt"; kkonganti@11: my @meta = path($metafn)->lines( { chomp => 1 } ); kkonganti@11: my %anno; kkonganti@11: foreach (@meta) { kkonganti@11: next if m/^#/; kkonganti@11: my @x = split m/\t/; kkonganti@11: kkonganti@11: #msg("$metafn: @x"); kkonganti@11: my ($gene) = ( $x[0] =~ m/^(.*?)_\w+$/ ); kkonganti@11: $anno{$gene}{ABX} = [ kkonganti@11: map { trim_spaces($_) } kkonganti@11: grep { !m/(unknown|notes|^none)/i } kkonganti@11: split m/,\s*/, kkonganti@11: $x[2] kkonganti@11: ]; kkonganti@11: kkonganti@11: #msg("$metafn: $gene |", $anno{$gene}{ABX}->@*); kkonganti@11: } kkonganti@11: msg( "get_resfinder: $metafn", scalar( keys %anno ), "genes" ); kkonganti@11: kkonganti@11: #print Dumper(\%anno); kkonganti@11: kkonganti@11: my @seq; kkonganti@11: for my $fasta (<$name/*.fsa>) { kkonganti@11: kkonganti@11: # Issue #62 - repair broken fasta files like this: kkonganti@11: # GCTTTAAATTGGAAAAAAGATAGTCAAACTCTTTAA>cmr_1_U43535 kkonganti@11: # inline replacement kkonganti@11: system( 'sed', '-i.bak', 's/\([A-Z]\)>/\1\n>/gi', $fasta ); kkonganti@11: my $args = load_fasta($fasta); kkonganti@11: kkonganti@11: # use name of fasta file as antibiotic name kkonganti@11: #my $abx = basename($fasta, '.fsa'); kkonganti@11: #msg("$fasta: Assigning '$abx' to all genes"); kkonganti@11: #push @{$_->{ABX}}, $abx for (@$args); kkonganti@11: push @seq, @$args; kkonganti@11: } kkonganti@11: kkonganti@11: # https://github.com/tseemann/abricate/issues/92 kkonganti@11: # mcr-9_1_NZ_NAAN01000063.1 kkonganti@11: #>mcr-9_1_NZ_NAAN01000063.1 kkonganti@11: # mcr-9.1:Colistin resistance: kkonganti@11: kkonganti@11: for my $seq (@seq) { kkonganti@11: my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*?)_(\d+)_(\S+)$/; kkonganti@11: kkonganti@11: #msg("resfinder: $1 $2 $3", $anno{$1}); kkonganti@11: $seq->{ID} = "${id}_${copy}"; kkonganti@11: $seq->{ACC} = $acc; kkonganti@11: $seq->{DESC} = $anno{$id}{DESC} || $id; kkonganti@11: push @{ $seq->{ABX} }, @{ $anno{$id}{ABX} } if $anno{$id}{ABX}; kkonganti@11: } kkonganti@11: kkonganti@11: return \@seq; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_serotypefinder { kkonganti@11: my $name = "serotypefinder_db"; kkonganti@11: my $url = "https://bitbucket.org/genomicepidemiology/$name.git"; kkonganti@11: kkonganti@11: if ( -r $name and not $force ) { kkonganti@11: msg("Won't overwrite existing $name (use --force)"); kkonganti@11: kkonganti@11: # exit(1); kkonganti@11: } kkonganti@11: else { kkonganti@11: msg("Nuking existing folder: $name"); kkonganti@11: remove_tree("./$name"); kkonganti@11: msg("Cloning $url to $name"); kkonganti@11: system("git clone --quiet $url $name"); kkonganti@11: } kkonganti@11: kkonganti@11: my @seq; kkonganti@11: for my $fasta (<$name/*.fsa>) { kkonganti@11: push @seq, @{ load_fasta($fasta) }; kkonganti@11: } kkonganti@11: kkonganti@11: # >fliC_44444_AY250028_H52 kkonganti@11: # FIXME - this is already in EcOH database! kkonganti@11: for my $seq (@seq) { kkonganti@11: my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*)_(\d+)_(\w+)$/; kkonganti@11: kkonganti@11: #msg("serotypefinder: $1 $2 $3", $anno{$1}); kkonganti@11: $seq->{ID} = "${id}_${copy}"; kkonganti@11: $seq->{ACC} = $acc; kkonganti@11: kkonganti@11: #$seq->{DESC} = $anno{$id} || ''; kkonganti@11: } kkonganti@11: kkonganti@11: return \@seq; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_tag { kkonganti@11: my ( $f, $tag ) = @_; kkonganti@11: if ( $f->has_tag($tag) ) { kkonganti@11: my ($val) = $f->get_tag_values($tag); kkonganti@11: return $val; kkonganti@11: } kkonganti@11: return ''; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_ncbi { kkonganti@11: my $AFP = kkonganti@11: "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest"; kkonganti@11: kkonganti@11: #my $src = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/AMR_CDS"; kkonganti@11: my $src = "$AFP/AMR_CDS"; kkonganti@11: my $name = "amr_cds.ffn"; kkonganti@11: kkonganti@11: #my $src2 = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/ReferenceGeneCatalog.txt"; kkonganti@11: my $src2 = "$AFP/ReferenceGeneCatalog.txt"; kkonganti@11: my $name2 = "amr_cds.tsv"; kkonganti@11: kkonganti@11: if ( -r $name and -r $name2 and not $force ) { kkonganti@11: msg("Won't overwrite existing $name/$name2 (use --force)"); kkonganti@11: kkonganti@11: # exit(1); kkonganti@11: } kkonganti@11: else { kkonganti@11: download( $src, $name ); kkonganti@11: download( $src2, $name2 ); kkonganti@11: } kkonganti@11: kkonganti@11: #1 allele kkonganti@11: #2 gene_family ble kkonganti@11: #3 whitelisted_taxa kkonganti@11: #4 product_name BLMA family bleomycin binding protein kkonganti@11: #5 scope core kkonganti@11: #6 type AMR kkonganti@11: #7 subtype AMR kkonganti@11: #8 class BLEOMYCIN kkonganti@11: #9 subclass BLEOMYCIN kkonganti@11: #10 refseq_protein_accession WP_063842967.1 kkonganti@11: #11 refseq_nucleotide_accession NG_047554.1 kkonganti@11: #12 curated_refseq_start No kkonganti@11: #13 genbank_protein_accession CAA02068.1 kkonganti@11: #14 genbank_nucleotide_accession A31900.1 kkonganti@11: #15 genbank_strand_orientation + kkonganti@11: #16 genbank_cds_start 6 kkonganti@11: #17 genbank_cds_stop 374 kkonganti@11: #18 pubmed_reference kkonganti@11: #19 blacklisted_taxa kkonganti@11: #20 db_version 2019-08-27.1 kkonganti@11: kkonganti@11: my $tsv = load_tabular( $name2, 10 ); # refseq_nucleotide_accession kkonganti@11: msg( "[$name2] Loaded", scalar keys %$tsv, "records" ); kkonganti@11: kkonganti@11: # https://github.com/ncbi/amr/wiki/AMRFinderPlus-database#amrprot kkonganti@11: # 0 1 2 3 4 5 6 7 kkonganti@11: # >1000909371|WP_061158039.1|NG_050200|1|1|blaTEM-156|blaTEM|class_A_beta-lactamase_TEM-156 NG_050200:101-961 kkonganti@11: my @seq; kkonganti@11: my $in = Bio::SeqIO->new( -file => $name, -format => "fasta" ); kkonganti@11: while ( my $rec = $in->next_seq ) { kkonganti@11: kkonganti@11: # parse ID kkonganti@11: my ( $gi, $pi, $acc, $fp, $fn, $gene, $fam, $prod ) = split m/\|/, kkonganti@11: $rec->id; kkonganti@11: kkonganti@11: # skip fusion genes kkonganti@11: next unless $fp == 1 and $fn == 1; kkonganti@11: kkonganti@11: # only keep true ARGs kkonganti@11: $acc .= ".1" unless $acc =~ m/\.\d+$/; kkonganti@11: my $t = $tsv->{$acc} or next; kkonganti@11: next unless ( $t->{scope} eq 'plus' || $t->{scope} eq 'core' ); kkonganti@11: kkonganti@11: # next unless $t->{type} eq 'VIRULENCE'; kkonganti@11: # next unless $t->{subtype} eq 'VIRULENCE'; kkonganti@11: # construct sequence record kkonganti@11: $prod =~ s/_/ /g; kkonganti@11: err("$pi: gene is empty") unless $gene; kkonganti@11: err("$pi: product is empty") unless $prod; kkonganti@11: my $s = { kkonganti@11: ID => $gene, kkonganti@11: ACC => $acc, kkonganti@11: DESC => $prod, kkonganti@11: SEQ => $rec->seq, kkonganti@11: ABX => [ split m'/', $t->{subclass} ] kkonganti@11: }; kkonganti@11: push @seq, $s; kkonganti@11: msg( "[$name]", 0 + @seq, "|", $s->{ID}, "|", $s->{ACC}, "|", kkonganti@11: $s->{DESC} ); kkonganti@11: kkonganti@11: #msg(Dumper($s)); kkonganti@11: msg( $s->{ID}, " is fusion $fp/$fn" ) if "$fp$fn" ne '11'; kkonganti@11: } kkonganti@11: return \@seq; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_plasmidfinder { kkonganti@11: my $name = "plasmidfinder"; kkonganti@11: my $zip = "$name.zip"; kkonganti@11: kkonganti@11: # download("https://cge.cbs.dtu.dk/cge/download_data.php?folder=$name&filename=$zip&submit=$zip", $zip); kkonganti@11: download( kkonganti@11: "https://bitbucket.org/genomicepidemiology/plasmidfinder_db/get/HEAD.zip", kkonganti@11: $zip kkonganti@11: ); kkonganti@11: system("unzip -j -u $zip"); kkonganti@11: kkonganti@11: my @seq; kkonganti@11: for my $fasta (<*.fsa>) { kkonganti@11: push @seq, @{ load_fasta($fasta) }; kkonganti@11: } kkonganti@11: kkonganti@11: for my $seq (@seq) { kkonganti@11: $seq->{DESC} = $seq->{ID}; # no desc, so use ORIGINAL ID as desc kkonganti@11: my ( $id, $acc ) = kkonganti@11: ( $seq->{ID} =~ m/^(.*)_(([A-Z]+|NC_)\d+(\.\d+)?)$/ ); kkonganti@11: $id =~ s/_+$//g; kkonganti@11: $seq->{ID} = $id || $seq->{ID}; kkonganti@11: $seq->{ACC} = $acc || ''; kkonganti@11: wrn( "Parsed empty ID:", kkonganti@11: $seq->{DESC}, kkonganti@11: "=> id='$id' acc='$acc' seq=" . substr( $seq->{SEQ}, 0, 10 ) ) kkonganti@11: if not $id; kkonganti@11: } kkonganti@11: kkonganti@11: return \@seq; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_megares { kkonganti@11: my $zip = "megares.zip"; kkonganti@11: download( 'https://www.meglab.org/downloads/megares_v3.00.zip', $zip ); kkonganti@11: system("unzip -j -u $zip"); kkonganti@11: my $seqs = load_fasta( glob("megares_drugs_*.fasta") ); kkonganti@11: my @okseq; kkonganti@11: kkonganti@11: # >MEG_372|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_MATE_efflux_pumps|ABEM kkonganti@11: # >MEG_411|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_RND_efflux_regulator|ACRR|RequiresSNPConfirmation kkonganti@11: # >MEG_7860|Drugs|betalactams|Class_B_betalactamases|ZOG kkonganti@11: # >MEG_7439|Drugs|Glycopeptides|VanI-type_resistance_protein|VANI kkonganti@11: # >MEG_7245|Drugs|Tetracyclines|Tetracycline_resistance_MFS_efflux_pumps|TETY kkonganti@11: # >MEG_9|Drugs|Aminoglycosides|Aminoglycoside-resistant_16S_ribosomal_subunit_protein|A16S|RequiresSNPConfirmation kkonganti@11: kkonganti@11: for my $s (@$seqs) { kkonganti@11: my ( $id, $type, $class, $mech, $group, $note ) = split m/\|/, $s->{ID}; kkonganti@11: if ($note) { kkonganti@11: kkonganti@11: # "RequiresSNPConfirmation" is the common one; we can't do that kkonganti@11: msg("Skipping $id due to: $note"); kkonganti@11: next; kkonganti@11: } kkonganti@11: $s->{ID} = $group; kkonganti@11: $s->{ACC} = $id; kkonganti@11: $s->{DESC} = join( ':', $type, $class, $mech, $group ); kkonganti@11: push @okseq, $s; kkonganti@11: } kkonganti@11: return [@okseq]; kkonganti@11: kkonganti@11: #return $seqs; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_argannot { kkonganti@11: my $fasta = 'arg-annot.fa'; kkonganti@11: download( kkonganti@11: # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/1425/argannot-aa-v3-march2017_doc.fasta', kkonganti@11: # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/691/argannot-nt_doc.fasta', kkonganti@11: 'https://www.mediterranee-infection.com/wp-content/uploads/2019/06/ARG_ANNOT_V5_Nt_JUNE2019.txt', kkonganti@11: $fasta kkonganti@11: ); kkonganti@11: kkonganti@11: # fix syntax errors in the FASTA file... kkonganti@11: path($fasta)->edit( sub { s/\\//g; $_ } ); kkonganti@11: kkonganti@11: my $seqs = load_fasta($fasta); kkonganti@11: kkonganti@11: # 0 1 2 3 kkonganti@11: # >(AGly)Aac2-Ie:NC_011896:3039059-3039607:549 kkonganti@11: for my $s (@$seqs) { kkonganti@11: my @x = split m/:/, $s->{ID}; kkonganti@11: $s->{ID} = $x[0]; kkonganti@11: $s->{ACC} = $x[1] . ':' . $x[2]; kkonganti@11: $s->{DESC} = ''; kkonganti@11: } kkonganti@11: kkonganti@11: return $seqs; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_bacmet2 { kkonganti@11: my $fasta = 'bacmet2.fa'; kkonganti@11: download( kkonganti@11: 'http://bacmet.biomedicine.gu.se/download/BacMet2_EXP_database.fasta', kkonganti@11: $fasta ); kkonganti@11: kkonganti@11: # This is a PROTEIN file kkonganti@11: my $seqs = load_fasta($fasta); kkonganti@11: kkonganti@11: # 0 1 2 3 4 ^ kkonganti@11: # >BAC0098|ctpC|sp|P0A502|CTPC_MYCTU Probable manganese/zinc-exporting kkonganti@11: for my $s (@$seqs) { kkonganti@11: my @x = split m/\|/, $s->{ID}; kkonganti@11: $s->{ID} = $x[1] . '-' . $x[0]; kkonganti@11: $s->{ACC} = $x[2] . ':' . $x[3]; kkonganti@11: } kkonganti@11: kkonganti@11: return $seqs; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_card { kkonganti@11: kkonganti@11: # https://github.com/tseemann/abricate/issues/25 kkonganti@11: my $tarball = 'card.tar.bz2'; kkonganti@11: download( kkonganti@11: #'https://card.mcmaster.ca/download/0/broadstreet-v2.0.2.tar.gz', kkonganti@11: 'https://card.mcmaster.ca/latest/data', kkonganti@11: kkonganti@11: #'https://card.mcmaster.ca/latest/data/card-data.tar.bz2', kkonganti@11: $tarball # yes, it's really BZ2 not GZ ... kkonganti@11: ); kkonganti@11: kkonganti@11: # my $fasta = "./nucleotide_fasta_protein_homolog_model.fasta"; kkonganti@11: my $jsonfile = "./card.json"; kkonganti@11: system( "tar", "xf", $tarball, $jsonfile ) == 0 kkonganti@11: or err("Problem with tar xf $tarball $jsonfile"); kkonganti@11: -r $jsonfile or err("Could not extract $jsonfile from $tarball"); kkonganti@11: kkonganti@11: # JSON kkonganti@11: my $json = path($jsonfile)->slurp_utf8; kkonganti@11: my $card = from_json( $json, { latin1 => 1 } ); kkonganti@11: my @seq; kkonganti@11: for my $g ( values %$card ) { kkonganti@11: next unless ref($g) eq 'HASH'; kkonganti@11: kkonganti@11: # msg(Dumper($g)); kkonganti@11: next kkonganti@11: unless $g->{model_type} eq kkonganti@11: "protein homolog model"; # only 'acquired' genes kkonganti@11: my $id = $g->{model_name}; kkonganti@11: err("$id has {model_param}{snp}") if exists $g->{model_param}{snp}; kkonganti@11: kkonganti@11: # msg("CARD: $id"); kkonganti@11: # print STDERR Dumper($g); kkonganti@11: my $dna = $g->{model_sequences}{sequence} kkonganti@11: or err("$id: no {model_sequences}{sequence} found"); kkonganti@11: my ($key) = sort keys %$dna; # first key kkonganti@11: $dna = $dna->{$key} or err("$id: invalid key '$key'"); kkonganti@11: $dna = $dna->{dna_sequence} or err("$id: no dna_sequence"); kkonganti@11: kkonganti@11: # msg(Dumper($dna)) if $id eq 'OXA-25'; kkonganti@11: kkonganti@11: # ARO_category => { kkonganti@11: # 'category_aro_name' => 'cephalosporin', kkonganti@11: # 'category_aro_class_name' => 'Drug Class', kkonganti@11: my $is_amr_gene = 0; kkonganti@11: my @abx; kkonganti@11: for my $key ( keys $g->{ARO_category}->%* ) { kkonganti@11: my $c = $g->{ARO_category}{$key}; kkonganti@11: if ( $c->{category_aro_class_name} eq 'Drug Class' ) { kkonganti@11: my $abx = $c->{category_aro_name}; kkonganti@11: $abx =~ s/ antibiotic//; kkonganti@11: $abx =~ s/\s/_/g; kkonganti@11: push @abx, $abx; kkonganti@11: } kkonganti@11: if ( $c->{category_aro_class_name} eq 'AMR Gene Family' ) { kkonganti@11: $is_amr_gene++; kkonganti@11: } kkonganti@11: } kkonganti@11: kkonganti@11: #err("CARD | $id | ", Dumper($g->{ARO_category}) ) unless $is_amr_gene; kkonganti@11: #msg("ABX=$_") for @abx; kkonganti@11: kkonganti@11: # put coordinates into normal form kkonganti@11: my ( $start, $stop ) = kkonganti@11: $dna->{strand} eq '-' kkonganti@11: ? ( $dna->{fmax}, $dna->{fmin} ) kkonganti@11: : ( $dna->{fmin}, $dna->{fmax} ); kkonganti@11: kkonganti@11: $id =~ s/\s+/_/g; kkonganti@11: push @seq, kkonganti@11: { kkonganti@11: ID => $id, kkonganti@11: ACC => $dna->{accession} . ":$start-$stop", kkonganti@11: DESC => ( $g->{ARO_description} || $g->{ARO_accession} ), kkonganti@11: SEQ => $dna->{sequence}, kkonganti@11: ABX => [@abx], kkonganti@11: }; kkonganti@11: kkonganti@11: # msg(Dumper($seq[-1])); kkonganti@11: } kkonganti@11: kkonganti@11: return \@seq; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_victors { kkonganti@11: kkonganti@11: # the CDS data is in .ffn and has source GI and coords kkonganti@11: # the PROT data is in .faa and has the protein ref and /product kkonganti@11: kkonganti@11: #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence kkonganti@11: download( 'http://www.phidias.us/victors/downloads/gen_downloads.php', kkonganti@11: 'victors.ffn' ); kkonganti@11: kkonganti@11: #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni] kkonganti@11: download( kkonganti@11: 'http://www.phidias.us/victors/downloads/gen_downloads_protein.php', kkonganti@11: 'victors.faa' ); kkonganti@11: kkonganti@11: my %gi; kkonganti@11: open my $FAA, '<', 'victors.faa'; kkonganti@11: while (<$FAA>) { kkonganti@11: kkonganti@11: #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni] kkonganti@11: next unless m"^>gi.(\d+).ref.([^|]+). ([^[]+)"; kkonganti@11: $gi{$1}{ACC} = $2; kkonganti@11: $gi{$1}{DESC} = $3; kkonganti@11: } kkonganti@11: kkonganti@11: my $seqs = load_fasta("victors.ffn"); kkonganti@11: kkonganti@11: for my $s (@$seqs) { kkonganti@11: kkonganti@11: #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence kkonganti@11: $s->{ID} =~ m/gi.(\d+):(\d+)-(\d+)/; kkonganti@11: $s->{ACC} = $gi{$1}{ACC} || "gi|$1:$2-$3"; kkonganti@11: $s->{DESC} =~ $gi{$1}{DESC} || 'hypothetical protein'; kkonganti@11: } kkonganti@11: kkonganti@11: # print Dumper($seqs); exit; kkonganti@11: kkonganti@11: return $seqs; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_vfdb { kkonganti@11: download( 'http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz', kkonganti@11: 'vfdb.fa.gz' ); kkonganti@11: system("gzip -f -d -c vfdb.fa.gz > vfdb.fa"); kkonganti@11: my $seqs = load_fasta("vfdb.fa"); kkonganti@11: kkonganti@11: # >VFG000676(gb|AAD32411) (lef) anthrax toxin lethal factor precursor [Anthrax toxin (VF0142)] [Bacillus anthracis str. Sterne] kkonganti@11: for my $s (@$seqs) { kkonganti@11: kkonganti@11: # https://github.com/tseemann/abricate/issues/64#issuecomment-421895159 by @VGalata kkonganti@11: $s->{ID} =~ m/^(\w+)\(\w+\|(\w+)(\.\d+)?\)$/; # kkonganti@11: #$s->{ID} =~ m/^(\w+)\(\w+\|(\w+)\)$/; kkonganti@11: $s->{ACC} = $2 if $2; kkonganti@11: $s->{DESC} =~ m/^\((.*?)\)/; kkonganti@11: $s->{ID} = $1 if $1; kkonganti@11: kkonganti@11: # print STDERR Dumper($s); exit; kkonganti@11: } kkonganti@11: kkonganti@11: return $seqs; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_ncbibetalactamase { kkonganti@11: my $fasta = "ncbi.fa"; kkonganti@11: download( kkonganti@11: 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele-dna.fa', kkonganti@11: $fasta ); kkonganti@11: my $tab = "ncbi.tab"; kkonganti@11: download( 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele.tab', kkonganti@11: $tab ); kkonganti@11: kkonganti@11: # >ACD12694.1 EU650653.1:1-1173 kkonganti@11: my $seqs = load_fasta($fasta); kkonganti@11: kkonganti@11: # ACC-1 ACD12694.1 EU650653.1 blaACC-1 1 1173 + cephalosporin-hydrolyzing class C beta-lactamase ACC-1 kkonganti@11: my %anno; kkonganti@11: my @anno = grep { !m/^#/ } path($tab)->lines( { chomp => 1 } ); kkonganti@11: msg( "Read", 0 + @anno, "annotations" ); kkonganti@11: foreach (@anno) { kkonganti@11: my ( $name, $id, $acc, $gene, $begin, $end, undef, $product ) = kkonganti@11: split m/\t/; kkonganti@11: $anno{$id} = { kkonganti@11: ID => $gene, kkonganti@11: DESC => $product, kkonganti@11: ACC => "$acc:$begin-$end", kkonganti@11: }; kkonganti@11: } kkonganti@11: kkonganti@11: # print Dumper(\%anno); kkonganti@11: kkonganti@11: for my $s (@$seqs) { kkonganti@11: my $id = $s->{ID}; kkonganti@11: next unless exists $anno{$id}; kkonganti@11: $s->{ID} = $anno{$id}{ID}; kkonganti@11: $s->{ACC} = $anno{$id}{ACC}; kkonganti@11: $s->{DESC} = $anno{$id}{DESC}; kkonganti@11: } kkonganti@11: kkonganti@11: # print Dumper($seqs); kkonganti@11: kkonganti@11: return $seqs; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_ecoh { kkonganti@11: my $fasta = "EcOH.fa"; kkonganti@11: download( kkonganti@11: 'https://raw.githubusercontent.com/katholt/srst2/master/data/EcOH.fasta', kkonganti@11: $fasta kkonganti@11: ); kkonganti@11: kkonganti@11: # https://github.com/katholt/srst2#generating-srst2-compatible-clustered-database-from-raw-sequences kkonganti@11: # [clusterID]__[gene]__[allele]__[seqID] [other stuff] kkonganti@11: # >1__fliC__fliC-H1__1 AB028471.1;flagellin;H1 kkonganti@11: # >8__wzx__wzx-O41__246 AB811617.1;O antigen flippase;O41 kkonganti@11: # >9__wzy__wzy-OgN31__597 LC125932.1;O antigen polyermase;OgN31 kkonganti@11: my $seqs = load_fasta($fasta); kkonganti@11: kkonganti@11: for my $s (@$seqs) { kkonganti@11: my @id = split m/__/, $s->{ID}; kkonganti@11: my @desc = split m';', $s->{DESC}; kkonganti@11: $s->{ID} = $id[2]; kkonganti@11: $s->{ACC} = shift(@desc); kkonganti@11: $s->{DESC} = join( ' ', @desc ); kkonganti@11: } kkonganti@11: kkonganti@11: # print Dumper($seqs); kkonganti@11: return $seqs; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub get_ecoli_vf { kkonganti@11: my $fasta = "ecoli_vf.ffn"; kkonganti@11: download( kkonganti@11: 'https://github.com/phac-nml/ecoli_vf/raw/master/data/repaired_ecoli_vfs_shortnames.ffn', kkonganti@11: $fasta kkonganti@11: ); kkonganti@11: my $seqs = load_fasta($fasta); kkonganti@11: kkonganti@11: # >VFG000748(gi:2865308) (espF) EspF [EspF (VF0182)] [Escherichia coli O127:H6 str. E2348/69] kkonganti@11: # >VFG000749(gi:6009379) (bfpA) Bundlin [BFP (VF0174)] [Escherichia coli B171] kkonganti@11: # >SPG000142 (cvac) Escherichia coli cvi cvaC operon. [X57525 434-745] kkonganti@11: # >SPG000143 (iss2) Escherichia coli Iss (iss) gene, complete cds. [AF042279 292-600] kkonganti@11: kkonganti@11: for my $s (@$seqs) { kkonganti@11: kkonganti@11: #print STDERR Dumper("IN", $s); kkonganti@11: $s->{ID} =~ m/ ^ (\w+) (?: \( (.*?) \) )? $ /x kkonganti@11: or die "Can't parse $fasta at " . Dumper($s); kkonganti@11: $s->{ID} = $1 if $1; kkonganti@11: $s->{ACC} = $2 || $1; kkonganti@11: $s->{DESC} =~ s/\s\[.*?\]$//g; # remove strain name kkonganti@11: $s->{DESC} =~ m/^(?:\((.*?)\)\s+)?(.*?)$/; kkonganti@11: $s->{ID} = $1 if $1; kkonganti@11: $s->{DESC} = $2; kkonganti@11: kkonganti@11: #print STDERR Dumper("OUT", $s); kkonganti@11: #print STDERR "="x60, "\n"; kkonganti@11: } kkonganti@11: kkonganti@11: # print Dumper($seqs); kkonganti@11: return $seqs; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub is_full_gene { kkonganti@11: my ($s) = @_; kkonganti@11: my $has_ambig = 0; kkonganti@11: kkonganti@11: my $id = $s->{ID}; kkonganti@11: my $L = length( $s->{SEQ} ); kkonganti@11: if ( $L % 3 != 0 ) { kkonganti@11: wrn("$id - length $L bp is not multiple of 3"); kkonganti@11: return; kkonganti@11: } kkonganti@11: if ( $s->{SEQ} !~ m/^[AGCT]+$/ ) { kkonganti@11: wrn("$id - has non-AGTC bases"); kkonganti@11: return; kkonganti@11: } kkonganti@11: kkonganti@11: my $seq = Bio::Seq->new( -id => $s->{ID}, -seq => $s->{SEQ} ); kkonganti@11: kkonganti@11: my $aa = $seq->translate->seq; kkonganti@11: kkonganti@11: if ( $aa =~ m/\*./ ) { kkonganti@11: wrn("$id - has internal stop codons, trying revcom"); kkonganti@11: $aa = $seq->revcom->translate->seq; kkonganti@11: if ( $aa =~ m/\*./ ) { kkonganti@11: wrn("$id - revcom has internal stop codons too"); kkonganti@11: return; kkonganti@11: } kkonganti@11: else { kkonganti@11: msg("$id - revcom resolves problem, hooray!"); kkonganti@11: $s->{SEQ} = $seq->revcom->seq; kkonganti@11: } kkonganti@11: } kkonganti@11: kkonganti@11: return $L; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub dedupe_seq { kkonganti@11: my ($seq) = @_; kkonganti@11: my %seen; kkonganti@11: my $good = []; kkonganti@11: for my $s (@$seq) { kkonganti@11: if ( $seen{ $s->{SEQ} } ) { kkonganti@11: wrn( "duplicate", length( $s->{SEQ} ), kkonganti@11: "bp sequence:", $s->{ID}, '~', $seen{ $s->{SEQ} } ); kkonganti@11: } kkonganti@11: else { kkonganti@11: push @$good, $s; kkonganti@11: } kkonganti@11: $seen{ $s->{SEQ} } .= ' ' . $s->{ID}; kkonganti@11: } kkonganti@11: msg( "dedupe_seq: read", scalar(@$seq), "/ kept", scalar(@$good) ); kkonganti@11: return $good; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub load_tabular { kkonganti@11: my ( $fname, $keycol, $sep ) = @_; kkonganti@11: $keycol //= 0; kkonganti@11: $sep //= "\t"; kkonganti@11: my $hash = {}; kkonganti@11: my @hdr; kkonganti@11: my $row = 0; kkonganti@11: open my $TSV, '<', $fname or err("Can't read TSV file: $fname"); kkonganti@11: while (<$TSV>) { kkonganti@11: chomp; kkonganti@11: my @col = split m/$sep/; kkonganti@11: $row++; kkonganti@11: if (@hdr) { kkonganti@11: @hdr == @col or err("Header and row $row differ in column count"); kkonganti@11: kkonganti@11: #my $key = $col[$keycol] or wrn("Empty key column $keycol: $_"); kkonganti@11: #exists{$hash->{$col[$key}} and wrn("WARNING: dupe key $key at row: $_"); kkonganti@11: my $key = $col[$keycol]; kkonganti@11: $hash->{$key} ||= kkonganti@11: { map { ( $hdr[$_] => $col[$_] ) } ( 0 .. $#hdr ) }; kkonganti@11: } kkonganti@11: else { kkonganti@11: @hdr = @col; kkonganti@11: } kkonganti@11: } kkonganti@11: close $TSV; kkonganti@11: return $hash; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub load_fasta { kkonganti@11: my ($fasta) = @_; kkonganti@11: my %seen; kkonganti@11: my $list; kkonganti@11: my $dbtype = 'unknown'; kkonganti@11: msg("load_fasta: $fasta"); kkonganti@11: my $in = Bio::SeqIO->new( -file => $fasta, -format => 'fasta' ); kkonganti@11: while ( my $seq = $in->next_seq ) { kkonganti@11: my $id = $seq->id or err("Empty ID in $fasta"); kkonganti@11: if ( $seen{$id} ) { kkonganti@11: wrn("Duplicate ID '$id' in $fasta"); kkonganti@11: $id = $id . '_dupe'; kkonganti@11: } kkonganti@11: $seen{$id}++; kkonganti@11: my $s = uc( $seq->seq ); kkonganti@11: $dbtype = $seq->alphabet eq 'dna' ? 'nucl' : 'prot'; kkonganti@11: $dbtype eq 'nucl' ? $s =~ s/[^AGTC]/N/g : $s =~ s/[^A-Z]/X/g; kkonganti@11: push @$list, kkonganti@11: { kkonganti@11: ID => $id, kkonganti@11: ACC => '', kkonganti@11: DESC => $seq->desc, kkonganti@11: SEQ => $s, kkonganti@11: TYPE => $dbtype, kkonganti@11: }; kkonganti@11: } kkonganti@11: msg( "load_fasta: read", scalar(@$list), "$dbtype sequences" ); kkonganti@11: return $list; kkonganti@11: } kkonganti@11: kkonganti@11: #.............................................................................. kkonganti@11: sub save_fasta { kkonganti@11: my ( $fasta, $seq ) = @_; kkonganti@11: msg("save_fasta: $fasta"); kkonganti@11: my %seen; kkonganti@11: my $out = Bio::SeqIO->new( -file => ">$fasta", -format => 'fasta' ); kkonganti@11: for my $s (@$seq) { kkonganti@11: $seen{ $s->{ID} }++; kkonganti@11: my $freq = $seen{ $s->{ID} }; kkonganti@11: kkonganti@11: #wrn("seen $s->{ID} now $freq times") if $freq > 1; kkonganti@11: # print Dumper($s); kkonganti@11: my $ABX = kkonganti@11: defined( $s->{ABX} ) ? join( $ABX_SEP, sort @{ $s->{ABX} } ) : ''; kkonganti@11: $ABX =~ s/\s+/_/g; # remove spaces! kkonganti@11: my $obj = Bio::Seq->new( kkonganti@11: -id => join( '~~~', $db, $s->{ID}, $s->{ACC}, $ABX ), kkonganti@11: -desc => ( $s->{DESC} || $s->{ID} ), kkonganti@11: -seq => $s->{SEQ}, kkonganti@11: ); kkonganti@11: kkonganti@11: # $obj->desc( hash_encode($s) ); kkonganti@11: $out->write_seq($obj); kkonganti@11: kkonganti@11: # $seen{ $s->{ID} }++; kkonganti@11: } kkonganti@11: msg( "save_fasta: wrote", scalar(@$seq), "sequences" ); kkonganti@11: } kkonganti@11: kkonganti@11: #---------------------------------------------------------------------- kkonganti@11: sub msg { kkonganti@11: print STDERR "@_\n"; kkonganti@11: } kkonganti@11: kkonganti@11: #---------------------------------------------------------------------- kkonganti@11: sub wrn { kkonganti@11: msg( "WARNING:", @_ ) if $debug; kkonganti@11: } kkonganti@11: kkonganti@11: #---------------------------------------------------------------------- kkonganti@11: sub err { kkonganti@11: msg( "ERROR:", @_ ); kkonganti@11: exit(1); kkonganti@11: } kkonganti@11: kkonganti@11: #---------------------------------------------------------------------- kkonganti@11: # Option setting routines kkonganti@11: kkonganti@11: sub setOptions { kkonganti@11: use Getopt::Long; kkonganti@11: kkonganti@11: @Options = ( kkonganti@11: { OPT => "help", VAR => \&usage, DESC => "This help" }, kkonganti@11: { kkonganti@11: OPT => "debug!", kkonganti@11: VAR => \$debug, kkonganti@11: DEFAULT => 0, kkonganti@11: DESC => "Verbose debug output" kkonganti@11: }, kkonganti@11: { kkonganti@11: OPT => "dbdir=s", kkonganti@11: VAR => \$outdir, kkonganti@11: DEFAULT => abs_path("$FindBin::RealBin/../db"), kkonganti@11: DESC => "Parent folder" kkonganti@11: }, kkonganti@11: { kkonganti@11: OPT => "db=s", kkonganti@11: VAR => \$db, kkonganti@11: DEFAULT => "", kkonganti@11: DESC => "Choices: $DATABASES" kkonganti@11: }, kkonganti@11: { kkonganti@11: OPT => "force!", kkonganti@11: VAR => \$force, kkonganti@11: DEFAULT => 0, kkonganti@11: DESC => "Force download even if exists" kkonganti@11: }, kkonganti@11: ); kkonganti@11: kkonganti@11: &GetOptions( map { $_->{OPT}, $_->{VAR} } @Options ) || usage(1); kkonganti@11: kkonganti@11: # Now setup default values. kkonganti@11: foreach (@Options) { kkonganti@11: if ( defined( $_->{DEFAULT} ) && !defined( ${ $_->{VAR} } ) ) { kkonganti@11: ${ $_->{VAR} } = $_->{DEFAULT}; kkonganti@11: } kkonganti@11: } kkonganti@11: } kkonganti@11: kkonganti@11: sub usage { kkonganti@11: my ($exitcode) = @_; kkonganti@11: $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref kkonganti@11: $exitcode ||= 0; kkonganti@11: select STDERR if $exitcode; # write to STDERR if exitcode is error kkonganti@11: kkonganti@11: print "SYNOPIS\n Download databases for abricate to use\n"; kkonganti@11: print "USAGE\n $EXE [options] --db DATABASE\n"; kkonganti@11: print "OPTIONS\n"; kkonganti@11: foreach (@Options) { kkonganti@11: printf " --%-13s %s%s.\n", $_->{OPT}, $_->{DESC}, kkonganti@11: defined( $_->{DEFAULT} ) ? " (default '$_->{DEFAULT}')" : ""; kkonganti@11: } kkonganti@11: exit($exitcode); kkonganti@11: } kkonganti@11: kkonganti@11: #---------------------------------------------------------------------- kkonganti@11: