Mercurial > repos > galaxytrakr > hfp_bettercallsal_awsbatch
comparison 1.0.0/assets/abricate-get_db @ 0:801b85b03a17 draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Thu, 28 May 2026 20:31:42 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:801b85b03a17 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use FindBin; | |
| 5 use Bio::SeqIO; | |
| 6 use Bio::Seq; | |
| 7 use Path::Tiny; | |
| 8 use File::Basename; | |
| 9 use File::Spec; | |
| 10 use File::Path qw(make_path remove_tree); | |
| 11 use List::Util qw(first); | |
| 12 use Cwd qw(abs_path); | |
| 13 use Data::Dumper; | |
| 14 use LWP::Simple; | |
| 15 use JSON; | |
| 16 | |
| 17 #.............................................................................. | |
| 18 # Globals | |
| 19 | |
| 20 my $EXE = basename($0); | |
| 21 my $ABX_SEP = ';'; | |
| 22 | |
| 23 my %DATABASE = ( | |
| 24 'resfinder' => \&get_resfinder, | |
| 25 'plasmidfinder' => \&get_plasmidfinder, | |
| 26 'megares' => \&get_megares, | |
| 27 'argannot' => \&get_argannot, | |
| 28 'card' => \&get_card, | |
| 29 | |
| 30 # 'ncbibetalactamase' => \&get_ncbibetalactamase, | |
| 31 'ncbi' => \&get_ncbi, | |
| 32 'vfdb' => \&get_vfdb, | |
| 33 'ecoli_vf' => \&get_ecoli_vf, # https://github.com/phac-nml/ecoli_vf | |
| 34 'ecoh' => \&get_ecoh, | |
| 35 'bacmet2' => \&get_bacmet2, | |
| 36 'victors' => \&get_victors, | |
| 37 | |
| 38 # 'serotypefinder' => \&get_serotypefinder, | |
| 39 ); | |
| 40 my $DATABASES = join( ' ', sort keys %DATABASE ); | |
| 41 | |
| 42 #.............................................................................. | |
| 43 # Command line options | |
| 44 | |
| 45 my ( @Options, $debug, $outdir, $db, $force ); | |
| 46 setOptions(); | |
| 47 | |
| 48 $db or err("Please choose a --db from: $DATABASES"); | |
| 49 exists $DATABASE{$db} or err("Unknown --db '$db', choose from: $DATABASES "); | |
| 50 -d $outdir or err("--outdir '$outdir' does not exist"); | |
| 51 | |
| 52 my $dir = abs_path( File::Spec->catdir( $outdir, $db ) ); | |
| 53 make_path($dir); | |
| 54 msg("Setting up '$db' in '$dir'"); | |
| 55 | |
| 56 #my $tmpdir = tempdir("$db-XXXXXXXX", DIR=>$dir, CLEANUP=>0); | |
| 57 #my $tmpdir = "/home/tseemann/git/abricate/db/resfinder/resfinder-6Kuphtvv"; | |
| 58 my $tmpdir = "$dir/src"; | |
| 59 make_path($tmpdir); | |
| 60 | |
| 61 # run the specific function from --db | |
| 62 chdir $tmpdir; | |
| 63 my $seq = $DATABASE{$db}->(); | |
| 64 map { is_full_gene($_) } @$seq; # doesn't do anything? | |
| 65 $seq = dedupe_seq($seq); | |
| 66 | |
| 67 #print Dumper($seq); | |
| 68 msg("Sorting sequences by ID"); | |
| 69 $seq = [ sort { $a->{ID} cmp $b->{ID} } @$seq ]; | |
| 70 save_fasta( "$dir/sequences", $seq ); | |
| 71 | |
| 72 msg("Formatting BLASTN database: $dir/sequences"); | |
| 73 my $logfile = "$tmpdir/makeblastdb.log"; | |
| 74 my $ncbi_title = $db; | |
| 75 if ( "$db" eq "ncbi" ) { | |
| 76 $ncbi_title = "ncbiamrplus"; | |
| 77 } | |
| 78 my $ec = system( | |
| 79 "makeblastdb -in '$dir/sequences' -title '$ncbi_title' -dbtype nucl -hash_index -logfile $logfile" | |
| 80 ); | |
| 81 if ( $ec != 0 ) { | |
| 82 system("tail '$logfile'"); | |
| 83 err("Error with makign BLAST database. See $logfile"); | |
| 84 } | |
| 85 | |
| 86 #msg("Run 'abricate --setupdb' to format the database"); | |
| 87 | |
| 88 msg("Done."); | |
| 89 | |
| 90 #.............................................................................. | |
| 91 | |
| 92 sub download { | |
| 93 my ( $url, $dest ) = @_; | |
| 94 if ( -r $dest and not $force ) { | |
| 95 msg("Won't re-download existing $dest (use --force)"); | |
| 96 | |
| 97 #exit(1); | |
| 98 } | |
| 99 else { | |
| 100 msg("Downloading: $url"); | |
| 101 my $ec = mirror( $url, $dest ); | |
| 102 msg("HTTP Result: $ec"); | |
| 103 ( $ec == 200 or $ec = 304 ) | |
| 104 or err("HTTP $ec | failed to download $url"); # is HTTP OK ? | |
| 105 } | |
| 106 msg("Destination: $dest"); | |
| 107 msg( "Filesize:", ( -s $dest ), "bytes" ); | |
| 108 } | |
| 109 | |
| 110 #.............................................................................. | |
| 111 sub trim_spaces { | |
| 112 my ($s) = @_; | |
| 113 $s =~ s/^\s+//; | |
| 114 $s =~ s/\s+$//; | |
| 115 return $s; | |
| 116 } | |
| 117 | |
| 118 #.............................................................................. | |
| 119 sub get_resfinder { | |
| 120 my $name = "resfinder_db"; | |
| 121 | |
| 122 # FIXME - can we just get HEAD.zip like in plasmidfinder? | |
| 123 my $url = "https://bitbucket.org/genomicepidemiology/$name.git"; | |
| 124 | |
| 125 if (-r $name and not $force) { | |
| 126 msg("Won't overwrite existing $name (use --force)"); | |
| 127 exit(1); | |
| 128 } | |
| 129 else { | |
| 130 msg("Nuking existing folder: $name"); | |
| 131 remove_tree("./$name"); | |
| 132 msg("Cloning $url to $name"); | |
| 133 system("git clone --quiet $url $name"); | |
| 134 } | |
| 135 | |
| 136 #<*.fsa> | |
| 137 #>aac(6')-Ib_2_M23634 | |
| 138 #>blaNDM-19_1_MF370080 | |
| 139 #>mcr-1.1_1_KP347127 | |
| 140 #>fosB1_1_CP001903 | |
| 141 #>fusB_1_AY373761 | |
| 142 #>VanHAX_1_FJ866609 | |
| 143 #>ere(A)_6_DQ157752 | |
| 144 #>nimA_1_X71444 | |
| 145 #>cfr_1_AM408573 | |
| 146 #>catB3_2_U13880 | |
| 147 #>qnrA1_1_AY070235 | |
| 148 #>ARR-2_1_HQ141279 | |
| 149 #>sul1_2_U12338 | |
| 150 #>tet_1_M74049 | |
| 151 #>dfrA19_1_EU855687 | |
| 152 | |
| 153 #<notes.txt> | |
| 154 #aac(6')-Iv:Aminoglycoside resistance: | |
| 155 #aac(6')-Iw:Aminoglycoside resistance:Alternate name; aac(6')-Ix | |
| 156 #sul3:Sulphonamide resistance: | |
| 157 ##Tetracycline: | |
| 158 #ort(B):Tetracycline resistance: | |
| 159 #blaCMY-59:Beta-lactam resistance: | |
| 160 | |
| 161 #<phenotypes.txt> | |
| 162 #Gene_accession no. Class Phenotype PMID Mechanism of resistance Notes Required_gene | |
| 163 #ant(2'')-Ia_1_X04555 Aminoglycoside Gentamicin, Tobramycin 3024112 Enzymatic modification Alternative name aadB | |
| 164 #ant(2'')-Ia_2_JF826500 Aminoglycoside Gentamicin, Tobramycin 22271862 Enzymat | |
| 165 | |
| 166 # $name = "~/apps/bettercallsal/assets/abricate_dbs/$name"; | |
| 167 my $metafn = "$name/phenotypes.txt"; | |
| 168 my @meta = path($metafn)->lines( { chomp => 1 } ); | |
| 169 my %anno; | |
| 170 foreach (@meta) { | |
| 171 next if m/^#/; | |
| 172 my @x = split m/\t/; | |
| 173 | |
| 174 #msg("$metafn: @x"); | |
| 175 my ($gene) = ( $x[0] =~ m/^(.*?)_\w+$/ ); | |
| 176 $anno{$gene}{ABX} = [ | |
| 177 map { trim_spaces($_) } | |
| 178 grep { !m/(unknown|notes|^none)/i } | |
| 179 split m/,\s*/, | |
| 180 $x[2] | |
| 181 ]; | |
| 182 | |
| 183 #msg("$metafn: $gene |", $anno{$gene}{ABX}->@*); | |
| 184 } | |
| 185 msg( "get_resfinder: $metafn", scalar( keys %anno ), "genes" ); | |
| 186 | |
| 187 #print Dumper(\%anno); | |
| 188 | |
| 189 my @seq; | |
| 190 for my $fasta (<$name/*.fsa>) { | |
| 191 | |
| 192 # Issue #62 - repair broken fasta files like this: | |
| 193 # GCTTTAAATTGGAAAAAAGATAGTCAAACTCTTTAA>cmr_1_U43535 | |
| 194 # inline replacement | |
| 195 system( 'sed', '-i.bak', 's/\([A-Z]\)>/\1\n>/gi', $fasta ); | |
| 196 my $args = load_fasta($fasta); | |
| 197 | |
| 198 # use name of fasta file as antibiotic name | |
| 199 #my $abx = basename($fasta, '.fsa'); | |
| 200 #msg("$fasta: Assigning '$abx' to all genes"); | |
| 201 #push @{$_->{ABX}}, $abx for (@$args); | |
| 202 push @seq, @$args; | |
| 203 } | |
| 204 | |
| 205 # https://github.com/tseemann/abricate/issues/92 | |
| 206 # mcr-9_1_NZ_NAAN01000063.1 | |
| 207 #>mcr-9_1_NZ_NAAN01000063.1 | |
| 208 # mcr-9.1:Colistin resistance: | |
| 209 | |
| 210 for my $seq (@seq) { | |
| 211 my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*?)_(\d+)_(\S+)$/; | |
| 212 | |
| 213 #msg("resfinder: $1 $2 $3", $anno{$1}); | |
| 214 $seq->{ID} = "${id}_${copy}"; | |
| 215 $seq->{ACC} = $acc; | |
| 216 $seq->{DESC} = $anno{$id}{DESC} || $id; | |
| 217 push @{ $seq->{ABX} }, @{ $anno{$id}{ABX} } if $anno{$id}{ABX}; | |
| 218 } | |
| 219 | |
| 220 return \@seq; | |
| 221 } | |
| 222 | |
| 223 #.............................................................................. | |
| 224 sub get_serotypefinder { | |
| 225 my $name = "serotypefinder_db"; | |
| 226 my $url = "https://bitbucket.org/genomicepidemiology/$name.git"; | |
| 227 | |
| 228 if ( -r $name and not $force ) { | |
| 229 msg("Won't overwrite existing $name (use --force)"); | |
| 230 | |
| 231 # exit(1); | |
| 232 } | |
| 233 else { | |
| 234 msg("Nuking existing folder: $name"); | |
| 235 remove_tree("./$name"); | |
| 236 msg("Cloning $url to $name"); | |
| 237 system("git clone --quiet $url $name"); | |
| 238 } | |
| 239 | |
| 240 my @seq; | |
| 241 for my $fasta (<$name/*.fsa>) { | |
| 242 push @seq, @{ load_fasta($fasta) }; | |
| 243 } | |
| 244 | |
| 245 # >fliC_44444_AY250028_H52 | |
| 246 # FIXME - this is already in EcOH database! | |
| 247 for my $seq (@seq) { | |
| 248 my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*)_(\d+)_(\w+)$/; | |
| 249 | |
| 250 #msg("serotypefinder: $1 $2 $3", $anno{$1}); | |
| 251 $seq->{ID} = "${id}_${copy}"; | |
| 252 $seq->{ACC} = $acc; | |
| 253 | |
| 254 #$seq->{DESC} = $anno{$id} || ''; | |
| 255 } | |
| 256 | |
| 257 return \@seq; | |
| 258 } | |
| 259 | |
| 260 #.............................................................................. | |
| 261 sub get_tag { | |
| 262 my ( $f, $tag ) = @_; | |
| 263 if ( $f->has_tag($tag) ) { | |
| 264 my ($val) = $f->get_tag_values($tag); | |
| 265 return $val; | |
| 266 } | |
| 267 return ''; | |
| 268 } | |
| 269 | |
| 270 #.............................................................................. | |
| 271 sub get_ncbi { | |
| 272 my $AFP = | |
| 273 "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest"; | |
| 274 | |
| 275 #my $src = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/AMR_CDS"; | |
| 276 my $src = "$AFP/AMR_CDS.fa"; | |
| 277 my $name = "amr_cds.ffn"; | |
| 278 | |
| 279 #my $src2 = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/ReferenceGeneCatalog.txt"; | |
| 280 my $src2 = "$AFP/ReferenceGeneCatalog.txt"; | |
| 281 my $name2 = "amr_cds.tsv"; | |
| 282 | |
| 283 if ( -r $name and -r $name2 and not $force ) { | |
| 284 msg("Won't overwrite existing $name/$name2 (use --force)"); | |
| 285 | |
| 286 # exit(1); | |
| 287 } | |
| 288 else { | |
| 289 download( $src, $name ); | |
| 290 download( $src2, $name2 ); | |
| 291 } | |
| 292 | |
| 293 #1 allele | |
| 294 #2 gene_family ble | |
| 295 #3 whitelisted_taxa | |
| 296 #4 product_name BLMA family bleomycin binding protein | |
| 297 #5 scope core | |
| 298 #6 type AMR | |
| 299 #7 subtype AMR | |
| 300 #8 class BLEOMYCIN | |
| 301 #9 subclass BLEOMYCIN | |
| 302 #10 refseq_protein_accession WP_063842967.1 | |
| 303 #11 refseq_nucleotide_accession NG_047554.1 | |
| 304 #12 curated_refseq_start No | |
| 305 #13 genbank_protein_accession CAA02068.1 | |
| 306 #14 genbank_nucleotide_accession A31900.1 | |
| 307 #15 genbank_strand_orientation + | |
| 308 #16 genbank_cds_start 6 | |
| 309 #17 genbank_cds_stop 374 | |
| 310 #18 pubmed_reference | |
| 311 #19 blacklisted_taxa | |
| 312 #20 db_version 2019-08-27.1 | |
| 313 | |
| 314 my $tsv = load_tabular( $name2, 10 ); # refseq_nucleotide_accession | |
| 315 my $tsv2 = load_tabular( $name2, 12 ); # refseq_nucleotide_accession | |
| 316 | |
| 317 msg( "[$name2] Loaded", scalar keys %$tsv, "records" ); | |
| 318 msg( "[$name2] Loaded", scalar keys %$tsv2, "records" ); | |
| 319 # print keys %$tsv; | |
| 320 # exit; | |
| 321 | |
| 322 # https://github.com/ncbi/amr/wiki/AMRFinderPlus-database#amrprot | |
| 323 # 0 1 2 3 4 5 6 7 | |
| 324 # >1000909371|WP_061158039.1|NG_050200|1|1|blaTEM-156|blaTEM|class_A_beta-lactamase_TEM-156 NG_050200:101-961 | |
| 325 my @seq; | |
| 326 my $in = Bio::SeqIO->new( -file => $name, -format => "fasta" ); | |
| 327 while ( my $rec = $in->next_seq ) { | |
| 328 | |
| 329 # parse ID | |
| 330 my ( $gi, $pi, $acc, $fp, $fn, $gene, $fam, $prod ) = split m/\|/, | |
| 331 $rec->id; | |
| 332 | |
| 333 # skip fusion genes | |
| 334 next unless $fp == 1 and $fn == 1; | |
| 335 | |
| 336 # only keep true ARGs | |
| 337 $acc .= ".1" unless $acc =~ m/\.\d+$/; | |
| 338 #my $t = $tsv->{$acc} or next; | |
| 339 my $t; | |
| 340 if ($tsv->{$acc}) { | |
| 341 $t = $tsv->{$acc} | |
| 342 } elsif ($tsv2->{$pi}) { | |
| 343 # print $pi, ": pi\n"; | |
| 344 # print $acc, ": acc\n"; | |
| 345 # print "$pi | scope: ", $t->{scope}, "\n"; | |
| 346 # exit; | |
| 347 $t = $tsv->{$pi} | |
| 348 } else { | |
| 349 next; | |
| 350 } | |
| 351 #next unless ( $t->{scope} eq 'plus' || $t->{scope} eq 'core' ); | |
| 352 | |
| 353 # next unless $t->{type} eq 'VIRULENCE'; | |
| 354 # next unless $t->{subtype} eq 'VIRULENCE'; | |
| 355 # construct sequence record | |
| 356 $prod =~ s/_/ /g; | |
| 357 err("$pi: gene is empty") unless $gene; | |
| 358 err("$pi: product is empty") unless $prod; | |
| 359 my $s = { | |
| 360 ID => $gene, | |
| 361 ACC => $acc, | |
| 362 DESC => $prod, | |
| 363 SEQ => $rec->seq, | |
| 364 ABX => [ split m'/', $t->{subclass} ] | |
| 365 }; | |
| 366 push @seq, $s; | |
| 367 msg( "[$name]", 0 + @seq, "|", $s->{ID}, "|", $s->{ACC}, "|", | |
| 368 $s->{DESC} ); | |
| 369 | |
| 370 #msg(Dumper($s)); | |
| 371 msg( $s->{ID}, " is fusion $fp/$fn" ) if "$fp$fn" ne '11'; | |
| 372 } | |
| 373 return \@seq; | |
| 374 } | |
| 375 | |
| 376 #.............................................................................. | |
| 377 sub get_plasmidfinder { | |
| 378 my $name = "plasmidfinder"; | |
| 379 my $zip = "$name.zip"; | |
| 380 | |
| 381 # download("https://cge.cbs.dtu.dk/cge/download_data.php?folder=$name&filename=$zip&submit=$zip", $zip); | |
| 382 download( | |
| 383 "https://bitbucket.org/genomicepidemiology/plasmidfinder_db/get/HEAD.zip", | |
| 384 $zip | |
| 385 ); | |
| 386 system("unzip -j -u $zip"); | |
| 387 | |
| 388 my @seq; | |
| 389 for my $fasta (<*.fsa>) { | |
| 390 push @seq, @{ load_fasta($fasta) }; | |
| 391 } | |
| 392 | |
| 393 for my $seq (@seq) { | |
| 394 $seq->{DESC} = $seq->{ID}; # no desc, so use ORIGINAL ID as desc | |
| 395 my ( $id, $acc ) = | |
| 396 ( $seq->{ID} =~ m/^(.*)_(([A-Z]+|NC_)\d+(\.\d+)?)$/ ); | |
| 397 $id =~ s/_+$//g; | |
| 398 $seq->{ID} = $id || $seq->{ID}; | |
| 399 $seq->{ACC} = $acc || ''; | |
| 400 wrn( "Parsed empty ID:", | |
| 401 $seq->{DESC}, | |
| 402 "=> id='$id' acc='$acc' seq=" . substr( $seq->{SEQ}, 0, 10 ) ) | |
| 403 if not $id; | |
| 404 } | |
| 405 | |
| 406 return \@seq; | |
| 407 } | |
| 408 | |
| 409 #.............................................................................. | |
| 410 sub get_megares { | |
| 411 my $zip = "megares.zip"; | |
| 412 download( 'https://www.meglab.org/downloads/megares_v3.00.zip', $zip ); | |
| 413 system("unzip -j -u $zip"); | |
| 414 my $seqs = load_fasta( glob("megares_drugs_*.fasta") ); | |
| 415 my @okseq; | |
| 416 | |
| 417 # >MEG_372|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_MATE_efflux_pumps|ABEM | |
| 418 # >MEG_411|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_RND_efflux_regulator|ACRR|RequiresSNPConfirmation | |
| 419 # >MEG_7860|Drugs|betalactams|Class_B_betalactamases|ZOG | |
| 420 # >MEG_7439|Drugs|Glycopeptides|VanI-type_resistance_protein|VANI | |
| 421 # >MEG_7245|Drugs|Tetracyclines|Tetracycline_resistance_MFS_efflux_pumps|TETY | |
| 422 # >MEG_9|Drugs|Aminoglycosides|Aminoglycoside-resistant_16S_ribosomal_subunit_protein|A16S|RequiresSNPConfirmation | |
| 423 | |
| 424 for my $s (@$seqs) { | |
| 425 my ( $id, $type, $class, $mech, $group, $note ) = split m/\|/, $s->{ID}; | |
| 426 if ($note) { | |
| 427 | |
| 428 # "RequiresSNPConfirmation" is the common one; we can't do that | |
| 429 msg("Skipping $id due to: $note"); | |
| 430 next; | |
| 431 } | |
| 432 $s->{ID} = $group; | |
| 433 $s->{ACC} = $id; | |
| 434 $s->{DESC} = join( ':', $type, $class, $mech, $group ); | |
| 435 push @okseq, $s; | |
| 436 } | |
| 437 return [@okseq]; | |
| 438 | |
| 439 #return $seqs; | |
| 440 } | |
| 441 | |
| 442 #.............................................................................. | |
| 443 sub get_argannot { | |
| 444 my $fasta = 'arg-annot.fa'; | |
| 445 download( | |
| 446 # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/1425/argannot-aa-v3-march2017_doc.fasta', | |
| 447 # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/691/argannot-nt_doc.fasta', | |
| 448 'https://www.mediterranee-infection.com/wp-content/uploads/2019/09/ARG-ANNOT_NT_V6_July2019.txt', | |
| 449 $fasta | |
| 450 ); | |
| 451 | |
| 452 # fix syntax errors in the FASTA file... | |
| 453 path($fasta)->edit( sub { s/\\//g; $_ } ); | |
| 454 | |
| 455 my $seqs = load_fasta($fasta); | |
| 456 | |
| 457 # 0 1 2 3 | |
| 458 # >(AGly)Aac2-Ie:NC_011896:3039059-3039607:549 | |
| 459 for my $s (@$seqs) { | |
| 460 my @x = split m/:/, $s->{ID}; | |
| 461 $s->{ID} = $x[0]; | |
| 462 $s->{ACC} = $x[1] . ':' . $x[2]; | |
| 463 $s->{DESC} = ''; | |
| 464 } | |
| 465 | |
| 466 return $seqs; | |
| 467 } | |
| 468 | |
| 469 #.............................................................................. | |
| 470 sub get_bacmet2 { | |
| 471 my $fasta = 'bacmet2.fa'; | |
| 472 download( | |
| 473 'http://bacmet.biomedicine.gu.se/download/BacMet2_EXP_database.fasta', | |
| 474 $fasta ); | |
| 475 | |
| 476 # This is a PROTEIN file | |
| 477 my $seqs = load_fasta($fasta); | |
| 478 | |
| 479 # 0 1 2 3 4 ^ | |
| 480 # >BAC0098|ctpC|sp|P0A502|CTPC_MYCTU Probable manganese/zinc-exporting | |
| 481 for my $s (@$seqs) { | |
| 482 my @x = split m/\|/, $s->{ID}; | |
| 483 $s->{ID} = $x[1] . '-' . $x[0]; | |
| 484 $s->{ACC} = $x[2] . ':' . $x[3]; | |
| 485 } | |
| 486 | |
| 487 return $seqs; | |
| 488 } | |
| 489 | |
| 490 #.............................................................................. | |
| 491 sub get_card { | |
| 492 | |
| 493 # https://github.com/tseemann/abricate/issues/25 | |
| 494 my $tarball = 'card.tar.bz2'; | |
| 495 download( | |
| 496 #'https://card.mcmaster.ca/download/0/broadstreet-v2.0.2.tar.gz', | |
| 497 'https://card.mcmaster.ca/latest/data', | |
| 498 | |
| 499 #'https://card.mcmaster.ca/latest/data/card-data.tar.bz2', | |
| 500 $tarball # yes, it's really BZ2 not GZ ... | |
| 501 ); | |
| 502 | |
| 503 # my $fasta = "./nucleotide_fasta_protein_homolog_model.fasta"; | |
| 504 my $jsonfile = "./card.json"; | |
| 505 system( "tar", "xf", $tarball, $jsonfile ) == 0 | |
| 506 or err("Problem with tar xf $tarball $jsonfile"); | |
| 507 -r $jsonfile or err("Could not extract $jsonfile from $tarball"); | |
| 508 | |
| 509 # JSON | |
| 510 my $json = path($jsonfile)->slurp_utf8; | |
| 511 my $card = from_json( $json, { latin1 => 1 } ); | |
| 512 my @seq; | |
| 513 for my $g ( values %$card ) { | |
| 514 next unless ref($g) eq 'HASH'; | |
| 515 | |
| 516 # msg(Dumper($g)); | |
| 517 next | |
| 518 unless $g->{model_type} eq | |
| 519 "protein homolog model"; # only 'acquired' genes | |
| 520 my $id = $g->{model_name}; | |
| 521 err("$id has {model_param}{snp}") if exists $g->{model_param}{snp}; | |
| 522 | |
| 523 # msg("CARD: $id"); | |
| 524 # print STDERR Dumper($g); | |
| 525 my $dna = $g->{model_sequences}{sequence} | |
| 526 or err("$id: no {model_sequences}{sequence} found"); | |
| 527 my ($key) = sort keys %$dna; # first key | |
| 528 $dna = $dna->{$key} or err("$id: invalid key '$key'"); | |
| 529 $dna = $dna->{dna_sequence} or err("$id: no dna_sequence"); | |
| 530 | |
| 531 # msg(Dumper($dna)) if $id eq 'OXA-25'; | |
| 532 | |
| 533 # ARO_category => { | |
| 534 # 'category_aro_name' => 'cephalosporin', | |
| 535 # 'category_aro_class_name' => 'Drug Class', | |
| 536 my $is_amr_gene = 0; | |
| 537 my @abx; | |
| 538 for my $key ( keys $g->{ARO_category}->%* ) { | |
| 539 my $c = $g->{ARO_category}{$key}; | |
| 540 if ( $c->{category_aro_class_name} eq 'Drug Class' ) { | |
| 541 my $abx = $c->{category_aro_name}; | |
| 542 $abx =~ s/ antibiotic//; | |
| 543 $abx =~ s/\s/_/g; | |
| 544 push @abx, $abx; | |
| 545 } | |
| 546 if ( $c->{category_aro_class_name} eq 'AMR Gene Family' ) { | |
| 547 $is_amr_gene++; | |
| 548 } | |
| 549 } | |
| 550 | |
| 551 #err("CARD | $id | ", Dumper($g->{ARO_category}) ) unless $is_amr_gene; | |
| 552 #msg("ABX=$_") for @abx; | |
| 553 | |
| 554 # put coordinates into normal form | |
| 555 my ( $start, $stop ) = | |
| 556 $dna->{strand} eq '-' | |
| 557 ? ( $dna->{fmax}, $dna->{fmin} ) | |
| 558 : ( $dna->{fmin}, $dna->{fmax} ); | |
| 559 | |
| 560 $id =~ s/\s+/_/g; | |
| 561 push @seq, | |
| 562 { | |
| 563 ID => $id, | |
| 564 ACC => $dna->{accession} . ":$start-$stop", | |
| 565 DESC => ( $g->{ARO_description} || $g->{ARO_accession} ), | |
| 566 SEQ => $dna->{sequence}, | |
| 567 ABX => [@abx], | |
| 568 }; | |
| 569 | |
| 570 # msg(Dumper($seq[-1])); | |
| 571 } | |
| 572 | |
| 573 return \@seq; | |
| 574 } | |
| 575 | |
| 576 #.............................................................................. | |
| 577 sub get_victors { | |
| 578 | |
| 579 # the CDS data is in .ffn and has source GI and coords | |
| 580 # the PROT data is in .faa and has the protein ref and /product | |
| 581 | |
| 582 #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence | |
| 583 download( 'http://www.phidias.us/victors/downloads/gen_downloads.php', | |
| 584 'victors.ffn' ); | |
| 585 | |
| 586 #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni] | |
| 587 download( | |
| 588 'http://www.phidias.us/victors/downloads/gen_downloads_protein.php', | |
| 589 'victors.faa' ); | |
| 590 | |
| 591 my %gi; | |
| 592 open my $FAA, '<', 'victors.faa'; | |
| 593 while (<$FAA>) { | |
| 594 | |
| 595 #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni] | |
| 596 next unless m"^>gi.(\d+).ref.([^|]+). ([^[]+)"; | |
| 597 $gi{$1}{ACC} = $2; | |
| 598 $gi{$1}{DESC} = $3; | |
| 599 } | |
| 600 | |
| 601 my $seqs = load_fasta("victors.ffn"); | |
| 602 | |
| 603 for my $s (@$seqs) { | |
| 604 | |
| 605 #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence | |
| 606 $s->{ID} =~ m/gi.(\d+):(\d+)-(\d+)/; | |
| 607 $s->{ACC} = $gi{$1}{ACC} || "gi|$1:$2-$3"; | |
| 608 $s->{DESC} =~ $gi{$1}{DESC} || 'hypothetical protein'; | |
| 609 } | |
| 610 | |
| 611 # print Dumper($seqs); exit; | |
| 612 | |
| 613 return $seqs; | |
| 614 } | |
| 615 | |
| 616 #.............................................................................. | |
| 617 sub get_vfdb { | |
| 618 download( 'http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz', | |
| 619 'vfdb.fa.gz' ); | |
| 620 system("gzip -f -d -c vfdb.fa.gz > vfdb.fa"); | |
| 621 my $seqs = load_fasta("vfdb.fa"); | |
| 622 | |
| 623 # >VFG000676(gb|AAD32411) (lef) anthrax toxin lethal factor precursor [Anthrax toxin (VF0142)] [Bacillus anthracis str. Sterne] | |
| 624 for my $s (@$seqs) { | |
| 625 | |
| 626 # https://github.com/tseemann/abricate/issues/64#issuecomment-421895159 by @VGalata | |
| 627 $s->{ID} =~ m/^(\w+)\(\w+\|(\w+)(\.\d+)?\)$/; # | |
| 628 #$s->{ID} =~ m/^(\w+)\(\w+\|(\w+)\)$/; | |
| 629 $s->{ACC} = $2 if $2; | |
| 630 $s->{DESC} =~ m/^\((.*?)\)/; | |
| 631 $s->{ID} = $1 if $1; | |
| 632 | |
| 633 # print STDERR Dumper($s); exit; | |
| 634 } | |
| 635 | |
| 636 return $seqs; | |
| 637 } | |
| 638 | |
| 639 #.............................................................................. | |
| 640 sub get_ncbibetalactamase { | |
| 641 my $fasta = "ncbi.fa"; | |
| 642 download( | |
| 643 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele-dna.fa', | |
| 644 $fasta ); | |
| 645 my $tab = "ncbi.tab"; | |
| 646 download( 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele.tab', | |
| 647 $tab ); | |
| 648 | |
| 649 # >ACD12694.1 EU650653.1:1-1173 | |
| 650 my $seqs = load_fasta($fasta); | |
| 651 | |
| 652 # ACC-1 ACD12694.1 EU650653.1 blaACC-1 1 1173 + cephalosporin-hydrolyzing class C beta-lactamase ACC-1 | |
| 653 my %anno; | |
| 654 my @anno = grep { !m/^#/ } path($tab)->lines( { chomp => 1 } ); | |
| 655 msg( "Read", 0 + @anno, "annotations" ); | |
| 656 foreach (@anno) { | |
| 657 my ( $name, $id, $acc, $gene, $begin, $end, undef, $product ) = | |
| 658 split m/\t/; | |
| 659 $anno{$id} = { | |
| 660 ID => $gene, | |
| 661 DESC => $product, | |
| 662 ACC => "$acc:$begin-$end", | |
| 663 }; | |
| 664 } | |
| 665 | |
| 666 # print Dumper(\%anno); | |
| 667 | |
| 668 for my $s (@$seqs) { | |
| 669 my $id = $s->{ID}; | |
| 670 next unless exists $anno{$id}; | |
| 671 $s->{ID} = $anno{$id}{ID}; | |
| 672 $s->{ACC} = $anno{$id}{ACC}; | |
| 673 $s->{DESC} = $anno{$id}{DESC}; | |
| 674 } | |
| 675 | |
| 676 # print Dumper($seqs); | |
| 677 | |
| 678 return $seqs; | |
| 679 } | |
| 680 | |
| 681 #.............................................................................. | |
| 682 sub get_ecoh { | |
| 683 my $fasta = "EcOH.fa"; | |
| 684 download( | |
| 685 'https://raw.githubusercontent.com/katholt/srst2/master/data/EcOH.fasta', | |
| 686 $fasta | |
| 687 ); | |
| 688 | |
| 689 # https://github.com/katholt/srst2#generating-srst2-compatible-clustered-database-from-raw-sequences | |
| 690 # [clusterID]__[gene]__[allele]__[seqID] [other stuff] | |
| 691 # >1__fliC__fliC-H1__1 AB028471.1;flagellin;H1 | |
| 692 # >8__wzx__wzx-O41__246 AB811617.1;O antigen flippase;O41 | |
| 693 # >9__wzy__wzy-OgN31__597 LC125932.1;O antigen polyermase;OgN31 | |
| 694 my $seqs = load_fasta($fasta); | |
| 695 | |
| 696 for my $s (@$seqs) { | |
| 697 my @id = split m/__/, $s->{ID}; | |
| 698 my @desc = split m';', $s->{DESC}; | |
| 699 $s->{ID} = $id[2]; | |
| 700 $s->{ACC} = shift(@desc); | |
| 701 $s->{DESC} = join( ' ', @desc ); | |
| 702 } | |
| 703 | |
| 704 # print Dumper($seqs); | |
| 705 return $seqs; | |
| 706 } | |
| 707 | |
| 708 #.............................................................................. | |
| 709 sub get_ecoli_vf { | |
| 710 my $fasta = "ecoli_vf.ffn"; | |
| 711 download( | |
| 712 'https://github.com/phac-nml/ecoli_vf/raw/master/data/repaired_ecoli_vfs_shortnames.ffn', | |
| 713 $fasta | |
| 714 ); | |
| 715 my $seqs = load_fasta($fasta); | |
| 716 | |
| 717 # >VFG000748(gi:2865308) (espF) EspF [EspF (VF0182)] [Escherichia coli O127:H6 str. E2348/69] | |
| 718 # >VFG000749(gi:6009379) (bfpA) Bundlin [BFP (VF0174)] [Escherichia coli B171] | |
| 719 # >SPG000142 (cvac) Escherichia coli cvi cvaC operon. [X57525 434-745] | |
| 720 # >SPG000143 (iss2) Escherichia coli Iss (iss) gene, complete cds. [AF042279 292-600] | |
| 721 | |
| 722 for my $s (@$seqs) { | |
| 723 | |
| 724 #print STDERR Dumper("IN", $s); | |
| 725 $s->{ID} =~ m/ ^ (\w+) (?: \( (.*?) \) )? $ /x | |
| 726 or die "Can't parse $fasta at " . Dumper($s); | |
| 727 $s->{ID} = $1 if $1; | |
| 728 $s->{ACC} = $2 || $1; | |
| 729 $s->{DESC} =~ s/\s\[.*?\]$//g; # remove strain name | |
| 730 $s->{DESC} =~ m/^(?:\((.*?)\)\s+)?(.*?)$/; | |
| 731 $s->{ID} = $1 if $1; | |
| 732 $s->{DESC} = $2; | |
| 733 | |
| 734 #print STDERR Dumper("OUT", $s); | |
| 735 #print STDERR "="x60, "\n"; | |
| 736 } | |
| 737 | |
| 738 # print Dumper($seqs); | |
| 739 return $seqs; | |
| 740 } | |
| 741 | |
| 742 #.............................................................................. | |
| 743 sub is_full_gene { | |
| 744 my ($s) = @_; | |
| 745 my $has_ambig = 0; | |
| 746 | |
| 747 my $id = $s->{ID}; | |
| 748 my $L = length( $s->{SEQ} ); | |
| 749 if ( $L % 3 != 0 ) { | |
| 750 wrn("$id - length $L bp is not multiple of 3"); | |
| 751 return; | |
| 752 } | |
| 753 if ( $s->{SEQ} !~ m/^[AGCT]+$/ ) { | |
| 754 wrn("$id - has non-AGTC bases"); | |
| 755 return; | |
| 756 } | |
| 757 | |
| 758 my $seq = Bio::Seq->new( -id => $s->{ID}, -seq => $s->{SEQ} ); | |
| 759 | |
| 760 my $aa = $seq->translate->seq; | |
| 761 | |
| 762 if ( $aa =~ m/\*./ ) { | |
| 763 wrn("$id - has internal stop codons, trying revcom"); | |
| 764 $aa = $seq->revcom->translate->seq; | |
| 765 if ( $aa =~ m/\*./ ) { | |
| 766 wrn("$id - revcom has internal stop codons too"); | |
| 767 return; | |
| 768 } | |
| 769 else { | |
| 770 msg("$id - revcom resolves problem, hooray!"); | |
| 771 $s->{SEQ} = $seq->revcom->seq; | |
| 772 } | |
| 773 } | |
| 774 | |
| 775 return $L; | |
| 776 } | |
| 777 | |
| 778 #.............................................................................. | |
| 779 sub dedupe_seq { | |
| 780 my ($seq) = @_; | |
| 781 my %seen; | |
| 782 my $good = []; | |
| 783 for my $s (@$seq) { | |
| 784 if ( $seen{ $s->{SEQ} } ) { | |
| 785 wrn( "duplicate", length( $s->{SEQ} ), | |
| 786 "bp sequence:", $s->{ID}, '~', $seen{ $s->{SEQ} } ); | |
| 787 } | |
| 788 else { | |
| 789 push @$good, $s; | |
| 790 } | |
| 791 $seen{ $s->{SEQ} } .= ' ' . $s->{ID}; | |
| 792 } | |
| 793 msg( "dedupe_seq: read", scalar(@$seq), "/ kept", scalar(@$good) ); | |
| 794 return $good; | |
| 795 } | |
| 796 | |
| 797 #.............................................................................. | |
| 798 sub load_tabular { | |
| 799 my ( $fname, $keycol, $sep ) = @_; | |
| 800 $keycol //= 0; | |
| 801 $sep //= "\t"; | |
| 802 my $hash = {}; | |
| 803 my @hdr; | |
| 804 my $row = 0; | |
| 805 open my $TSV, '<', $fname or err("Can't read TSV file: $fname"); | |
| 806 while (<$TSV>) { | |
| 807 chomp; | |
| 808 my @col = split m/$sep/; | |
| 809 $row++; | |
| 810 if (@hdr) { | |
| 811 @hdr == @col or err("Header and row $row differ in column count"); | |
| 812 | |
| 813 #my $key = $col[$keycol] or wrn("Empty key column $keycol: $_"); | |
| 814 #exists{$hash->{$col[$key}} and wrn("WARNING: dupe key $key at row: $_"); | |
| 815 my $key = $col[$keycol]; | |
| 816 $hash->{$key} ||= | |
| 817 { map { ( $hdr[$_] => $col[$_] ) } ( 0 .. $#hdr ) }; | |
| 818 } | |
| 819 else { | |
| 820 @hdr = @col; | |
| 821 } | |
| 822 } | |
| 823 close $TSV; | |
| 824 return $hash; | |
| 825 } | |
| 826 | |
| 827 #.............................................................................. | |
| 828 sub load_fasta { | |
| 829 my ($fasta) = @_; | |
| 830 my %seen; | |
| 831 my $list; | |
| 832 my $dbtype = 'unknown'; | |
| 833 msg("load_fasta: $fasta"); | |
| 834 my $in = Bio::SeqIO->new( -file => $fasta, -format => 'fasta' ); | |
| 835 while ( my $seq = $in->next_seq ) { | |
| 836 my $id = $seq->id or err("Empty ID in $fasta"); | |
| 837 if ( $seen{$id} ) { | |
| 838 wrn("Duplicate ID '$id' in $fasta"); | |
| 839 $id = $id . '_dupe'; | |
| 840 } | |
| 841 $seen{$id}++; | |
| 842 my $s = uc( $seq->seq ); | |
| 843 $dbtype = $seq->alphabet eq 'dna' ? 'nucl' : 'prot'; | |
| 844 $dbtype eq 'nucl' ? $s =~ s/[^AGTC]/N/g : $s =~ s/[^A-Z]/X/g; | |
| 845 push @$list, | |
| 846 { | |
| 847 ID => $id, | |
| 848 ACC => '', | |
| 849 DESC => $seq->desc, | |
| 850 SEQ => $s, | |
| 851 TYPE => $dbtype, | |
| 852 }; | |
| 853 } | |
| 854 msg( "load_fasta: read", scalar(@$list), "$dbtype sequences" ); | |
| 855 return $list; | |
| 856 } | |
| 857 | |
| 858 #.............................................................................. | |
| 859 sub save_fasta { | |
| 860 my ( $fasta, $seq ) = @_; | |
| 861 msg("save_fasta: $fasta"); | |
| 862 my %seen; | |
| 863 my $out = Bio::SeqIO->new( -file => ">$fasta", -format => 'fasta' ); | |
| 864 for my $s (@$seq) { | |
| 865 $seen{ $s->{ID} }++; | |
| 866 my $freq = $seen{ $s->{ID} }; | |
| 867 | |
| 868 #wrn("seen $s->{ID} now $freq times") if $freq > 1; | |
| 869 # print Dumper($s); | |
| 870 my $ABX = | |
| 871 defined( $s->{ABX} ) ? join( $ABX_SEP, sort @{ $s->{ABX} } ) : ''; | |
| 872 $ABX =~ s/\s+/_/g; # remove spaces! | |
| 873 my $obj = Bio::Seq->new( | |
| 874 -id => join( '~~~', $db, $s->{ID}, $s->{ACC}, $ABX ), | |
| 875 -desc => ( $s->{DESC} || $s->{ID} ), | |
| 876 -seq => $s->{SEQ}, | |
| 877 ); | |
| 878 | |
| 879 # $obj->desc( hash_encode($s) ); | |
| 880 $out->write_seq($obj); | |
| 881 | |
| 882 # $seen{ $s->{ID} }++; | |
| 883 } | |
| 884 msg( "save_fasta: wrote", scalar(@$seq), "sequences" ); | |
| 885 } | |
| 886 | |
| 887 #---------------------------------------------------------------------- | |
| 888 sub msg { | |
| 889 print STDERR "@_\n"; | |
| 890 } | |
| 891 | |
| 892 #---------------------------------------------------------------------- | |
| 893 sub wrn { | |
| 894 msg( "WARNING:", @_ ) if $debug; | |
| 895 } | |
| 896 | |
| 897 #---------------------------------------------------------------------- | |
| 898 sub err { | |
| 899 msg( "ERROR:", @_ ); | |
| 900 exit(1); | |
| 901 } | |
| 902 | |
| 903 #---------------------------------------------------------------------- | |
| 904 # Option setting routines | |
| 905 | |
| 906 sub setOptions { | |
| 907 use Getopt::Long; | |
| 908 | |
| 909 @Options = ( | |
| 910 { OPT => "help", VAR => \&usage, DESC => "This help" }, | |
| 911 { | |
| 912 OPT => "debug!", | |
| 913 VAR => \$debug, | |
| 914 DEFAULT => 0, | |
| 915 DESC => "Verbose debug output" | |
| 916 }, | |
| 917 { | |
| 918 OPT => "dbdir=s", | |
| 919 VAR => \$outdir, | |
| 920 DEFAULT => abs_path("$FindBin::RealBin/../db"), | |
| 921 DESC => "Parent folder" | |
| 922 }, | |
| 923 { | |
| 924 OPT => "db=s", | |
| 925 VAR => \$db, | |
| 926 DEFAULT => "", | |
| 927 DESC => "Choices: $DATABASES" | |
| 928 }, | |
| 929 { | |
| 930 OPT => "force!", | |
| 931 VAR => \$force, | |
| 932 DEFAULT => 0, | |
| 933 DESC => "Force download even if exists" | |
| 934 }, | |
| 935 ); | |
| 936 | |
| 937 &GetOptions( map { $_->{OPT}, $_->{VAR} } @Options ) || usage(1); | |
| 938 | |
| 939 # Now setup default values. | |
| 940 foreach (@Options) { | |
| 941 if ( defined( $_->{DEFAULT} ) && !defined( ${ $_->{VAR} } ) ) { | |
| 942 ${ $_->{VAR} } = $_->{DEFAULT}; | |
| 943 } | |
| 944 } | |
| 945 } | |
| 946 | |
| 947 sub usage { | |
| 948 my ($exitcode) = @_; | |
| 949 $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref | |
| 950 $exitcode ||= 0; | |
| 951 select STDERR if $exitcode; # write to STDERR if exitcode is error | |
| 952 | |
| 953 print "SYNOPIS\n Download databases for abricate to use\n"; | |
| 954 print "USAGE\n $EXE [options] --db DATABASE\n"; | |
| 955 print "OPTIONS\n"; | |
| 956 foreach (@Options) { | |
| 957 printf " --%-13s %s%s.\n", $_->{OPT}, $_->{DESC}, | |
| 958 defined( $_->{DEFAULT} ) ? " (default '$_->{DEFAULT}')" : ""; | |
| 959 } | |
| 960 exit($exitcode); | |
| 961 } | |
| 962 | |
| 963 #---------------------------------------------------------------------- | |
| 964 |
