Mercurial > repos > kkonganti > cfsan_bettercallsal
comparison 0.7.0/assets/abricate-get_db @ 17:0e7a0053e4a6
planemo upload
author | kkonganti |
---|---|
date | Mon, 15 Jul 2024 10:42:02 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
16:b90e5a7a3d4f | 17:0e7a0053e4a6 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use FindBin; | |
5 use Bio::SeqIO; | |
6 use Bio::Seq; | |
7 use Path::Tiny; | |
8 use File::Basename; | |
9 use File::Spec; | |
10 use File::Path qw(make_path remove_tree); | |
11 use List::Util qw(first); | |
12 use Cwd qw(abs_path); | |
13 use Data::Dumper; | |
14 use LWP::Simple; | |
15 use JSON; | |
16 | |
17 #.............................................................................. | |
18 # Globals | |
19 | |
20 my $EXE = basename($0); | |
21 my $ABX_SEP = ';'; | |
22 | |
23 my %DATABASE = ( | |
24 'resfinder' => \&get_resfinder, | |
25 'plasmidfinder' => \&get_plasmidfinder, | |
26 'megares' => \&get_megares, | |
27 'argannot' => \&get_argannot, | |
28 'card' => \&get_card, | |
29 | |
30 # 'ncbibetalactamase' => \&get_ncbibetalactamase, | |
31 'ncbi' => \&get_ncbi, | |
32 'vfdb' => \&get_vfdb, | |
33 'ecoli_vf' => \&get_ecoli_vf, # https://github.com/phac-nml/ecoli_vf | |
34 'ecoh' => \&get_ecoh, | |
35 'bacmet2' => \&get_bacmet2, | |
36 'victors' => \&get_victors, | |
37 | |
38 # 'serotypefinder' => \&get_serotypefinder, | |
39 ); | |
40 my $DATABASES = join( ' ', sort keys %DATABASE ); | |
41 | |
42 #.............................................................................. | |
43 # Command line options | |
44 | |
45 my ( @Options, $debug, $outdir, $db, $force ); | |
46 setOptions(); | |
47 | |
48 $db or err("Please choose a --db from: $DATABASES"); | |
49 exists $DATABASE{$db} or err("Unknown --db '$db', choose from: $DATABASES "); | |
50 -d $outdir or err("--outdir '$outdir' does not exist"); | |
51 | |
52 my $dir = abs_path( File::Spec->catdir( $outdir, $db ) ); | |
53 make_path($dir); | |
54 msg("Setting up '$db' in '$dir'"); | |
55 | |
56 #my $tmpdir = tempdir("$db-XXXXXXXX", DIR=>$dir, CLEANUP=>0); | |
57 #my $tmpdir = "/home/tseemann/git/abricate/db/resfinder/resfinder-6Kuphtvv"; | |
58 my $tmpdir = "$dir/src"; | |
59 make_path($tmpdir); | |
60 | |
61 # run the specific function from --db | |
62 chdir $tmpdir; | |
63 my $seq = $DATABASE{$db}->(); | |
64 map { is_full_gene($_) } @$seq; # doesn't do anything? | |
65 $seq = dedupe_seq($seq); | |
66 | |
67 #print Dumper($seq); | |
68 msg("Sorting sequences by ID"); | |
69 $seq = [ sort { $a->{ID} cmp $b->{ID} } @$seq ]; | |
70 save_fasta( "$dir/sequences", $seq ); | |
71 | |
72 msg("Formatting BLASTN database: $dir/sequences"); | |
73 my $logfile = "$tmpdir/makeblastdb.log"; | |
74 my $ncbi_title = $db; | |
75 if ( "$db" eq "ncbi" ) { | |
76 $ncbi_title = "ncbiamrplus"; | |
77 } | |
78 my $ec = system( | |
79 "makeblastdb -in '$dir/sequences' -title '$ncbi_title' -dbtype nucl -hash_index -logfile $logfile" | |
80 ); | |
81 if ( $ec != 0 ) { | |
82 system("tail '$logfile'"); | |
83 err("Error with makign BLAST database. See $logfile"); | |
84 } | |
85 | |
86 #msg("Run 'abricate --setupdb' to format the database"); | |
87 | |
88 msg("Done."); | |
89 | |
90 #.............................................................................. | |
91 | |
92 sub download { | |
93 my ( $url, $dest ) = @_; | |
94 if ( -r $dest and not $force ) { | |
95 msg("Won't re-download existing $dest (use --force)"); | |
96 | |
97 #exit(1); | |
98 } | |
99 else { | |
100 msg("Downloading: $url"); | |
101 my $ec = mirror( $url, $dest ); | |
102 msg("HTTP Result: $ec"); | |
103 ( $ec == 200 or $ec = 304 ) | |
104 or err("HTTP $ec | failed to download $url"); # is HTTP OK ? | |
105 } | |
106 msg("Destination: $dest"); | |
107 msg( "Filesize:", ( -s $dest ), "bytes" ); | |
108 } | |
109 | |
110 #.............................................................................. | |
111 sub trim_spaces { | |
112 my ($s) = @_; | |
113 $s =~ s/^\s+//; | |
114 $s =~ s/\s+$//; | |
115 return $s; | |
116 } | |
117 | |
118 #.............................................................................. | |
119 sub get_resfinder { | |
120 my $name = "resfinder_db"; | |
121 | |
122 # FIXME - can we just get HEAD.zip like in plasmidfinder? | |
123 my $url = "https://bitbucket.org/genomicepidemiology/$name.git"; | |
124 | |
125 # if (-r $name and not $force) { | |
126 # msg("Won't overwrite existing $name (use --force)"); | |
127 # # exit(1); | |
128 # } | |
129 # else { | |
130 # msg("Nuking existing folder: $name"); | |
131 # remove_tree("./$name"); | |
132 # msg("Cloning $url to $name"); | |
133 # system("git clone --quiet $url $name"); | |
134 # } | |
135 | |
136 #<*.fsa> | |
137 #>aac(6')-Ib_2_M23634 | |
138 #>blaNDM-19_1_MF370080 | |
139 #>mcr-1.1_1_KP347127 | |
140 #>fosB1_1_CP001903 | |
141 #>fusB_1_AY373761 | |
142 #>VanHAX_1_FJ866609 | |
143 #>ere(A)_6_DQ157752 | |
144 #>nimA_1_X71444 | |
145 #>cfr_1_AM408573 | |
146 #>catB3_2_U13880 | |
147 #>qnrA1_1_AY070235 | |
148 #>ARR-2_1_HQ141279 | |
149 #>sul1_2_U12338 | |
150 #>tet_1_M74049 | |
151 #>dfrA19_1_EU855687 | |
152 | |
153 #<notes.txt> | |
154 #aac(6')-Iv:Aminoglycoside resistance: | |
155 #aac(6')-Iw:Aminoglycoside resistance:Alternate name; aac(6')-Ix | |
156 #sul3:Sulphonamide resistance: | |
157 ##Tetracycline: | |
158 #ort(B):Tetracycline resistance: | |
159 #blaCMY-59:Beta-lactam resistance: | |
160 | |
161 #<phenotypes.txt> | |
162 #Gene_accession no. Class Phenotype PMID Mechanism of resistance Notes Required_gene | |
163 #ant(2'')-Ia_1_X04555 Aminoglycoside Gentamicin, Tobramycin 3024112 Enzymatic modification Alternative name aadB | |
164 #ant(2'')-Ia_2_JF826500 Aminoglycoside Gentamicin, Tobramycin 22271862 Enzymat | |
165 | |
166 $name = "~/apps/bettercallsal/assets/abricate_dbs/$name"; | |
167 my $metafn = "$name/phenotypes.txt"; | |
168 my @meta = path($metafn)->lines( { chomp => 1 } ); | |
169 my %anno; | |
170 foreach (@meta) { | |
171 next if m/^#/; | |
172 my @x = split m/\t/; | |
173 | |
174 #msg("$metafn: @x"); | |
175 my ($gene) = ( $x[0] =~ m/^(.*?)_\w+$/ ); | |
176 $anno{$gene}{ABX} = [ | |
177 map { trim_spaces($_) } | |
178 grep { !m/(unknown|notes|^none)/i } | |
179 split m/,\s*/, | |
180 $x[2] | |
181 ]; | |
182 | |
183 #msg("$metafn: $gene |", $anno{$gene}{ABX}->@*); | |
184 } | |
185 msg( "get_resfinder: $metafn", scalar( keys %anno ), "genes" ); | |
186 | |
187 #print Dumper(\%anno); | |
188 | |
189 my @seq; | |
190 for my $fasta (<$name/*.fsa>) { | |
191 | |
192 # Issue #62 - repair broken fasta files like this: | |
193 # GCTTTAAATTGGAAAAAAGATAGTCAAACTCTTTAA>cmr_1_U43535 | |
194 # inline replacement | |
195 system( 'sed', '-i.bak', 's/\([A-Z]\)>/\1\n>/gi', $fasta ); | |
196 my $args = load_fasta($fasta); | |
197 | |
198 # use name of fasta file as antibiotic name | |
199 #my $abx = basename($fasta, '.fsa'); | |
200 #msg("$fasta: Assigning '$abx' to all genes"); | |
201 #push @{$_->{ABX}}, $abx for (@$args); | |
202 push @seq, @$args; | |
203 } | |
204 | |
205 # https://github.com/tseemann/abricate/issues/92 | |
206 # mcr-9_1_NZ_NAAN01000063.1 | |
207 #>mcr-9_1_NZ_NAAN01000063.1 | |
208 # mcr-9.1:Colistin resistance: | |
209 | |
210 for my $seq (@seq) { | |
211 my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*?)_(\d+)_(\S+)$/; | |
212 | |
213 #msg("resfinder: $1 $2 $3", $anno{$1}); | |
214 $seq->{ID} = "${id}_${copy}"; | |
215 $seq->{ACC} = $acc; | |
216 $seq->{DESC} = $anno{$id}{DESC} || $id; | |
217 push @{ $seq->{ABX} }, @{ $anno{$id}{ABX} } if $anno{$id}{ABX}; | |
218 } | |
219 | |
220 return \@seq; | |
221 } | |
222 | |
223 #.............................................................................. | |
224 sub get_serotypefinder { | |
225 my $name = "serotypefinder_db"; | |
226 my $url = "https://bitbucket.org/genomicepidemiology/$name.git"; | |
227 | |
228 if ( -r $name and not $force ) { | |
229 msg("Won't overwrite existing $name (use --force)"); | |
230 | |
231 # exit(1); | |
232 } | |
233 else { | |
234 msg("Nuking existing folder: $name"); | |
235 remove_tree("./$name"); | |
236 msg("Cloning $url to $name"); | |
237 system("git clone --quiet $url $name"); | |
238 } | |
239 | |
240 my @seq; | |
241 for my $fasta (<$name/*.fsa>) { | |
242 push @seq, @{ load_fasta($fasta) }; | |
243 } | |
244 | |
245 # >fliC_44444_AY250028_H52 | |
246 # FIXME - this is already in EcOH database! | |
247 for my $seq (@seq) { | |
248 my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*)_(\d+)_(\w+)$/; | |
249 | |
250 #msg("serotypefinder: $1 $2 $3", $anno{$1}); | |
251 $seq->{ID} = "${id}_${copy}"; | |
252 $seq->{ACC} = $acc; | |
253 | |
254 #$seq->{DESC} = $anno{$id} || ''; | |
255 } | |
256 | |
257 return \@seq; | |
258 } | |
259 | |
260 #.............................................................................. | |
261 sub get_tag { | |
262 my ( $f, $tag ) = @_; | |
263 if ( $f->has_tag($tag) ) { | |
264 my ($val) = $f->get_tag_values($tag); | |
265 return $val; | |
266 } | |
267 return ''; | |
268 } | |
269 | |
270 #.............................................................................. | |
271 sub get_ncbi { | |
272 my $AFP = | |
273 "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest"; | |
274 | |
275 #my $src = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/AMR_CDS"; | |
276 my $src = "$AFP/AMR_CDS"; | |
277 my $name = "amr_cds.ffn"; | |
278 | |
279 #my $src2 = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/ReferenceGeneCatalog.txt"; | |
280 my $src2 = "$AFP/ReferenceGeneCatalog.txt"; | |
281 my $name2 = "amr_cds.tsv"; | |
282 | |
283 if ( -r $name and -r $name2 and not $force ) { | |
284 msg("Won't overwrite existing $name/$name2 (use --force)"); | |
285 | |
286 # exit(1); | |
287 } | |
288 else { | |
289 download( $src, $name ); | |
290 download( $src2, $name2 ); | |
291 } | |
292 | |
293 #1 allele | |
294 #2 gene_family ble | |
295 #3 whitelisted_taxa | |
296 #4 product_name BLMA family bleomycin binding protein | |
297 #5 scope core | |
298 #6 type AMR | |
299 #7 subtype AMR | |
300 #8 class BLEOMYCIN | |
301 #9 subclass BLEOMYCIN | |
302 #10 refseq_protein_accession WP_063842967.1 | |
303 #11 refseq_nucleotide_accession NG_047554.1 | |
304 #12 curated_refseq_start No | |
305 #13 genbank_protein_accession CAA02068.1 | |
306 #14 genbank_nucleotide_accession A31900.1 | |
307 #15 genbank_strand_orientation + | |
308 #16 genbank_cds_start 6 | |
309 #17 genbank_cds_stop 374 | |
310 #18 pubmed_reference | |
311 #19 blacklisted_taxa | |
312 #20 db_version 2019-08-27.1 | |
313 | |
314 my $tsv = load_tabular( $name2, 10 ); # refseq_nucleotide_accession | |
315 msg( "[$name2] Loaded", scalar keys %$tsv, "records" ); | |
316 | |
317 # https://github.com/ncbi/amr/wiki/AMRFinderPlus-database#amrprot | |
318 # 0 1 2 3 4 5 6 7 | |
319 # >1000909371|WP_061158039.1|NG_050200|1|1|blaTEM-156|blaTEM|class_A_beta-lactamase_TEM-156 NG_050200:101-961 | |
320 my @seq; | |
321 my $in = Bio::SeqIO->new( -file => $name, -format => "fasta" ); | |
322 while ( my $rec = $in->next_seq ) { | |
323 | |
324 # parse ID | |
325 my ( $gi, $pi, $acc, $fp, $fn, $gene, $fam, $prod ) = split m/\|/, | |
326 $rec->id; | |
327 | |
328 # skip fusion genes | |
329 next unless $fp == 1 and $fn == 1; | |
330 | |
331 # only keep true ARGs | |
332 $acc .= ".1" unless $acc =~ m/\.\d+$/; | |
333 my $t = $tsv->{$acc} or next; | |
334 next unless ( $t->{scope} eq 'plus' || $t->{scope} eq 'core' ); | |
335 | |
336 # next unless $t->{type} eq 'VIRULENCE'; | |
337 # next unless $t->{subtype} eq 'VIRULENCE'; | |
338 # construct sequence record | |
339 $prod =~ s/_/ /g; | |
340 err("$pi: gene is empty") unless $gene; | |
341 err("$pi: product is empty") unless $prod; | |
342 my $s = { | |
343 ID => $gene, | |
344 ACC => $acc, | |
345 DESC => $prod, | |
346 SEQ => $rec->seq, | |
347 ABX => [ split m'/', $t->{subclass} ] | |
348 }; | |
349 push @seq, $s; | |
350 msg( "[$name]", 0 + @seq, "|", $s->{ID}, "|", $s->{ACC}, "|", | |
351 $s->{DESC} ); | |
352 | |
353 #msg(Dumper($s)); | |
354 msg( $s->{ID}, " is fusion $fp/$fn" ) if "$fp$fn" ne '11'; | |
355 } | |
356 return \@seq; | |
357 } | |
358 | |
359 #.............................................................................. | |
360 sub get_plasmidfinder { | |
361 my $name = "plasmidfinder"; | |
362 my $zip = "$name.zip"; | |
363 | |
364 # download("https://cge.cbs.dtu.dk/cge/download_data.php?folder=$name&filename=$zip&submit=$zip", $zip); | |
365 download( | |
366 "https://bitbucket.org/genomicepidemiology/plasmidfinder_db/get/HEAD.zip", | |
367 $zip | |
368 ); | |
369 system("unzip -j -u $zip"); | |
370 | |
371 my @seq; | |
372 for my $fasta (<*.fsa>) { | |
373 push @seq, @{ load_fasta($fasta) }; | |
374 } | |
375 | |
376 for my $seq (@seq) { | |
377 $seq->{DESC} = $seq->{ID}; # no desc, so use ORIGINAL ID as desc | |
378 my ( $id, $acc ) = | |
379 ( $seq->{ID} =~ m/^(.*)_(([A-Z]+|NC_)\d+(\.\d+)?)$/ ); | |
380 $id =~ s/_+$//g; | |
381 $seq->{ID} = $id || $seq->{ID}; | |
382 $seq->{ACC} = $acc || ''; | |
383 wrn( "Parsed empty ID:", | |
384 $seq->{DESC}, | |
385 "=> id='$id' acc='$acc' seq=" . substr( $seq->{SEQ}, 0, 10 ) ) | |
386 if not $id; | |
387 } | |
388 | |
389 return \@seq; | |
390 } | |
391 | |
392 #.............................................................................. | |
393 sub get_megares { | |
394 my $zip = "megares.zip"; | |
395 download( 'https://www.meglab.org/downloads/megares_v3.00.zip', $zip ); | |
396 system("unzip -j -u $zip"); | |
397 my $seqs = load_fasta( glob("megares_drugs_*.fasta") ); | |
398 my @okseq; | |
399 | |
400 # >MEG_372|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_MATE_efflux_pumps|ABEM | |
401 # >MEG_411|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_RND_efflux_regulator|ACRR|RequiresSNPConfirmation | |
402 # >MEG_7860|Drugs|betalactams|Class_B_betalactamases|ZOG | |
403 # >MEG_7439|Drugs|Glycopeptides|VanI-type_resistance_protein|VANI | |
404 # >MEG_7245|Drugs|Tetracyclines|Tetracycline_resistance_MFS_efflux_pumps|TETY | |
405 # >MEG_9|Drugs|Aminoglycosides|Aminoglycoside-resistant_16S_ribosomal_subunit_protein|A16S|RequiresSNPConfirmation | |
406 | |
407 for my $s (@$seqs) { | |
408 my ( $id, $type, $class, $mech, $group, $note ) = split m/\|/, $s->{ID}; | |
409 if ($note) { | |
410 | |
411 # "RequiresSNPConfirmation" is the common one; we can't do that | |
412 msg("Skipping $id due to: $note"); | |
413 next; | |
414 } | |
415 $s->{ID} = $group; | |
416 $s->{ACC} = $id; | |
417 $s->{DESC} = join( ':', $type, $class, $mech, $group ); | |
418 push @okseq, $s; | |
419 } | |
420 return [@okseq]; | |
421 | |
422 #return $seqs; | |
423 } | |
424 | |
425 #.............................................................................. | |
426 sub get_argannot { | |
427 my $fasta = 'arg-annot.fa'; | |
428 download( | |
429 # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/1425/argannot-aa-v3-march2017_doc.fasta', | |
430 # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/691/argannot-nt_doc.fasta', | |
431 'https://www.mediterranee-infection.com/wp-content/uploads/2019/06/ARG_ANNOT_V5_Nt_JUNE2019.txt', | |
432 $fasta | |
433 ); | |
434 | |
435 # fix syntax errors in the FASTA file... | |
436 path($fasta)->edit( sub { s/\\//g; $_ } ); | |
437 | |
438 my $seqs = load_fasta($fasta); | |
439 | |
440 # 0 1 2 3 | |
441 # >(AGly)Aac2-Ie:NC_011896:3039059-3039607:549 | |
442 for my $s (@$seqs) { | |
443 my @x = split m/:/, $s->{ID}; | |
444 $s->{ID} = $x[0]; | |
445 $s->{ACC} = $x[1] . ':' . $x[2]; | |
446 $s->{DESC} = ''; | |
447 } | |
448 | |
449 return $seqs; | |
450 } | |
451 | |
452 #.............................................................................. | |
453 sub get_bacmet2 { | |
454 my $fasta = 'bacmet2.fa'; | |
455 download( | |
456 'http://bacmet.biomedicine.gu.se/download/BacMet2_EXP_database.fasta', | |
457 $fasta ); | |
458 | |
459 # This is a PROTEIN file | |
460 my $seqs = load_fasta($fasta); | |
461 | |
462 # 0 1 2 3 4 ^ | |
463 # >BAC0098|ctpC|sp|P0A502|CTPC_MYCTU Probable manganese/zinc-exporting | |
464 for my $s (@$seqs) { | |
465 my @x = split m/\|/, $s->{ID}; | |
466 $s->{ID} = $x[1] . '-' . $x[0]; | |
467 $s->{ACC} = $x[2] . ':' . $x[3]; | |
468 } | |
469 | |
470 return $seqs; | |
471 } | |
472 | |
473 #.............................................................................. | |
474 sub get_card { | |
475 | |
476 # https://github.com/tseemann/abricate/issues/25 | |
477 my $tarball = 'card.tar.bz2'; | |
478 download( | |
479 #'https://card.mcmaster.ca/download/0/broadstreet-v2.0.2.tar.gz', | |
480 'https://card.mcmaster.ca/latest/data', | |
481 | |
482 #'https://card.mcmaster.ca/latest/data/card-data.tar.bz2', | |
483 $tarball # yes, it's really BZ2 not GZ ... | |
484 ); | |
485 | |
486 # my $fasta = "./nucleotide_fasta_protein_homolog_model.fasta"; | |
487 my $jsonfile = "./card.json"; | |
488 system( "tar", "xf", $tarball, $jsonfile ) == 0 | |
489 or err("Problem with tar xf $tarball $jsonfile"); | |
490 -r $jsonfile or err("Could not extract $jsonfile from $tarball"); | |
491 | |
492 # JSON | |
493 my $json = path($jsonfile)->slurp_utf8; | |
494 my $card = from_json( $json, { latin1 => 1 } ); | |
495 my @seq; | |
496 for my $g ( values %$card ) { | |
497 next unless ref($g) eq 'HASH'; | |
498 | |
499 # msg(Dumper($g)); | |
500 next | |
501 unless $g->{model_type} eq | |
502 "protein homolog model"; # only 'acquired' genes | |
503 my $id = $g->{model_name}; | |
504 err("$id has {model_param}{snp}") if exists $g->{model_param}{snp}; | |
505 | |
506 # msg("CARD: $id"); | |
507 # print STDERR Dumper($g); | |
508 my $dna = $g->{model_sequences}{sequence} | |
509 or err("$id: no {model_sequences}{sequence} found"); | |
510 my ($key) = sort keys %$dna; # first key | |
511 $dna = $dna->{$key} or err("$id: invalid key '$key'"); | |
512 $dna = $dna->{dna_sequence} or err("$id: no dna_sequence"); | |
513 | |
514 # msg(Dumper($dna)) if $id eq 'OXA-25'; | |
515 | |
516 # ARO_category => { | |
517 # 'category_aro_name' => 'cephalosporin', | |
518 # 'category_aro_class_name' => 'Drug Class', | |
519 my $is_amr_gene = 0; | |
520 my @abx; | |
521 for my $key ( keys $g->{ARO_category}->%* ) { | |
522 my $c = $g->{ARO_category}{$key}; | |
523 if ( $c->{category_aro_class_name} eq 'Drug Class' ) { | |
524 my $abx = $c->{category_aro_name}; | |
525 $abx =~ s/ antibiotic//; | |
526 $abx =~ s/\s/_/g; | |
527 push @abx, $abx; | |
528 } | |
529 if ( $c->{category_aro_class_name} eq 'AMR Gene Family' ) { | |
530 $is_amr_gene++; | |
531 } | |
532 } | |
533 | |
534 #err("CARD | $id | ", Dumper($g->{ARO_category}) ) unless $is_amr_gene; | |
535 #msg("ABX=$_") for @abx; | |
536 | |
537 # put coordinates into normal form | |
538 my ( $start, $stop ) = | |
539 $dna->{strand} eq '-' | |
540 ? ( $dna->{fmax}, $dna->{fmin} ) | |
541 : ( $dna->{fmin}, $dna->{fmax} ); | |
542 | |
543 $id =~ s/\s+/_/g; | |
544 push @seq, | |
545 { | |
546 ID => $id, | |
547 ACC => $dna->{accession} . ":$start-$stop", | |
548 DESC => ( $g->{ARO_description} || $g->{ARO_accession} ), | |
549 SEQ => $dna->{sequence}, | |
550 ABX => [@abx], | |
551 }; | |
552 | |
553 # msg(Dumper($seq[-1])); | |
554 } | |
555 | |
556 return \@seq; | |
557 } | |
558 | |
559 #.............................................................................. | |
560 sub get_victors { | |
561 | |
562 # the CDS data is in .ffn and has source GI and coords | |
563 # the PROT data is in .faa and has the protein ref and /product | |
564 | |
565 #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence | |
566 download( 'http://www.phidias.us/victors/downloads/gen_downloads.php', | |
567 'victors.ffn' ); | |
568 | |
569 #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni] | |
570 download( | |
571 'http://www.phidias.us/victors/downloads/gen_downloads_protein.php', | |
572 'victors.faa' ); | |
573 | |
574 my %gi; | |
575 open my $FAA, '<', 'victors.faa'; | |
576 while (<$FAA>) { | |
577 | |
578 #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni] | |
579 next unless m"^>gi.(\d+).ref.([^|]+). ([^[]+)"; | |
580 $gi{$1}{ACC} = $2; | |
581 $gi{$1}{DESC} = $3; | |
582 } | |
583 | |
584 my $seqs = load_fasta("victors.ffn"); | |
585 | |
586 for my $s (@$seqs) { | |
587 | |
588 #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence | |
589 $s->{ID} =~ m/gi.(\d+):(\d+)-(\d+)/; | |
590 $s->{ACC} = $gi{$1}{ACC} || "gi|$1:$2-$3"; | |
591 $s->{DESC} =~ $gi{$1}{DESC} || 'hypothetical protein'; | |
592 } | |
593 | |
594 # print Dumper($seqs); exit; | |
595 | |
596 return $seqs; | |
597 } | |
598 | |
599 #.............................................................................. | |
600 sub get_vfdb { | |
601 download( 'http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz', | |
602 'vfdb.fa.gz' ); | |
603 system("gzip -f -d -c vfdb.fa.gz > vfdb.fa"); | |
604 my $seqs = load_fasta("vfdb.fa"); | |
605 | |
606 # >VFG000676(gb|AAD32411) (lef) anthrax toxin lethal factor precursor [Anthrax toxin (VF0142)] [Bacillus anthracis str. Sterne] | |
607 for my $s (@$seqs) { | |
608 | |
609 # https://github.com/tseemann/abricate/issues/64#issuecomment-421895159 by @VGalata | |
610 $s->{ID} =~ m/^(\w+)\(\w+\|(\w+)(\.\d+)?\)$/; # | |
611 #$s->{ID} =~ m/^(\w+)\(\w+\|(\w+)\)$/; | |
612 $s->{ACC} = $2 if $2; | |
613 $s->{DESC} =~ m/^\((.*?)\)/; | |
614 $s->{ID} = $1 if $1; | |
615 | |
616 # print STDERR Dumper($s); exit; | |
617 } | |
618 | |
619 return $seqs; | |
620 } | |
621 | |
622 #.............................................................................. | |
623 sub get_ncbibetalactamase { | |
624 my $fasta = "ncbi.fa"; | |
625 download( | |
626 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele-dna.fa', | |
627 $fasta ); | |
628 my $tab = "ncbi.tab"; | |
629 download( 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele.tab', | |
630 $tab ); | |
631 | |
632 # >ACD12694.1 EU650653.1:1-1173 | |
633 my $seqs = load_fasta($fasta); | |
634 | |
635 # ACC-1 ACD12694.1 EU650653.1 blaACC-1 1 1173 + cephalosporin-hydrolyzing class C beta-lactamase ACC-1 | |
636 my %anno; | |
637 my @anno = grep { !m/^#/ } path($tab)->lines( { chomp => 1 } ); | |
638 msg( "Read", 0 + @anno, "annotations" ); | |
639 foreach (@anno) { | |
640 my ( $name, $id, $acc, $gene, $begin, $end, undef, $product ) = | |
641 split m/\t/; | |
642 $anno{$id} = { | |
643 ID => $gene, | |
644 DESC => $product, | |
645 ACC => "$acc:$begin-$end", | |
646 }; | |
647 } | |
648 | |
649 # print Dumper(\%anno); | |
650 | |
651 for my $s (@$seqs) { | |
652 my $id = $s->{ID}; | |
653 next unless exists $anno{$id}; | |
654 $s->{ID} = $anno{$id}{ID}; | |
655 $s->{ACC} = $anno{$id}{ACC}; | |
656 $s->{DESC} = $anno{$id}{DESC}; | |
657 } | |
658 | |
659 # print Dumper($seqs); | |
660 | |
661 return $seqs; | |
662 } | |
663 | |
664 #.............................................................................. | |
665 sub get_ecoh { | |
666 my $fasta = "EcOH.fa"; | |
667 download( | |
668 'https://raw.githubusercontent.com/katholt/srst2/master/data/EcOH.fasta', | |
669 $fasta | |
670 ); | |
671 | |
672 # https://github.com/katholt/srst2#generating-srst2-compatible-clustered-database-from-raw-sequences | |
673 # [clusterID]__[gene]__[allele]__[seqID] [other stuff] | |
674 # >1__fliC__fliC-H1__1 AB028471.1;flagellin;H1 | |
675 # >8__wzx__wzx-O41__246 AB811617.1;O antigen flippase;O41 | |
676 # >9__wzy__wzy-OgN31__597 LC125932.1;O antigen polyermase;OgN31 | |
677 my $seqs = load_fasta($fasta); | |
678 | |
679 for my $s (@$seqs) { | |
680 my @id = split m/__/, $s->{ID}; | |
681 my @desc = split m';', $s->{DESC}; | |
682 $s->{ID} = $id[2]; | |
683 $s->{ACC} = shift(@desc); | |
684 $s->{DESC} = join( ' ', @desc ); | |
685 } | |
686 | |
687 # print Dumper($seqs); | |
688 return $seqs; | |
689 } | |
690 | |
691 #.............................................................................. | |
692 sub get_ecoli_vf { | |
693 my $fasta = "ecoli_vf.ffn"; | |
694 download( | |
695 'https://github.com/phac-nml/ecoli_vf/raw/master/data/repaired_ecoli_vfs_shortnames.ffn', | |
696 $fasta | |
697 ); | |
698 my $seqs = load_fasta($fasta); | |
699 | |
700 # >VFG000748(gi:2865308) (espF) EspF [EspF (VF0182)] [Escherichia coli O127:H6 str. E2348/69] | |
701 # >VFG000749(gi:6009379) (bfpA) Bundlin [BFP (VF0174)] [Escherichia coli B171] | |
702 # >SPG000142 (cvac) Escherichia coli cvi cvaC operon. [X57525 434-745] | |
703 # >SPG000143 (iss2) Escherichia coli Iss (iss) gene, complete cds. [AF042279 292-600] | |
704 | |
705 for my $s (@$seqs) { | |
706 | |
707 #print STDERR Dumper("IN", $s); | |
708 $s->{ID} =~ m/ ^ (\w+) (?: \( (.*?) \) )? $ /x | |
709 or die "Can't parse $fasta at " . Dumper($s); | |
710 $s->{ID} = $1 if $1; | |
711 $s->{ACC} = $2 || $1; | |
712 $s->{DESC} =~ s/\s\[.*?\]$//g; # remove strain name | |
713 $s->{DESC} =~ m/^(?:\((.*?)\)\s+)?(.*?)$/; | |
714 $s->{ID} = $1 if $1; | |
715 $s->{DESC} = $2; | |
716 | |
717 #print STDERR Dumper("OUT", $s); | |
718 #print STDERR "="x60, "\n"; | |
719 } | |
720 | |
721 # print Dumper($seqs); | |
722 return $seqs; | |
723 } | |
724 | |
725 #.............................................................................. | |
726 sub is_full_gene { | |
727 my ($s) = @_; | |
728 my $has_ambig = 0; | |
729 | |
730 my $id = $s->{ID}; | |
731 my $L = length( $s->{SEQ} ); | |
732 if ( $L % 3 != 0 ) { | |
733 wrn("$id - length $L bp is not multiple of 3"); | |
734 return; | |
735 } | |
736 if ( $s->{SEQ} !~ m/^[AGCT]+$/ ) { | |
737 wrn("$id - has non-AGTC bases"); | |
738 return; | |
739 } | |
740 | |
741 my $seq = Bio::Seq->new( -id => $s->{ID}, -seq => $s->{SEQ} ); | |
742 | |
743 my $aa = $seq->translate->seq; | |
744 | |
745 if ( $aa =~ m/\*./ ) { | |
746 wrn("$id - has internal stop codons, trying revcom"); | |
747 $aa = $seq->revcom->translate->seq; | |
748 if ( $aa =~ m/\*./ ) { | |
749 wrn("$id - revcom has internal stop codons too"); | |
750 return; | |
751 } | |
752 else { | |
753 msg("$id - revcom resolves problem, hooray!"); | |
754 $s->{SEQ} = $seq->revcom->seq; | |
755 } | |
756 } | |
757 | |
758 return $L; | |
759 } | |
760 | |
761 #.............................................................................. | |
762 sub dedupe_seq { | |
763 my ($seq) = @_; | |
764 my %seen; | |
765 my $good = []; | |
766 for my $s (@$seq) { | |
767 if ( $seen{ $s->{SEQ} } ) { | |
768 wrn( "duplicate", length( $s->{SEQ} ), | |
769 "bp sequence:", $s->{ID}, '~', $seen{ $s->{SEQ} } ); | |
770 } | |
771 else { | |
772 push @$good, $s; | |
773 } | |
774 $seen{ $s->{SEQ} } .= ' ' . $s->{ID}; | |
775 } | |
776 msg( "dedupe_seq: read", scalar(@$seq), "/ kept", scalar(@$good) ); | |
777 return $good; | |
778 } | |
779 | |
780 #.............................................................................. | |
781 sub load_tabular { | |
782 my ( $fname, $keycol, $sep ) = @_; | |
783 $keycol //= 0; | |
784 $sep //= "\t"; | |
785 my $hash = {}; | |
786 my @hdr; | |
787 my $row = 0; | |
788 open my $TSV, '<', $fname or err("Can't read TSV file: $fname"); | |
789 while (<$TSV>) { | |
790 chomp; | |
791 my @col = split m/$sep/; | |
792 $row++; | |
793 if (@hdr) { | |
794 @hdr == @col or err("Header and row $row differ in column count"); | |
795 | |
796 #my $key = $col[$keycol] or wrn("Empty key column $keycol: $_"); | |
797 #exists{$hash->{$col[$key}} and wrn("WARNING: dupe key $key at row: $_"); | |
798 my $key = $col[$keycol]; | |
799 $hash->{$key} ||= | |
800 { map { ( $hdr[$_] => $col[$_] ) } ( 0 .. $#hdr ) }; | |
801 } | |
802 else { | |
803 @hdr = @col; | |
804 } | |
805 } | |
806 close $TSV; | |
807 return $hash; | |
808 } | |
809 | |
810 #.............................................................................. | |
811 sub load_fasta { | |
812 my ($fasta) = @_; | |
813 my %seen; | |
814 my $list; | |
815 my $dbtype = 'unknown'; | |
816 msg("load_fasta: $fasta"); | |
817 my $in = Bio::SeqIO->new( -file => $fasta, -format => 'fasta' ); | |
818 while ( my $seq = $in->next_seq ) { | |
819 my $id = $seq->id or err("Empty ID in $fasta"); | |
820 if ( $seen{$id} ) { | |
821 wrn("Duplicate ID '$id' in $fasta"); | |
822 $id = $id . '_dupe'; | |
823 } | |
824 $seen{$id}++; | |
825 my $s = uc( $seq->seq ); | |
826 $dbtype = $seq->alphabet eq 'dna' ? 'nucl' : 'prot'; | |
827 $dbtype eq 'nucl' ? $s =~ s/[^AGTC]/N/g : $s =~ s/[^A-Z]/X/g; | |
828 push @$list, | |
829 { | |
830 ID => $id, | |
831 ACC => '', | |
832 DESC => $seq->desc, | |
833 SEQ => $s, | |
834 TYPE => $dbtype, | |
835 }; | |
836 } | |
837 msg( "load_fasta: read", scalar(@$list), "$dbtype sequences" ); | |
838 return $list; | |
839 } | |
840 | |
841 #.............................................................................. | |
842 sub save_fasta { | |
843 my ( $fasta, $seq ) = @_; | |
844 msg("save_fasta: $fasta"); | |
845 my %seen; | |
846 my $out = Bio::SeqIO->new( -file => ">$fasta", -format => 'fasta' ); | |
847 for my $s (@$seq) { | |
848 $seen{ $s->{ID} }++; | |
849 my $freq = $seen{ $s->{ID} }; | |
850 | |
851 #wrn("seen $s->{ID} now $freq times") if $freq > 1; | |
852 # print Dumper($s); | |
853 my $ABX = | |
854 defined( $s->{ABX} ) ? join( $ABX_SEP, sort @{ $s->{ABX} } ) : ''; | |
855 $ABX =~ s/\s+/_/g; # remove spaces! | |
856 my $obj = Bio::Seq->new( | |
857 -id => join( '~~~', $db, $s->{ID}, $s->{ACC}, $ABX ), | |
858 -desc => ( $s->{DESC} || $s->{ID} ), | |
859 -seq => $s->{SEQ}, | |
860 ); | |
861 | |
862 # $obj->desc( hash_encode($s) ); | |
863 $out->write_seq($obj); | |
864 | |
865 # $seen{ $s->{ID} }++; | |
866 } | |
867 msg( "save_fasta: wrote", scalar(@$seq), "sequences" ); | |
868 } | |
869 | |
870 #---------------------------------------------------------------------- | |
871 sub msg { | |
872 print STDERR "@_\n"; | |
873 } | |
874 | |
875 #---------------------------------------------------------------------- | |
876 sub wrn { | |
877 msg( "WARNING:", @_ ) if $debug; | |
878 } | |
879 | |
880 #---------------------------------------------------------------------- | |
881 sub err { | |
882 msg( "ERROR:", @_ ); | |
883 exit(1); | |
884 } | |
885 | |
886 #---------------------------------------------------------------------- | |
887 # Option setting routines | |
888 | |
889 sub setOptions { | |
890 use Getopt::Long; | |
891 | |
892 @Options = ( | |
893 { OPT => "help", VAR => \&usage, DESC => "This help" }, | |
894 { | |
895 OPT => "debug!", | |
896 VAR => \$debug, | |
897 DEFAULT => 0, | |
898 DESC => "Verbose debug output" | |
899 }, | |
900 { | |
901 OPT => "dbdir=s", | |
902 VAR => \$outdir, | |
903 DEFAULT => abs_path("$FindBin::RealBin/../db"), | |
904 DESC => "Parent folder" | |
905 }, | |
906 { | |
907 OPT => "db=s", | |
908 VAR => \$db, | |
909 DEFAULT => "", | |
910 DESC => "Choices: $DATABASES" | |
911 }, | |
912 { | |
913 OPT => "force!", | |
914 VAR => \$force, | |
915 DEFAULT => 0, | |
916 DESC => "Force download even if exists" | |
917 }, | |
918 ); | |
919 | |
920 &GetOptions( map { $_->{OPT}, $_->{VAR} } @Options ) || usage(1); | |
921 | |
922 # Now setup default values. | |
923 foreach (@Options) { | |
924 if ( defined( $_->{DEFAULT} ) && !defined( ${ $_->{VAR} } ) ) { | |
925 ${ $_->{VAR} } = $_->{DEFAULT}; | |
926 } | |
927 } | |
928 } | |
929 | |
930 sub usage { | |
931 my ($exitcode) = @_; | |
932 $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref | |
933 $exitcode ||= 0; | |
934 select STDERR if $exitcode; # write to STDERR if exitcode is error | |
935 | |
936 print "SYNOPIS\n Download databases for abricate to use\n"; | |
937 print "USAGE\n $EXE [options] --db DATABASE\n"; | |
938 print "OPTIONS\n"; | |
939 foreach (@Options) { | |
940 printf " --%-13s %s%s.\n", $_->{OPT}, $_->{DESC}, | |
941 defined( $_->{DEFAULT} ) ? " (default '$_->{DEFAULT}')" : ""; | |
942 } | |
943 exit($exitcode); | |
944 } | |
945 | |
946 #---------------------------------------------------------------------- | |
947 |