annotate 0.1.0/assets/abricate-get_db @ 6:a3c1cba6f773

"planemo upload"
author kkonganti
date Mon, 27 Nov 2023 19:52:20 -0500
parents c8597e9e1a97
children
rev   line source
kkonganti@0 1 #!/usr/bin/env perl
kkonganti@0 2
kkonganti@0 3 use strict;
kkonganti@0 4 use FindBin;
kkonganti@0 5 use Bio::SeqIO;
kkonganti@0 6 use Bio::Seq;
kkonganti@0 7 use Path::Tiny;
kkonganti@0 8 use File::Basename;
kkonganti@0 9 use File::Spec;
kkonganti@0 10 use File::Path qw(make_path remove_tree);
kkonganti@0 11 use List::Util qw(first);
kkonganti@0 12 use Cwd qw(abs_path);
kkonganti@0 13 use Data::Dumper;
kkonganti@0 14 use LWP::Simple;
kkonganti@0 15 use JSON;
kkonganti@0 16
kkonganti@0 17 #..............................................................................
kkonganti@0 18 # Globals
kkonganti@0 19
kkonganti@0 20 my $EXE = basename($0);
kkonganti@0 21 my $ABX_SEP = ';';
kkonganti@0 22
kkonganti@0 23 my %DATABASE = (
kkonganti@0 24 'resfinder' => \&get_resfinder,
kkonganti@0 25 'plasmidfinder' => \&get_plasmidfinder,
kkonganti@0 26 'megares' => \&get_megares,
kkonganti@0 27 'argannot' => \&get_argannot,
kkonganti@0 28 'card' => \&get_card,
kkonganti@0 29
kkonganti@0 30 # 'ncbibetalactamase' => \&get_ncbibetalactamase,
kkonganti@0 31 'ncbi' => \&get_ncbi,
kkonganti@0 32 'vfdb' => \&get_vfdb,
kkonganti@0 33 'ecoli_vf' => \&get_ecoli_vf, # https://github.com/phac-nml/ecoli_vf
kkonganti@0 34 'ecoh' => \&get_ecoh,
kkonganti@0 35 'bacmet2' => \&get_bacmet2,
kkonganti@0 36 'victors' => \&get_victors,
kkonganti@0 37
kkonganti@0 38 # 'serotypefinder' => \&get_serotypefinder,
kkonganti@0 39 );
kkonganti@0 40 my $DATABASES = join( ' ', sort keys %DATABASE );
kkonganti@0 41
kkonganti@0 42 #..............................................................................
kkonganti@0 43 # Command line options
kkonganti@0 44
kkonganti@0 45 my ( @Options, $debug, $outdir, $db, $force );
kkonganti@0 46 setOptions();
kkonganti@0 47
kkonganti@0 48 $db or err("Please choose a --db from: $DATABASES");
kkonganti@0 49 exists $DATABASE{$db} or err("Unknown --db '$db', choose from: $DATABASES ");
kkonganti@0 50 -d $outdir or err("--outdir '$outdir' does not exist");
kkonganti@0 51
kkonganti@0 52 my $dir = abs_path( File::Spec->catdir( $outdir, $db ) );
kkonganti@0 53 make_path($dir);
kkonganti@0 54 msg("Setting up '$db' in '$dir'");
kkonganti@0 55
kkonganti@0 56 #my $tmpdir = tempdir("$db-XXXXXXXX", DIR=>$dir, CLEANUP=>0);
kkonganti@0 57 #my $tmpdir = "/home/tseemann/git/abricate/db/resfinder/resfinder-6Kuphtvv";
kkonganti@0 58 my $tmpdir = "$dir/src";
kkonganti@0 59 make_path($tmpdir);
kkonganti@0 60
kkonganti@0 61 # run the specific function from --db
kkonganti@0 62 chdir $tmpdir;
kkonganti@0 63 my $seq = $DATABASE{$db}->();
kkonganti@0 64 map { is_full_gene($_) } @$seq; # doesn't do anything?
kkonganti@0 65 $seq = dedupe_seq($seq);
kkonganti@0 66
kkonganti@0 67 #print Dumper($seq);
kkonganti@0 68 msg("Sorting sequences by ID");
kkonganti@0 69 $seq = [ sort { $a->{ID} cmp $b->{ID} } @$seq ];
kkonganti@0 70 save_fasta( "$dir/sequences", $seq );
kkonganti@0 71
kkonganti@0 72 msg("Formatting BLASTN database: $dir/sequences");
kkonganti@0 73 my $logfile = "$tmpdir/makeblastdb.log";
kkonganti@0 74 my $ncbi_title = $db;
kkonganti@0 75 if ( "$db" eq "ncbi" ) {
kkonganti@0 76 $ncbi_title = "ncbiamrplus";
kkonganti@0 77 }
kkonganti@0 78 my $ec = system(
kkonganti@0 79 "makeblastdb -in '$dir/sequences' -title '$ncbi_title' -dbtype nucl -hash_index -logfile $logfile"
kkonganti@0 80 );
kkonganti@0 81 if ( $ec != 0 ) {
kkonganti@0 82 system("tail '$logfile'");
kkonganti@0 83 err("Error with makign BLAST database. See $logfile");
kkonganti@0 84 }
kkonganti@0 85
kkonganti@0 86 #msg("Run 'abricate --setupdb' to format the database");
kkonganti@0 87
kkonganti@0 88 msg("Done.");
kkonganti@0 89
kkonganti@0 90 #..............................................................................
kkonganti@0 91
kkonganti@0 92 sub download {
kkonganti@0 93 my ( $url, $dest ) = @_;
kkonganti@0 94 if ( -r $dest and not $force ) {
kkonganti@0 95 msg("Won't re-download existing $dest (use --force)");
kkonganti@0 96
kkonganti@0 97 #exit(1);
kkonganti@0 98 }
kkonganti@0 99 else {
kkonganti@0 100 msg("Downloading: $url");
kkonganti@0 101 my $ec = mirror( $url, $dest );
kkonganti@0 102 msg("HTTP Result: $ec");
kkonganti@0 103 ( $ec == 200 or $ec = 304 )
kkonganti@0 104 or err("HTTP $ec | failed to download $url"); # is HTTP OK ?
kkonganti@0 105 }
kkonganti@0 106 msg("Destination: $dest");
kkonganti@0 107 msg( "Filesize:", ( -s $dest ), "bytes" );
kkonganti@0 108 }
kkonganti@0 109
kkonganti@0 110 #..............................................................................
kkonganti@0 111 sub trim_spaces {
kkonganti@0 112 my ($s) = @_;
kkonganti@0 113 $s =~ s/^\s+//;
kkonganti@0 114 $s =~ s/\s+$//;
kkonganti@0 115 return $s;
kkonganti@0 116 }
kkonganti@0 117
kkonganti@0 118 #..............................................................................
kkonganti@0 119 sub get_resfinder {
kkonganti@0 120 my $name = "resfinder_db";
kkonganti@0 121
kkonganti@0 122 # FIXME - can we just get HEAD.zip like in plasmidfinder?
kkonganti@0 123 my $url = "https://bitbucket.org/genomicepidemiology/$name.git";
kkonganti@0 124
kkonganti@0 125 # if (-r $name and not $force) {
kkonganti@0 126 # msg("Won't overwrite existing $name (use --force)");
kkonganti@0 127 # # exit(1);
kkonganti@0 128 # }
kkonganti@0 129 # else {
kkonganti@0 130 # msg("Nuking existing folder: $name");
kkonganti@0 131 # remove_tree("./$name");
kkonganti@0 132 # msg("Cloning $url to $name");
kkonganti@0 133 # system("git clone --quiet $url $name");
kkonganti@0 134 # }
kkonganti@0 135
kkonganti@0 136 #<*.fsa>
kkonganti@0 137 #>aac(6')-Ib_2_M23634
kkonganti@0 138 #>blaNDM-19_1_MF370080
kkonganti@0 139 #>mcr-1.1_1_KP347127
kkonganti@0 140 #>fosB1_1_CP001903
kkonganti@0 141 #>fusB_1_AY373761
kkonganti@0 142 #>VanHAX_1_FJ866609
kkonganti@0 143 #>ere(A)_6_DQ157752
kkonganti@0 144 #>nimA_1_X71444
kkonganti@0 145 #>cfr_1_AM408573
kkonganti@0 146 #>catB3_2_U13880
kkonganti@0 147 #>qnrA1_1_AY070235
kkonganti@0 148 #>ARR-2_1_HQ141279
kkonganti@0 149 #>sul1_2_U12338
kkonganti@0 150 #>tet_1_M74049
kkonganti@0 151 #>dfrA19_1_EU855687
kkonganti@0 152
kkonganti@0 153 #<notes.txt>
kkonganti@0 154 #aac(6')-Iv:Aminoglycoside resistance:
kkonganti@0 155 #aac(6')-Iw:Aminoglycoside resistance:Alternate name; aac(6')-Ix
kkonganti@0 156 #sul3:Sulphonamide resistance:
kkonganti@0 157 ##Tetracycline:
kkonganti@0 158 #ort(B):Tetracycline resistance:
kkonganti@0 159 #blaCMY-59:Beta-lactam resistance:
kkonganti@0 160
kkonganti@0 161 #<phenotypes.txt>
kkonganti@0 162 #Gene_accession no. Class Phenotype PMID Mechanism of resistance Notes Required_gene
kkonganti@0 163 #ant(2'')-Ia_1_X04555 Aminoglycoside Gentamicin, Tobramycin 3024112 Enzymatic modification Alternative name aadB
kkonganti@0 164 #ant(2'')-Ia_2_JF826500 Aminoglycoside Gentamicin, Tobramycin 22271862 Enzymat
kkonganti@0 165
kkonganti@0 166 $name = "~/apps/bettercallsal/assets/abricate_dbs/$name";
kkonganti@0 167 my $metafn = "$name/phenotypes.txt";
kkonganti@0 168 my @meta = path($metafn)->lines( { chomp => 1 } );
kkonganti@0 169 my %anno;
kkonganti@0 170 foreach (@meta) {
kkonganti@0 171 next if m/^#/;
kkonganti@0 172 my @x = split m/\t/;
kkonganti@0 173
kkonganti@0 174 #msg("$metafn: @x");
kkonganti@0 175 my ($gene) = ( $x[0] =~ m/^(.*?)_\w+$/ );
kkonganti@0 176 $anno{$gene}{ABX} = [
kkonganti@0 177 map { trim_spaces($_) }
kkonganti@0 178 grep { !m/(unknown|notes|^none)/i }
kkonganti@0 179 split m/,\s*/,
kkonganti@0 180 $x[2]
kkonganti@0 181 ];
kkonganti@0 182
kkonganti@0 183 #msg("$metafn: $gene |", $anno{$gene}{ABX}->@*);
kkonganti@0 184 }
kkonganti@0 185 msg( "get_resfinder: $metafn", scalar( keys %anno ), "genes" );
kkonganti@0 186
kkonganti@0 187 #print Dumper(\%anno);
kkonganti@0 188
kkonganti@0 189 my @seq;
kkonganti@0 190 for my $fasta (<$name/*.fsa>) {
kkonganti@0 191
kkonganti@0 192 # Issue #62 - repair broken fasta files like this:
kkonganti@0 193 # GCTTTAAATTGGAAAAAAGATAGTCAAACTCTTTAA>cmr_1_U43535
kkonganti@0 194 # inline replacement
kkonganti@0 195 system( 'sed', '-i.bak', 's/\([A-Z]\)>/\1\n>/gi', $fasta );
kkonganti@0 196 my $args = load_fasta($fasta);
kkonganti@0 197
kkonganti@0 198 # use name of fasta file as antibiotic name
kkonganti@0 199 #my $abx = basename($fasta, '.fsa');
kkonganti@0 200 #msg("$fasta: Assigning '$abx' to all genes");
kkonganti@0 201 #push @{$_->{ABX}}, $abx for (@$args);
kkonganti@0 202 push @seq, @$args;
kkonganti@0 203 }
kkonganti@0 204
kkonganti@0 205 # https://github.com/tseemann/abricate/issues/92
kkonganti@0 206 # mcr-9_1_NZ_NAAN01000063.1
kkonganti@0 207 #>mcr-9_1_NZ_NAAN01000063.1
kkonganti@0 208 # mcr-9.1:Colistin resistance:
kkonganti@0 209
kkonganti@0 210 for my $seq (@seq) {
kkonganti@0 211 my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*?)_(\d+)_(\S+)$/;
kkonganti@0 212
kkonganti@0 213 #msg("resfinder: $1 $2 $3", $anno{$1});
kkonganti@0 214 $seq->{ID} = "${id}_${copy}";
kkonganti@0 215 $seq->{ACC} = $acc;
kkonganti@0 216 $seq->{DESC} = $anno{$id}{DESC} || $id;
kkonganti@0 217 push @{ $seq->{ABX} }, @{ $anno{$id}{ABX} } if $anno{$id}{ABX};
kkonganti@0 218 }
kkonganti@0 219
kkonganti@0 220 return \@seq;
kkonganti@0 221 }
kkonganti@0 222
kkonganti@0 223 #..............................................................................
kkonganti@0 224 sub get_serotypefinder {
kkonganti@0 225 my $name = "serotypefinder_db";
kkonganti@0 226 my $url = "https://bitbucket.org/genomicepidemiology/$name.git";
kkonganti@0 227
kkonganti@0 228 if ( -r $name and not $force ) {
kkonganti@0 229 msg("Won't overwrite existing $name (use --force)");
kkonganti@0 230
kkonganti@0 231 # exit(1);
kkonganti@0 232 }
kkonganti@0 233 else {
kkonganti@0 234 msg("Nuking existing folder: $name");
kkonganti@0 235 remove_tree("./$name");
kkonganti@0 236 msg("Cloning $url to $name");
kkonganti@0 237 system("git clone --quiet $url $name");
kkonganti@0 238 }
kkonganti@0 239
kkonganti@0 240 my @seq;
kkonganti@0 241 for my $fasta (<$name/*.fsa>) {
kkonganti@0 242 push @seq, @{ load_fasta($fasta) };
kkonganti@0 243 }
kkonganti@0 244
kkonganti@0 245 # >fliC_44444_AY250028_H52
kkonganti@0 246 # FIXME - this is already in EcOH database!
kkonganti@0 247 for my $seq (@seq) {
kkonganti@0 248 my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*)_(\d+)_(\w+)$/;
kkonganti@0 249
kkonganti@0 250 #msg("serotypefinder: $1 $2 $3", $anno{$1});
kkonganti@0 251 $seq->{ID} = "${id}_${copy}";
kkonganti@0 252 $seq->{ACC} = $acc;
kkonganti@0 253
kkonganti@0 254 #$seq->{DESC} = $anno{$id} || '';
kkonganti@0 255 }
kkonganti@0 256
kkonganti@0 257 return \@seq;
kkonganti@0 258 }
kkonganti@0 259
kkonganti@0 260 #..............................................................................
kkonganti@0 261 sub get_tag {
kkonganti@0 262 my ( $f, $tag ) = @_;
kkonganti@0 263 if ( $f->has_tag($tag) ) {
kkonganti@0 264 my ($val) = $f->get_tag_values($tag);
kkonganti@0 265 return $val;
kkonganti@0 266 }
kkonganti@0 267 return '';
kkonganti@0 268 }
kkonganti@0 269
kkonganti@0 270 #..............................................................................
kkonganti@0 271 sub get_ncbi {
kkonganti@0 272 my $AFP =
kkonganti@0 273 "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest";
kkonganti@0 274
kkonganti@0 275 #my $src = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/AMR_CDS";
kkonganti@0 276 my $src = "$AFP/AMR_CDS";
kkonganti@0 277 my $name = "amr_cds.ffn";
kkonganti@0 278
kkonganti@0 279 #my $src2 = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/ReferenceGeneCatalog.txt";
kkonganti@0 280 my $src2 = "$AFP/ReferenceGeneCatalog.txt";
kkonganti@0 281 my $name2 = "amr_cds.tsv";
kkonganti@0 282
kkonganti@0 283 if ( -r $name and -r $name2 and not $force ) {
kkonganti@0 284 msg("Won't overwrite existing $name/$name2 (use --force)");
kkonganti@0 285
kkonganti@0 286 # exit(1);
kkonganti@0 287 }
kkonganti@0 288 else {
kkonganti@0 289 download( $src, $name );
kkonganti@0 290 download( $src2, $name2 );
kkonganti@0 291 }
kkonganti@0 292
kkonganti@0 293 #1 allele
kkonganti@0 294 #2 gene_family ble
kkonganti@0 295 #3 whitelisted_taxa
kkonganti@0 296 #4 product_name BLMA family bleomycin binding protein
kkonganti@0 297 #5 scope core
kkonganti@0 298 #6 type AMR
kkonganti@0 299 #7 subtype AMR
kkonganti@0 300 #8 class BLEOMYCIN
kkonganti@0 301 #9 subclass BLEOMYCIN
kkonganti@0 302 #10 refseq_protein_accession WP_063842967.1
kkonganti@0 303 #11 refseq_nucleotide_accession NG_047554.1
kkonganti@0 304 #12 curated_refseq_start No
kkonganti@0 305 #13 genbank_protein_accession CAA02068.1
kkonganti@0 306 #14 genbank_nucleotide_accession A31900.1
kkonganti@0 307 #15 genbank_strand_orientation +
kkonganti@0 308 #16 genbank_cds_start 6
kkonganti@0 309 #17 genbank_cds_stop 374
kkonganti@0 310 #18 pubmed_reference
kkonganti@0 311 #19 blacklisted_taxa
kkonganti@0 312 #20 db_version 2019-08-27.1
kkonganti@0 313
kkonganti@0 314 my $tsv = load_tabular( $name2, 10 ); # refseq_nucleotide_accession
kkonganti@0 315 msg( "[$name2] Loaded", scalar keys %$tsv, "records" );
kkonganti@0 316
kkonganti@0 317 # https://github.com/ncbi/amr/wiki/AMRFinderPlus-database#amrprot
kkonganti@0 318 # 0 1 2 3 4 5 6 7
kkonganti@0 319 # >1000909371|WP_061158039.1|NG_050200|1|1|blaTEM-156|blaTEM|class_A_beta-lactamase_TEM-156 NG_050200:101-961
kkonganti@0 320 my @seq;
kkonganti@0 321 my $in = Bio::SeqIO->new( -file => $name, -format => "fasta" );
kkonganti@0 322 while ( my $rec = $in->next_seq ) {
kkonganti@0 323
kkonganti@0 324 # parse ID
kkonganti@0 325 my ( $gi, $pi, $acc, $fp, $fn, $gene, $fam, $prod ) = split m/\|/,
kkonganti@0 326 $rec->id;
kkonganti@0 327
kkonganti@0 328 # skip fusion genes
kkonganti@0 329 next unless $fp == 1 and $fn == 1;
kkonganti@0 330
kkonganti@0 331 # only keep true ARGs
kkonganti@0 332 $acc .= ".1" unless $acc =~ m/\.\d+$/;
kkonganti@0 333 my $t = $tsv->{$acc} or next;
kkonganti@0 334 next unless ( $t->{scope} eq 'plus' || $t->{scope} eq 'core' );
kkonganti@0 335
kkonganti@0 336 # next unless $t->{type} eq 'VIRULENCE';
kkonganti@0 337 # next unless $t->{subtype} eq 'VIRULENCE';
kkonganti@0 338 # construct sequence record
kkonganti@0 339 $prod =~ s/_/ /g;
kkonganti@0 340 err("$pi: gene is empty") unless $gene;
kkonganti@0 341 err("$pi: product is empty") unless $prod;
kkonganti@0 342 my $s = {
kkonganti@0 343 ID => $gene,
kkonganti@0 344 ACC => $acc,
kkonganti@0 345 DESC => $prod,
kkonganti@0 346 SEQ => $rec->seq,
kkonganti@0 347 ABX => [ split m'/', $t->{subclass} ]
kkonganti@0 348 };
kkonganti@0 349 push @seq, $s;
kkonganti@0 350 msg( "[$name]", 0 + @seq, "|", $s->{ID}, "|", $s->{ACC}, "|",
kkonganti@0 351 $s->{DESC} );
kkonganti@0 352
kkonganti@0 353 #msg(Dumper($s));
kkonganti@0 354 msg( $s->{ID}, " is fusion $fp/$fn" ) if "$fp$fn" ne '11';
kkonganti@0 355 }
kkonganti@0 356 return \@seq;
kkonganti@0 357 }
kkonganti@0 358
kkonganti@0 359 #..............................................................................
kkonganti@0 360 sub get_plasmidfinder {
kkonganti@0 361 my $name = "plasmidfinder";
kkonganti@0 362 my $zip = "$name.zip";
kkonganti@0 363
kkonganti@0 364 # download("https://cge.cbs.dtu.dk/cge/download_data.php?folder=$name&filename=$zip&submit=$zip", $zip);
kkonganti@0 365 download(
kkonganti@0 366 "https://bitbucket.org/genomicepidemiology/plasmidfinder_db/get/HEAD.zip",
kkonganti@0 367 $zip
kkonganti@0 368 );
kkonganti@0 369 system("unzip -j -u $zip");
kkonganti@0 370
kkonganti@0 371 my @seq;
kkonganti@0 372 for my $fasta (<*.fsa>) {
kkonganti@0 373 push @seq, @{ load_fasta($fasta) };
kkonganti@0 374 }
kkonganti@0 375
kkonganti@0 376 for my $seq (@seq) {
kkonganti@0 377 $seq->{DESC} = $seq->{ID}; # no desc, so use ORIGINAL ID as desc
kkonganti@0 378 my ( $id, $acc ) =
kkonganti@0 379 ( $seq->{ID} =~ m/^(.*)_(([A-Z]+|NC_)\d+(\.\d+)?)$/ );
kkonganti@0 380 $id =~ s/_+$//g;
kkonganti@0 381 $seq->{ID} = $id || $seq->{ID};
kkonganti@0 382 $seq->{ACC} = $acc || '';
kkonganti@0 383 wrn( "Parsed empty ID:",
kkonganti@0 384 $seq->{DESC},
kkonganti@0 385 "=> id='$id' acc='$acc' seq=" . substr( $seq->{SEQ}, 0, 10 ) )
kkonganti@0 386 if not $id;
kkonganti@0 387 }
kkonganti@0 388
kkonganti@0 389 return \@seq;
kkonganti@0 390 }
kkonganti@0 391
kkonganti@0 392 #..............................................................................
kkonganti@0 393 sub get_megares {
kkonganti@0 394 my $zip = "megares.zip";
kkonganti@0 395 download( 'https://www.meglab.org/downloads/megares_v3.00.zip', $zip );
kkonganti@0 396 system("unzip -j -u $zip");
kkonganti@0 397 my $seqs = load_fasta( glob("megares_drugs_*.fasta") );
kkonganti@0 398 my @okseq;
kkonganti@0 399
kkonganti@0 400 # >MEG_372|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_MATE_efflux_pumps|ABEM
kkonganti@0 401 # >MEG_411|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_RND_efflux_regulator|ACRR|RequiresSNPConfirmation
kkonganti@0 402 # >MEG_7860|Drugs|betalactams|Class_B_betalactamases|ZOG
kkonganti@0 403 # >MEG_7439|Drugs|Glycopeptides|VanI-type_resistance_protein|VANI
kkonganti@0 404 # >MEG_7245|Drugs|Tetracyclines|Tetracycline_resistance_MFS_efflux_pumps|TETY
kkonganti@0 405 # >MEG_9|Drugs|Aminoglycosides|Aminoglycoside-resistant_16S_ribosomal_subunit_protein|A16S|RequiresSNPConfirmation
kkonganti@0 406
kkonganti@0 407 for my $s (@$seqs) {
kkonganti@0 408 my ( $id, $type, $class, $mech, $group, $note ) = split m/\|/, $s->{ID};
kkonganti@0 409 if ($note) {
kkonganti@0 410
kkonganti@0 411 # "RequiresSNPConfirmation" is the common one; we can't do that
kkonganti@0 412 msg("Skipping $id due to: $note");
kkonganti@0 413 next;
kkonganti@0 414 }
kkonganti@0 415 $s->{ID} = $group;
kkonganti@0 416 $s->{ACC} = $id;
kkonganti@0 417 $s->{DESC} = join( ':', $type, $class, $mech, $group );
kkonganti@0 418 push @okseq, $s;
kkonganti@0 419 }
kkonganti@0 420 return [@okseq];
kkonganti@0 421
kkonganti@0 422 #return $seqs;
kkonganti@0 423 }
kkonganti@0 424
kkonganti@0 425 #..............................................................................
kkonganti@0 426 sub get_argannot {
kkonganti@0 427 my $fasta = 'arg-annot.fa';
kkonganti@0 428 download(
kkonganti@0 429 # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/1425/argannot-aa-v3-march2017_doc.fasta',
kkonganti@0 430 # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/691/argannot-nt_doc.fasta',
kkonganti@0 431 'https://www.mediterranee-infection.com/wp-content/uploads/2019/06/ARG_ANNOT_V5_Nt_JUNE2019.txt',
kkonganti@0 432 $fasta
kkonganti@0 433 );
kkonganti@0 434
kkonganti@0 435 # fix syntax errors in the FASTA file...
kkonganti@0 436 path($fasta)->edit( sub { s/\\//g; $_ } );
kkonganti@0 437
kkonganti@0 438 my $seqs = load_fasta($fasta);
kkonganti@0 439
kkonganti@0 440 # 0 1 2 3
kkonganti@0 441 # >(AGly)Aac2-Ie:NC_011896:3039059-3039607:549
kkonganti@0 442 for my $s (@$seqs) {
kkonganti@0 443 my @x = split m/:/, $s->{ID};
kkonganti@0 444 $s->{ID} = $x[0];
kkonganti@0 445 $s->{ACC} = $x[1] . ':' . $x[2];
kkonganti@0 446 $s->{DESC} = '';
kkonganti@0 447 }
kkonganti@0 448
kkonganti@0 449 return $seqs;
kkonganti@0 450 }
kkonganti@0 451
kkonganti@0 452 #..............................................................................
kkonganti@0 453 sub get_bacmet2 {
kkonganti@0 454 my $fasta = 'bacmet2.fa';
kkonganti@0 455 download(
kkonganti@0 456 'http://bacmet.biomedicine.gu.se/download/BacMet2_EXP_database.fasta',
kkonganti@0 457 $fasta );
kkonganti@0 458
kkonganti@0 459 # This is a PROTEIN file
kkonganti@0 460 my $seqs = load_fasta($fasta);
kkonganti@0 461
kkonganti@0 462 # 0 1 2 3 4 ^
kkonganti@0 463 # >BAC0098|ctpC|sp|P0A502|CTPC_MYCTU Probable manganese/zinc-exporting
kkonganti@0 464 for my $s (@$seqs) {
kkonganti@0 465 my @x = split m/\|/, $s->{ID};
kkonganti@0 466 $s->{ID} = $x[1] . '-' . $x[0];
kkonganti@0 467 $s->{ACC} = $x[2] . ':' . $x[3];
kkonganti@0 468 }
kkonganti@0 469
kkonganti@0 470 return $seqs;
kkonganti@0 471 }
kkonganti@0 472
kkonganti@0 473 #..............................................................................
kkonganti@0 474 sub get_card {
kkonganti@0 475
kkonganti@0 476 # https://github.com/tseemann/abricate/issues/25
kkonganti@0 477 my $tarball = 'card.tar.bz2';
kkonganti@0 478 download(
kkonganti@0 479 #'https://card.mcmaster.ca/download/0/broadstreet-v2.0.2.tar.gz',
kkonganti@0 480 'https://card.mcmaster.ca/latest/data',
kkonganti@0 481
kkonganti@0 482 #'https://card.mcmaster.ca/latest/data/card-data.tar.bz2',
kkonganti@0 483 $tarball # yes, it's really BZ2 not GZ ...
kkonganti@0 484 );
kkonganti@0 485
kkonganti@0 486 # my $fasta = "./nucleotide_fasta_protein_homolog_model.fasta";
kkonganti@0 487 my $jsonfile = "./card.json";
kkonganti@0 488 system( "tar", "xf", $tarball, $jsonfile ) == 0
kkonganti@0 489 or err("Problem with tar xf $tarball $jsonfile");
kkonganti@0 490 -r $jsonfile or err("Could not extract $jsonfile from $tarball");
kkonganti@0 491
kkonganti@0 492 # JSON
kkonganti@0 493 my $json = path($jsonfile)->slurp_utf8;
kkonganti@0 494 my $card = from_json( $json, { latin1 => 1 } );
kkonganti@0 495 my @seq;
kkonganti@0 496 for my $g ( values %$card ) {
kkonganti@0 497 next unless ref($g) eq 'HASH';
kkonganti@0 498
kkonganti@0 499 # msg(Dumper($g));
kkonganti@0 500 next
kkonganti@0 501 unless $g->{model_type} eq
kkonganti@0 502 "protein homolog model"; # only 'acquired' genes
kkonganti@0 503 my $id = $g->{model_name};
kkonganti@0 504 err("$id has {model_param}{snp}") if exists $g->{model_param}{snp};
kkonganti@0 505
kkonganti@0 506 # msg("CARD: $id");
kkonganti@0 507 # print STDERR Dumper($g);
kkonganti@0 508 my $dna = $g->{model_sequences}{sequence}
kkonganti@0 509 or err("$id: no {model_sequences}{sequence} found");
kkonganti@0 510 my ($key) = sort keys %$dna; # first key
kkonganti@0 511 $dna = $dna->{$key} or err("$id: invalid key '$key'");
kkonganti@0 512 $dna = $dna->{dna_sequence} or err("$id: no dna_sequence");
kkonganti@0 513
kkonganti@0 514 # msg(Dumper($dna)) if $id eq 'OXA-25';
kkonganti@0 515
kkonganti@0 516 # ARO_category => {
kkonganti@0 517 # 'category_aro_name' => 'cephalosporin',
kkonganti@0 518 # 'category_aro_class_name' => 'Drug Class',
kkonganti@0 519 my $is_amr_gene = 0;
kkonganti@0 520 my @abx;
kkonganti@0 521 for my $key ( keys $g->{ARO_category}->%* ) {
kkonganti@0 522 my $c = $g->{ARO_category}{$key};
kkonganti@0 523 if ( $c->{category_aro_class_name} eq 'Drug Class' ) {
kkonganti@0 524 my $abx = $c->{category_aro_name};
kkonganti@0 525 $abx =~ s/ antibiotic//;
kkonganti@0 526 $abx =~ s/\s/_/g;
kkonganti@0 527 push @abx, $abx;
kkonganti@0 528 }
kkonganti@0 529 if ( $c->{category_aro_class_name} eq 'AMR Gene Family' ) {
kkonganti@0 530 $is_amr_gene++;
kkonganti@0 531 }
kkonganti@0 532 }
kkonganti@0 533
kkonganti@0 534 #err("CARD | $id | ", Dumper($g->{ARO_category}) ) unless $is_amr_gene;
kkonganti@0 535 #msg("ABX=$_") for @abx;
kkonganti@0 536
kkonganti@0 537 # put coordinates into normal form
kkonganti@0 538 my ( $start, $stop ) =
kkonganti@0 539 $dna->{strand} eq '-'
kkonganti@0 540 ? ( $dna->{fmax}, $dna->{fmin} )
kkonganti@0 541 : ( $dna->{fmin}, $dna->{fmax} );
kkonganti@0 542
kkonganti@0 543 $id =~ s/\s+/_/g;
kkonganti@0 544 push @seq,
kkonganti@0 545 {
kkonganti@0 546 ID => $id,
kkonganti@0 547 ACC => $dna->{accession} . ":$start-$stop",
kkonganti@0 548 DESC => ( $g->{ARO_description} || $g->{ARO_accession} ),
kkonganti@0 549 SEQ => $dna->{sequence},
kkonganti@0 550 ABX => [@abx],
kkonganti@0 551 };
kkonganti@0 552
kkonganti@0 553 # msg(Dumper($seq[-1]));
kkonganti@0 554 }
kkonganti@0 555
kkonganti@0 556 return \@seq;
kkonganti@0 557 }
kkonganti@0 558
kkonganti@0 559 #..............................................................................
kkonganti@0 560 sub get_victors {
kkonganti@0 561
kkonganti@0 562 # the CDS data is in .ffn and has source GI and coords
kkonganti@0 563 # the PROT data is in .faa and has the protein ref and /product
kkonganti@0 564
kkonganti@0 565 #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence
kkonganti@0 566 download( 'http://www.phidias.us/victors/downloads/gen_downloads.php',
kkonganti@0 567 'victors.ffn' );
kkonganti@0 568
kkonganti@0 569 #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni]
kkonganti@0 570 download(
kkonganti@0 571 'http://www.phidias.us/victors/downloads/gen_downloads_protein.php',
kkonganti@0 572 'victors.faa' );
kkonganti@0 573
kkonganti@0 574 my %gi;
kkonganti@0 575 open my $FAA, '<', 'victors.faa';
kkonganti@0 576 while (<$FAA>) {
kkonganti@0 577
kkonganti@0 578 #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni]
kkonganti@0 579 next unless m"^>gi.(\d+).ref.([^|]+). ([^[]+)";
kkonganti@0 580 $gi{$1}{ACC} = $2;
kkonganti@0 581 $gi{$1}{DESC} = $3;
kkonganti@0 582 }
kkonganti@0 583
kkonganti@0 584 my $seqs = load_fasta("victors.ffn");
kkonganti@0 585
kkonganti@0 586 for my $s (@$seqs) {
kkonganti@0 587
kkonganti@0 588 #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence
kkonganti@0 589 $s->{ID} =~ m/gi.(\d+):(\d+)-(\d+)/;
kkonganti@0 590 $s->{ACC} = $gi{$1}{ACC} || "gi|$1:$2-$3";
kkonganti@0 591 $s->{DESC} =~ $gi{$1}{DESC} || 'hypothetical protein';
kkonganti@0 592 }
kkonganti@0 593
kkonganti@0 594 # print Dumper($seqs); exit;
kkonganti@0 595
kkonganti@0 596 return $seqs;
kkonganti@0 597 }
kkonganti@0 598
kkonganti@0 599 #..............................................................................
kkonganti@0 600 sub get_vfdb {
kkonganti@0 601 download( 'http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz',
kkonganti@0 602 'vfdb.fa.gz' );
kkonganti@0 603 system("gzip -f -d -c vfdb.fa.gz > vfdb.fa");
kkonganti@0 604 my $seqs = load_fasta("vfdb.fa");
kkonganti@0 605
kkonganti@0 606 # >VFG000676(gb|AAD32411) (lef) anthrax toxin lethal factor precursor [Anthrax toxin (VF0142)] [Bacillus anthracis str. Sterne]
kkonganti@0 607 for my $s (@$seqs) {
kkonganti@0 608
kkonganti@0 609 # https://github.com/tseemann/abricate/issues/64#issuecomment-421895159 by @VGalata
kkonganti@0 610 $s->{ID} =~ m/^(\w+)\(\w+\|(\w+)(\.\d+)?\)$/; #
kkonganti@0 611 #$s->{ID} =~ m/^(\w+)\(\w+\|(\w+)\)$/;
kkonganti@0 612 $s->{ACC} = $2 if $2;
kkonganti@0 613 $s->{DESC} =~ m/^\((.*?)\)/;
kkonganti@0 614 $s->{ID} = $1 if $1;
kkonganti@0 615
kkonganti@0 616 # print STDERR Dumper($s); exit;
kkonganti@0 617 }
kkonganti@0 618
kkonganti@0 619 return $seqs;
kkonganti@0 620 }
kkonganti@0 621
kkonganti@0 622 #..............................................................................
kkonganti@0 623 sub get_ncbibetalactamase {
kkonganti@0 624 my $fasta = "ncbi.fa";
kkonganti@0 625 download(
kkonganti@0 626 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele-dna.fa',
kkonganti@0 627 $fasta );
kkonganti@0 628 my $tab = "ncbi.tab";
kkonganti@0 629 download( 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele.tab',
kkonganti@0 630 $tab );
kkonganti@0 631
kkonganti@0 632 # >ACD12694.1 EU650653.1:1-1173
kkonganti@0 633 my $seqs = load_fasta($fasta);
kkonganti@0 634
kkonganti@0 635 # ACC-1 ACD12694.1 EU650653.1 blaACC-1 1 1173 + cephalosporin-hydrolyzing class C beta-lactamase ACC-1
kkonganti@0 636 my %anno;
kkonganti@0 637 my @anno = grep { !m/^#/ } path($tab)->lines( { chomp => 1 } );
kkonganti@0 638 msg( "Read", 0 + @anno, "annotations" );
kkonganti@0 639 foreach (@anno) {
kkonganti@0 640 my ( $name, $id, $acc, $gene, $begin, $end, undef, $product ) =
kkonganti@0 641 split m/\t/;
kkonganti@0 642 $anno{$id} = {
kkonganti@0 643 ID => $gene,
kkonganti@0 644 DESC => $product,
kkonganti@0 645 ACC => "$acc:$begin-$end",
kkonganti@0 646 };
kkonganti@0 647 }
kkonganti@0 648
kkonganti@0 649 # print Dumper(\%anno);
kkonganti@0 650
kkonganti@0 651 for my $s (@$seqs) {
kkonganti@0 652 my $id = $s->{ID};
kkonganti@0 653 next unless exists $anno{$id};
kkonganti@0 654 $s->{ID} = $anno{$id}{ID};
kkonganti@0 655 $s->{ACC} = $anno{$id}{ACC};
kkonganti@0 656 $s->{DESC} = $anno{$id}{DESC};
kkonganti@0 657 }
kkonganti@0 658
kkonganti@0 659 # print Dumper($seqs);
kkonganti@0 660
kkonganti@0 661 return $seqs;
kkonganti@0 662 }
kkonganti@0 663
kkonganti@0 664 #..............................................................................
kkonganti@0 665 sub get_ecoh {
kkonganti@0 666 my $fasta = "EcOH.fa";
kkonganti@0 667 download(
kkonganti@0 668 'https://raw.githubusercontent.com/katholt/srst2/master/data/EcOH.fasta',
kkonganti@0 669 $fasta
kkonganti@0 670 );
kkonganti@0 671
kkonganti@0 672 # https://github.com/katholt/srst2#generating-srst2-compatible-clustered-database-from-raw-sequences
kkonganti@0 673 # [clusterID]__[gene]__[allele]__[seqID] [other stuff]
kkonganti@0 674 # >1__fliC__fliC-H1__1 AB028471.1;flagellin;H1
kkonganti@0 675 # >8__wzx__wzx-O41__246 AB811617.1;O antigen flippase;O41
kkonganti@0 676 # >9__wzy__wzy-OgN31__597 LC125932.1;O antigen polyermase;OgN31
kkonganti@0 677 my $seqs = load_fasta($fasta);
kkonganti@0 678
kkonganti@0 679 for my $s (@$seqs) {
kkonganti@0 680 my @id = split m/__/, $s->{ID};
kkonganti@0 681 my @desc = split m';', $s->{DESC};
kkonganti@0 682 $s->{ID} = $id[2];
kkonganti@0 683 $s->{ACC} = shift(@desc);
kkonganti@0 684 $s->{DESC} = join( ' ', @desc );
kkonganti@0 685 }
kkonganti@0 686
kkonganti@0 687 # print Dumper($seqs);
kkonganti@0 688 return $seqs;
kkonganti@0 689 }
kkonganti@0 690
kkonganti@0 691 #..............................................................................
kkonganti@0 692 sub get_ecoli_vf {
kkonganti@0 693 my $fasta = "ecoli_vf.ffn";
kkonganti@0 694 download(
kkonganti@0 695 'https://github.com/phac-nml/ecoli_vf/raw/master/data/repaired_ecoli_vfs_shortnames.ffn',
kkonganti@0 696 $fasta
kkonganti@0 697 );
kkonganti@0 698 my $seqs = load_fasta($fasta);
kkonganti@0 699
kkonganti@0 700 # >VFG000748(gi:2865308) (espF) EspF [EspF (VF0182)] [Escherichia coli O127:H6 str. E2348/69]
kkonganti@0 701 # >VFG000749(gi:6009379) (bfpA) Bundlin [BFP (VF0174)] [Escherichia coli B171]
kkonganti@0 702 # >SPG000142 (cvac) Escherichia coli cvi cvaC operon. [X57525 434-745]
kkonganti@0 703 # >SPG000143 (iss2) Escherichia coli Iss (iss) gene, complete cds. [AF042279 292-600]
kkonganti@0 704
kkonganti@0 705 for my $s (@$seqs) {
kkonganti@0 706
kkonganti@0 707 #print STDERR Dumper("IN", $s);
kkonganti@0 708 $s->{ID} =~ m/ ^ (\w+) (?: \( (.*?) \) )? $ /x
kkonganti@0 709 or die "Can't parse $fasta at " . Dumper($s);
kkonganti@0 710 $s->{ID} = $1 if $1;
kkonganti@0 711 $s->{ACC} = $2 || $1;
kkonganti@0 712 $s->{DESC} =~ s/\s\[.*?\]$//g; # remove strain name
kkonganti@0 713 $s->{DESC} =~ m/^(?:\((.*?)\)\s+)?(.*?)$/;
kkonganti@0 714 $s->{ID} = $1 if $1;
kkonganti@0 715 $s->{DESC} = $2;
kkonganti@0 716
kkonganti@0 717 #print STDERR Dumper("OUT", $s);
kkonganti@0 718 #print STDERR "="x60, "\n";
kkonganti@0 719 }
kkonganti@0 720
kkonganti@0 721 # print Dumper($seqs);
kkonganti@0 722 return $seqs;
kkonganti@0 723 }
kkonganti@0 724
kkonganti@0 725 #..............................................................................
kkonganti@0 726 sub is_full_gene {
kkonganti@0 727 my ($s) = @_;
kkonganti@0 728 my $has_ambig = 0;
kkonganti@0 729
kkonganti@0 730 my $id = $s->{ID};
kkonganti@0 731 my $L = length( $s->{SEQ} );
kkonganti@0 732 if ( $L % 3 != 0 ) {
kkonganti@0 733 wrn("$id - length $L bp is not multiple of 3");
kkonganti@0 734 return;
kkonganti@0 735 }
kkonganti@0 736 if ( $s->{SEQ} !~ m/^[AGCT]+$/ ) {
kkonganti@0 737 wrn("$id - has non-AGTC bases");
kkonganti@0 738 return;
kkonganti@0 739 }
kkonganti@0 740
kkonganti@0 741 my $seq = Bio::Seq->new( -id => $s->{ID}, -seq => $s->{SEQ} );
kkonganti@0 742
kkonganti@0 743 my $aa = $seq->translate->seq;
kkonganti@0 744
kkonganti@0 745 if ( $aa =~ m/\*./ ) {
kkonganti@0 746 wrn("$id - has internal stop codons, trying revcom");
kkonganti@0 747 $aa = $seq->revcom->translate->seq;
kkonganti@0 748 if ( $aa =~ m/\*./ ) {
kkonganti@0 749 wrn("$id - revcom has internal stop codons too");
kkonganti@0 750 return;
kkonganti@0 751 }
kkonganti@0 752 else {
kkonganti@0 753 msg("$id - revcom resolves problem, hooray!");
kkonganti@0 754 $s->{SEQ} = $seq->revcom->seq;
kkonganti@0 755 }
kkonganti@0 756 }
kkonganti@0 757
kkonganti@0 758 return $L;
kkonganti@0 759 }
kkonganti@0 760
kkonganti@0 761 #..............................................................................
kkonganti@0 762 sub dedupe_seq {
kkonganti@0 763 my ($seq) = @_;
kkonganti@0 764 my %seen;
kkonganti@0 765 my $good = [];
kkonganti@0 766 for my $s (@$seq) {
kkonganti@0 767 if ( $seen{ $s->{SEQ} } ) {
kkonganti@0 768 wrn( "duplicate", length( $s->{SEQ} ),
kkonganti@0 769 "bp sequence:", $s->{ID}, '~', $seen{ $s->{SEQ} } );
kkonganti@0 770 }
kkonganti@0 771 else {
kkonganti@0 772 push @$good, $s;
kkonganti@0 773 }
kkonganti@0 774 $seen{ $s->{SEQ} } .= ' ' . $s->{ID};
kkonganti@0 775 }
kkonganti@0 776 msg( "dedupe_seq: read", scalar(@$seq), "/ kept", scalar(@$good) );
kkonganti@0 777 return $good;
kkonganti@0 778 }
kkonganti@0 779
kkonganti@0 780 #..............................................................................
kkonganti@0 781 sub load_tabular {
kkonganti@0 782 my ( $fname, $keycol, $sep ) = @_;
kkonganti@0 783 $keycol //= 0;
kkonganti@0 784 $sep //= "\t";
kkonganti@0 785 my $hash = {};
kkonganti@0 786 my @hdr;
kkonganti@0 787 my $row = 0;
kkonganti@0 788 open my $TSV, '<', $fname or err("Can't read TSV file: $fname");
kkonganti@0 789 while (<$TSV>) {
kkonganti@0 790 chomp;
kkonganti@0 791 my @col = split m/$sep/;
kkonganti@0 792 $row++;
kkonganti@0 793 if (@hdr) {
kkonganti@0 794 @hdr == @col or err("Header and row $row differ in column count");
kkonganti@0 795
kkonganti@0 796 #my $key = $col[$keycol] or wrn("Empty key column $keycol: $_");
kkonganti@0 797 #exists{$hash->{$col[$key}} and wrn("WARNING: dupe key $key at row: $_");
kkonganti@0 798 my $key = $col[$keycol];
kkonganti@0 799 $hash->{$key} ||=
kkonganti@0 800 { map { ( $hdr[$_] => $col[$_] ) } ( 0 .. $#hdr ) };
kkonganti@0 801 }
kkonganti@0 802 else {
kkonganti@0 803 @hdr = @col;
kkonganti@0 804 }
kkonganti@0 805 }
kkonganti@0 806 close $TSV;
kkonganti@0 807 return $hash;
kkonganti@0 808 }
kkonganti@0 809
kkonganti@0 810 #..............................................................................
kkonganti@0 811 sub load_fasta {
kkonganti@0 812 my ($fasta) = @_;
kkonganti@0 813 my %seen;
kkonganti@0 814 my $list;
kkonganti@0 815 my $dbtype = 'unknown';
kkonganti@0 816 msg("load_fasta: $fasta");
kkonganti@0 817 my $in = Bio::SeqIO->new( -file => $fasta, -format => 'fasta' );
kkonganti@0 818 while ( my $seq = $in->next_seq ) {
kkonganti@0 819 my $id = $seq->id or err("Empty ID in $fasta");
kkonganti@0 820 if ( $seen{$id} ) {
kkonganti@0 821 wrn("Duplicate ID '$id' in $fasta");
kkonganti@0 822 $id = $id . '_dupe';
kkonganti@0 823 }
kkonganti@0 824 $seen{$id}++;
kkonganti@0 825 my $s = uc( $seq->seq );
kkonganti@0 826 $dbtype = $seq->alphabet eq 'dna' ? 'nucl' : 'prot';
kkonganti@0 827 $dbtype eq 'nucl' ? $s =~ s/[^AGTC]/N/g : $s =~ s/[^A-Z]/X/g;
kkonganti@0 828 push @$list,
kkonganti@0 829 {
kkonganti@0 830 ID => $id,
kkonganti@0 831 ACC => '',
kkonganti@0 832 DESC => $seq->desc,
kkonganti@0 833 SEQ => $s,
kkonganti@0 834 TYPE => $dbtype,
kkonganti@0 835 };
kkonganti@0 836 }
kkonganti@0 837 msg( "load_fasta: read", scalar(@$list), "$dbtype sequences" );
kkonganti@0 838 return $list;
kkonganti@0 839 }
kkonganti@0 840
kkonganti@0 841 #..............................................................................
kkonganti@0 842 sub save_fasta {
kkonganti@0 843 my ( $fasta, $seq ) = @_;
kkonganti@0 844 msg("save_fasta: $fasta");
kkonganti@0 845 my %seen;
kkonganti@0 846 my $out = Bio::SeqIO->new( -file => ">$fasta", -format => 'fasta' );
kkonganti@0 847 for my $s (@$seq) {
kkonganti@0 848 $seen{ $s->{ID} }++;
kkonganti@0 849 my $freq = $seen{ $s->{ID} };
kkonganti@0 850
kkonganti@0 851 #wrn("seen $s->{ID} now $freq times") if $freq > 1;
kkonganti@0 852 # print Dumper($s);
kkonganti@0 853 my $ABX =
kkonganti@0 854 defined( $s->{ABX} ) ? join( $ABX_SEP, sort @{ $s->{ABX} } ) : '';
kkonganti@0 855 $ABX =~ s/\s+/_/g; # remove spaces!
kkonganti@0 856 my $obj = Bio::Seq->new(
kkonganti@0 857 -id => join( '~~~', $db, $s->{ID}, $s->{ACC}, $ABX ),
kkonganti@0 858 -desc => ( $s->{DESC} || $s->{ID} ),
kkonganti@0 859 -seq => $s->{SEQ},
kkonganti@0 860 );
kkonganti@0 861
kkonganti@0 862 # $obj->desc( hash_encode($s) );
kkonganti@0 863 $out->write_seq($obj);
kkonganti@0 864
kkonganti@0 865 # $seen{ $s->{ID} }++;
kkonganti@0 866 }
kkonganti@0 867 msg( "save_fasta: wrote", scalar(@$seq), "sequences" );
kkonganti@0 868 }
kkonganti@0 869
kkonganti@0 870 #----------------------------------------------------------------------
kkonganti@0 871 sub msg {
kkonganti@0 872 print STDERR "@_\n";
kkonganti@0 873 }
kkonganti@0 874
kkonganti@0 875 #----------------------------------------------------------------------
kkonganti@0 876 sub wrn {
kkonganti@0 877 msg( "WARNING:", @_ ) if $debug;
kkonganti@0 878 }
kkonganti@0 879
kkonganti@0 880 #----------------------------------------------------------------------
kkonganti@0 881 sub err {
kkonganti@0 882 msg( "ERROR:", @_ );
kkonganti@0 883 exit(1);
kkonganti@0 884 }
kkonganti@0 885
kkonganti@0 886 #----------------------------------------------------------------------
kkonganti@0 887 # Option setting routines
kkonganti@0 888
kkonganti@0 889 sub setOptions {
kkonganti@0 890 use Getopt::Long;
kkonganti@0 891
kkonganti@0 892 @Options = (
kkonganti@0 893 { OPT => "help", VAR => \&usage, DESC => "This help" },
kkonganti@0 894 {
kkonganti@0 895 OPT => "debug!",
kkonganti@0 896 VAR => \$debug,
kkonganti@0 897 DEFAULT => 0,
kkonganti@0 898 DESC => "Verbose debug output"
kkonganti@0 899 },
kkonganti@0 900 {
kkonganti@0 901 OPT => "dbdir=s",
kkonganti@0 902 VAR => \$outdir,
kkonganti@0 903 DEFAULT => abs_path("$FindBin::RealBin/../db"),
kkonganti@0 904 DESC => "Parent folder"
kkonganti@0 905 },
kkonganti@0 906 {
kkonganti@0 907 OPT => "db=s",
kkonganti@0 908 VAR => \$db,
kkonganti@0 909 DEFAULT => "",
kkonganti@0 910 DESC => "Choices: $DATABASES"
kkonganti@0 911 },
kkonganti@0 912 {
kkonganti@0 913 OPT => "force!",
kkonganti@0 914 VAR => \$force,
kkonganti@0 915 DEFAULT => 0,
kkonganti@0 916 DESC => "Force download even if exists"
kkonganti@0 917 },
kkonganti@0 918 );
kkonganti@0 919
kkonganti@0 920 &GetOptions( map { $_->{OPT}, $_->{VAR} } @Options ) || usage(1);
kkonganti@0 921
kkonganti@0 922 # Now setup default values.
kkonganti@0 923 foreach (@Options) {
kkonganti@0 924 if ( defined( $_->{DEFAULT} ) && !defined( ${ $_->{VAR} } ) ) {
kkonganti@0 925 ${ $_->{VAR} } = $_->{DEFAULT};
kkonganti@0 926 }
kkonganti@0 927 }
kkonganti@0 928 }
kkonganti@0 929
kkonganti@0 930 sub usage {
kkonganti@0 931 my ($exitcode) = @_;
kkonganti@0 932 $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref
kkonganti@0 933 $exitcode ||= 0;
kkonganti@0 934 select STDERR if $exitcode; # write to STDERR if exitcode is error
kkonganti@0 935
kkonganti@0 936 print "SYNOPIS\n Download databases for abricate to use\n";
kkonganti@0 937 print "USAGE\n $EXE [options] --db DATABASE\n";
kkonganti@0 938 print "OPTIONS\n";
kkonganti@0 939 foreach (@Options) {
kkonganti@0 940 printf " --%-13s %s%s.\n", $_->{OPT}, $_->{DESC},
kkonganti@0 941 defined( $_->{DEFAULT} ) ? " (default '$_->{DEFAULT}')" : "";
kkonganti@0 942 }
kkonganti@0 943 exit($exitcode);
kkonganti@0 944 }
kkonganti@0 945
kkonganti@0 946 #----------------------------------------------------------------------
kkonganti@0 947