annotate 1.0.0/assets/abricate-get_db @ 0:0a8dda29956e draft default tip

planemo upload
author galaxytrakr
date Thu, 28 May 2026 20:41:10 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env perl
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
2
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
3 use strict;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
4 use FindBin;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
5 use Bio::SeqIO;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
6 use Bio::Seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
7 use Path::Tiny;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
8 use File::Basename;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
9 use File::Spec;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
10 use File::Path qw(make_path remove_tree);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
11 use List::Util qw(first);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
12 use Cwd qw(abs_path);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
13 use Data::Dumper;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
14 use LWP::Simple;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
15 use JSON;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
16
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
17 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
18 # Globals
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
19
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
20 my $EXE = basename($0);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
21 my $ABX_SEP = ';';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
22
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
23 my %DATABASE = (
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
24 'resfinder' => \&get_resfinder,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
25 'plasmidfinder' => \&get_plasmidfinder,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
26 'megares' => \&get_megares,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
27 'argannot' => \&get_argannot,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
28 'card' => \&get_card,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
29
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
30 # 'ncbibetalactamase' => \&get_ncbibetalactamase,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
31 'ncbi' => \&get_ncbi,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
32 'vfdb' => \&get_vfdb,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
33 'ecoli_vf' => \&get_ecoli_vf, # https://github.com/phac-nml/ecoli_vf
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
34 'ecoh' => \&get_ecoh,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
35 'bacmet2' => \&get_bacmet2,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
36 'victors' => \&get_victors,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
37
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
38 # 'serotypefinder' => \&get_serotypefinder,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
39 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
40 my $DATABASES = join( ' ', sort keys %DATABASE );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
41
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
42 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
43 # Command line options
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
44
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
45 my ( @Options, $debug, $outdir, $db, $force );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
46 setOptions();
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
47
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
48 $db or err("Please choose a --db from: $DATABASES");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
49 exists $DATABASE{$db} or err("Unknown --db '$db', choose from: $DATABASES ");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
50 -d $outdir or err("--outdir '$outdir' does not exist");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
51
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
52 my $dir = abs_path( File::Spec->catdir( $outdir, $db ) );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
53 make_path($dir);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
54 msg("Setting up '$db' in '$dir'");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
55
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
56 #my $tmpdir = tempdir("$db-XXXXXXXX", DIR=>$dir, CLEANUP=>0);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
57 #my $tmpdir = "/home/tseemann/git/abricate/db/resfinder/resfinder-6Kuphtvv";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
58 my $tmpdir = "$dir/src";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
59 make_path($tmpdir);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
60
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
61 # run the specific function from --db
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
62 chdir $tmpdir;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
63 my $seq = $DATABASE{$db}->();
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
64 map { is_full_gene($_) } @$seq; # doesn't do anything?
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
65 $seq = dedupe_seq($seq);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
66
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
67 #print Dumper($seq);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
68 msg("Sorting sequences by ID");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
69 $seq = [ sort { $a->{ID} cmp $b->{ID} } @$seq ];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
70 save_fasta( "$dir/sequences", $seq );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
71
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
72 msg("Formatting BLASTN database: $dir/sequences");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
73 my $logfile = "$tmpdir/makeblastdb.log";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
74 my $ncbi_title = $db;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
75 if ( "$db" eq "ncbi" ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
76 $ncbi_title = "ncbiamrplus";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
77 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
78 my $ec = system(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
79 "makeblastdb -in '$dir/sequences' -title '$ncbi_title' -dbtype nucl -hash_index -logfile $logfile"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
80 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
81 if ( $ec != 0 ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
82 system("tail '$logfile'");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
83 err("Error with makign BLAST database. See $logfile");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
84 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
85
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
86 #msg("Run 'abricate --setupdb' to format the database");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
87
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
88 msg("Done.");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
89
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
90 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
91
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
92 sub download {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
93 my ( $url, $dest ) = @_;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
94 if ( -r $dest and not $force ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
95 msg("Won't re-download existing $dest (use --force)");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
96
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
97 #exit(1);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
98 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
99 else {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
100 msg("Downloading: $url");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
101 my $ec = mirror( $url, $dest );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
102 msg("HTTP Result: $ec");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
103 ( $ec == 200 or $ec = 304 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
104 or err("HTTP $ec | failed to download $url"); # is HTTP OK ?
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
105 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
106 msg("Destination: $dest");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
107 msg( "Filesize:", ( -s $dest ), "bytes" );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
108 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
109
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
110 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
111 sub trim_spaces {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
112 my ($s) = @_;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
113 $s =~ s/^\s+//;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
114 $s =~ s/\s+$//;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
115 return $s;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
116 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
117
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
118 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
119 sub get_resfinder {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
120 my $name = "resfinder_db";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
121
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
122 # FIXME - can we just get HEAD.zip like in plasmidfinder?
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
123 my $url = "https://bitbucket.org/genomicepidemiology/$name.git";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
124
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
125 if (-r $name and not $force) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
126 msg("Won't overwrite existing $name (use --force)");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
127 exit(1);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
128 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
129 else {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
130 msg("Nuking existing folder: $name");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
131 remove_tree("./$name");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
132 msg("Cloning $url to $name");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
133 system("git clone --quiet $url $name");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
134 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
135
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
136 #<*.fsa>
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
137 #>aac(6')-Ib_2_M23634
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
138 #>blaNDM-19_1_MF370080
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
139 #>mcr-1.1_1_KP347127
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
140 #>fosB1_1_CP001903
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
141 #>fusB_1_AY373761
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
142 #>VanHAX_1_FJ866609
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
143 #>ere(A)_6_DQ157752
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
144 #>nimA_1_X71444
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
145 #>cfr_1_AM408573
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
146 #>catB3_2_U13880
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
147 #>qnrA1_1_AY070235
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
148 #>ARR-2_1_HQ141279
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
149 #>sul1_2_U12338
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
150 #>tet_1_M74049
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
151 #>dfrA19_1_EU855687
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
152
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
153 #<notes.txt>
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
154 #aac(6')-Iv:Aminoglycoside resistance:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
155 #aac(6')-Iw:Aminoglycoside resistance:Alternate name; aac(6')-Ix
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
156 #sul3:Sulphonamide resistance:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
157 ##Tetracycline:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
158 #ort(B):Tetracycline resistance:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
159 #blaCMY-59:Beta-lactam resistance:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
160
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
161 #<phenotypes.txt>
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
162 #Gene_accession no. Class Phenotype PMID Mechanism of resistance Notes Required_gene
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
163 #ant(2'')-Ia_1_X04555 Aminoglycoside Gentamicin, Tobramycin 3024112 Enzymatic modification Alternative name aadB
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
164 #ant(2'')-Ia_2_JF826500 Aminoglycoside Gentamicin, Tobramycin 22271862 Enzymat
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
165
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
166 # $name = "~/apps/bettercallsal/assets/abricate_dbs/$name";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
167 my $metafn = "$name/phenotypes.txt";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
168 my @meta = path($metafn)->lines( { chomp => 1 } );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
169 my %anno;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
170 foreach (@meta) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
171 next if m/^#/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
172 my @x = split m/\t/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
173
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
174 #msg("$metafn: @x");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
175 my ($gene) = ( $x[0] =~ m/^(.*?)_\w+$/ );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
176 $anno{$gene}{ABX} = [
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
177 map { trim_spaces($_) }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
178 grep { !m/(unknown|notes|^none)/i }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
179 split m/,\s*/,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
180 $x[2]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
181 ];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
182
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
183 #msg("$metafn: $gene |", $anno{$gene}{ABX}->@*);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
184 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
185 msg( "get_resfinder: $metafn", scalar( keys %anno ), "genes" );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
186
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
187 #print Dumper(\%anno);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
188
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
189 my @seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
190 for my $fasta (<$name/*.fsa>) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
191
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
192 # Issue #62 - repair broken fasta files like this:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
193 # GCTTTAAATTGGAAAAAAGATAGTCAAACTCTTTAA>cmr_1_U43535
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
194 # inline replacement
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
195 system( 'sed', '-i.bak', 's/\([A-Z]\)>/\1\n>/gi', $fasta );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
196 my $args = load_fasta($fasta);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
197
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
198 # use name of fasta file as antibiotic name
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
199 #my $abx = basename($fasta, '.fsa');
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
200 #msg("$fasta: Assigning '$abx' to all genes");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
201 #push @{$_->{ABX}}, $abx for (@$args);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
202 push @seq, @$args;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
203 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
204
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
205 # https://github.com/tseemann/abricate/issues/92
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
206 # mcr-9_1_NZ_NAAN01000063.1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
207 #>mcr-9_1_NZ_NAAN01000063.1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
208 # mcr-9.1:Colistin resistance:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
209
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
210 for my $seq (@seq) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
211 my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*?)_(\d+)_(\S+)$/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
212
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
213 #msg("resfinder: $1 $2 $3", $anno{$1});
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
214 $seq->{ID} = "${id}_${copy}";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
215 $seq->{ACC} = $acc;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
216 $seq->{DESC} = $anno{$id}{DESC} || $id;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
217 push @{ $seq->{ABX} }, @{ $anno{$id}{ABX} } if $anno{$id}{ABX};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
218 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
219
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
220 return \@seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
221 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
222
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
223 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
224 sub get_serotypefinder {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
225 my $name = "serotypefinder_db";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
226 my $url = "https://bitbucket.org/genomicepidemiology/$name.git";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
227
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
228 if ( -r $name and not $force ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
229 msg("Won't overwrite existing $name (use --force)");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
230
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
231 # exit(1);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
232 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
233 else {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
234 msg("Nuking existing folder: $name");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
235 remove_tree("./$name");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
236 msg("Cloning $url to $name");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
237 system("git clone --quiet $url $name");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
238 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
239
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
240 my @seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
241 for my $fasta (<$name/*.fsa>) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
242 push @seq, @{ load_fasta($fasta) };
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
243 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
244
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
245 # >fliC_44444_AY250028_H52
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
246 # FIXME - this is already in EcOH database!
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
247 for my $seq (@seq) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
248 my ( $id, $copy, $acc ) = $seq->{ID} =~ m/^(.*)_(\d+)_(\w+)$/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
249
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
250 #msg("serotypefinder: $1 $2 $3", $anno{$1});
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
251 $seq->{ID} = "${id}_${copy}";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
252 $seq->{ACC} = $acc;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
253
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
254 #$seq->{DESC} = $anno{$id} || '';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
255 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
256
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
257 return \@seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
258 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
259
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
260 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
261 sub get_tag {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
262 my ( $f, $tag ) = @_;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
263 if ( $f->has_tag($tag) ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
264 my ($val) = $f->get_tag_values($tag);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
265 return $val;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
266 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
267 return '';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
268 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
269
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
270 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
271 sub get_ncbi {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
272 my $AFP =
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
273 "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
274
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
275 #my $src = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/AMR_CDS";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
276 my $src = "$AFP/AMR_CDS.fa";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
277 my $name = "amr_cds.ffn";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
278
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
279 #my $src2 = "https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/data/latest/ReferenceGeneCatalog.txt";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
280 my $src2 = "$AFP/ReferenceGeneCatalog.txt";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
281 my $name2 = "amr_cds.tsv";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
282
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
283 if ( -r $name and -r $name2 and not $force ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
284 msg("Won't overwrite existing $name/$name2 (use --force)");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
285
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
286 # exit(1);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
287 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
288 else {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
289 download( $src, $name );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
290 download( $src2, $name2 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
291 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
292
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
293 #1 allele
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
294 #2 gene_family ble
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
295 #3 whitelisted_taxa
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
296 #4 product_name BLMA family bleomycin binding protein
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
297 #5 scope core
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
298 #6 type AMR
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
299 #7 subtype AMR
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
300 #8 class BLEOMYCIN
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
301 #9 subclass BLEOMYCIN
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
302 #10 refseq_protein_accession WP_063842967.1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
303 #11 refseq_nucleotide_accession NG_047554.1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
304 #12 curated_refseq_start No
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
305 #13 genbank_protein_accession CAA02068.1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
306 #14 genbank_nucleotide_accession A31900.1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
307 #15 genbank_strand_orientation +
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
308 #16 genbank_cds_start 6
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
309 #17 genbank_cds_stop 374
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
310 #18 pubmed_reference
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
311 #19 blacklisted_taxa
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
312 #20 db_version 2019-08-27.1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
313
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
314 my $tsv = load_tabular( $name2, 10 ); # refseq_nucleotide_accession
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
315 msg( "[$name2] Loaded", scalar keys %$tsv, "records" );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
316
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
317 # https://github.com/ncbi/amr/wiki/AMRFinderPlus-database#amrprot
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
318 # 0 1 2 3 4 5 6 7
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
319 # >1000909371|WP_061158039.1|NG_050200|1|1|blaTEM-156|blaTEM|class_A_beta-lactamase_TEM-156 NG_050200:101-961
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
320 my @seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
321 my $in = Bio::SeqIO->new( -file => $name, -format => "fasta" );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
322 while ( my $rec = $in->next_seq ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
323
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
324 # parse ID
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
325 my ( $gi, $pi, $acc, $fp, $fn, $gene, $fam, $prod ) = split m/\|/,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
326 $rec->id;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
327
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
328 # skip fusion genes
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
329 next unless $fp == 1 and $fn == 1;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
330
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
331 # only keep true ARGs
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
332 $acc .= ".1" unless $acc =~ m/\.\d+$/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
333 my $t = $tsv->{$acc} or next;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
334 next unless ( $t->{scope} eq 'plus' || $t->{scope} eq 'core' );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
335
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
336 # next unless $t->{type} eq 'VIRULENCE';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
337 # next unless $t->{subtype} eq 'VIRULENCE';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
338 # construct sequence record
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
339 $prod =~ s/_/ /g;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
340 err("$pi: gene is empty") unless $gene;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
341 err("$pi: product is empty") unless $prod;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
342 my $s = {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
343 ID => $gene,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
344 ACC => $acc,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
345 DESC => $prod,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
346 SEQ => $rec->seq,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
347 ABX => [ split m'/', $t->{subclass} ]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
348 };
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
349 push @seq, $s;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
350 msg( "[$name]", 0 + @seq, "|", $s->{ID}, "|", $s->{ACC}, "|",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
351 $s->{DESC} );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
352
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
353 #msg(Dumper($s));
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
354 msg( $s->{ID}, " is fusion $fp/$fn" ) if "$fp$fn" ne '11';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
355 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
356 return \@seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
357 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
358
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
359 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
360 sub get_plasmidfinder {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
361 my $name = "plasmidfinder";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
362 my $zip = "$name.zip";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
363
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
364 # download("https://cge.cbs.dtu.dk/cge/download_data.php?folder=$name&filename=$zip&submit=$zip", $zip);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
365 download(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
366 "https://bitbucket.org/genomicepidemiology/plasmidfinder_db/get/HEAD.zip",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
367 $zip
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
368 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
369 system("unzip -j -u $zip");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
370
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
371 my @seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
372 for my $fasta (<*.fsa>) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
373 push @seq, @{ load_fasta($fasta) };
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
374 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
375
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
376 for my $seq (@seq) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
377 $seq->{DESC} = $seq->{ID}; # no desc, so use ORIGINAL ID as desc
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
378 my ( $id, $acc ) =
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
379 ( $seq->{ID} =~ m/^(.*)_(([A-Z]+|NC_)\d+(\.\d+)?)$/ );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
380 $id =~ s/_+$//g;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
381 $seq->{ID} = $id || $seq->{ID};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
382 $seq->{ACC} = $acc || '';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
383 wrn( "Parsed empty ID:",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
384 $seq->{DESC},
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
385 "=> id='$id' acc='$acc' seq=" . substr( $seq->{SEQ}, 0, 10 ) )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
386 if not $id;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
387 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
388
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
389 return \@seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
390 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
391
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
392 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
393 sub get_megares {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
394 my $zip = "megares.zip";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
395 download( 'https://www.meglab.org/downloads/megares_v3.00.zip', $zip );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
396 system("unzip -j -u $zip");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
397 my $seqs = load_fasta( glob("megares_drugs_*.fasta") );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
398 my @okseq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
399
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
400 # >MEG_372|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_MATE_efflux_pumps|ABEM
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
401 # >MEG_411|Multi-compound|Drug_and_biocide_resistance|Drug_and_biocide_RND_efflux_regulator|ACRR|RequiresSNPConfirmation
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
402 # >MEG_7860|Drugs|betalactams|Class_B_betalactamases|ZOG
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
403 # >MEG_7439|Drugs|Glycopeptides|VanI-type_resistance_protein|VANI
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
404 # >MEG_7245|Drugs|Tetracyclines|Tetracycline_resistance_MFS_efflux_pumps|TETY
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
405 # >MEG_9|Drugs|Aminoglycosides|Aminoglycoside-resistant_16S_ribosomal_subunit_protein|A16S|RequiresSNPConfirmation
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
406
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
407 for my $s (@$seqs) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
408 my ( $id, $type, $class, $mech, $group, $note ) = split m/\|/, $s->{ID};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
409 if ($note) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
410
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
411 # "RequiresSNPConfirmation" is the common one; we can't do that
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
412 msg("Skipping $id due to: $note");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
413 next;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
414 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
415 $s->{ID} = $group;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
416 $s->{ACC} = $id;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
417 $s->{DESC} = join( ':', $type, $class, $mech, $group );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
418 push @okseq, $s;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
419 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
420 return [@okseq];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
421
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
422 #return $seqs;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
423 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
424
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
425 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
426 sub get_argannot {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
427 my $fasta = 'arg-annot.fa';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
428 download(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
429 # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/1425/argannot-aa-v3-march2017_doc.fasta',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
430 # 'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/691/argannot-nt_doc.fasta',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
431 'https://www.mediterranee-infection.com/wp-content/uploads/2019/09/ARG-ANNOT_NT_V6_July2019.txt',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
432 $fasta
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
433 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
434
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
435 # fix syntax errors in the FASTA file...
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
436 path($fasta)->edit( sub { s/\\//g; $_ } );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
437
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
438 my $seqs = load_fasta($fasta);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
439
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
440 # 0 1 2 3
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
441 # >(AGly)Aac2-Ie:NC_011896:3039059-3039607:549
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
442 for my $s (@$seqs) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
443 my @x = split m/:/, $s->{ID};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
444 $s->{ID} = $x[0];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
445 $s->{ACC} = $x[1] . ':' . $x[2];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
446 $s->{DESC} = '';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
447 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
448
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
449 return $seqs;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
450 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
451
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
452 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
453 sub get_bacmet2 {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
454 my $fasta = 'bacmet2.fa';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
455 download(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
456 'http://bacmet.biomedicine.gu.se/download/BacMet2_EXP_database.fasta',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
457 $fasta );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
458
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
459 # This is a PROTEIN file
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
460 my $seqs = load_fasta($fasta);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
461
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
462 # 0 1 2 3 4 ^
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
463 # >BAC0098|ctpC|sp|P0A502|CTPC_MYCTU Probable manganese/zinc-exporting
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
464 for my $s (@$seqs) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
465 my @x = split m/\|/, $s->{ID};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
466 $s->{ID} = $x[1] . '-' . $x[0];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
467 $s->{ACC} = $x[2] . ':' . $x[3];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
468 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
469
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
470 return $seqs;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
471 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
472
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
473 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
474 sub get_card {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
475
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
476 # https://github.com/tseemann/abricate/issues/25
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
477 my $tarball = 'card.tar.bz2';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
478 download(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
479 #'https://card.mcmaster.ca/download/0/broadstreet-v2.0.2.tar.gz',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
480 'https://card.mcmaster.ca/latest/data',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
481
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
482 #'https://card.mcmaster.ca/latest/data/card-data.tar.bz2',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
483 $tarball # yes, it's really BZ2 not GZ ...
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
484 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
485
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
486 # my $fasta = "./nucleotide_fasta_protein_homolog_model.fasta";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
487 my $jsonfile = "./card.json";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
488 system( "tar", "xf", $tarball, $jsonfile ) == 0
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
489 or err("Problem with tar xf $tarball $jsonfile");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
490 -r $jsonfile or err("Could not extract $jsonfile from $tarball");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
491
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
492 # JSON
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
493 my $json = path($jsonfile)->slurp_utf8;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
494 my $card = from_json( $json, { latin1 => 1 } );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
495 my @seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
496 for my $g ( values %$card ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
497 next unless ref($g) eq 'HASH';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
498
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
499 # msg(Dumper($g));
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
500 next
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
501 unless $g->{model_type} eq
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
502 "protein homolog model"; # only 'acquired' genes
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
503 my $id = $g->{model_name};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
504 err("$id has {model_param}{snp}") if exists $g->{model_param}{snp};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
505
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
506 # msg("CARD: $id");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
507 # print STDERR Dumper($g);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
508 my $dna = $g->{model_sequences}{sequence}
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
509 or err("$id: no {model_sequences}{sequence} found");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
510 my ($key) = sort keys %$dna; # first key
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
511 $dna = $dna->{$key} or err("$id: invalid key '$key'");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
512 $dna = $dna->{dna_sequence} or err("$id: no dna_sequence");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
513
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
514 # msg(Dumper($dna)) if $id eq 'OXA-25';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
515
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
516 # ARO_category => {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
517 # 'category_aro_name' => 'cephalosporin',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
518 # 'category_aro_class_name' => 'Drug Class',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
519 my $is_amr_gene = 0;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
520 my @abx;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
521 for my $key ( keys $g->{ARO_category}->%* ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
522 my $c = $g->{ARO_category}{$key};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
523 if ( $c->{category_aro_class_name} eq 'Drug Class' ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
524 my $abx = $c->{category_aro_name};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
525 $abx =~ s/ antibiotic//;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
526 $abx =~ s/\s/_/g;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
527 push @abx, $abx;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
528 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
529 if ( $c->{category_aro_class_name} eq 'AMR Gene Family' ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
530 $is_amr_gene++;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
531 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
532 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
533
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
534 #err("CARD | $id | ", Dumper($g->{ARO_category}) ) unless $is_amr_gene;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
535 #msg("ABX=$_") for @abx;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
536
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
537 # put coordinates into normal form
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
538 my ( $start, $stop ) =
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
539 $dna->{strand} eq '-'
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
540 ? ( $dna->{fmax}, $dna->{fmin} )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
541 : ( $dna->{fmin}, $dna->{fmax} );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
542
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
543 $id =~ s/\s+/_/g;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
544 push @seq,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
545 {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
546 ID => $id,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
547 ACC => $dna->{accession} . ":$start-$stop",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
548 DESC => ( $g->{ARO_description} || $g->{ARO_accession} ),
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
549 SEQ => $dna->{sequence},
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
550 ABX => [@abx],
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
551 };
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
552
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
553 # msg(Dumper($seq[-1]));
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
554 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
555
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
556 return \@seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
557 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
558
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
559 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
560 sub get_victors {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
561
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
562 # the CDS data is in .ffn and has source GI and coords
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
563 # the PROT data is in .faa and has the protein ref and /product
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
564
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
565 #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
566 download( 'http://www.phidias.us/victors/downloads/gen_downloads.php',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
567 'victors.ffn' );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
568
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
569 #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
570 download(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
571 'http://www.phidias.us/victors/downloads/gen_downloads_protein.php',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
572 'victors.faa' );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
573
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
574 my %gi;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
575 open my $FAA, '<', 'victors.faa';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
576 while (<$FAA>) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
577
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
578 #>gi|115534244|ref|YP_783826.1| hypothetical protein pCJ01p4 [Campylobacter jejuni]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
579 next unless m"^>gi.(\d+).ref.([^|]+). ([^[]+)";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
580 $gi{$1}{ACC} = $2;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
581 $gi{$1}{DESC} = $3;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
582 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
583
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
584 my $seqs = load_fasta("victors.ffn");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
585
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
586 for my $s (@$seqs) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
587
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
588 #>gi|115534241:2616-3152 Campylobacter jejuni plasmid pCJ01, complete sequence
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
589 $s->{ID} =~ m/gi.(\d+):(\d+)-(\d+)/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
590 $s->{ACC} = $gi{$1}{ACC} || "gi|$1:$2-$3";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
591 $s->{DESC} =~ $gi{$1}{DESC} || 'hypothetical protein';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
592 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
593
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
594 # print Dumper($seqs); exit;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
595
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
596 return $seqs;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
597 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
598
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
599 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
600 sub get_vfdb {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
601 download( 'http://www.mgc.ac.cn/VFs/Down/VFDB_setA_nt.fas.gz',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
602 'vfdb.fa.gz' );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
603 system("gzip -f -d -c vfdb.fa.gz > vfdb.fa");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
604 my $seqs = load_fasta("vfdb.fa");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
605
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
606 # >VFG000676(gb|AAD32411) (lef) anthrax toxin lethal factor precursor [Anthrax toxin (VF0142)] [Bacillus anthracis str. Sterne]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
607 for my $s (@$seqs) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
608
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
609 # https://github.com/tseemann/abricate/issues/64#issuecomment-421895159 by @VGalata
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
610 $s->{ID} =~ m/^(\w+)\(\w+\|(\w+)(\.\d+)?\)$/; #
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
611 #$s->{ID} =~ m/^(\w+)\(\w+\|(\w+)\)$/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
612 $s->{ACC} = $2 if $2;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
613 $s->{DESC} =~ m/^\((.*?)\)/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
614 $s->{ID} = $1 if $1;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
615
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
616 # print STDERR Dumper($s); exit;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
617 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
618
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
619 return $seqs;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
620 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
621
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
622 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
623 sub get_ncbibetalactamase {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
624 my $fasta = "ncbi.fa";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
625 download(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
626 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele-dna.fa',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
627 $fasta );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
628 my $tab = "ncbi.tab";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
629 download( 'ftp://ftp.ncbi.nlm.nih.gov/pathogen/betalactamases/Allele.tab',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
630 $tab );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
631
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
632 # >ACD12694.1 EU650653.1:1-1173
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
633 my $seqs = load_fasta($fasta);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
634
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
635 # ACC-1 ACD12694.1 EU650653.1 blaACC-1 1 1173 + cephalosporin-hydrolyzing class C beta-lactamase ACC-1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
636 my %anno;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
637 my @anno = grep { !m/^#/ } path($tab)->lines( { chomp => 1 } );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
638 msg( "Read", 0 + @anno, "annotations" );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
639 foreach (@anno) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
640 my ( $name, $id, $acc, $gene, $begin, $end, undef, $product ) =
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
641 split m/\t/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
642 $anno{$id} = {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
643 ID => $gene,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
644 DESC => $product,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
645 ACC => "$acc:$begin-$end",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
646 };
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
647 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
648
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
649 # print Dumper(\%anno);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
650
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
651 for my $s (@$seqs) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
652 my $id = $s->{ID};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
653 next unless exists $anno{$id};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
654 $s->{ID} = $anno{$id}{ID};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
655 $s->{ACC} = $anno{$id}{ACC};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
656 $s->{DESC} = $anno{$id}{DESC};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
657 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
658
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
659 # print Dumper($seqs);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
660
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
661 return $seqs;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
662 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
663
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
664 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
665 sub get_ecoh {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
666 my $fasta = "EcOH.fa";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
667 download(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
668 'https://raw.githubusercontent.com/katholt/srst2/master/data/EcOH.fasta',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
669 $fasta
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
670 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
671
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
672 # https://github.com/katholt/srst2#generating-srst2-compatible-clustered-database-from-raw-sequences
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
673 # [clusterID]__[gene]__[allele]__[seqID] [other stuff]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
674 # >1__fliC__fliC-H1__1 AB028471.1;flagellin;H1
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
675 # >8__wzx__wzx-O41__246 AB811617.1;O antigen flippase;O41
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
676 # >9__wzy__wzy-OgN31__597 LC125932.1;O antigen polyermase;OgN31
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
677 my $seqs = load_fasta($fasta);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
678
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
679 for my $s (@$seqs) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
680 my @id = split m/__/, $s->{ID};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
681 my @desc = split m';', $s->{DESC};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
682 $s->{ID} = $id[2];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
683 $s->{ACC} = shift(@desc);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
684 $s->{DESC} = join( ' ', @desc );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
685 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
686
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
687 # print Dumper($seqs);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
688 return $seqs;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
689 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
690
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
691 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
692 sub get_ecoli_vf {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
693 my $fasta = "ecoli_vf.ffn";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
694 download(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
695 'https://github.com/phac-nml/ecoli_vf/raw/master/data/repaired_ecoli_vfs_shortnames.ffn',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
696 $fasta
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
697 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
698 my $seqs = load_fasta($fasta);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
699
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
700 # >VFG000748(gi:2865308) (espF) EspF [EspF (VF0182)] [Escherichia coli O127:H6 str. E2348/69]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
701 # >VFG000749(gi:6009379) (bfpA) Bundlin [BFP (VF0174)] [Escherichia coli B171]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
702 # >SPG000142 (cvac) Escherichia coli cvi cvaC operon. [X57525 434-745]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
703 # >SPG000143 (iss2) Escherichia coli Iss (iss) gene, complete cds. [AF042279 292-600]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
704
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
705 for my $s (@$seqs) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
706
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
707 #print STDERR Dumper("IN", $s);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
708 $s->{ID} =~ m/ ^ (\w+) (?: \( (.*?) \) )? $ /x
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
709 or die "Can't parse $fasta at " . Dumper($s);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
710 $s->{ID} = $1 if $1;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
711 $s->{ACC} = $2 || $1;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
712 $s->{DESC} =~ s/\s\[.*?\]$//g; # remove strain name
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
713 $s->{DESC} =~ m/^(?:\((.*?)\)\s+)?(.*?)$/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
714 $s->{ID} = $1 if $1;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
715 $s->{DESC} = $2;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
716
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
717 #print STDERR Dumper("OUT", $s);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
718 #print STDERR "="x60, "\n";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
719 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
720
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
721 # print Dumper($seqs);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
722 return $seqs;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
723 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
724
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
725 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
726 sub is_full_gene {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
727 my ($s) = @_;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
728 my $has_ambig = 0;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
729
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
730 my $id = $s->{ID};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
731 my $L = length( $s->{SEQ} );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
732 if ( $L % 3 != 0 ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
733 wrn("$id - length $L bp is not multiple of 3");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
734 return;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
735 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
736 if ( $s->{SEQ} !~ m/^[AGCT]+$/ ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
737 wrn("$id - has non-AGTC bases");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
738 return;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
739 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
740
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
741 my $seq = Bio::Seq->new( -id => $s->{ID}, -seq => $s->{SEQ} );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
742
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
743 my $aa = $seq->translate->seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
744
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
745 if ( $aa =~ m/\*./ ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
746 wrn("$id - has internal stop codons, trying revcom");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
747 $aa = $seq->revcom->translate->seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
748 if ( $aa =~ m/\*./ ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
749 wrn("$id - revcom has internal stop codons too");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
750 return;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
751 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
752 else {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
753 msg("$id - revcom resolves problem, hooray!");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
754 $s->{SEQ} = $seq->revcom->seq;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
755 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
756 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
757
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
758 return $L;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
759 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
760
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
761 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
762 sub dedupe_seq {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
763 my ($seq) = @_;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
764 my %seen;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
765 my $good = [];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
766 for my $s (@$seq) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
767 if ( $seen{ $s->{SEQ} } ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
768 wrn( "duplicate", length( $s->{SEQ} ),
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
769 "bp sequence:", $s->{ID}, '~', $seen{ $s->{SEQ} } );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
770 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
771 else {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
772 push @$good, $s;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
773 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
774 $seen{ $s->{SEQ} } .= ' ' . $s->{ID};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
775 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
776 msg( "dedupe_seq: read", scalar(@$seq), "/ kept", scalar(@$good) );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
777 return $good;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
778 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
779
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
780 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
781 sub load_tabular {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
782 my ( $fname, $keycol, $sep ) = @_;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
783 $keycol //= 0;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
784 $sep //= "\t";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
785 my $hash = {};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
786 my @hdr;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
787 my $row = 0;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
788 open my $TSV, '<', $fname or err("Can't read TSV file: $fname");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
789 while (<$TSV>) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
790 chomp;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
791 my @col = split m/$sep/;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
792 $row++;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
793 if (@hdr) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
794 @hdr == @col or err("Header and row $row differ in column count");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
795
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
796 #my $key = $col[$keycol] or wrn("Empty key column $keycol: $_");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
797 #exists{$hash->{$col[$key}} and wrn("WARNING: dupe key $key at row: $_");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
798 my $key = $col[$keycol];
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
799 $hash->{$key} ||=
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
800 { map { ( $hdr[$_] => $col[$_] ) } ( 0 .. $#hdr ) };
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
801 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
802 else {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
803 @hdr = @col;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
804 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
805 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
806 close $TSV;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
807 return $hash;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
808 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
809
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
810 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
811 sub load_fasta {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
812 my ($fasta) = @_;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
813 my %seen;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
814 my $list;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
815 my $dbtype = 'unknown';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
816 msg("load_fasta: $fasta");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
817 my $in = Bio::SeqIO->new( -file => $fasta, -format => 'fasta' );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
818 while ( my $seq = $in->next_seq ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
819 my $id = $seq->id or err("Empty ID in $fasta");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
820 if ( $seen{$id} ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
821 wrn("Duplicate ID '$id' in $fasta");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
822 $id = $id . '_dupe';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
823 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
824 $seen{$id}++;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
825 my $s = uc( $seq->seq );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
826 $dbtype = $seq->alphabet eq 'dna' ? 'nucl' : 'prot';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
827 $dbtype eq 'nucl' ? $s =~ s/[^AGTC]/N/g : $s =~ s/[^A-Z]/X/g;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
828 push @$list,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
829 {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
830 ID => $id,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
831 ACC => '',
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
832 DESC => $seq->desc,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
833 SEQ => $s,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
834 TYPE => $dbtype,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
835 };
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
836 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
837 msg( "load_fasta: read", scalar(@$list), "$dbtype sequences" );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
838 return $list;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
839 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
840
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
841 #..............................................................................
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
842 sub save_fasta {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
843 my ( $fasta, $seq ) = @_;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
844 msg("save_fasta: $fasta");
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
845 my %seen;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
846 my $out = Bio::SeqIO->new( -file => ">$fasta", -format => 'fasta' );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
847 for my $s (@$seq) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
848 $seen{ $s->{ID} }++;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
849 my $freq = $seen{ $s->{ID} };
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
850
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
851 #wrn("seen $s->{ID} now $freq times") if $freq > 1;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
852 # print Dumper($s);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
853 my $ABX =
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
854 defined( $s->{ABX} ) ? join( $ABX_SEP, sort @{ $s->{ABX} } ) : '';
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
855 $ABX =~ s/\s+/_/g; # remove spaces!
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
856 my $obj = Bio::Seq->new(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
857 -id => join( '~~~', $db, $s->{ID}, $s->{ACC}, $ABX ),
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
858 -desc => ( $s->{DESC} || $s->{ID} ),
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
859 -seq => $s->{SEQ},
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
860 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
861
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
862 # $obj->desc( hash_encode($s) );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
863 $out->write_seq($obj);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
864
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
865 # $seen{ $s->{ID} }++;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
866 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
867 msg( "save_fasta: wrote", scalar(@$seq), "sequences" );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
868 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
869
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
870 #----------------------------------------------------------------------
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
871 sub msg {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
872 print STDERR "@_\n";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
873 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
874
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
875 #----------------------------------------------------------------------
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
876 sub wrn {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
877 msg( "WARNING:", @_ ) if $debug;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
878 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
879
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
880 #----------------------------------------------------------------------
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
881 sub err {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
882 msg( "ERROR:", @_ );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
883 exit(1);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
884 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
885
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
886 #----------------------------------------------------------------------
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
887 # Option setting routines
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
888
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
889 sub setOptions {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
890 use Getopt::Long;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
891
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
892 @Options = (
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
893 { OPT => "help", VAR => \&usage, DESC => "This help" },
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
894 {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
895 OPT => "debug!",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
896 VAR => \$debug,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
897 DEFAULT => 0,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
898 DESC => "Verbose debug output"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
899 },
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
900 {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
901 OPT => "dbdir=s",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
902 VAR => \$outdir,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
903 DEFAULT => abs_path("$FindBin::RealBin/../db"),
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
904 DESC => "Parent folder"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
905 },
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
906 {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
907 OPT => "db=s",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
908 VAR => \$db,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
909 DEFAULT => "",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
910 DESC => "Choices: $DATABASES"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
911 },
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
912 {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
913 OPT => "force!",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
914 VAR => \$force,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
915 DEFAULT => 0,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
916 DESC => "Force download even if exists"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
917 },
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
918 );
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
919
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
920 &GetOptions( map { $_->{OPT}, $_->{VAR} } @Options ) || usage(1);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
921
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
922 # Now setup default values.
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
923 foreach (@Options) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
924 if ( defined( $_->{DEFAULT} ) && !defined( ${ $_->{VAR} } ) ) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
925 ${ $_->{VAR} } = $_->{DEFAULT};
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
926 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
927 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
928 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
929
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
930 sub usage {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
931 my ($exitcode) = @_;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
932 $exitcode = 0 if $exitcode eq 'help'; # what gets passed by getopt func ref
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
933 $exitcode ||= 0;
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
934 select STDERR if $exitcode; # write to STDERR if exitcode is error
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
935
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
936 print "SYNOPIS\n Download databases for abricate to use\n";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
937 print "USAGE\n $EXE [options] --db DATABASE\n";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
938 print "OPTIONS\n";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
939 foreach (@Options) {
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
940 printf " --%-13s %s%s.\n", $_->{OPT}, $_->{DESC},
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
941 defined( $_->{DEFAULT} ) ? " (default '$_->{DEFAULT}')" : "";
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
942 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
943 exit($exitcode);
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
944 }
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
945
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
946 #----------------------------------------------------------------------
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
947