comparison 0.6.1/bin/waterfall_per_snp_cluster.pl @ 11:749faef1caa9

"planemo upload"
author kkonganti
date Tue, 05 Sep 2023 11:51:40 -0400
parents
children
comparison
equal deleted inserted replaced
10:1b9de878b04a 11:749faef1caa9
1 #!/usr/bin/env perl
2
3 # Kranti Konganti
4 # 08/23/2023
5
6 use strict;
7 use warnings;
8 use Getopt::Long;
9 use Data::Dumper;
10 use Pod::Usage;
11 use File::Basename;
12 use File::Spec::Functions;
13
14 my $tbl = {};
15 my $snp_2_serovar = {};
16 my $acc_2_serovar = {};
17 my $acc_2_target = {};
18 my $snp_count = {};
19 my $snp_2_acc = {};
20 my $acc_2_snp = {};
21 my $multi_cluster_acc = {};
22 my (
23 $serovar_limit, $serovar_or_type_col, $min_asm_size,
24 $complete_serotype_name, $PDG_file, $table_file,
25 $not_null_pdg_serovar, $snp_cluster, $help,
26 $out_prefix
27 );
28 my @custom_serovars;
29
30 GetOptions(
31 'help' => \$help,
32 'pdg=s' => \$PDG_file,
33 'tbl=s' => \$table_file,
34 'snp=s' => \$snp_cluster,
35 'min_contig_size=i' => \$min_asm_size,
36 'complete_serotype_name' => \$complete_serotype_name,
37 'serocol:i' => \$serovar_or_type_col,
38 'not_null_pdg_serovar' => \$not_null_pdg_serovar,
39 'num_serotypes_per_serotype:i' => \$serovar_limit,
40 'include_serovar=s' => \@custom_serovars,
41 'op=s' => \$out_prefix
42 ) or pod2usage( -verbose => 2 );
43
44 if ( defined $help ) {
45 pod2usage( -verbose => 2 );
46 }
47
48 if ( !defined $serovar_limit ) {
49 $serovar_limit = 1;
50 }
51
52 if ( !defined $serovar_or_type_col ) {
53 $serovar_or_type_col = 49;
54 }
55
56 if ( !defined $min_asm_size ) {
57 $min_asm_size = 0;
58 }
59
60 if ( defined $out_prefix ) {
61 $out_prefix .= '_';
62 }
63 else {
64 $out_prefix = '';
65 }
66
67 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster );
68
69 open( my $pdg_file, '<', $PDG_file )
70 || die "\nCannot open PDG file $PDG_file: $!\n\n";
71 open( my $tbl_file, '<', $table_file )
72 || die "\nCannot open tbl file $table_file: $!\n\n";
73 open( my $snp_cluster_file, '<', $snp_cluster )
74 || die "\nCannot open $snp_cluster: $!\n\n";
75 open( my $acc_fh, '>', 'acc2serovar.txt' )
76 || die "\nCannot open acc2serovar.txt: $!\n\n";
77 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n";
78 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n";
79 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' )
80 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n";
81 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' )
82 || die "\nCannnot open "
83 . $out_prefix
84 . "mash_snp_genome_list.txt for writing: $!\n\n";
85
86 my $pdg_release = basename( $PDG_file, ".metadata.tsv" );
87
88 while ( my $line = <$pdg_file> ) {
89 chomp $line;
90 next if ( $line =~ m/^\#/ );
91
92 # Relevent columns (Perl index):
93 # 9: asm_acc
94 # 33: serovar
95 # 48: computed serotype
96
97 my @cols = split( /\t/, $line );
98 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ];
99 my $acc = $cols[9];
100 my $serovar = $cols[33];
101 my $target_acc = $cols[41];
102
103 $serovar_or_type =~ s/\"//g;
104
105 my $skip = 1;
106 foreach my $ser (@custom_serovars) {
107 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ );
108 }
109
110 if ( defined $complete_serotype_name ) {
111 next
112 if ( $skip
113 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ )
114 );
115 }
116
117 next
118 if (
119 $skip
120 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
121 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ )
122 );
123
124 # next
125 # if (
126 # (
127 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
128 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/
129 # )
130 # );
131
132 if ( defined $not_null_pdg_serovar ) {
133 $acc_2_serovar->{$acc} = $serovar_or_type,
134 $acc_2_target->{$acc} = $target_acc,
135 print $acc_fh "$acc\t$serovar_or_type\n"
136 if ( $acc !~ m/NULL/
137 && $serovar !~ m/NULL/
138 && $serovar_or_type !~ m/NULL/ );
139 }
140 else {
141 $acc_2_serovar->{$acc} = $serovar_or_type,
142 $acc_2_target->{$acc} = $target_acc,
143 print $acc_fh "$acc\t$serovar_or_type\n"
144 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ );
145 }
146
147 # $snp_count->{$serovar_or_type} = 0;
148 }
149
150 #
151 # SNP to ACC
152 #
153
154 while ( my $line = <$snp_cluster_file> ) {
155 chomp $line;
156 my @cols = split( /\t/, $line );
157
158 # Relevant columns
159 # 0: SNP Cluster ID
160 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank)
161 my $snp_clus_id = $cols[0];
162 my $acc = $cols[3];
163
164 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ );
165 next if ( !exists $acc_2_serovar->{$acc} );
166 push @{ $snp_2_acc->{$snp_clus_id} }, $acc;
167 if ( exists $acc_2_snp->{$acc} ) {
168 print $Stderr
169 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n";
170 exit 1;
171 }
172 $acc_2_snp->{$acc} = $snp_clus_id;
173 $snp_count->{$snp_clus_id} = 0;
174 }
175
176 while ( my $line = <$tbl_file> ) {
177 chomp $line;
178
179 my @cols = split( /\t/, $line );
180
181 # .tbl file columns (Perl index):
182 #
183 # 0: Accession
184 # 1: AssemblyLevel
185 # 2: ScaffoldN50
186 # 3: ContigN50
187
188 my $acc = $cols[0];
189 my $asm_lvl = $cols[1];
190 my $scaffold_n50 = $cols[2];
191 my $contig_n50 = $cols[3];
192
193 # my $idx0 = $acc_2_serovar->{$cols[0]};
194 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } );
195
196 if ( not_empty($acc) && defined $idx0 ) {
197 my $fna_rel_loc =
198 "$pdg_release/ncbi_dataset/data/$acc/"
199 . $acc
200 . '_scaffolded_genomic.fna.gz';
201
202 if ( not_empty($scaffold_n50) ) {
203 next if ( $scaffold_n50 <= $min_asm_size );
204 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
205 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
206 }
207 elsif ( not_empty($contig_n50) ) {
208 next if ( $contig_n50 <= $min_asm_size );
209 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
210 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
211 }
212 }
213 }
214
215 foreach my $snp_cluster_id ( keys %$snp_2_acc ) {
216 my $count = $snp_count->{$snp_cluster_id};
217 foreach my $asm_lvl (
218 sort { $a cmp $b }
219 keys %{ $snp_2_serovar->{$snp_cluster_id} }
220 )
221 {
222 if ( $asm_lvl =~ m/Complete\s+Genome/i ) {
223 $count =
224 print_dl_metadata( $asm_lvl,
225 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
226 $count, $snp_cluster_id );
227 }
228 if ( $asm_lvl =~ m/Chromosome/i ) {
229 $count =
230 print_dl_metadata( $asm_lvl,
231 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
232 $count, $snp_cluster_id );
233 }
234 if ( $asm_lvl =~ m/Scaffold/i ) {
235 $count =
236 print_dl_metadata( $asm_lvl,
237 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
238 $count, $snp_cluster_id );
239 }
240 if ( $asm_lvl =~ m/Contig/i ) {
241 $count =
242 print_dl_metadata( $asm_lvl,
243 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
244 $count, $snp_cluster_id );
245 }
246 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count
247 if ( $count > 0 );
248 last if ( $count >= $serovar_limit );
249 }
250 }
251
252 close $pdg_file;
253 close $tbl_file;
254 close $snp_cluster_file;
255 close $acc_fh;
256 close $accs_snp_fh;
257
258 #-------------------------------------------
259 # Main ends
260 #-------------------------------------------
261 # Routines begin
262 #-------------------------------------------
263
264 sub print_dl_metadata {
265 my $asm_lvl = shift;
266 my $acc_sizes = shift;
267 my $curr_count = shift;
268 my $snp_cluster_id = shift;
269
270 $asm_lvl =~ s/.+?\_(.+)/$1/;
271
272 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) {
273 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) {
274 my ( $serovar, $url ) = split( /\|/, $serovar_url );
275 return $curr_count if ( exists $multi_cluster_acc->{$url} );
276 $multi_cluster_acc->{$url} = 1;
277 $curr_count++;
278 my ( $final_acc, $genome_header ) =
279 ( split( /\//, $url ) )[ 3 .. 4 ];
280 print $accs_snp_fh "$final_acc\n";
281 print $genome_headers_fh catfile( 'scaffold_genomes',
282 $genome_header )
283 . "\n";
284 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n"
285 if ( $curr_count > 0 );
286 }
287 last if ( $curr_count >= $serovar_limit );
288 }
289 return $curr_count;
290 }
291
292 sub sort_asm_level {
293 my $level = shift;
294
295 $level =~ s/(Complete\s+Genome)/a\_$1/
296 if ( $level =~ m/Complete\s+Genome/i );
297 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i );
298 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i );
299 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i );
300
301 return $level;
302 }
303
304 sub not_empty {
305 my $col = shift;
306
307 if ( $col !~ m/^$/ ) {
308 return 1;
309 }
310 else {
311 return 0;
312 }
313 }
314
315 __END__
316
317 =head1 SYNOPSIS
318
319 This script will take in a PDG metadata file, a C<.tbl> file and generate
320 the final list by B<I<waterfall>> priority.
321
322 See complete description:
323
324 perldoc waterfall_per_snp_cluster.pl
325
326 or
327
328 waterfall_per_snp_cluster.pl --help
329
330 Examples:
331
332 waterfall_per_snp_cluster.pl
333
334 =head1 DESCRIPTION
335
336 We will retain up to N number of genome accessions per SNP cluster.
337 It prioritizes SNP Cluster participation over serotype coverage.
338 Which N genomes are selected depends on (in order):
339
340 1. Genome assembly level, whose priority is
341
342 a: Complete Genome
343 b: Chromosome
344 c: Scaffold
345 d: Contig
346
347 2. If the genomes are of same assembly level, then
348 scaffold N50 followed by contig N50 is chosen.
349
350 3. If the scaffold or contig N50 is same, then all
351 of them are included
352
353 =head1 OPTIONS
354
355 =over 3
356
357 =item -p PDGXXXXX.XXXX.metadata.tsv
358
359 Absolute UNIX path pointing to the PDG metadata file.
360 Example: PDG000000002.2505.metadata.tsv
361
362 =item -t asm.tbl
363
364 Absolute UNIX path pointing to the file from the result
365 of the C<dl_pdg_data.py> script, which is the C<asm.tbl>
366 file.
367
368 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv
369
370 Absolute UNIX path pointing to the SNP Cluster metadata file.
371 Examples: PDG000000002.2505.reference_target.cluster_list.tsv
372
373
374 =item --serocol <int> (Optional)
375
376 Column number (non 0-based index) of the PDG metadata file
377 by which the serotypes are collected. Default: 49
378
379 =item --complete_serotype_name (Optional)
380
381 Skip indexing serotypes when the serotype name in the column
382 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if
383 an accession has a I<B<serotype=>> string as such in column
384 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-">
385 then, the indexing of that accession is skipped.
386 Default: False
387
388 =item --not_null_pdg_serovar (Optional)
389
390 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based)
391 if the B<I<serovar>> column is not C<NULL>.
392
393 =item -i <serotype name> (Optional)
394
395 Make sure the following serotype is included. Mention C<-i> multiple
396 times to include multiple serotypes.
397
398 =item -num <int> (Optional)
399
400 Number of genome accessions per SNP Cluster. Default: 1
401
402 =back
403
404 =head1 AUTHOR
405
406 Kranti Konganti
407
408 =cut