comparison 0.7.0/bin/waterfall_per_snp_cluster.pl @ 17:0e7a0053e4a6

planemo upload
author kkonganti
date Mon, 15 Jul 2024 10:42:02 -0400
parents
children
comparison
equal deleted inserted replaced
16:b90e5a7a3d4f 17:0e7a0053e4a6
1 #!/usr/bin/env perl
2
3 # Kranti Konganti
4 # 01/02/2024
5
6 use strict;
7 use warnings;
8 use Getopt::Long;
9 use Data::Dumper;
10 use Pod::Usage;
11 use File::Basename;
12 use File::Spec::Functions;
13
14 my $tbl = {};
15 my $snp_2_serovar = {};
16 my $acc_2_serovar = {};
17 my $acc_2_target = {};
18 my $snp_count = {};
19 my $snp_2_acc = {};
20 my $acc_2_snp = {};
21 my $multi_cluster_acc = {};
22 my (
23 $serovar_limit, $serovar_or_type_col, $min_asm_size,
24 $complete_serotype_name, $PDG_file, $table_file,
25 $not_null_pdg_serovar, $snp_cluster, $help,
26 $out_prefix, $acc_col, $seronamecol,
27 $target_acc_col
28 );
29 my @custom_serovars;
30
31 GetOptions(
32 'help' => \$help,
33 'pdg=s' => \$PDG_file,
34 'tbl=s' => \$table_file,
35 'snp=s' => \$snp_cluster,
36 'min_contig_size=i' => \$min_asm_size,
37 'complete_serotype_name' => \$complete_serotype_name,
38 'serocol:i' => \$serovar_or_type_col,
39 'seronamecol:i' => \$seronamecol,
40 'target_acc_col:i' => \$target_acc_col,
41 'acc_col:i' => \$acc_col,
42 'not_null_pdg_serovar' => \$not_null_pdg_serovar,
43 'num_serotypes_per_serotype:i' => \$serovar_limit,
44 'include_serovar=s' => \@custom_serovars,
45 'op=s' => \$out_prefix
46 ) or pod2usage( -verbose => 2 );
47
48 if ( defined $help ) {
49 pod2usage( -verbose => 2 );
50 }
51
52 if ( !defined $serovar_limit ) {
53 $serovar_limit = 1;
54 }
55
56 if ( !defined $serovar_or_type_col ) {
57 $serovar_or_type_col = 50;
58 }
59
60 if ( !defined $seronamecol ) {
61 $seronamecol = 34;
62 }
63
64 if ( !defined $target_acc_col ) {
65 $target_acc_col = 43;
66 }
67
68 if ( !defined $acc_col ) {
69 $acc_col = 10;
70 }
71
72 if ( !defined $min_asm_size ) {
73 $min_asm_size = 0;
74 }
75
76 if ( defined $out_prefix ) {
77 $out_prefix .= '_';
78 }
79 else {
80 $out_prefix = '';
81 }
82
83 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster );
84
85 open( my $pdg_file, '<', $PDG_file )
86 || die "\nCannot open PDG file $PDG_file: $!\n\n";
87 open( my $tbl_file, '<', $table_file )
88 || die "\nCannot open tbl file $table_file: $!\n\n";
89 open( my $snp_cluster_file, '<', $snp_cluster )
90 || die "\nCannot open $snp_cluster: $!\n\n";
91 open( my $acc_fh, '>', 'acc2serovar.txt' )
92 || die "\nCannot open acc2serovar.txt: $!\n\n";
93 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n";
94 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n";
95 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' )
96 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n";
97 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' )
98 || die "\nCannnot open "
99 . $out_prefix
100 . "mash_snp_genome_list.txt for writing: $!\n\n";
101
102 my $pdg_release = basename( $PDG_file, ".metadata.tsv" );
103
104 while ( my $line = <$pdg_file> ) {
105 chomp $line;
106 next if ( $line =~ m/^\#/ );
107
108 # Relevent columns (Perl index):
109 # 10-1 = 9: asm_acc
110 # 34 -1 = 33: serovar
111 # 50 -1 = 49: computed serotype
112
113 my @cols = split( /\t/, $line );
114 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ];
115 my $acc = $cols[ $acc_col - 1 ];
116 my $serovar = $cols[ $seronamecol - 1 ];
117 my $target_acc = $cols[ $target_acc_col - 1 ];
118
119 $serovar_or_type =~ s/\"//g;
120
121 my $skip = 1;
122 foreach my $ser (@custom_serovars) {
123 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ );
124 }
125
126 if ( defined $complete_serotype_name ) {
127 next
128 if ( $skip
129 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ )
130 );
131 }
132
133 next
134 if (
135 $skip
136 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
137 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ )
138 );
139
140 # next
141 # if (
142 # (
143 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
144 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/
145 # )
146 # );
147
148 if ( defined $not_null_pdg_serovar ) {
149 $acc_2_serovar->{$acc} = $serovar_or_type,
150 $acc_2_target->{$acc} = $target_acc,
151 print $acc_fh "$acc\t$serovar_or_type\n"
152 if ( $acc !~ m/NULL/
153 && $serovar !~ m/NULL/
154 && $serovar_or_type !~ m/NULL/ );
155 }
156 else {
157 $acc_2_serovar->{$acc} = $serovar_or_type,
158 $acc_2_target->{$acc} = $target_acc,
159 print $acc_fh "$acc\t$serovar_or_type\n"
160 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ );
161 }
162
163 # $snp_count->{$serovar_or_type} = 0;
164 }
165
166 #
167 # SNP to ACC
168 #
169
170 while ( my $line = <$snp_cluster_file> ) {
171 chomp $line;
172 my @cols = split( /\t/, $line );
173
174 # Relevant columns
175 # 0: SNP Cluster ID
176 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank)
177 my $snp_clus_id = $cols[0];
178 my $acc = $cols[3];
179
180 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ );
181 next if ( !exists $acc_2_serovar->{$acc} );
182 push @{ $snp_2_acc->{$snp_clus_id} }, $acc;
183 if ( exists $acc_2_snp->{$acc} ) {
184 print $Stderr
185 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n";
186 exit 1;
187 }
188 $acc_2_snp->{$acc} = $snp_clus_id;
189 $snp_count->{$snp_clus_id} = 0;
190 }
191
192 while ( my $line = <$tbl_file> ) {
193 chomp $line;
194
195 my @cols = split( /\t/, $line );
196
197 # .tbl file columns (Perl index):
198 #
199 # 0: Accession
200 # 1: AssemblyLevel
201 # 2: ScaffoldN50
202 # 3: ContigN50
203
204 my $acc = $cols[0];
205 my $asm_lvl = $cols[1];
206 my $scaffold_n50 = $cols[2];
207 my $contig_n50 = $cols[3];
208
209 # my $idx0 = $acc_2_serovar->{$cols[0]};
210 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } );
211
212 if ( not_empty($acc) && defined $idx0 ) {
213 my $fna_rel_loc =
214 "$pdg_release/ncbi_dataset/data/$acc/"
215 . $acc
216 . '_scaffolded_genomic.fna.gz';
217
218 if ( not_empty($scaffold_n50) ) {
219 next if ( $scaffold_n50 <= $min_asm_size );
220 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
221 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
222 }
223 elsif ( not_empty($contig_n50) ) {
224 next if ( $contig_n50 <= $min_asm_size );
225 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
226 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
227 }
228 }
229 }
230
231 foreach my $snp_cluster_id ( keys %$snp_2_acc ) {
232 my $count = $snp_count->{$snp_cluster_id};
233 foreach my $asm_lvl (
234 sort { $a cmp $b }
235 keys %{ $snp_2_serovar->{$snp_cluster_id} }
236 )
237 {
238 if ( $asm_lvl =~ m/Complete\s+Genome/i ) {
239 $count =
240 print_dl_metadata( $asm_lvl,
241 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
242 $count, $snp_cluster_id );
243 }
244 if ( $asm_lvl =~ m/Chromosome/i ) {
245 $count =
246 print_dl_metadata( $asm_lvl,
247 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
248 $count, $snp_cluster_id );
249 }
250 if ( $asm_lvl =~ m/Scaffold/i ) {
251 $count =
252 print_dl_metadata( $asm_lvl,
253 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
254 $count, $snp_cluster_id );
255 }
256 if ( $asm_lvl =~ m/Contig/i ) {
257 $count =
258 print_dl_metadata( $asm_lvl,
259 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
260 $count, $snp_cluster_id );
261 }
262 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count
263 if ( $count > 0 );
264 last if ( $count >= $serovar_limit );
265 }
266 }
267
268 close $pdg_file;
269 close $tbl_file;
270 close $snp_cluster_file;
271 close $acc_fh;
272 close $accs_snp_fh;
273
274 #-------------------------------------------
275 # Main ends
276 #-------------------------------------------
277 # Routines begin
278 #-------------------------------------------
279
280 sub print_dl_metadata {
281 my $asm_lvl = shift;
282 my $acc_sizes = shift;
283 my $curr_count = shift;
284 my $snp_cluster_id = shift;
285
286 $asm_lvl =~ s/.+?\_(.+)/$1/;
287
288 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) {
289 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) {
290 my ( $serovar, $url ) = split( /\|/, $serovar_url );
291 return $curr_count if ( exists $multi_cluster_acc->{$url} );
292 $multi_cluster_acc->{$url} = 1;
293 $curr_count++;
294 my ( $final_acc, $genome_header ) =
295 ( split( /\//, $url ) )[ 3 .. 4 ];
296 print $accs_snp_fh "$final_acc\n";
297 print $genome_headers_fh catfile( 'scaffold_genomes',
298 $genome_header )
299 . "\n";
300 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n"
301 if ( $curr_count > 0 );
302 }
303 last if ( $curr_count >= $serovar_limit );
304 }
305 return $curr_count;
306 }
307
308 sub sort_asm_level {
309 my $level = shift;
310
311 $level =~ s/(Complete\s+Genome)/a\_$1/
312 if ( $level =~ m/Complete\s+Genome/i );
313 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i );
314 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i );
315 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i );
316
317 return $level;
318 }
319
320 sub not_empty {
321 my $col = shift;
322
323 if ( $col !~ m/^$/ ) {
324 return 1;
325 }
326 else {
327 return 0;
328 }
329 }
330
331 __END__
332
333 =head1 SYNOPSIS
334
335 This script will take in a PDG metadata file, a C<.tbl> file and generate
336 the final list by B<I<waterfall>> priority.
337
338 See complete description:
339
340 perldoc waterfall_per_snp_cluster.pl
341
342 or
343
344 waterfall_per_snp_cluster.pl --help
345
346 Examples:
347
348 waterfall_per_snp_cluster.pl
349
350 =head1 DESCRIPTION
351
352 We will retain up to N number of genome accessions per SNP cluster.
353 It prioritizes SNP Cluster participation over serotype coverage.
354 Which N genomes are selected depends on (in order):
355
356 1. Genome assembly level, whose priority is
357
358 a: Complete Genome
359 b: Chromosome
360 c: Scaffold
361 d: Contig
362
363 2. If the genomes are of same assembly level, then
364 scaffold N50 followed by contig N50 is chosen.
365
366 3. If the scaffold or contig N50 is same, then all
367 of them are included
368
369 =head1 OPTIONS
370
371 =over 3
372
373 =item -p PDGXXXXX.XXXX.metadata.tsv
374
375 Absolute UNIX path pointing to the PDG metadata file.
376 Example: PDG000000002.2505.metadata.tsv
377
378 =item -t asm.tbl
379
380 Absolute UNIX path pointing to the file from the result
381 of the C<dl_pdg_data.py> script, which is the C<asm.tbl>
382 file.
383
384 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv
385
386 Absolute UNIX path pointing to the SNP Cluster metadata file.
387 Examples: PDG000000002.2505.reference_target.cluster_list.tsv
388
389 =item --serocol <int> (Optional)
390
391 Column number (non 0-based index) of the PDG metadata file
392 by which the serotypes are collected
393 (column name: "computed_types"). Default: 50
394
395 =item --seronamecol <int> (Optional)
396
397 Column number (non 0-based index) of the PDG metadata file
398 whose column name is "serovar". Default: 34
399
400 =item --acc_col <int> (Optional)
401
402 Column number (non 0-based index) of the PDG metadata file
403 whose column name is "acc". Default: 10
404
405 =item --target_acc_col <int> (Optional)
406
407 Column number (non 0-based index) of the PDG metadata file
408 whose column name is "target_acc". Default: 43
409
410 =item --complete_serotype_name (Optional)
411
412 Skip indexing serotypes when the serotype name in the column
413 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if
414 an accession has a I<B<serotype=>> string as such in column
415 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-">
416 then, the indexing of that accession is skipped.
417 Default: False
418
419 =item --not_null_pdg_serovar (Optional)
420
421 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based)
422 if the B<I<serovar>> column is not C<NULL>.
423
424 =item -i <serotype name> (Optional)
425
426 Make sure the following serotype is included. Mention C<-i> multiple
427 times to include multiple serotypes.
428
429 =item -num <int> (Optional)
430
431 Number of genome accessions per SNP Cluster. Default: 1
432
433 =back
434
435 =head1 AUTHOR
436
437 Kranti Konganti
438
439 =cut