Mercurial > repos > kkonganti > cfsan_bettercallsal
comparison 0.7.0/bin/waterfall_per_snp_cluster.pl @ 17:0e7a0053e4a6
planemo upload
author | kkonganti |
---|---|
date | Mon, 15 Jul 2024 10:42:02 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
16:b90e5a7a3d4f | 17:0e7a0053e4a6 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 # Kranti Konganti | |
4 # 01/02/2024 | |
5 | |
6 use strict; | |
7 use warnings; | |
8 use Getopt::Long; | |
9 use Data::Dumper; | |
10 use Pod::Usage; | |
11 use File::Basename; | |
12 use File::Spec::Functions; | |
13 | |
14 my $tbl = {}; | |
15 my $snp_2_serovar = {}; | |
16 my $acc_2_serovar = {}; | |
17 my $acc_2_target = {}; | |
18 my $snp_count = {}; | |
19 my $snp_2_acc = {}; | |
20 my $acc_2_snp = {}; | |
21 my $multi_cluster_acc = {}; | |
22 my ( | |
23 $serovar_limit, $serovar_or_type_col, $min_asm_size, | |
24 $complete_serotype_name, $PDG_file, $table_file, | |
25 $not_null_pdg_serovar, $snp_cluster, $help, | |
26 $out_prefix, $acc_col, $seronamecol, | |
27 $target_acc_col | |
28 ); | |
29 my @custom_serovars; | |
30 | |
31 GetOptions( | |
32 'help' => \$help, | |
33 'pdg=s' => \$PDG_file, | |
34 'tbl=s' => \$table_file, | |
35 'snp=s' => \$snp_cluster, | |
36 'min_contig_size=i' => \$min_asm_size, | |
37 'complete_serotype_name' => \$complete_serotype_name, | |
38 'serocol:i' => \$serovar_or_type_col, | |
39 'seronamecol:i' => \$seronamecol, | |
40 'target_acc_col:i' => \$target_acc_col, | |
41 'acc_col:i' => \$acc_col, | |
42 'not_null_pdg_serovar' => \$not_null_pdg_serovar, | |
43 'num_serotypes_per_serotype:i' => \$serovar_limit, | |
44 'include_serovar=s' => \@custom_serovars, | |
45 'op=s' => \$out_prefix | |
46 ) or pod2usage( -verbose => 2 ); | |
47 | |
48 if ( defined $help ) { | |
49 pod2usage( -verbose => 2 ); | |
50 } | |
51 | |
52 if ( !defined $serovar_limit ) { | |
53 $serovar_limit = 1; | |
54 } | |
55 | |
56 if ( !defined $serovar_or_type_col ) { | |
57 $serovar_or_type_col = 50; | |
58 } | |
59 | |
60 if ( !defined $seronamecol ) { | |
61 $seronamecol = 34; | |
62 } | |
63 | |
64 if ( !defined $target_acc_col ) { | |
65 $target_acc_col = 43; | |
66 } | |
67 | |
68 if ( !defined $acc_col ) { | |
69 $acc_col = 10; | |
70 } | |
71 | |
72 if ( !defined $min_asm_size ) { | |
73 $min_asm_size = 0; | |
74 } | |
75 | |
76 if ( defined $out_prefix ) { | |
77 $out_prefix .= '_'; | |
78 } | |
79 else { | |
80 $out_prefix = ''; | |
81 } | |
82 | |
83 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster ); | |
84 | |
85 open( my $pdg_file, '<', $PDG_file ) | |
86 || die "\nCannot open PDG file $PDG_file: $!\n\n"; | |
87 open( my $tbl_file, '<', $table_file ) | |
88 || die "\nCannot open tbl file $table_file: $!\n\n"; | |
89 open( my $snp_cluster_file, '<', $snp_cluster ) | |
90 || die "\nCannot open $snp_cluster: $!\n\n"; | |
91 open( my $acc_fh, '>', 'acc2serovar.txt' ) | |
92 || die "\nCannot open acc2serovar.txt: $!\n\n"; | |
93 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n"; | |
94 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n"; | |
95 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' ) | |
96 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n"; | |
97 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' ) | |
98 || die "\nCannnot open " | |
99 . $out_prefix | |
100 . "mash_snp_genome_list.txt for writing: $!\n\n"; | |
101 | |
102 my $pdg_release = basename( $PDG_file, ".metadata.tsv" ); | |
103 | |
104 while ( my $line = <$pdg_file> ) { | |
105 chomp $line; | |
106 next if ( $line =~ m/^\#/ ); | |
107 | |
108 # Relevent columns (Perl index): | |
109 # 10-1 = 9: asm_acc | |
110 # 34 -1 = 33: serovar | |
111 # 50 -1 = 49: computed serotype | |
112 | |
113 my @cols = split( /\t/, $line ); | |
114 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ]; | |
115 my $acc = $cols[ $acc_col - 1 ]; | |
116 my $serovar = $cols[ $seronamecol - 1 ]; | |
117 my $target_acc = $cols[ $target_acc_col - 1 ]; | |
118 | |
119 $serovar_or_type =~ s/\"//g; | |
120 | |
121 my $skip = 1; | |
122 foreach my $ser (@custom_serovars) { | |
123 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ ); | |
124 } | |
125 | |
126 if ( defined $complete_serotype_name ) { | |
127 next | |
128 if ( $skip | |
129 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ ) | |
130 ); | |
131 } | |
132 | |
133 next | |
134 if ( | |
135 $skip | |
136 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ | |
137 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ ) | |
138 ); | |
139 | |
140 # next | |
141 # if ( | |
142 # ( | |
143 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ | |
144 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ | |
145 # ) | |
146 # ); | |
147 | |
148 if ( defined $not_null_pdg_serovar ) { | |
149 $acc_2_serovar->{$acc} = $serovar_or_type, | |
150 $acc_2_target->{$acc} = $target_acc, | |
151 print $acc_fh "$acc\t$serovar_or_type\n" | |
152 if ( $acc !~ m/NULL/ | |
153 && $serovar !~ m/NULL/ | |
154 && $serovar_or_type !~ m/NULL/ ); | |
155 } | |
156 else { | |
157 $acc_2_serovar->{$acc} = $serovar_or_type, | |
158 $acc_2_target->{$acc} = $target_acc, | |
159 print $acc_fh "$acc\t$serovar_or_type\n" | |
160 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ ); | |
161 } | |
162 | |
163 # $snp_count->{$serovar_or_type} = 0; | |
164 } | |
165 | |
166 # | |
167 # SNP to ACC | |
168 # | |
169 | |
170 while ( my $line = <$snp_cluster_file> ) { | |
171 chomp $line; | |
172 my @cols = split( /\t/, $line ); | |
173 | |
174 # Relevant columns | |
175 # 0: SNP Cluster ID | |
176 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank) | |
177 my $snp_clus_id = $cols[0]; | |
178 my $acc = $cols[3]; | |
179 | |
180 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ ); | |
181 next if ( !exists $acc_2_serovar->{$acc} ); | |
182 push @{ $snp_2_acc->{$snp_clus_id} }, $acc; | |
183 if ( exists $acc_2_snp->{$acc} ) { | |
184 print $Stderr | |
185 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n"; | |
186 exit 1; | |
187 } | |
188 $acc_2_snp->{$acc} = $snp_clus_id; | |
189 $snp_count->{$snp_clus_id} = 0; | |
190 } | |
191 | |
192 while ( my $line = <$tbl_file> ) { | |
193 chomp $line; | |
194 | |
195 my @cols = split( /\t/, $line ); | |
196 | |
197 # .tbl file columns (Perl index): | |
198 # | |
199 # 0: Accession | |
200 # 1: AssemblyLevel | |
201 # 2: ScaffoldN50 | |
202 # 3: ContigN50 | |
203 | |
204 my $acc = $cols[0]; | |
205 my $asm_lvl = $cols[1]; | |
206 my $scaffold_n50 = $cols[2]; | |
207 my $contig_n50 = $cols[3]; | |
208 | |
209 # my $idx0 = $acc_2_serovar->{$cols[0]}; | |
210 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } ); | |
211 | |
212 if ( not_empty($acc) && defined $idx0 ) { | |
213 my $fna_rel_loc = | |
214 "$pdg_release/ncbi_dataset/data/$acc/" | |
215 . $acc | |
216 . '_scaffolded_genomic.fna.gz'; | |
217 | |
218 if ( not_empty($scaffold_n50) ) { | |
219 next if ( $scaffold_n50 <= $min_asm_size ); | |
220 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } | |
221 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; | |
222 } | |
223 elsif ( not_empty($contig_n50) ) { | |
224 next if ( $contig_n50 <= $min_asm_size ); | |
225 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } | |
226 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; | |
227 } | |
228 } | |
229 } | |
230 | |
231 foreach my $snp_cluster_id ( keys %$snp_2_acc ) { | |
232 my $count = $snp_count->{$snp_cluster_id}; | |
233 foreach my $asm_lvl ( | |
234 sort { $a cmp $b } | |
235 keys %{ $snp_2_serovar->{$snp_cluster_id} } | |
236 ) | |
237 { | |
238 if ( $asm_lvl =~ m/Complete\s+Genome/i ) { | |
239 $count = | |
240 print_dl_metadata( $asm_lvl, | |
241 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
242 $count, $snp_cluster_id ); | |
243 } | |
244 if ( $asm_lvl =~ m/Chromosome/i ) { | |
245 $count = | |
246 print_dl_metadata( $asm_lvl, | |
247 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
248 $count, $snp_cluster_id ); | |
249 } | |
250 if ( $asm_lvl =~ m/Scaffold/i ) { | |
251 $count = | |
252 print_dl_metadata( $asm_lvl, | |
253 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
254 $count, $snp_cluster_id ); | |
255 } | |
256 if ( $asm_lvl =~ m/Contig/i ) { | |
257 $count = | |
258 print_dl_metadata( $asm_lvl, | |
259 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
260 $count, $snp_cluster_id ); | |
261 } | |
262 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count | |
263 if ( $count > 0 ); | |
264 last if ( $count >= $serovar_limit ); | |
265 } | |
266 } | |
267 | |
268 close $pdg_file; | |
269 close $tbl_file; | |
270 close $snp_cluster_file; | |
271 close $acc_fh; | |
272 close $accs_snp_fh; | |
273 | |
274 #------------------------------------------- | |
275 # Main ends | |
276 #------------------------------------------- | |
277 # Routines begin | |
278 #------------------------------------------- | |
279 | |
280 sub print_dl_metadata { | |
281 my $asm_lvl = shift; | |
282 my $acc_sizes = shift; | |
283 my $curr_count = shift; | |
284 my $snp_cluster_id = shift; | |
285 | |
286 $asm_lvl =~ s/.+?\_(.+)/$1/; | |
287 | |
288 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) { | |
289 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) { | |
290 my ( $serovar, $url ) = split( /\|/, $serovar_url ); | |
291 return $curr_count if ( exists $multi_cluster_acc->{$url} ); | |
292 $multi_cluster_acc->{$url} = 1; | |
293 $curr_count++; | |
294 my ( $final_acc, $genome_header ) = | |
295 ( split( /\//, $url ) )[ 3 .. 4 ]; | |
296 print $accs_snp_fh "$final_acc\n"; | |
297 print $genome_headers_fh catfile( 'scaffold_genomes', | |
298 $genome_header ) | |
299 . "\n"; | |
300 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n" | |
301 if ( $curr_count > 0 ); | |
302 } | |
303 last if ( $curr_count >= $serovar_limit ); | |
304 } | |
305 return $curr_count; | |
306 } | |
307 | |
308 sub sort_asm_level { | |
309 my $level = shift; | |
310 | |
311 $level =~ s/(Complete\s+Genome)/a\_$1/ | |
312 if ( $level =~ m/Complete\s+Genome/i ); | |
313 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i ); | |
314 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i ); | |
315 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i ); | |
316 | |
317 return $level; | |
318 } | |
319 | |
320 sub not_empty { | |
321 my $col = shift; | |
322 | |
323 if ( $col !~ m/^$/ ) { | |
324 return 1; | |
325 } | |
326 else { | |
327 return 0; | |
328 } | |
329 } | |
330 | |
331 __END__ | |
332 | |
333 =head1 SYNOPSIS | |
334 | |
335 This script will take in a PDG metadata file, a C<.tbl> file and generate | |
336 the final list by B<I<waterfall>> priority. | |
337 | |
338 See complete description: | |
339 | |
340 perldoc waterfall_per_snp_cluster.pl | |
341 | |
342 or | |
343 | |
344 waterfall_per_snp_cluster.pl --help | |
345 | |
346 Examples: | |
347 | |
348 waterfall_per_snp_cluster.pl | |
349 | |
350 =head1 DESCRIPTION | |
351 | |
352 We will retain up to N number of genome accessions per SNP cluster. | |
353 It prioritizes SNP Cluster participation over serotype coverage. | |
354 Which N genomes are selected depends on (in order): | |
355 | |
356 1. Genome assembly level, whose priority is | |
357 | |
358 a: Complete Genome | |
359 b: Chromosome | |
360 c: Scaffold | |
361 d: Contig | |
362 | |
363 2. If the genomes are of same assembly level, then | |
364 scaffold N50 followed by contig N50 is chosen. | |
365 | |
366 3. If the scaffold or contig N50 is same, then all | |
367 of them are included | |
368 | |
369 =head1 OPTIONS | |
370 | |
371 =over 3 | |
372 | |
373 =item -p PDGXXXXX.XXXX.metadata.tsv | |
374 | |
375 Absolute UNIX path pointing to the PDG metadata file. | |
376 Example: PDG000000002.2505.metadata.tsv | |
377 | |
378 =item -t asm.tbl | |
379 | |
380 Absolute UNIX path pointing to the file from the result | |
381 of the C<dl_pdg_data.py> script, which is the C<asm.tbl> | |
382 file. | |
383 | |
384 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv | |
385 | |
386 Absolute UNIX path pointing to the SNP Cluster metadata file. | |
387 Examples: PDG000000002.2505.reference_target.cluster_list.tsv | |
388 | |
389 =item --serocol <int> (Optional) | |
390 | |
391 Column number (non 0-based index) of the PDG metadata file | |
392 by which the serotypes are collected | |
393 (column name: "computed_types"). Default: 50 | |
394 | |
395 =item --seronamecol <int> (Optional) | |
396 | |
397 Column number (non 0-based index) of the PDG metadata file | |
398 whose column name is "serovar". Default: 34 | |
399 | |
400 =item --acc_col <int> (Optional) | |
401 | |
402 Column number (non 0-based index) of the PDG metadata file | |
403 whose column name is "acc". Default: 10 | |
404 | |
405 =item --target_acc_col <int> (Optional) | |
406 | |
407 Column number (non 0-based index) of the PDG metadata file | |
408 whose column name is "target_acc". Default: 43 | |
409 | |
410 =item --complete_serotype_name (Optional) | |
411 | |
412 Skip indexing serotypes when the serotype name in the column | |
413 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if | |
414 an accession has a I<B<serotype=>> string as such in column | |
415 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-"> | |
416 then, the indexing of that accession is skipped. | |
417 Default: False | |
418 | |
419 =item --not_null_pdg_serovar (Optional) | |
420 | |
421 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based) | |
422 if the B<I<serovar>> column is not C<NULL>. | |
423 | |
424 =item -i <serotype name> (Optional) | |
425 | |
426 Make sure the following serotype is included. Mention C<-i> multiple | |
427 times to include multiple serotypes. | |
428 | |
429 =item -num <int> (Optional) | |
430 | |
431 Number of genome accessions per SNP Cluster. Default: 1 | |
432 | |
433 =back | |
434 | |
435 =head1 AUTHOR | |
436 | |
437 Kranti Konganti | |
438 | |
439 =cut |