Mercurial > repos > kkonganti > cfsan_bettercallsal
comparison 0.5.0/bin/waterfall_per_snp_cluster.pl @ 1:365849f031fd
"planemo upload"
author | kkonganti |
---|---|
date | Mon, 05 Jun 2023 18:48:51 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:a4b1ee4b68b1 | 1:365849f031fd |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 # Kranti Konganti | |
4 # 10/12/2022 | |
5 | |
6 use strict; | |
7 use warnings; | |
8 use Getopt::Long; | |
9 use Data::Dumper; | |
10 use Pod::Usage; | |
11 use File::Basename; | |
12 use File::Spec::Functions; | |
13 | |
14 my $tbl = {}; | |
15 my $snp_2_serovar = {}; | |
16 my $acc_2_serovar = {}; | |
17 my $acc_2_target = {}; | |
18 my $snp_count = {}; | |
19 my $snp_2_acc = {}; | |
20 my $acc_2_snp = {}; | |
21 my $multi_cluster_acc = {}; | |
22 my ( | |
23 $serovar_limit, $serovar_or_type_col, $min_asm_size, | |
24 $complete_serotype_name, $PDG_file, $table_file, | |
25 $not_null_pdg_serovar, $snp_cluster, $help, | |
26 $out_prefix | |
27 ); | |
28 my @custom_serovars; | |
29 | |
30 GetOptions( | |
31 'help' => \$help, | |
32 'pdg=s' => \$PDG_file, | |
33 'tbl=s' => \$table_file, | |
34 'snp=s' => \$snp_cluster, | |
35 'min_contig_size=i' => \$min_asm_size, | |
36 'complete_serotype_name' => \$complete_serotype_name, | |
37 'serocol:i' => \$serovar_or_type_col, | |
38 'not_null_pdg_serovar' => \$not_null_pdg_serovar, | |
39 'num_serotypes_per_serotype:i' => \$serovar_limit, | |
40 'include_serovar=s' => \@custom_serovars, | |
41 'op=s' => \$out_prefix | |
42 ) or pod2usage( -verbose => 2 ); | |
43 | |
44 if ( defined $help ) { | |
45 pod2usage( -verbose => 2 ); | |
46 } | |
47 | |
48 if ( !defined $serovar_limit ) { | |
49 $serovar_limit = 1; | |
50 } | |
51 | |
52 if ( !defined $serovar_or_type_col ) { | |
53 $serovar_or_type_col = 49; | |
54 } | |
55 | |
56 if ( !defined $min_asm_size ) { | |
57 $min_asm_size = 0; | |
58 } | |
59 | |
60 if ( defined $out_prefix ) { | |
61 $out_prefix .= '_'; | |
62 } | |
63 else { | |
64 $out_prefix = ''; | |
65 } | |
66 | |
67 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster ); | |
68 | |
69 open( my $pdg_file, '<', $PDG_file ) | |
70 || die "\nCannot open PDG file $PDG_file: $!\n\n"; | |
71 open( my $tbl_file, '<', $table_file ) | |
72 || die "\nCannot open tbl file $table_file: $!\n\n"; | |
73 open( my $snp_cluster_file, '<', $snp_cluster ) | |
74 || die "\nCannot open $snp_cluster: $!\n\n"; | |
75 open( my $acc_fh, '>', 'acc2serovar.txt' ) | |
76 || die "\nCannot open acc2serovar.txt: $!\n\n"; | |
77 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n"; | |
78 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n"; | |
79 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' ) | |
80 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n"; | |
81 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' ) | |
82 || die "\nCannnot open " | |
83 . $out_prefix | |
84 . "mash_snp_genome_list.txt for writing: $!\n\n"; | |
85 | |
86 my $pdg_release = basename( $PDG_file, ".metadata.tsv" ); | |
87 | |
88 while ( my $line = <$pdg_file> ) { | |
89 chomp $line; | |
90 next if ( $line =~ m/^\#/ ); | |
91 | |
92 # Relevent columns (Perl index): | |
93 # 9: asm_acc | |
94 # 33: serovar | |
95 # 48: computed serotype | |
96 | |
97 my @cols = split( /\t/, $line ); | |
98 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ]; | |
99 my $acc = $cols[9]; | |
100 my $serovar = $cols[33]; | |
101 my $target_acc = $cols[41]; | |
102 | |
103 $serovar_or_type =~ s/\"//g; | |
104 | |
105 my $skip = 1; | |
106 foreach my $ser (@custom_serovars) { | |
107 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ ); | |
108 } | |
109 | |
110 if ( defined $complete_serotype_name ) { | |
111 next | |
112 if ( $skip | |
113 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ ) | |
114 ); | |
115 } | |
116 | |
117 next | |
118 if ( | |
119 $skip | |
120 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ | |
121 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ ) | |
122 ); | |
123 | |
124 # next | |
125 # if ( | |
126 # ( | |
127 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ | |
128 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ | |
129 # ) | |
130 # ); | |
131 | |
132 if ( defined $not_null_pdg_serovar ) { | |
133 $acc_2_serovar->{$acc} = $serovar_or_type, | |
134 $acc_2_target->{$acc} = $target_acc, | |
135 print $acc_fh "$acc\t$serovar_or_type\n" | |
136 if ( $acc !~ m/NULL/ | |
137 && $serovar !~ m/NULL/ | |
138 && $serovar_or_type !~ m/NULL/ ); | |
139 } | |
140 else { | |
141 $acc_2_serovar->{$acc} = $serovar_or_type, | |
142 $acc_2_target->{$acc} = $target_acc, | |
143 print $acc_fh "$acc\t$serovar_or_type\n" | |
144 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ ); | |
145 } | |
146 | |
147 # $snp_count->{$serovar_or_type} = 0; | |
148 } | |
149 | |
150 # | |
151 # SNP to ACC | |
152 # | |
153 | |
154 while ( my $line = <$snp_cluster_file> ) { | |
155 chomp $line; | |
156 my @cols = split( /\t/, $line ); | |
157 | |
158 # Relevant columns | |
159 # 0: SNP Cluster ID | |
160 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank) | |
161 my $snp_clus_id = $cols[0]; | |
162 my $acc = $cols[3]; | |
163 | |
164 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ ); | |
165 next if ( !exists $acc_2_serovar->{$acc} ); | |
166 push @{ $snp_2_acc->{$snp_clus_id} }, $acc; | |
167 if ( exists $acc_2_snp->{$acc} ) { | |
168 print $Stderr | |
169 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n"; | |
170 exit 1; | |
171 } | |
172 $acc_2_snp->{$acc} = $snp_clus_id; | |
173 $snp_count->{$snp_clus_id} = 0; | |
174 } | |
175 | |
176 while ( my $line = <$tbl_file> ) { | |
177 chomp $line; | |
178 | |
179 my @cols = split( /\t/, $line ); | |
180 | |
181 # .tbl file columns (Perl index): | |
182 # | |
183 # 0: Accession | |
184 # 1: AssemblyLevel | |
185 # 2: ScaffoldN50 | |
186 # 3: ContigN50 | |
187 | |
188 my $acc = $cols[0]; | |
189 my $asm_lvl = $cols[1]; | |
190 my $scaffold_n50 = $cols[2]; | |
191 my $contig_n50 = $cols[3]; | |
192 | |
193 # my $idx0 = $acc_2_serovar->{$cols[0]}; | |
194 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } ); | |
195 | |
196 if ( not_empty($acc) && defined $idx0 ) { | |
197 my $fna_rel_loc = | |
198 "$pdg_release/ncbi_dataset/data/$acc/" | |
199 . $acc | |
200 . '_scaffolded_genomic.fna.gz'; | |
201 | |
202 if ( not_empty($scaffold_n50) ) { | |
203 next if ( $scaffold_n50 <= $min_asm_size ); | |
204 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } | |
205 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; | |
206 } | |
207 elsif ( not_empty($contig_n50) ) { | |
208 next if ( $contig_n50 <= $min_asm_size ); | |
209 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } | |
210 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; | |
211 } | |
212 } | |
213 } | |
214 | |
215 foreach my $snp_cluster_id ( keys %$snp_2_acc ) { | |
216 my $count = $snp_count->{$snp_cluster_id}; | |
217 foreach my $asm_lvl ( | |
218 sort { $a cmp $b } | |
219 keys %{ $snp_2_serovar->{$snp_cluster_id} } | |
220 ) | |
221 { | |
222 if ( $asm_lvl =~ m/Complete\s+Genome/i ) { | |
223 $count = | |
224 print_dl_metadata( $asm_lvl, | |
225 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
226 $count, $snp_cluster_id ); | |
227 } | |
228 if ( $asm_lvl =~ m/Chromosome/i ) { | |
229 $count = | |
230 print_dl_metadata( $asm_lvl, | |
231 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
232 $count, $snp_cluster_id ); | |
233 } | |
234 if ( $asm_lvl =~ m/Scaffold/i ) { | |
235 $count = | |
236 print_dl_metadata( $asm_lvl, | |
237 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
238 $count, $snp_cluster_id ); | |
239 } | |
240 if ( $asm_lvl =~ m/Contig/i ) { | |
241 $count = | |
242 print_dl_metadata( $asm_lvl, | |
243 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
244 $count, $snp_cluster_id ); | |
245 } | |
246 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count | |
247 if ( $count > 0 ); | |
248 last if ( $count == $serovar_limit ); | |
249 } | |
250 } | |
251 | |
252 close $pdg_file; | |
253 close $tbl_file; | |
254 close $snp_cluster_file; | |
255 close $acc_fh; | |
256 close $accs_snp_fh; | |
257 | |
258 #------------------------------------------- | |
259 # Main ends | |
260 #------------------------------------------- | |
261 # Routines begin | |
262 #------------------------------------------- | |
263 | |
264 sub print_dl_metadata { | |
265 my $asm_lvl = shift; | |
266 my $acc_sizes = shift; | |
267 my $curr_count = shift; | |
268 my $snp_cluster_id = shift; | |
269 | |
270 $asm_lvl =~ s/.+?\_(.+)/$1/; | |
271 | |
272 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) { | |
273 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) { | |
274 my ( $serovar, $url ) = split( /\|/, $serovar_url ); | |
275 return $curr_count if ( exists $multi_cluster_acc->{$url} ); | |
276 $multi_cluster_acc->{$url} = 1; | |
277 $curr_count++; | |
278 my ( $final_acc, $genome_header ) = | |
279 ( split( /\//, $url ) )[ 3 .. 4 ]; | |
280 print $accs_snp_fh "$final_acc\n"; | |
281 print $genome_headers_fh catfile( 'scaffold_genomes', | |
282 $genome_header ) | |
283 . "\n"; | |
284 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n" | |
285 if ( $curr_count > 0 ); | |
286 last if ( $curr_count == $serovar_limit ); | |
287 } | |
288 last if ( $curr_count == $serovar_limit ); | |
289 } | |
290 return $curr_count; | |
291 } | |
292 | |
293 sub sort_asm_level { | |
294 my $level = shift; | |
295 | |
296 $level =~ s/(Complete\s+Genome)/a\_$1/ | |
297 if ( $level =~ m/Complete\s+Genome/i ); | |
298 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i ); | |
299 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i ); | |
300 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i ); | |
301 | |
302 return $level; | |
303 } | |
304 | |
305 sub not_empty { | |
306 my $col = shift; | |
307 | |
308 if ( $col !~ m/^$/ ) { | |
309 return 1; | |
310 } | |
311 else { | |
312 return 0; | |
313 } | |
314 } | |
315 | |
316 __END__ | |
317 | |
318 =head1 SYNOPSIS | |
319 | |
320 This script will take in a PDG metadata file, a C<.tbl> file and generate | |
321 the final list by B<I<waterfall>> priority. | |
322 | |
323 See complete description: | |
324 | |
325 perldoc waterfall_per_snp_cluster.pl | |
326 | |
327 or | |
328 | |
329 waterfall_per_snp_cluster.pl --help | |
330 | |
331 Examples: | |
332 | |
333 waterfall_per_snp_cluster.pl | |
334 | |
335 =head1 DESCRIPTION | |
336 | |
337 We will retain up to N number of genome accessions per SNP cluster. | |
338 It prioritizes SNP Cluster participation over serotype coverage. | |
339 Which N genomes are selected depends on (in order): | |
340 | |
341 1. Genome assembly level, whose priority is | |
342 | |
343 a: Complete Genome | |
344 b: Chromosome | |
345 c: Scaffold | |
346 d: Contig | |
347 | |
348 2. If the genomes are of same assembly level, then | |
349 scaffold N50 followed by contig N50 is chosen. | |
350 | |
351 3. If the scaffold or contig N50 is same, then all | |
352 of them are included | |
353 | |
354 =head1 OPTIONS | |
355 | |
356 =over 3 | |
357 | |
358 =item -p PDGXXXXX.XXXX.metadata.tsv | |
359 | |
360 Absolute UNIX path pointing to the PDG metadata file. | |
361 Example: PDG000000002.2505.metadata.tsv | |
362 | |
363 =item -t asm.tbl | |
364 | |
365 Absolute UNIX path pointing to the file from the result | |
366 of the C<dl_pdg_data.py> script, which is the C<asm.tbl> | |
367 file. | |
368 | |
369 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv | |
370 | |
371 Absolute UNIX path pointing to the SNP Cluster metadata file. | |
372 Examples: PDG000000002.2505.reference_target.cluster_list.tsv | |
373 | |
374 | |
375 =item --serocol <int> (Optional) | |
376 | |
377 Column number (non 0-based index) of the PDG metadata file | |
378 by which the serotypes are collected. Default: 49 | |
379 | |
380 =item --complete_serotype_name (Optional) | |
381 | |
382 Skip indexing serotypes when the serotype name in the column | |
383 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if | |
384 an accession has a I<B<serotype=>> string as such in column | |
385 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-"> | |
386 then, the indexing of that accession is skipped. | |
387 Default: False | |
388 | |
389 =item --not_null_pdg_serovar (Optional) | |
390 | |
391 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based) | |
392 if the B<I<serovar>> column is not C<NULL>. | |
393 | |
394 =item -i <serotype name> (Optional) | |
395 | |
396 Make sure the following serotype is included. Mention C<-i> multiple | |
397 times to include multiple serotypes. | |
398 | |
399 =item -num <int> (Optional) | |
400 | |
401 Number of genome accessions per SNP Cluster. Default: 1 | |
402 | |
403 =back | |
404 | |
405 =head1 AUTHOR | |
406 | |
407 Kranti Konganti | |
408 | |
409 =cut |