comparison 0.5.0/bin/waterfall_per_snp_cluster.pl @ 1:365849f031fd

"planemo upload"
author kkonganti
date Mon, 05 Jun 2023 18:48:51 -0400
parents
children
comparison
equal deleted inserted replaced
0:a4b1ee4b68b1 1:365849f031fd
1 #!/usr/bin/env perl
2
3 # Kranti Konganti
4 # 10/12/2022
5
6 use strict;
7 use warnings;
8 use Getopt::Long;
9 use Data::Dumper;
10 use Pod::Usage;
11 use File::Basename;
12 use File::Spec::Functions;
13
14 my $tbl = {};
15 my $snp_2_serovar = {};
16 my $acc_2_serovar = {};
17 my $acc_2_target = {};
18 my $snp_count = {};
19 my $snp_2_acc = {};
20 my $acc_2_snp = {};
21 my $multi_cluster_acc = {};
22 my (
23 $serovar_limit, $serovar_or_type_col, $min_asm_size,
24 $complete_serotype_name, $PDG_file, $table_file,
25 $not_null_pdg_serovar, $snp_cluster, $help,
26 $out_prefix
27 );
28 my @custom_serovars;
29
30 GetOptions(
31 'help' => \$help,
32 'pdg=s' => \$PDG_file,
33 'tbl=s' => \$table_file,
34 'snp=s' => \$snp_cluster,
35 'min_contig_size=i' => \$min_asm_size,
36 'complete_serotype_name' => \$complete_serotype_name,
37 'serocol:i' => \$serovar_or_type_col,
38 'not_null_pdg_serovar' => \$not_null_pdg_serovar,
39 'num_serotypes_per_serotype:i' => \$serovar_limit,
40 'include_serovar=s' => \@custom_serovars,
41 'op=s' => \$out_prefix
42 ) or pod2usage( -verbose => 2 );
43
44 if ( defined $help ) {
45 pod2usage( -verbose => 2 );
46 }
47
48 if ( !defined $serovar_limit ) {
49 $serovar_limit = 1;
50 }
51
52 if ( !defined $serovar_or_type_col ) {
53 $serovar_or_type_col = 49;
54 }
55
56 if ( !defined $min_asm_size ) {
57 $min_asm_size = 0;
58 }
59
60 if ( defined $out_prefix ) {
61 $out_prefix .= '_';
62 }
63 else {
64 $out_prefix = '';
65 }
66
67 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster );
68
69 open( my $pdg_file, '<', $PDG_file )
70 || die "\nCannot open PDG file $PDG_file: $!\n\n";
71 open( my $tbl_file, '<', $table_file )
72 || die "\nCannot open tbl file $table_file: $!\n\n";
73 open( my $snp_cluster_file, '<', $snp_cluster )
74 || die "\nCannot open $snp_cluster: $!\n\n";
75 open( my $acc_fh, '>', 'acc2serovar.txt' )
76 || die "\nCannot open acc2serovar.txt: $!\n\n";
77 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n";
78 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n";
79 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' )
80 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n";
81 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' )
82 || die "\nCannnot open "
83 . $out_prefix
84 . "mash_snp_genome_list.txt for writing: $!\n\n";
85
86 my $pdg_release = basename( $PDG_file, ".metadata.tsv" );
87
88 while ( my $line = <$pdg_file> ) {
89 chomp $line;
90 next if ( $line =~ m/^\#/ );
91
92 # Relevent columns (Perl index):
93 # 9: asm_acc
94 # 33: serovar
95 # 48: computed serotype
96
97 my @cols = split( /\t/, $line );
98 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ];
99 my $acc = $cols[9];
100 my $serovar = $cols[33];
101 my $target_acc = $cols[41];
102
103 $serovar_or_type =~ s/\"//g;
104
105 my $skip = 1;
106 foreach my $ser (@custom_serovars) {
107 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ );
108 }
109
110 if ( defined $complete_serotype_name ) {
111 next
112 if ( $skip
113 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ )
114 );
115 }
116
117 next
118 if (
119 $skip
120 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
121 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ )
122 );
123
124 # next
125 # if (
126 # (
127 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
128 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/
129 # )
130 # );
131
132 if ( defined $not_null_pdg_serovar ) {
133 $acc_2_serovar->{$acc} = $serovar_or_type,
134 $acc_2_target->{$acc} = $target_acc,
135 print $acc_fh "$acc\t$serovar_or_type\n"
136 if ( $acc !~ m/NULL/
137 && $serovar !~ m/NULL/
138 && $serovar_or_type !~ m/NULL/ );
139 }
140 else {
141 $acc_2_serovar->{$acc} = $serovar_or_type,
142 $acc_2_target->{$acc} = $target_acc,
143 print $acc_fh "$acc\t$serovar_or_type\n"
144 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ );
145 }
146
147 # $snp_count->{$serovar_or_type} = 0;
148 }
149
150 #
151 # SNP to ACC
152 #
153
154 while ( my $line = <$snp_cluster_file> ) {
155 chomp $line;
156 my @cols = split( /\t/, $line );
157
158 # Relevant columns
159 # 0: SNP Cluster ID
160 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank)
161 my $snp_clus_id = $cols[0];
162 my $acc = $cols[3];
163
164 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ );
165 next if ( !exists $acc_2_serovar->{$acc} );
166 push @{ $snp_2_acc->{$snp_clus_id} }, $acc;
167 if ( exists $acc_2_snp->{$acc} ) {
168 print $Stderr
169 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n";
170 exit 1;
171 }
172 $acc_2_snp->{$acc} = $snp_clus_id;
173 $snp_count->{$snp_clus_id} = 0;
174 }
175
176 while ( my $line = <$tbl_file> ) {
177 chomp $line;
178
179 my @cols = split( /\t/, $line );
180
181 # .tbl file columns (Perl index):
182 #
183 # 0: Accession
184 # 1: AssemblyLevel
185 # 2: ScaffoldN50
186 # 3: ContigN50
187
188 my $acc = $cols[0];
189 my $asm_lvl = $cols[1];
190 my $scaffold_n50 = $cols[2];
191 my $contig_n50 = $cols[3];
192
193 # my $idx0 = $acc_2_serovar->{$cols[0]};
194 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } );
195
196 if ( not_empty($acc) && defined $idx0 ) {
197 my $fna_rel_loc =
198 "$pdg_release/ncbi_dataset/data/$acc/"
199 . $acc
200 . '_scaffolded_genomic.fna.gz';
201
202 if ( not_empty($scaffold_n50) ) {
203 next if ( $scaffold_n50 <= $min_asm_size );
204 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
205 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
206 }
207 elsif ( not_empty($contig_n50) ) {
208 next if ( $contig_n50 <= $min_asm_size );
209 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
210 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
211 }
212 }
213 }
214
215 foreach my $snp_cluster_id ( keys %$snp_2_acc ) {
216 my $count = $snp_count->{$snp_cluster_id};
217 foreach my $asm_lvl (
218 sort { $a cmp $b }
219 keys %{ $snp_2_serovar->{$snp_cluster_id} }
220 )
221 {
222 if ( $asm_lvl =~ m/Complete\s+Genome/i ) {
223 $count =
224 print_dl_metadata( $asm_lvl,
225 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
226 $count, $snp_cluster_id );
227 }
228 if ( $asm_lvl =~ m/Chromosome/i ) {
229 $count =
230 print_dl_metadata( $asm_lvl,
231 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
232 $count, $snp_cluster_id );
233 }
234 if ( $asm_lvl =~ m/Scaffold/i ) {
235 $count =
236 print_dl_metadata( $asm_lvl,
237 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
238 $count, $snp_cluster_id );
239 }
240 if ( $asm_lvl =~ m/Contig/i ) {
241 $count =
242 print_dl_metadata( $asm_lvl,
243 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
244 $count, $snp_cluster_id );
245 }
246 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count
247 if ( $count > 0 );
248 last if ( $count == $serovar_limit );
249 }
250 }
251
252 close $pdg_file;
253 close $tbl_file;
254 close $snp_cluster_file;
255 close $acc_fh;
256 close $accs_snp_fh;
257
258 #-------------------------------------------
259 # Main ends
260 #-------------------------------------------
261 # Routines begin
262 #-------------------------------------------
263
264 sub print_dl_metadata {
265 my $asm_lvl = shift;
266 my $acc_sizes = shift;
267 my $curr_count = shift;
268 my $snp_cluster_id = shift;
269
270 $asm_lvl =~ s/.+?\_(.+)/$1/;
271
272 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) {
273 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) {
274 my ( $serovar, $url ) = split( /\|/, $serovar_url );
275 return $curr_count if ( exists $multi_cluster_acc->{$url} );
276 $multi_cluster_acc->{$url} = 1;
277 $curr_count++;
278 my ( $final_acc, $genome_header ) =
279 ( split( /\//, $url ) )[ 3 .. 4 ];
280 print $accs_snp_fh "$final_acc\n";
281 print $genome_headers_fh catfile( 'scaffold_genomes',
282 $genome_header )
283 . "\n";
284 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n"
285 if ( $curr_count > 0 );
286 last if ( $curr_count == $serovar_limit );
287 }
288 last if ( $curr_count == $serovar_limit );
289 }
290 return $curr_count;
291 }
292
293 sub sort_asm_level {
294 my $level = shift;
295
296 $level =~ s/(Complete\s+Genome)/a\_$1/
297 if ( $level =~ m/Complete\s+Genome/i );
298 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i );
299 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i );
300 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i );
301
302 return $level;
303 }
304
305 sub not_empty {
306 my $col = shift;
307
308 if ( $col !~ m/^$/ ) {
309 return 1;
310 }
311 else {
312 return 0;
313 }
314 }
315
316 __END__
317
318 =head1 SYNOPSIS
319
320 This script will take in a PDG metadata file, a C<.tbl> file and generate
321 the final list by B<I<waterfall>> priority.
322
323 See complete description:
324
325 perldoc waterfall_per_snp_cluster.pl
326
327 or
328
329 waterfall_per_snp_cluster.pl --help
330
331 Examples:
332
333 waterfall_per_snp_cluster.pl
334
335 =head1 DESCRIPTION
336
337 We will retain up to N number of genome accessions per SNP cluster.
338 It prioritizes SNP Cluster participation over serotype coverage.
339 Which N genomes are selected depends on (in order):
340
341 1. Genome assembly level, whose priority is
342
343 a: Complete Genome
344 b: Chromosome
345 c: Scaffold
346 d: Contig
347
348 2. If the genomes are of same assembly level, then
349 scaffold N50 followed by contig N50 is chosen.
350
351 3. If the scaffold or contig N50 is same, then all
352 of them are included
353
354 =head1 OPTIONS
355
356 =over 3
357
358 =item -p PDGXXXXX.XXXX.metadata.tsv
359
360 Absolute UNIX path pointing to the PDG metadata file.
361 Example: PDG000000002.2505.metadata.tsv
362
363 =item -t asm.tbl
364
365 Absolute UNIX path pointing to the file from the result
366 of the C<dl_pdg_data.py> script, which is the C<asm.tbl>
367 file.
368
369 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv
370
371 Absolute UNIX path pointing to the SNP Cluster metadata file.
372 Examples: PDG000000002.2505.reference_target.cluster_list.tsv
373
374
375 =item --serocol <int> (Optional)
376
377 Column number (non 0-based index) of the PDG metadata file
378 by which the serotypes are collected. Default: 49
379
380 =item --complete_serotype_name (Optional)
381
382 Skip indexing serotypes when the serotype name in the column
383 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if
384 an accession has a I<B<serotype=>> string as such in column
385 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-">
386 then, the indexing of that accession is skipped.
387 Default: False
388
389 =item --not_null_pdg_serovar (Optional)
390
391 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based)
392 if the B<I<serovar>> column is not C<NULL>.
393
394 =item -i <serotype name> (Optional)
395
396 Make sure the following serotype is included. Mention C<-i> multiple
397 times to include multiple serotypes.
398
399 =item -num <int> (Optional)
400
401 Number of genome accessions per SNP Cluster. Default: 1
402
403 =back
404
405 =head1 AUTHOR
406
407 Kranti Konganti
408
409 =cut