Mercurial > repos > galaxytrakr > hfp_bettercallsal_konda
comparison 1.0.0/bin/waterfall_per_snp_cluster.pl @ 0:0a8dda29956e draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Thu, 28 May 2026 20:41:10 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0a8dda29956e |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 # Kranti Konganti | |
| 4 # 01/02/2024 | |
| 5 | |
| 6 use strict; | |
| 7 use warnings; | |
| 8 use Getopt::Long; | |
| 9 use Data::Dumper; | |
| 10 use Pod::Usage; | |
| 11 use File::Basename; | |
| 12 use File::Spec::Functions; | |
| 13 | |
| 14 my $tbl = {}; | |
| 15 my $snp_2_serovar = {}; | |
| 16 my $acc_2_serovar = {}; | |
| 17 my $acc_2_target = {}; | |
| 18 my $snp_count = {}; | |
| 19 my $snp_2_acc = {}; | |
| 20 my $acc_2_snp = {}; | |
| 21 my $multi_cluster_acc = {}; | |
| 22 my ( | |
| 23 $serovar_limit, $serovar_or_type_col, $min_asm_size, | |
| 24 $complete_serotype_name, $PDG_file, $table_file, | |
| 25 $not_null_pdg_serovar, $snp_cluster, $help, | |
| 26 $out_prefix, $acc_col, $seronamecol, | |
| 27 $target_acc_col | |
| 28 ); | |
| 29 my @custom_serovars; | |
| 30 | |
| 31 GetOptions( | |
| 32 'help' => \$help, | |
| 33 'pdg=s' => \$PDG_file, | |
| 34 'tbl=s' => \$table_file, | |
| 35 'snp=s' => \$snp_cluster, | |
| 36 'min_contig_size=i' => \$min_asm_size, | |
| 37 'complete_serotype_name' => \$complete_serotype_name, | |
| 38 'serocol:i' => \$serovar_or_type_col, | |
| 39 'seronamecol:i' => \$seronamecol, | |
| 40 'target_acc_col:i' => \$target_acc_col, | |
| 41 'acc_col:i' => \$acc_col, | |
| 42 'not_null_pdg_serovar' => \$not_null_pdg_serovar, | |
| 43 'num_serotypes_per_serotype:i' => \$serovar_limit, | |
| 44 'include_serovar=s' => \@custom_serovars, | |
| 45 'op=s' => \$out_prefix | |
| 46 ) or pod2usage( -verbose => 2 ); | |
| 47 | |
| 48 if ( defined $help ) { | |
| 49 pod2usage( -verbose => 2 ); | |
| 50 } | |
| 51 | |
| 52 if ( !defined $serovar_limit ) { | |
| 53 $serovar_limit = 1; | |
| 54 } | |
| 55 | |
| 56 if ( !defined $serovar_or_type_col ) { | |
| 57 $serovar_or_type_col = 50; | |
| 58 } | |
| 59 | |
| 60 if ( !defined $seronamecol ) { | |
| 61 $seronamecol = 34; | |
| 62 } | |
| 63 | |
| 64 if ( !defined $target_acc_col ) { | |
| 65 $target_acc_col = 43; | |
| 66 } | |
| 67 | |
| 68 if ( !defined $acc_col ) { | |
| 69 $acc_col = 10; | |
| 70 } | |
| 71 | |
| 72 if ( !defined $min_asm_size ) { | |
| 73 $min_asm_size = 0; | |
| 74 } | |
| 75 | |
| 76 if ( defined $out_prefix ) { | |
| 77 $out_prefix .= '_'; | |
| 78 } | |
| 79 else { | |
| 80 $out_prefix = ''; | |
| 81 } | |
| 82 | |
| 83 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster ); | |
| 84 | |
| 85 open( my $pdg_file, '<', $PDG_file ) | |
| 86 || die "\nCannot open PDG file $PDG_file: $!\n\n"; | |
| 87 open( my $tbl_file, '<', $table_file ) | |
| 88 || die "\nCannot open tbl file $table_file: $!\n\n"; | |
| 89 open( my $snp_cluster_file, '<', $snp_cluster ) | |
| 90 || die "\nCannot open $snp_cluster: $!\n\n"; | |
| 91 open( my $acc_fh, '>', 'acc2serovar.txt' ) | |
| 92 || die "\nCannot open acc2serovar.txt: $!\n\n"; | |
| 93 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n"; | |
| 94 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n"; | |
| 95 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' ) | |
| 96 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n"; | |
| 97 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' ) | |
| 98 || die "\nCannnot open " | |
| 99 . $out_prefix | |
| 100 . "mash_snp_genome_list.txt for writing: $!\n\n"; | |
| 101 | |
| 102 my $pdg_release = basename( $PDG_file, ".metadata.tsv" ); | |
| 103 | |
| 104 while ( my $line = <$pdg_file> ) { | |
| 105 chomp $line; | |
| 106 next if ( $line =~ m/^\#/ ); | |
| 107 | |
| 108 # Relevent columns (Perl index): | |
| 109 # 10-1 = 9: asm_acc | |
| 110 # 34 -1 = 33: serovar | |
| 111 # 50 -1 = 49: computed serotype | |
| 112 | |
| 113 my @cols = split( /\t/, $line ); | |
| 114 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ]; | |
| 115 my $acc = $cols[ $acc_col - 1 ]; | |
| 116 my $serovar = $cols[ $seronamecol - 1 ]; | |
| 117 my $target_acc = $cols[ $target_acc_col - 1 ]; | |
| 118 | |
| 119 $serovar_or_type =~ s/\"//g; | |
| 120 | |
| 121 my $skip = 1; | |
| 122 foreach my $ser (@custom_serovars) { | |
| 123 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ ); | |
| 124 } | |
| 125 | |
| 126 if ( defined $complete_serotype_name ) { | |
| 127 next | |
| 128 if ( $skip | |
| 129 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ ) | |
| 130 ); | |
| 131 } | |
| 132 | |
| 133 next | |
| 134 if ( | |
| 135 $skip | |
| 136 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ | |
| 137 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ ) | |
| 138 ); | |
| 139 | |
| 140 # next | |
| 141 # if ( | |
| 142 # ( | |
| 143 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ | |
| 144 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ | |
| 145 # ) | |
| 146 # ); | |
| 147 | |
| 148 if ( defined $not_null_pdg_serovar ) { | |
| 149 $acc_2_serovar->{$acc} = $serovar_or_type, | |
| 150 $acc_2_target->{$acc} = $target_acc, | |
| 151 print $acc_fh "$acc\t$serovar_or_type\n" | |
| 152 if ( $acc !~ m/NULL/ | |
| 153 && $serovar !~ m/NULL/ | |
| 154 && $serovar_or_type !~ m/NULL/ ); | |
| 155 } | |
| 156 else { | |
| 157 $acc_2_serovar->{$acc} = $serovar_or_type, | |
| 158 $acc_2_target->{$acc} = $target_acc, | |
| 159 print $acc_fh "$acc\t$serovar_or_type\n" | |
| 160 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ ); | |
| 161 } | |
| 162 | |
| 163 # $snp_count->{$serovar_or_type} = 0; | |
| 164 } | |
| 165 | |
| 166 # | |
| 167 # SNP to ACC | |
| 168 # | |
| 169 | |
| 170 while ( my $line = <$snp_cluster_file> ) { | |
| 171 chomp $line; | |
| 172 my @cols = split( /\t/, $line ); | |
| 173 | |
| 174 # Relevant columns | |
| 175 # 0: SNP Cluster ID | |
| 176 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank) | |
| 177 my $snp_clus_id = $cols[0]; | |
| 178 my $acc = $cols[3]; | |
| 179 | |
| 180 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ ); | |
| 181 next if ( !exists $acc_2_serovar->{$acc} ); | |
| 182 push @{ $snp_2_acc->{$snp_clus_id} }, $acc; | |
| 183 if ( exists $acc_2_snp->{$acc} ) { | |
| 184 print $Stderr | |
| 185 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n"; | |
| 186 exit 1; | |
| 187 } | |
| 188 $acc_2_snp->{$acc} = $snp_clus_id; | |
| 189 $snp_count->{$snp_clus_id} = 0; | |
| 190 } | |
| 191 | |
| 192 while ( my $line = <$tbl_file> ) { | |
| 193 chomp $line; | |
| 194 | |
| 195 my @cols = split( /\t/, $line ); | |
| 196 | |
| 197 # .tbl file columns (Perl index): | |
| 198 # | |
| 199 # 0: Accession | |
| 200 # 1: AssemblyLevel | |
| 201 # 2: ScaffoldN50 | |
| 202 # 3: ContigN50 | |
| 203 | |
| 204 my $acc = $cols[0]; | |
| 205 my $asm_lvl = $cols[1]; | |
| 206 my $scaffold_n50 = $cols[2]; | |
| 207 my $contig_n50 = $cols[3]; | |
| 208 | |
| 209 # my $idx0 = $acc_2_serovar->{$cols[0]}; | |
| 210 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } ); | |
| 211 | |
| 212 if ( not_empty($acc) && defined $idx0 ) { | |
| 213 my $fna_rel_loc = | |
| 214 "$pdg_release/ncbi_dataset/data/$acc/" | |
| 215 . $acc | |
| 216 . '_scaffolded_genomic.fna.gz'; | |
| 217 | |
| 218 if ( not_empty($scaffold_n50) ) { | |
| 219 next if ( $scaffold_n50 <= $min_asm_size ); | |
| 220 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } | |
| 221 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; | |
| 222 } | |
| 223 elsif ( not_empty($contig_n50) ) { | |
| 224 next if ( $contig_n50 <= $min_asm_size ); | |
| 225 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } | |
| 226 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; | |
| 227 } | |
| 228 } | |
| 229 } | |
| 230 | |
| 231 foreach my $snp_cluster_id ( keys %$snp_2_acc ) { | |
| 232 my $count = $snp_count->{$snp_cluster_id}; | |
| 233 foreach my $asm_lvl ( | |
| 234 sort { $a cmp $b } | |
| 235 keys %{ $snp_2_serovar->{$snp_cluster_id} } | |
| 236 ) | |
| 237 { | |
| 238 if ( $asm_lvl =~ m/Complete\s+Genome/i ) { | |
| 239 $count = | |
| 240 print_dl_metadata( $asm_lvl, | |
| 241 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
| 242 $count, $snp_cluster_id ); | |
| 243 } | |
| 244 if ( $asm_lvl =~ m/Chromosome/i ) { | |
| 245 $count = | |
| 246 print_dl_metadata( $asm_lvl, | |
| 247 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
| 248 $count, $snp_cluster_id ); | |
| 249 } | |
| 250 if ( $asm_lvl =~ m/Scaffold/i ) { | |
| 251 $count = | |
| 252 print_dl_metadata( $asm_lvl, | |
| 253 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
| 254 $count, $snp_cluster_id ); | |
| 255 } | |
| 256 if ( $asm_lvl =~ m/Contig/i ) { | |
| 257 $count = | |
| 258 print_dl_metadata( $asm_lvl, | |
| 259 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, | |
| 260 $count, $snp_cluster_id ); | |
| 261 } | |
| 262 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count | |
| 263 if ( $count > 0 ); | |
| 264 last if ( $count >= $serovar_limit ); | |
| 265 } | |
| 266 } | |
| 267 | |
| 268 close $pdg_file; | |
| 269 close $tbl_file; | |
| 270 close $snp_cluster_file; | |
| 271 close $acc_fh; | |
| 272 close $accs_snp_fh; | |
| 273 | |
| 274 #------------------------------------------- | |
| 275 # Main ends | |
| 276 #------------------------------------------- | |
| 277 # Routines begin | |
| 278 #------------------------------------------- | |
| 279 | |
| 280 sub print_dl_metadata { | |
| 281 my $asm_lvl = shift; | |
| 282 my $acc_sizes = shift; | |
| 283 my $curr_count = shift; | |
| 284 my $snp_cluster_id = shift; | |
| 285 | |
| 286 $asm_lvl =~ s/.+?\_(.+)/$1/; | |
| 287 | |
| 288 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) { | |
| 289 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) { | |
| 290 my ( $serovar, $url ) = split( /\|/, $serovar_url ); | |
| 291 return $curr_count if ( exists $multi_cluster_acc->{$url} ); | |
| 292 $multi_cluster_acc->{$url} = 1; | |
| 293 $curr_count++; | |
| 294 my ( $final_acc, $genome_header ) = | |
| 295 ( split( /\//, $url ) )[ 3 .. 4 ]; | |
| 296 print $accs_snp_fh "$final_acc\n"; | |
| 297 print $genome_headers_fh catfile( 'scaffold_genomes', | |
| 298 $genome_header ) | |
| 299 . "\n"; | |
| 300 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n" | |
| 301 if ( $curr_count > 0 ); | |
| 302 } | |
| 303 last if ( $curr_count >= $serovar_limit ); | |
| 304 } | |
| 305 return $curr_count; | |
| 306 } | |
| 307 | |
| 308 sub sort_asm_level { | |
| 309 my $level = shift; | |
| 310 | |
| 311 $level =~ s/(Complete\s+Genome)/a\_$1/ | |
| 312 if ( $level =~ m/Complete\s+Genome/i ); | |
| 313 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i ); | |
| 314 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i ); | |
| 315 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i ); | |
| 316 | |
| 317 return $level; | |
| 318 } | |
| 319 | |
| 320 sub not_empty { | |
| 321 my $col = shift; | |
| 322 | |
| 323 if ( $col !~ m/^$/ ) { | |
| 324 return 1; | |
| 325 } | |
| 326 else { | |
| 327 return 0; | |
| 328 } | |
| 329 } | |
| 330 | |
| 331 __END__ | |
| 332 | |
| 333 =head1 SYNOPSIS | |
| 334 | |
| 335 This script will take in a PDG metadata file, a C<.tbl> file and generate | |
| 336 the final list by B<I<waterfall>> priority. | |
| 337 | |
| 338 See complete description: | |
| 339 | |
| 340 perldoc waterfall_per_snp_cluster.pl | |
| 341 | |
| 342 or | |
| 343 | |
| 344 waterfall_per_snp_cluster.pl --help | |
| 345 | |
| 346 Examples: | |
| 347 | |
| 348 waterfall_per_snp_cluster.pl | |
| 349 | |
| 350 =head1 DESCRIPTION | |
| 351 | |
| 352 We will retain up to N number of genome accessions per SNP cluster. | |
| 353 It prioritizes SNP Cluster participation over serotype coverage. | |
| 354 Which N genomes are selected depends on (in order): | |
| 355 | |
| 356 1. Genome assembly level, whose priority is | |
| 357 | |
| 358 a: Complete Genome | |
| 359 b: Chromosome | |
| 360 c: Scaffold | |
| 361 d: Contig | |
| 362 | |
| 363 2. If the genomes are of same assembly level, then | |
| 364 scaffold N50 followed by contig N50 is chosen. | |
| 365 | |
| 366 3. If the scaffold or contig N50 is same, then all | |
| 367 of them are included | |
| 368 | |
| 369 =head1 OPTIONS | |
| 370 | |
| 371 =over 3 | |
| 372 | |
| 373 =item -p PDGXXXXX.XXXX.metadata.tsv | |
| 374 | |
| 375 Absolute UNIX path pointing to the PDG metadata file. | |
| 376 Example: PDG000000002.2505.metadata.tsv | |
| 377 | |
| 378 =item -t asm.tbl | |
| 379 | |
| 380 Absolute UNIX path pointing to the file from the result | |
| 381 of the C<dl_pdg_data.py> script, which is the C<asm.tbl> | |
| 382 file. | |
| 383 | |
| 384 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv | |
| 385 | |
| 386 Absolute UNIX path pointing to the SNP Cluster metadata file. | |
| 387 Examples: PDG000000002.2505.reference_target.cluster_list.tsv | |
| 388 | |
| 389 =item --serocol <int> (Optional) | |
| 390 | |
| 391 Column number (non 0-based index) of the PDG metadata file | |
| 392 by which the serotypes are collected | |
| 393 (column name: "computed_types"). Default: 50 | |
| 394 | |
| 395 =item --seronamecol <int> (Optional) | |
| 396 | |
| 397 Column number (non 0-based index) of the PDG metadata file | |
| 398 whose column name is "serovar". Default: 34 | |
| 399 | |
| 400 =item --acc_col <int> (Optional) | |
| 401 | |
| 402 Column number (non 0-based index) of the PDG metadata file | |
| 403 whose column name is "acc". Default: 10 | |
| 404 | |
| 405 =item --target_acc_col <int> (Optional) | |
| 406 | |
| 407 Column number (non 0-based index) of the PDG metadata file | |
| 408 whose column name is "target_acc". Default: 43 | |
| 409 | |
| 410 =item --complete_serotype_name (Optional) | |
| 411 | |
| 412 Skip indexing serotypes when the serotype name in the column | |
| 413 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if | |
| 414 an accession has a I<B<serotype=>> string as such in column | |
| 415 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-"> | |
| 416 then, the indexing of that accession is skipped. | |
| 417 Default: False | |
| 418 | |
| 419 =item --not_null_pdg_serovar (Optional) | |
| 420 | |
| 421 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based) | |
| 422 if the B<I<serovar>> column is not C<NULL>. | |
| 423 | |
| 424 =item -i <serotype name> (Optional) | |
| 425 | |
| 426 Make sure the following serotype is included. Mention C<-i> multiple | |
| 427 times to include multiple serotypes. | |
| 428 | |
| 429 =item -num <int> (Optional) | |
| 430 | |
| 431 Number of genome accessions per SNP Cluster. Default: 1 | |
| 432 | |
| 433 =back | |
| 434 | |
| 435 =head1 AUTHOR | |
| 436 | |
| 437 Kranti Konganti | |
| 438 | |
| 439 =cut |
