annotate 0.7.0/bin/waterfall_per_snp_cluster.pl @ 17:0e7a0053e4a6

planemo upload
author kkonganti
date Mon, 15 Jul 2024 10:42:02 -0400
parents
children
rev   line source
kkonganti@17 1 #!/usr/bin/env perl
kkonganti@17 2
kkonganti@17 3 # Kranti Konganti
kkonganti@17 4 # 01/02/2024
kkonganti@17 5
kkonganti@17 6 use strict;
kkonganti@17 7 use warnings;
kkonganti@17 8 use Getopt::Long;
kkonganti@17 9 use Data::Dumper;
kkonganti@17 10 use Pod::Usage;
kkonganti@17 11 use File::Basename;
kkonganti@17 12 use File::Spec::Functions;
kkonganti@17 13
kkonganti@17 14 my $tbl = {};
kkonganti@17 15 my $snp_2_serovar = {};
kkonganti@17 16 my $acc_2_serovar = {};
kkonganti@17 17 my $acc_2_target = {};
kkonganti@17 18 my $snp_count = {};
kkonganti@17 19 my $snp_2_acc = {};
kkonganti@17 20 my $acc_2_snp = {};
kkonganti@17 21 my $multi_cluster_acc = {};
kkonganti@17 22 my (
kkonganti@17 23 $serovar_limit, $serovar_or_type_col, $min_asm_size,
kkonganti@17 24 $complete_serotype_name, $PDG_file, $table_file,
kkonganti@17 25 $not_null_pdg_serovar, $snp_cluster, $help,
kkonganti@17 26 $out_prefix, $acc_col, $seronamecol,
kkonganti@17 27 $target_acc_col
kkonganti@17 28 );
kkonganti@17 29 my @custom_serovars;
kkonganti@17 30
kkonganti@17 31 GetOptions(
kkonganti@17 32 'help' => \$help,
kkonganti@17 33 'pdg=s' => \$PDG_file,
kkonganti@17 34 'tbl=s' => \$table_file,
kkonganti@17 35 'snp=s' => \$snp_cluster,
kkonganti@17 36 'min_contig_size=i' => \$min_asm_size,
kkonganti@17 37 'complete_serotype_name' => \$complete_serotype_name,
kkonganti@17 38 'serocol:i' => \$serovar_or_type_col,
kkonganti@17 39 'seronamecol:i' => \$seronamecol,
kkonganti@17 40 'target_acc_col:i' => \$target_acc_col,
kkonganti@17 41 'acc_col:i' => \$acc_col,
kkonganti@17 42 'not_null_pdg_serovar' => \$not_null_pdg_serovar,
kkonganti@17 43 'num_serotypes_per_serotype:i' => \$serovar_limit,
kkonganti@17 44 'include_serovar=s' => \@custom_serovars,
kkonganti@17 45 'op=s' => \$out_prefix
kkonganti@17 46 ) or pod2usage( -verbose => 2 );
kkonganti@17 47
kkonganti@17 48 if ( defined $help ) {
kkonganti@17 49 pod2usage( -verbose => 2 );
kkonganti@17 50 }
kkonganti@17 51
kkonganti@17 52 if ( !defined $serovar_limit ) {
kkonganti@17 53 $serovar_limit = 1;
kkonganti@17 54 }
kkonganti@17 55
kkonganti@17 56 if ( !defined $serovar_or_type_col ) {
kkonganti@17 57 $serovar_or_type_col = 50;
kkonganti@17 58 }
kkonganti@17 59
kkonganti@17 60 if ( !defined $seronamecol ) {
kkonganti@17 61 $seronamecol = 34;
kkonganti@17 62 }
kkonganti@17 63
kkonganti@17 64 if ( !defined $target_acc_col ) {
kkonganti@17 65 $target_acc_col = 43;
kkonganti@17 66 }
kkonganti@17 67
kkonganti@17 68 if ( !defined $acc_col ) {
kkonganti@17 69 $acc_col = 10;
kkonganti@17 70 }
kkonganti@17 71
kkonganti@17 72 if ( !defined $min_asm_size ) {
kkonganti@17 73 $min_asm_size = 0;
kkonganti@17 74 }
kkonganti@17 75
kkonganti@17 76 if ( defined $out_prefix ) {
kkonganti@17 77 $out_prefix .= '_';
kkonganti@17 78 }
kkonganti@17 79 else {
kkonganti@17 80 $out_prefix = '';
kkonganti@17 81 }
kkonganti@17 82
kkonganti@17 83 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster );
kkonganti@17 84
kkonganti@17 85 open( my $pdg_file, '<', $PDG_file )
kkonganti@17 86 || die "\nCannot open PDG file $PDG_file: $!\n\n";
kkonganti@17 87 open( my $tbl_file, '<', $table_file )
kkonganti@17 88 || die "\nCannot open tbl file $table_file: $!\n\n";
kkonganti@17 89 open( my $snp_cluster_file, '<', $snp_cluster )
kkonganti@17 90 || die "\nCannot open $snp_cluster: $!\n\n";
kkonganti@17 91 open( my $acc_fh, '>', 'acc2serovar.txt' )
kkonganti@17 92 || die "\nCannot open acc2serovar.txt: $!\n\n";
kkonganti@17 93 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n";
kkonganti@17 94 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n";
kkonganti@17 95 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' )
kkonganti@17 96 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n";
kkonganti@17 97 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' )
kkonganti@17 98 || die "\nCannnot open "
kkonganti@17 99 . $out_prefix
kkonganti@17 100 . "mash_snp_genome_list.txt for writing: $!\n\n";
kkonganti@17 101
kkonganti@17 102 my $pdg_release = basename( $PDG_file, ".metadata.tsv" );
kkonganti@17 103
kkonganti@17 104 while ( my $line = <$pdg_file> ) {
kkonganti@17 105 chomp $line;
kkonganti@17 106 next if ( $line =~ m/^\#/ );
kkonganti@17 107
kkonganti@17 108 # Relevent columns (Perl index):
kkonganti@17 109 # 10-1 = 9: asm_acc
kkonganti@17 110 # 34 -1 = 33: serovar
kkonganti@17 111 # 50 -1 = 49: computed serotype
kkonganti@17 112
kkonganti@17 113 my @cols = split( /\t/, $line );
kkonganti@17 114 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ];
kkonganti@17 115 my $acc = $cols[ $acc_col - 1 ];
kkonganti@17 116 my $serovar = $cols[ $seronamecol - 1 ];
kkonganti@17 117 my $target_acc = $cols[ $target_acc_col - 1 ];
kkonganti@17 118
kkonganti@17 119 $serovar_or_type =~ s/\"//g;
kkonganti@17 120
kkonganti@17 121 my $skip = 1;
kkonganti@17 122 foreach my $ser (@custom_serovars) {
kkonganti@17 123 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ );
kkonganti@17 124 }
kkonganti@17 125
kkonganti@17 126 if ( defined $complete_serotype_name ) {
kkonganti@17 127 next
kkonganti@17 128 if ( $skip
kkonganti@17 129 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ )
kkonganti@17 130 );
kkonganti@17 131 }
kkonganti@17 132
kkonganti@17 133 next
kkonganti@17 134 if (
kkonganti@17 135 $skip
kkonganti@17 136 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
kkonganti@17 137 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ )
kkonganti@17 138 );
kkonganti@17 139
kkonganti@17 140 # next
kkonganti@17 141 # if (
kkonganti@17 142 # (
kkonganti@17 143 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
kkonganti@17 144 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/
kkonganti@17 145 # )
kkonganti@17 146 # );
kkonganti@17 147
kkonganti@17 148 if ( defined $not_null_pdg_serovar ) {
kkonganti@17 149 $acc_2_serovar->{$acc} = $serovar_or_type,
kkonganti@17 150 $acc_2_target->{$acc} = $target_acc,
kkonganti@17 151 print $acc_fh "$acc\t$serovar_or_type\n"
kkonganti@17 152 if ( $acc !~ m/NULL/
kkonganti@17 153 && $serovar !~ m/NULL/
kkonganti@17 154 && $serovar_or_type !~ m/NULL/ );
kkonganti@17 155 }
kkonganti@17 156 else {
kkonganti@17 157 $acc_2_serovar->{$acc} = $serovar_or_type,
kkonganti@17 158 $acc_2_target->{$acc} = $target_acc,
kkonganti@17 159 print $acc_fh "$acc\t$serovar_or_type\n"
kkonganti@17 160 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ );
kkonganti@17 161 }
kkonganti@17 162
kkonganti@17 163 # $snp_count->{$serovar_or_type} = 0;
kkonganti@17 164 }
kkonganti@17 165
kkonganti@17 166 #
kkonganti@17 167 # SNP to ACC
kkonganti@17 168 #
kkonganti@17 169
kkonganti@17 170 while ( my $line = <$snp_cluster_file> ) {
kkonganti@17 171 chomp $line;
kkonganti@17 172 my @cols = split( /\t/, $line );
kkonganti@17 173
kkonganti@17 174 # Relevant columns
kkonganti@17 175 # 0: SNP Cluster ID
kkonganti@17 176 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank)
kkonganti@17 177 my $snp_clus_id = $cols[0];
kkonganti@17 178 my $acc = $cols[3];
kkonganti@17 179
kkonganti@17 180 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ );
kkonganti@17 181 next if ( !exists $acc_2_serovar->{$acc} );
kkonganti@17 182 push @{ $snp_2_acc->{$snp_clus_id} }, $acc;
kkonganti@17 183 if ( exists $acc_2_snp->{$acc} ) {
kkonganti@17 184 print $Stderr
kkonganti@17 185 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n";
kkonganti@17 186 exit 1;
kkonganti@17 187 }
kkonganti@17 188 $acc_2_snp->{$acc} = $snp_clus_id;
kkonganti@17 189 $snp_count->{$snp_clus_id} = 0;
kkonganti@17 190 }
kkonganti@17 191
kkonganti@17 192 while ( my $line = <$tbl_file> ) {
kkonganti@17 193 chomp $line;
kkonganti@17 194
kkonganti@17 195 my @cols = split( /\t/, $line );
kkonganti@17 196
kkonganti@17 197 # .tbl file columns (Perl index):
kkonganti@17 198 #
kkonganti@17 199 # 0: Accession
kkonganti@17 200 # 1: AssemblyLevel
kkonganti@17 201 # 2: ScaffoldN50
kkonganti@17 202 # 3: ContigN50
kkonganti@17 203
kkonganti@17 204 my $acc = $cols[0];
kkonganti@17 205 my $asm_lvl = $cols[1];
kkonganti@17 206 my $scaffold_n50 = $cols[2];
kkonganti@17 207 my $contig_n50 = $cols[3];
kkonganti@17 208
kkonganti@17 209 # my $idx0 = $acc_2_serovar->{$cols[0]};
kkonganti@17 210 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } );
kkonganti@17 211
kkonganti@17 212 if ( not_empty($acc) && defined $idx0 ) {
kkonganti@17 213 my $fna_rel_loc =
kkonganti@17 214 "$pdg_release/ncbi_dataset/data/$acc/"
kkonganti@17 215 . $acc
kkonganti@17 216 . '_scaffolded_genomic.fna.gz';
kkonganti@17 217
kkonganti@17 218 if ( not_empty($scaffold_n50) ) {
kkonganti@17 219 next if ( $scaffold_n50 <= $min_asm_size );
kkonganti@17 220 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
kkonganti@17 221 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
kkonganti@17 222 }
kkonganti@17 223 elsif ( not_empty($contig_n50) ) {
kkonganti@17 224 next if ( $contig_n50 <= $min_asm_size );
kkonganti@17 225 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
kkonganti@17 226 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
kkonganti@17 227 }
kkonganti@17 228 }
kkonganti@17 229 }
kkonganti@17 230
kkonganti@17 231 foreach my $snp_cluster_id ( keys %$snp_2_acc ) {
kkonganti@17 232 my $count = $snp_count->{$snp_cluster_id};
kkonganti@17 233 foreach my $asm_lvl (
kkonganti@17 234 sort { $a cmp $b }
kkonganti@17 235 keys %{ $snp_2_serovar->{$snp_cluster_id} }
kkonganti@17 236 )
kkonganti@17 237 {
kkonganti@17 238 if ( $asm_lvl =~ m/Complete\s+Genome/i ) {
kkonganti@17 239 $count =
kkonganti@17 240 print_dl_metadata( $asm_lvl,
kkonganti@17 241 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@17 242 $count, $snp_cluster_id );
kkonganti@17 243 }
kkonganti@17 244 if ( $asm_lvl =~ m/Chromosome/i ) {
kkonganti@17 245 $count =
kkonganti@17 246 print_dl_metadata( $asm_lvl,
kkonganti@17 247 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@17 248 $count, $snp_cluster_id );
kkonganti@17 249 }
kkonganti@17 250 if ( $asm_lvl =~ m/Scaffold/i ) {
kkonganti@17 251 $count =
kkonganti@17 252 print_dl_metadata( $asm_lvl,
kkonganti@17 253 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@17 254 $count, $snp_cluster_id );
kkonganti@17 255 }
kkonganti@17 256 if ( $asm_lvl =~ m/Contig/i ) {
kkonganti@17 257 $count =
kkonganti@17 258 print_dl_metadata( $asm_lvl,
kkonganti@17 259 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@17 260 $count, $snp_cluster_id );
kkonganti@17 261 }
kkonganti@17 262 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count
kkonganti@17 263 if ( $count > 0 );
kkonganti@17 264 last if ( $count >= $serovar_limit );
kkonganti@17 265 }
kkonganti@17 266 }
kkonganti@17 267
kkonganti@17 268 close $pdg_file;
kkonganti@17 269 close $tbl_file;
kkonganti@17 270 close $snp_cluster_file;
kkonganti@17 271 close $acc_fh;
kkonganti@17 272 close $accs_snp_fh;
kkonganti@17 273
kkonganti@17 274 #-------------------------------------------
kkonganti@17 275 # Main ends
kkonganti@17 276 #-------------------------------------------
kkonganti@17 277 # Routines begin
kkonganti@17 278 #-------------------------------------------
kkonganti@17 279
kkonganti@17 280 sub print_dl_metadata {
kkonganti@17 281 my $asm_lvl = shift;
kkonganti@17 282 my $acc_sizes = shift;
kkonganti@17 283 my $curr_count = shift;
kkonganti@17 284 my $snp_cluster_id = shift;
kkonganti@17 285
kkonganti@17 286 $asm_lvl =~ s/.+?\_(.+)/$1/;
kkonganti@17 287
kkonganti@17 288 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) {
kkonganti@17 289 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) {
kkonganti@17 290 my ( $serovar, $url ) = split( /\|/, $serovar_url );
kkonganti@17 291 return $curr_count if ( exists $multi_cluster_acc->{$url} );
kkonganti@17 292 $multi_cluster_acc->{$url} = 1;
kkonganti@17 293 $curr_count++;
kkonganti@17 294 my ( $final_acc, $genome_header ) =
kkonganti@17 295 ( split( /\//, $url ) )[ 3 .. 4 ];
kkonganti@17 296 print $accs_snp_fh "$final_acc\n";
kkonganti@17 297 print $genome_headers_fh catfile( 'scaffold_genomes',
kkonganti@17 298 $genome_header )
kkonganti@17 299 . "\n";
kkonganti@17 300 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n"
kkonganti@17 301 if ( $curr_count > 0 );
kkonganti@17 302 }
kkonganti@17 303 last if ( $curr_count >= $serovar_limit );
kkonganti@17 304 }
kkonganti@17 305 return $curr_count;
kkonganti@17 306 }
kkonganti@17 307
kkonganti@17 308 sub sort_asm_level {
kkonganti@17 309 my $level = shift;
kkonganti@17 310
kkonganti@17 311 $level =~ s/(Complete\s+Genome)/a\_$1/
kkonganti@17 312 if ( $level =~ m/Complete\s+Genome/i );
kkonganti@17 313 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i );
kkonganti@17 314 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i );
kkonganti@17 315 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i );
kkonganti@17 316
kkonganti@17 317 return $level;
kkonganti@17 318 }
kkonganti@17 319
kkonganti@17 320 sub not_empty {
kkonganti@17 321 my $col = shift;
kkonganti@17 322
kkonganti@17 323 if ( $col !~ m/^$/ ) {
kkonganti@17 324 return 1;
kkonganti@17 325 }
kkonganti@17 326 else {
kkonganti@17 327 return 0;
kkonganti@17 328 }
kkonganti@17 329 }
kkonganti@17 330
kkonganti@17 331 __END__
kkonganti@17 332
kkonganti@17 333 =head1 SYNOPSIS
kkonganti@17 334
kkonganti@17 335 This script will take in a PDG metadata file, a C<.tbl> file and generate
kkonganti@17 336 the final list by B<I<waterfall>> priority.
kkonganti@17 337
kkonganti@17 338 See complete description:
kkonganti@17 339
kkonganti@17 340 perldoc waterfall_per_snp_cluster.pl
kkonganti@17 341
kkonganti@17 342 or
kkonganti@17 343
kkonganti@17 344 waterfall_per_snp_cluster.pl --help
kkonganti@17 345
kkonganti@17 346 Examples:
kkonganti@17 347
kkonganti@17 348 waterfall_per_snp_cluster.pl
kkonganti@17 349
kkonganti@17 350 =head1 DESCRIPTION
kkonganti@17 351
kkonganti@17 352 We will retain up to N number of genome accessions per SNP cluster.
kkonganti@17 353 It prioritizes SNP Cluster participation over serotype coverage.
kkonganti@17 354 Which N genomes are selected depends on (in order):
kkonganti@17 355
kkonganti@17 356 1. Genome assembly level, whose priority is
kkonganti@17 357
kkonganti@17 358 a: Complete Genome
kkonganti@17 359 b: Chromosome
kkonganti@17 360 c: Scaffold
kkonganti@17 361 d: Contig
kkonganti@17 362
kkonganti@17 363 2. If the genomes are of same assembly level, then
kkonganti@17 364 scaffold N50 followed by contig N50 is chosen.
kkonganti@17 365
kkonganti@17 366 3. If the scaffold or contig N50 is same, then all
kkonganti@17 367 of them are included
kkonganti@17 368
kkonganti@17 369 =head1 OPTIONS
kkonganti@17 370
kkonganti@17 371 =over 3
kkonganti@17 372
kkonganti@17 373 =item -p PDGXXXXX.XXXX.metadata.tsv
kkonganti@17 374
kkonganti@17 375 Absolute UNIX path pointing to the PDG metadata file.
kkonganti@17 376 Example: PDG000000002.2505.metadata.tsv
kkonganti@17 377
kkonganti@17 378 =item -t asm.tbl
kkonganti@17 379
kkonganti@17 380 Absolute UNIX path pointing to the file from the result
kkonganti@17 381 of the C<dl_pdg_data.py> script, which is the C<asm.tbl>
kkonganti@17 382 file.
kkonganti@17 383
kkonganti@17 384 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv
kkonganti@17 385
kkonganti@17 386 Absolute UNIX path pointing to the SNP Cluster metadata file.
kkonganti@17 387 Examples: PDG000000002.2505.reference_target.cluster_list.tsv
kkonganti@17 388
kkonganti@17 389 =item --serocol <int> (Optional)
kkonganti@17 390
kkonganti@17 391 Column number (non 0-based index) of the PDG metadata file
kkonganti@17 392 by which the serotypes are collected
kkonganti@17 393 (column name: "computed_types"). Default: 50
kkonganti@17 394
kkonganti@17 395 =item --seronamecol <int> (Optional)
kkonganti@17 396
kkonganti@17 397 Column number (non 0-based index) of the PDG metadata file
kkonganti@17 398 whose column name is "serovar". Default: 34
kkonganti@17 399
kkonganti@17 400 =item --acc_col <int> (Optional)
kkonganti@17 401
kkonganti@17 402 Column number (non 0-based index) of the PDG metadata file
kkonganti@17 403 whose column name is "acc". Default: 10
kkonganti@17 404
kkonganti@17 405 =item --target_acc_col <int> (Optional)
kkonganti@17 406
kkonganti@17 407 Column number (non 0-based index) of the PDG metadata file
kkonganti@17 408 whose column name is "target_acc". Default: 43
kkonganti@17 409
kkonganti@17 410 =item --complete_serotype_name (Optional)
kkonganti@17 411
kkonganti@17 412 Skip indexing serotypes when the serotype name in the column
kkonganti@17 413 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if
kkonganti@17 414 an accession has a I<B<serotype=>> string as such in column
kkonganti@17 415 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-">
kkonganti@17 416 then, the indexing of that accession is skipped.
kkonganti@17 417 Default: False
kkonganti@17 418
kkonganti@17 419 =item --not_null_pdg_serovar (Optional)
kkonganti@17 420
kkonganti@17 421 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based)
kkonganti@17 422 if the B<I<serovar>> column is not C<NULL>.
kkonganti@17 423
kkonganti@17 424 =item -i <serotype name> (Optional)
kkonganti@17 425
kkonganti@17 426 Make sure the following serotype is included. Mention C<-i> multiple
kkonganti@17 427 times to include multiple serotypes.
kkonganti@17 428
kkonganti@17 429 =item -num <int> (Optional)
kkonganti@17 430
kkonganti@17 431 Number of genome accessions per SNP Cluster. Default: 1
kkonganti@17 432
kkonganti@17 433 =back
kkonganti@17 434
kkonganti@17 435 =head1 AUTHOR
kkonganti@17 436
kkonganti@17 437 Kranti Konganti
kkonganti@17 438
kkonganti@17 439 =cut