annotate 0.6.1/bin/waterfall_per_snp_cluster.pl @ 12:c5faadb3386f

"planemo upload"
author kkonganti
date Tue, 05 Sep 2023 11:57:15 -0400
parents 749faef1caa9
children
rev   line source
kkonganti@11 1 #!/usr/bin/env perl
kkonganti@11 2
kkonganti@11 3 # Kranti Konganti
kkonganti@11 4 # 08/23/2023
kkonganti@11 5
kkonganti@11 6 use strict;
kkonganti@11 7 use warnings;
kkonganti@11 8 use Getopt::Long;
kkonganti@11 9 use Data::Dumper;
kkonganti@11 10 use Pod::Usage;
kkonganti@11 11 use File::Basename;
kkonganti@11 12 use File::Spec::Functions;
kkonganti@11 13
kkonganti@11 14 my $tbl = {};
kkonganti@11 15 my $snp_2_serovar = {};
kkonganti@11 16 my $acc_2_serovar = {};
kkonganti@11 17 my $acc_2_target = {};
kkonganti@11 18 my $snp_count = {};
kkonganti@11 19 my $snp_2_acc = {};
kkonganti@11 20 my $acc_2_snp = {};
kkonganti@11 21 my $multi_cluster_acc = {};
kkonganti@11 22 my (
kkonganti@11 23 $serovar_limit, $serovar_or_type_col, $min_asm_size,
kkonganti@11 24 $complete_serotype_name, $PDG_file, $table_file,
kkonganti@11 25 $not_null_pdg_serovar, $snp_cluster, $help,
kkonganti@11 26 $out_prefix
kkonganti@11 27 );
kkonganti@11 28 my @custom_serovars;
kkonganti@11 29
kkonganti@11 30 GetOptions(
kkonganti@11 31 'help' => \$help,
kkonganti@11 32 'pdg=s' => \$PDG_file,
kkonganti@11 33 'tbl=s' => \$table_file,
kkonganti@11 34 'snp=s' => \$snp_cluster,
kkonganti@11 35 'min_contig_size=i' => \$min_asm_size,
kkonganti@11 36 'complete_serotype_name' => \$complete_serotype_name,
kkonganti@11 37 'serocol:i' => \$serovar_or_type_col,
kkonganti@11 38 'not_null_pdg_serovar' => \$not_null_pdg_serovar,
kkonganti@11 39 'num_serotypes_per_serotype:i' => \$serovar_limit,
kkonganti@11 40 'include_serovar=s' => \@custom_serovars,
kkonganti@11 41 'op=s' => \$out_prefix
kkonganti@11 42 ) or pod2usage( -verbose => 2 );
kkonganti@11 43
kkonganti@11 44 if ( defined $help ) {
kkonganti@11 45 pod2usage( -verbose => 2 );
kkonganti@11 46 }
kkonganti@11 47
kkonganti@11 48 if ( !defined $serovar_limit ) {
kkonganti@11 49 $serovar_limit = 1;
kkonganti@11 50 }
kkonganti@11 51
kkonganti@11 52 if ( !defined $serovar_or_type_col ) {
kkonganti@11 53 $serovar_or_type_col = 49;
kkonganti@11 54 }
kkonganti@11 55
kkonganti@11 56 if ( !defined $min_asm_size ) {
kkonganti@11 57 $min_asm_size = 0;
kkonganti@11 58 }
kkonganti@11 59
kkonganti@11 60 if ( defined $out_prefix ) {
kkonganti@11 61 $out_prefix .= '_';
kkonganti@11 62 }
kkonganti@11 63 else {
kkonganti@11 64 $out_prefix = '';
kkonganti@11 65 }
kkonganti@11 66
kkonganti@11 67 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster );
kkonganti@11 68
kkonganti@11 69 open( my $pdg_file, '<', $PDG_file )
kkonganti@11 70 || die "\nCannot open PDG file $PDG_file: $!\n\n";
kkonganti@11 71 open( my $tbl_file, '<', $table_file )
kkonganti@11 72 || die "\nCannot open tbl file $table_file: $!\n\n";
kkonganti@11 73 open( my $snp_cluster_file, '<', $snp_cluster )
kkonganti@11 74 || die "\nCannot open $snp_cluster: $!\n\n";
kkonganti@11 75 open( my $acc_fh, '>', 'acc2serovar.txt' )
kkonganti@11 76 || die "\nCannot open acc2serovar.txt: $!\n\n";
kkonganti@11 77 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n";
kkonganti@11 78 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n";
kkonganti@11 79 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' )
kkonganti@11 80 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n";
kkonganti@11 81 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' )
kkonganti@11 82 || die "\nCannnot open "
kkonganti@11 83 . $out_prefix
kkonganti@11 84 . "mash_snp_genome_list.txt for writing: $!\n\n";
kkonganti@11 85
kkonganti@11 86 my $pdg_release = basename( $PDG_file, ".metadata.tsv" );
kkonganti@11 87
kkonganti@11 88 while ( my $line = <$pdg_file> ) {
kkonganti@11 89 chomp $line;
kkonganti@11 90 next if ( $line =~ m/^\#/ );
kkonganti@11 91
kkonganti@11 92 # Relevent columns (Perl index):
kkonganti@11 93 # 9: asm_acc
kkonganti@11 94 # 33: serovar
kkonganti@11 95 # 48: computed serotype
kkonganti@11 96
kkonganti@11 97 my @cols = split( /\t/, $line );
kkonganti@11 98 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ];
kkonganti@11 99 my $acc = $cols[9];
kkonganti@11 100 my $serovar = $cols[33];
kkonganti@11 101 my $target_acc = $cols[41];
kkonganti@11 102
kkonganti@11 103 $serovar_or_type =~ s/\"//g;
kkonganti@11 104
kkonganti@11 105 my $skip = 1;
kkonganti@11 106 foreach my $ser (@custom_serovars) {
kkonganti@11 107 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ );
kkonganti@11 108 }
kkonganti@11 109
kkonganti@11 110 if ( defined $complete_serotype_name ) {
kkonganti@11 111 next
kkonganti@11 112 if ( $skip
kkonganti@11 113 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ )
kkonganti@11 114 );
kkonganti@11 115 }
kkonganti@11 116
kkonganti@11 117 next
kkonganti@11 118 if (
kkonganti@11 119 $skip
kkonganti@11 120 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
kkonganti@11 121 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ )
kkonganti@11 122 );
kkonganti@11 123
kkonganti@11 124 # next
kkonganti@11 125 # if (
kkonganti@11 126 # (
kkonganti@11 127 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
kkonganti@11 128 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/
kkonganti@11 129 # )
kkonganti@11 130 # );
kkonganti@11 131
kkonganti@11 132 if ( defined $not_null_pdg_serovar ) {
kkonganti@11 133 $acc_2_serovar->{$acc} = $serovar_or_type,
kkonganti@11 134 $acc_2_target->{$acc} = $target_acc,
kkonganti@11 135 print $acc_fh "$acc\t$serovar_or_type\n"
kkonganti@11 136 if ( $acc !~ m/NULL/
kkonganti@11 137 && $serovar !~ m/NULL/
kkonganti@11 138 && $serovar_or_type !~ m/NULL/ );
kkonganti@11 139 }
kkonganti@11 140 else {
kkonganti@11 141 $acc_2_serovar->{$acc} = $serovar_or_type,
kkonganti@11 142 $acc_2_target->{$acc} = $target_acc,
kkonganti@11 143 print $acc_fh "$acc\t$serovar_or_type\n"
kkonganti@11 144 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ );
kkonganti@11 145 }
kkonganti@11 146
kkonganti@11 147 # $snp_count->{$serovar_or_type} = 0;
kkonganti@11 148 }
kkonganti@11 149
kkonganti@11 150 #
kkonganti@11 151 # SNP to ACC
kkonganti@11 152 #
kkonganti@11 153
kkonganti@11 154 while ( my $line = <$snp_cluster_file> ) {
kkonganti@11 155 chomp $line;
kkonganti@11 156 my @cols = split( /\t/, $line );
kkonganti@11 157
kkonganti@11 158 # Relevant columns
kkonganti@11 159 # 0: SNP Cluster ID
kkonganti@11 160 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank)
kkonganti@11 161 my $snp_clus_id = $cols[0];
kkonganti@11 162 my $acc = $cols[3];
kkonganti@11 163
kkonganti@11 164 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ );
kkonganti@11 165 next if ( !exists $acc_2_serovar->{$acc} );
kkonganti@11 166 push @{ $snp_2_acc->{$snp_clus_id} }, $acc;
kkonganti@11 167 if ( exists $acc_2_snp->{$acc} ) {
kkonganti@11 168 print $Stderr
kkonganti@11 169 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n";
kkonganti@11 170 exit 1;
kkonganti@11 171 }
kkonganti@11 172 $acc_2_snp->{$acc} = $snp_clus_id;
kkonganti@11 173 $snp_count->{$snp_clus_id} = 0;
kkonganti@11 174 }
kkonganti@11 175
kkonganti@11 176 while ( my $line = <$tbl_file> ) {
kkonganti@11 177 chomp $line;
kkonganti@11 178
kkonganti@11 179 my @cols = split( /\t/, $line );
kkonganti@11 180
kkonganti@11 181 # .tbl file columns (Perl index):
kkonganti@11 182 #
kkonganti@11 183 # 0: Accession
kkonganti@11 184 # 1: AssemblyLevel
kkonganti@11 185 # 2: ScaffoldN50
kkonganti@11 186 # 3: ContigN50
kkonganti@11 187
kkonganti@11 188 my $acc = $cols[0];
kkonganti@11 189 my $asm_lvl = $cols[1];
kkonganti@11 190 my $scaffold_n50 = $cols[2];
kkonganti@11 191 my $contig_n50 = $cols[3];
kkonganti@11 192
kkonganti@11 193 # my $idx0 = $acc_2_serovar->{$cols[0]};
kkonganti@11 194 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } );
kkonganti@11 195
kkonganti@11 196 if ( not_empty($acc) && defined $idx0 ) {
kkonganti@11 197 my $fna_rel_loc =
kkonganti@11 198 "$pdg_release/ncbi_dataset/data/$acc/"
kkonganti@11 199 . $acc
kkonganti@11 200 . '_scaffolded_genomic.fna.gz';
kkonganti@11 201
kkonganti@11 202 if ( not_empty($scaffold_n50) ) {
kkonganti@11 203 next if ( $scaffold_n50 <= $min_asm_size );
kkonganti@11 204 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
kkonganti@11 205 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
kkonganti@11 206 }
kkonganti@11 207 elsif ( not_empty($contig_n50) ) {
kkonganti@11 208 next if ( $contig_n50 <= $min_asm_size );
kkonganti@11 209 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
kkonganti@11 210 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
kkonganti@11 211 }
kkonganti@11 212 }
kkonganti@11 213 }
kkonganti@11 214
kkonganti@11 215 foreach my $snp_cluster_id ( keys %$snp_2_acc ) {
kkonganti@11 216 my $count = $snp_count->{$snp_cluster_id};
kkonganti@11 217 foreach my $asm_lvl (
kkonganti@11 218 sort { $a cmp $b }
kkonganti@11 219 keys %{ $snp_2_serovar->{$snp_cluster_id} }
kkonganti@11 220 )
kkonganti@11 221 {
kkonganti@11 222 if ( $asm_lvl =~ m/Complete\s+Genome/i ) {
kkonganti@11 223 $count =
kkonganti@11 224 print_dl_metadata( $asm_lvl,
kkonganti@11 225 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@11 226 $count, $snp_cluster_id );
kkonganti@11 227 }
kkonganti@11 228 if ( $asm_lvl =~ m/Chromosome/i ) {
kkonganti@11 229 $count =
kkonganti@11 230 print_dl_metadata( $asm_lvl,
kkonganti@11 231 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@11 232 $count, $snp_cluster_id );
kkonganti@11 233 }
kkonganti@11 234 if ( $asm_lvl =~ m/Scaffold/i ) {
kkonganti@11 235 $count =
kkonganti@11 236 print_dl_metadata( $asm_lvl,
kkonganti@11 237 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@11 238 $count, $snp_cluster_id );
kkonganti@11 239 }
kkonganti@11 240 if ( $asm_lvl =~ m/Contig/i ) {
kkonganti@11 241 $count =
kkonganti@11 242 print_dl_metadata( $asm_lvl,
kkonganti@11 243 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@11 244 $count, $snp_cluster_id );
kkonganti@11 245 }
kkonganti@11 246 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count
kkonganti@11 247 if ( $count > 0 );
kkonganti@11 248 last if ( $count >= $serovar_limit );
kkonganti@11 249 }
kkonganti@11 250 }
kkonganti@11 251
kkonganti@11 252 close $pdg_file;
kkonganti@11 253 close $tbl_file;
kkonganti@11 254 close $snp_cluster_file;
kkonganti@11 255 close $acc_fh;
kkonganti@11 256 close $accs_snp_fh;
kkonganti@11 257
kkonganti@11 258 #-------------------------------------------
kkonganti@11 259 # Main ends
kkonganti@11 260 #-------------------------------------------
kkonganti@11 261 # Routines begin
kkonganti@11 262 #-------------------------------------------
kkonganti@11 263
kkonganti@11 264 sub print_dl_metadata {
kkonganti@11 265 my $asm_lvl = shift;
kkonganti@11 266 my $acc_sizes = shift;
kkonganti@11 267 my $curr_count = shift;
kkonganti@11 268 my $snp_cluster_id = shift;
kkonganti@11 269
kkonganti@11 270 $asm_lvl =~ s/.+?\_(.+)/$1/;
kkonganti@11 271
kkonganti@11 272 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) {
kkonganti@11 273 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) {
kkonganti@11 274 my ( $serovar, $url ) = split( /\|/, $serovar_url );
kkonganti@11 275 return $curr_count if ( exists $multi_cluster_acc->{$url} );
kkonganti@11 276 $multi_cluster_acc->{$url} = 1;
kkonganti@11 277 $curr_count++;
kkonganti@11 278 my ( $final_acc, $genome_header ) =
kkonganti@11 279 ( split( /\//, $url ) )[ 3 .. 4 ];
kkonganti@11 280 print $accs_snp_fh "$final_acc\n";
kkonganti@11 281 print $genome_headers_fh catfile( 'scaffold_genomes',
kkonganti@11 282 $genome_header )
kkonganti@11 283 . "\n";
kkonganti@11 284 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n"
kkonganti@11 285 if ( $curr_count > 0 );
kkonganti@11 286 }
kkonganti@11 287 last if ( $curr_count >= $serovar_limit );
kkonganti@11 288 }
kkonganti@11 289 return $curr_count;
kkonganti@11 290 }
kkonganti@11 291
kkonganti@11 292 sub sort_asm_level {
kkonganti@11 293 my $level = shift;
kkonganti@11 294
kkonganti@11 295 $level =~ s/(Complete\s+Genome)/a\_$1/
kkonganti@11 296 if ( $level =~ m/Complete\s+Genome/i );
kkonganti@11 297 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i );
kkonganti@11 298 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i );
kkonganti@11 299 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i );
kkonganti@11 300
kkonganti@11 301 return $level;
kkonganti@11 302 }
kkonganti@11 303
kkonganti@11 304 sub not_empty {
kkonganti@11 305 my $col = shift;
kkonganti@11 306
kkonganti@11 307 if ( $col !~ m/^$/ ) {
kkonganti@11 308 return 1;
kkonganti@11 309 }
kkonganti@11 310 else {
kkonganti@11 311 return 0;
kkonganti@11 312 }
kkonganti@11 313 }
kkonganti@11 314
kkonganti@11 315 __END__
kkonganti@11 316
kkonganti@11 317 =head1 SYNOPSIS
kkonganti@11 318
kkonganti@11 319 This script will take in a PDG metadata file, a C<.tbl> file and generate
kkonganti@11 320 the final list by B<I<waterfall>> priority.
kkonganti@11 321
kkonganti@11 322 See complete description:
kkonganti@11 323
kkonganti@11 324 perldoc waterfall_per_snp_cluster.pl
kkonganti@11 325
kkonganti@11 326 or
kkonganti@11 327
kkonganti@11 328 waterfall_per_snp_cluster.pl --help
kkonganti@11 329
kkonganti@11 330 Examples:
kkonganti@11 331
kkonganti@11 332 waterfall_per_snp_cluster.pl
kkonganti@11 333
kkonganti@11 334 =head1 DESCRIPTION
kkonganti@11 335
kkonganti@11 336 We will retain up to N number of genome accessions per SNP cluster.
kkonganti@11 337 It prioritizes SNP Cluster participation over serotype coverage.
kkonganti@11 338 Which N genomes are selected depends on (in order):
kkonganti@11 339
kkonganti@11 340 1. Genome assembly level, whose priority is
kkonganti@11 341
kkonganti@11 342 a: Complete Genome
kkonganti@11 343 b: Chromosome
kkonganti@11 344 c: Scaffold
kkonganti@11 345 d: Contig
kkonganti@11 346
kkonganti@11 347 2. If the genomes are of same assembly level, then
kkonganti@11 348 scaffold N50 followed by contig N50 is chosen.
kkonganti@11 349
kkonganti@11 350 3. If the scaffold or contig N50 is same, then all
kkonganti@11 351 of them are included
kkonganti@11 352
kkonganti@11 353 =head1 OPTIONS
kkonganti@11 354
kkonganti@11 355 =over 3
kkonganti@11 356
kkonganti@11 357 =item -p PDGXXXXX.XXXX.metadata.tsv
kkonganti@11 358
kkonganti@11 359 Absolute UNIX path pointing to the PDG metadata file.
kkonganti@11 360 Example: PDG000000002.2505.metadata.tsv
kkonganti@11 361
kkonganti@11 362 =item -t asm.tbl
kkonganti@11 363
kkonganti@11 364 Absolute UNIX path pointing to the file from the result
kkonganti@11 365 of the C<dl_pdg_data.py> script, which is the C<asm.tbl>
kkonganti@11 366 file.
kkonganti@11 367
kkonganti@11 368 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv
kkonganti@11 369
kkonganti@11 370 Absolute UNIX path pointing to the SNP Cluster metadata file.
kkonganti@11 371 Examples: PDG000000002.2505.reference_target.cluster_list.tsv
kkonganti@11 372
kkonganti@11 373
kkonganti@11 374 =item --serocol <int> (Optional)
kkonganti@11 375
kkonganti@11 376 Column number (non 0-based index) of the PDG metadata file
kkonganti@11 377 by which the serotypes are collected. Default: 49
kkonganti@11 378
kkonganti@11 379 =item --complete_serotype_name (Optional)
kkonganti@11 380
kkonganti@11 381 Skip indexing serotypes when the serotype name in the column
kkonganti@11 382 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if
kkonganti@11 383 an accession has a I<B<serotype=>> string as such in column
kkonganti@11 384 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-">
kkonganti@11 385 then, the indexing of that accession is skipped.
kkonganti@11 386 Default: False
kkonganti@11 387
kkonganti@11 388 =item --not_null_pdg_serovar (Optional)
kkonganti@11 389
kkonganti@11 390 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based)
kkonganti@11 391 if the B<I<serovar>> column is not C<NULL>.
kkonganti@11 392
kkonganti@11 393 =item -i <serotype name> (Optional)
kkonganti@11 394
kkonganti@11 395 Make sure the following serotype is included. Mention C<-i> multiple
kkonganti@11 396 times to include multiple serotypes.
kkonganti@11 397
kkonganti@11 398 =item -num <int> (Optional)
kkonganti@11 399
kkonganti@11 400 Number of genome accessions per SNP Cluster. Default: 1
kkonganti@11 401
kkonganti@11 402 =back
kkonganti@11 403
kkonganti@11 404 =head1 AUTHOR
kkonganti@11 405
kkonganti@11 406 Kranti Konganti
kkonganti@11 407
kkonganti@11 408 =cut