annotate 0.5.0/bin/waterfall_per_snp_cluster.pl @ 1:365849f031fd

"planemo upload"
author kkonganti
date Mon, 05 Jun 2023 18:48:51 -0400
parents
children
rev   line source
kkonganti@1 1 #!/usr/bin/env perl
kkonganti@1 2
kkonganti@1 3 # Kranti Konganti
kkonganti@1 4 # 10/12/2022
kkonganti@1 5
kkonganti@1 6 use strict;
kkonganti@1 7 use warnings;
kkonganti@1 8 use Getopt::Long;
kkonganti@1 9 use Data::Dumper;
kkonganti@1 10 use Pod::Usage;
kkonganti@1 11 use File::Basename;
kkonganti@1 12 use File::Spec::Functions;
kkonganti@1 13
kkonganti@1 14 my $tbl = {};
kkonganti@1 15 my $snp_2_serovar = {};
kkonganti@1 16 my $acc_2_serovar = {};
kkonganti@1 17 my $acc_2_target = {};
kkonganti@1 18 my $snp_count = {};
kkonganti@1 19 my $snp_2_acc = {};
kkonganti@1 20 my $acc_2_snp = {};
kkonganti@1 21 my $multi_cluster_acc = {};
kkonganti@1 22 my (
kkonganti@1 23 $serovar_limit, $serovar_or_type_col, $min_asm_size,
kkonganti@1 24 $complete_serotype_name, $PDG_file, $table_file,
kkonganti@1 25 $not_null_pdg_serovar, $snp_cluster, $help,
kkonganti@1 26 $out_prefix
kkonganti@1 27 );
kkonganti@1 28 my @custom_serovars;
kkonganti@1 29
kkonganti@1 30 GetOptions(
kkonganti@1 31 'help' => \$help,
kkonganti@1 32 'pdg=s' => \$PDG_file,
kkonganti@1 33 'tbl=s' => \$table_file,
kkonganti@1 34 'snp=s' => \$snp_cluster,
kkonganti@1 35 'min_contig_size=i' => \$min_asm_size,
kkonganti@1 36 'complete_serotype_name' => \$complete_serotype_name,
kkonganti@1 37 'serocol:i' => \$serovar_or_type_col,
kkonganti@1 38 'not_null_pdg_serovar' => \$not_null_pdg_serovar,
kkonganti@1 39 'num_serotypes_per_serotype:i' => \$serovar_limit,
kkonganti@1 40 'include_serovar=s' => \@custom_serovars,
kkonganti@1 41 'op=s' => \$out_prefix
kkonganti@1 42 ) or pod2usage( -verbose => 2 );
kkonganti@1 43
kkonganti@1 44 if ( defined $help ) {
kkonganti@1 45 pod2usage( -verbose => 2 );
kkonganti@1 46 }
kkonganti@1 47
kkonganti@1 48 if ( !defined $serovar_limit ) {
kkonganti@1 49 $serovar_limit = 1;
kkonganti@1 50 }
kkonganti@1 51
kkonganti@1 52 if ( !defined $serovar_or_type_col ) {
kkonganti@1 53 $serovar_or_type_col = 49;
kkonganti@1 54 }
kkonganti@1 55
kkonganti@1 56 if ( !defined $min_asm_size ) {
kkonganti@1 57 $min_asm_size = 0;
kkonganti@1 58 }
kkonganti@1 59
kkonganti@1 60 if ( defined $out_prefix ) {
kkonganti@1 61 $out_prefix .= '_';
kkonganti@1 62 }
kkonganti@1 63 else {
kkonganti@1 64 $out_prefix = '';
kkonganti@1 65 }
kkonganti@1 66
kkonganti@1 67 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster );
kkonganti@1 68
kkonganti@1 69 open( my $pdg_file, '<', $PDG_file )
kkonganti@1 70 || die "\nCannot open PDG file $PDG_file: $!\n\n";
kkonganti@1 71 open( my $tbl_file, '<', $table_file )
kkonganti@1 72 || die "\nCannot open tbl file $table_file: $!\n\n";
kkonganti@1 73 open( my $snp_cluster_file, '<', $snp_cluster )
kkonganti@1 74 || die "\nCannot open $snp_cluster: $!\n\n";
kkonganti@1 75 open( my $acc_fh, '>', 'acc2serovar.txt' )
kkonganti@1 76 || die "\nCannot open acc2serovar.txt: $!\n\n";
kkonganti@1 77 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n";
kkonganti@1 78 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n";
kkonganti@1 79 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' )
kkonganti@1 80 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n";
kkonganti@1 81 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' )
kkonganti@1 82 || die "\nCannnot open "
kkonganti@1 83 . $out_prefix
kkonganti@1 84 . "mash_snp_genome_list.txt for writing: $!\n\n";
kkonganti@1 85
kkonganti@1 86 my $pdg_release = basename( $PDG_file, ".metadata.tsv" );
kkonganti@1 87
kkonganti@1 88 while ( my $line = <$pdg_file> ) {
kkonganti@1 89 chomp $line;
kkonganti@1 90 next if ( $line =~ m/^\#/ );
kkonganti@1 91
kkonganti@1 92 # Relevent columns (Perl index):
kkonganti@1 93 # 9: asm_acc
kkonganti@1 94 # 33: serovar
kkonganti@1 95 # 48: computed serotype
kkonganti@1 96
kkonganti@1 97 my @cols = split( /\t/, $line );
kkonganti@1 98 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ];
kkonganti@1 99 my $acc = $cols[9];
kkonganti@1 100 my $serovar = $cols[33];
kkonganti@1 101 my $target_acc = $cols[41];
kkonganti@1 102
kkonganti@1 103 $serovar_or_type =~ s/\"//g;
kkonganti@1 104
kkonganti@1 105 my $skip = 1;
kkonganti@1 106 foreach my $ser (@custom_serovars) {
kkonganti@1 107 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ );
kkonganti@1 108 }
kkonganti@1 109
kkonganti@1 110 if ( defined $complete_serotype_name ) {
kkonganti@1 111 next
kkonganti@1 112 if ( $skip
kkonganti@1 113 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ )
kkonganti@1 114 );
kkonganti@1 115 }
kkonganti@1 116
kkonganti@1 117 next
kkonganti@1 118 if (
kkonganti@1 119 $skip
kkonganti@1 120 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
kkonganti@1 121 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ )
kkonganti@1 122 );
kkonganti@1 123
kkonganti@1 124 # next
kkonganti@1 125 # if (
kkonganti@1 126 # (
kkonganti@1 127 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
kkonganti@1 128 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/
kkonganti@1 129 # )
kkonganti@1 130 # );
kkonganti@1 131
kkonganti@1 132 if ( defined $not_null_pdg_serovar ) {
kkonganti@1 133 $acc_2_serovar->{$acc} = $serovar_or_type,
kkonganti@1 134 $acc_2_target->{$acc} = $target_acc,
kkonganti@1 135 print $acc_fh "$acc\t$serovar_or_type\n"
kkonganti@1 136 if ( $acc !~ m/NULL/
kkonganti@1 137 && $serovar !~ m/NULL/
kkonganti@1 138 && $serovar_or_type !~ m/NULL/ );
kkonganti@1 139 }
kkonganti@1 140 else {
kkonganti@1 141 $acc_2_serovar->{$acc} = $serovar_or_type,
kkonganti@1 142 $acc_2_target->{$acc} = $target_acc,
kkonganti@1 143 print $acc_fh "$acc\t$serovar_or_type\n"
kkonganti@1 144 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ );
kkonganti@1 145 }
kkonganti@1 146
kkonganti@1 147 # $snp_count->{$serovar_or_type} = 0;
kkonganti@1 148 }
kkonganti@1 149
kkonganti@1 150 #
kkonganti@1 151 # SNP to ACC
kkonganti@1 152 #
kkonganti@1 153
kkonganti@1 154 while ( my $line = <$snp_cluster_file> ) {
kkonganti@1 155 chomp $line;
kkonganti@1 156 my @cols = split( /\t/, $line );
kkonganti@1 157
kkonganti@1 158 # Relevant columns
kkonganti@1 159 # 0: SNP Cluster ID
kkonganti@1 160 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank)
kkonganti@1 161 my $snp_clus_id = $cols[0];
kkonganti@1 162 my $acc = $cols[3];
kkonganti@1 163
kkonganti@1 164 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ );
kkonganti@1 165 next if ( !exists $acc_2_serovar->{$acc} );
kkonganti@1 166 push @{ $snp_2_acc->{$snp_clus_id} }, $acc;
kkonganti@1 167 if ( exists $acc_2_snp->{$acc} ) {
kkonganti@1 168 print $Stderr
kkonganti@1 169 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n";
kkonganti@1 170 exit 1;
kkonganti@1 171 }
kkonganti@1 172 $acc_2_snp->{$acc} = $snp_clus_id;
kkonganti@1 173 $snp_count->{$snp_clus_id} = 0;
kkonganti@1 174 }
kkonganti@1 175
kkonganti@1 176 while ( my $line = <$tbl_file> ) {
kkonganti@1 177 chomp $line;
kkonganti@1 178
kkonganti@1 179 my @cols = split( /\t/, $line );
kkonganti@1 180
kkonganti@1 181 # .tbl file columns (Perl index):
kkonganti@1 182 #
kkonganti@1 183 # 0: Accession
kkonganti@1 184 # 1: AssemblyLevel
kkonganti@1 185 # 2: ScaffoldN50
kkonganti@1 186 # 3: ContigN50
kkonganti@1 187
kkonganti@1 188 my $acc = $cols[0];
kkonganti@1 189 my $asm_lvl = $cols[1];
kkonganti@1 190 my $scaffold_n50 = $cols[2];
kkonganti@1 191 my $contig_n50 = $cols[3];
kkonganti@1 192
kkonganti@1 193 # my $idx0 = $acc_2_serovar->{$cols[0]};
kkonganti@1 194 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } );
kkonganti@1 195
kkonganti@1 196 if ( not_empty($acc) && defined $idx0 ) {
kkonganti@1 197 my $fna_rel_loc =
kkonganti@1 198 "$pdg_release/ncbi_dataset/data/$acc/"
kkonganti@1 199 . $acc
kkonganti@1 200 . '_scaffolded_genomic.fna.gz';
kkonganti@1 201
kkonganti@1 202 if ( not_empty($scaffold_n50) ) {
kkonganti@1 203 next if ( $scaffold_n50 <= $min_asm_size );
kkonganti@1 204 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
kkonganti@1 205 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
kkonganti@1 206 }
kkonganti@1 207 elsif ( not_empty($contig_n50) ) {
kkonganti@1 208 next if ( $contig_n50 <= $min_asm_size );
kkonganti@1 209 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
kkonganti@1 210 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
kkonganti@1 211 }
kkonganti@1 212 }
kkonganti@1 213 }
kkonganti@1 214
kkonganti@1 215 foreach my $snp_cluster_id ( keys %$snp_2_acc ) {
kkonganti@1 216 my $count = $snp_count->{$snp_cluster_id};
kkonganti@1 217 foreach my $asm_lvl (
kkonganti@1 218 sort { $a cmp $b }
kkonganti@1 219 keys %{ $snp_2_serovar->{$snp_cluster_id} }
kkonganti@1 220 )
kkonganti@1 221 {
kkonganti@1 222 if ( $asm_lvl =~ m/Complete\s+Genome/i ) {
kkonganti@1 223 $count =
kkonganti@1 224 print_dl_metadata( $asm_lvl,
kkonganti@1 225 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@1 226 $count, $snp_cluster_id );
kkonganti@1 227 }
kkonganti@1 228 if ( $asm_lvl =~ m/Chromosome/i ) {
kkonganti@1 229 $count =
kkonganti@1 230 print_dl_metadata( $asm_lvl,
kkonganti@1 231 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@1 232 $count, $snp_cluster_id );
kkonganti@1 233 }
kkonganti@1 234 if ( $asm_lvl =~ m/Scaffold/i ) {
kkonganti@1 235 $count =
kkonganti@1 236 print_dl_metadata( $asm_lvl,
kkonganti@1 237 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@1 238 $count, $snp_cluster_id );
kkonganti@1 239 }
kkonganti@1 240 if ( $asm_lvl =~ m/Contig/i ) {
kkonganti@1 241 $count =
kkonganti@1 242 print_dl_metadata( $asm_lvl,
kkonganti@1 243 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
kkonganti@1 244 $count, $snp_cluster_id );
kkonganti@1 245 }
kkonganti@1 246 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count
kkonganti@1 247 if ( $count > 0 );
kkonganti@1 248 last if ( $count == $serovar_limit );
kkonganti@1 249 }
kkonganti@1 250 }
kkonganti@1 251
kkonganti@1 252 close $pdg_file;
kkonganti@1 253 close $tbl_file;
kkonganti@1 254 close $snp_cluster_file;
kkonganti@1 255 close $acc_fh;
kkonganti@1 256 close $accs_snp_fh;
kkonganti@1 257
kkonganti@1 258 #-------------------------------------------
kkonganti@1 259 # Main ends
kkonganti@1 260 #-------------------------------------------
kkonganti@1 261 # Routines begin
kkonganti@1 262 #-------------------------------------------
kkonganti@1 263
kkonganti@1 264 sub print_dl_metadata {
kkonganti@1 265 my $asm_lvl = shift;
kkonganti@1 266 my $acc_sizes = shift;
kkonganti@1 267 my $curr_count = shift;
kkonganti@1 268 my $snp_cluster_id = shift;
kkonganti@1 269
kkonganti@1 270 $asm_lvl =~ s/.+?\_(.+)/$1/;
kkonganti@1 271
kkonganti@1 272 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) {
kkonganti@1 273 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) {
kkonganti@1 274 my ( $serovar, $url ) = split( /\|/, $serovar_url );
kkonganti@1 275 return $curr_count if ( exists $multi_cluster_acc->{$url} );
kkonganti@1 276 $multi_cluster_acc->{$url} = 1;
kkonganti@1 277 $curr_count++;
kkonganti@1 278 my ( $final_acc, $genome_header ) =
kkonganti@1 279 ( split( /\//, $url ) )[ 3 .. 4 ];
kkonganti@1 280 print $accs_snp_fh "$final_acc\n";
kkonganti@1 281 print $genome_headers_fh catfile( 'scaffold_genomes',
kkonganti@1 282 $genome_header )
kkonganti@1 283 . "\n";
kkonganti@1 284 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n"
kkonganti@1 285 if ( $curr_count > 0 );
kkonganti@1 286 last if ( $curr_count == $serovar_limit );
kkonganti@1 287 }
kkonganti@1 288 last if ( $curr_count == $serovar_limit );
kkonganti@1 289 }
kkonganti@1 290 return $curr_count;
kkonganti@1 291 }
kkonganti@1 292
kkonganti@1 293 sub sort_asm_level {
kkonganti@1 294 my $level = shift;
kkonganti@1 295
kkonganti@1 296 $level =~ s/(Complete\s+Genome)/a\_$1/
kkonganti@1 297 if ( $level =~ m/Complete\s+Genome/i );
kkonganti@1 298 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i );
kkonganti@1 299 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i );
kkonganti@1 300 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i );
kkonganti@1 301
kkonganti@1 302 return $level;
kkonganti@1 303 }
kkonganti@1 304
kkonganti@1 305 sub not_empty {
kkonganti@1 306 my $col = shift;
kkonganti@1 307
kkonganti@1 308 if ( $col !~ m/^$/ ) {
kkonganti@1 309 return 1;
kkonganti@1 310 }
kkonganti@1 311 else {
kkonganti@1 312 return 0;
kkonganti@1 313 }
kkonganti@1 314 }
kkonganti@1 315
kkonganti@1 316 __END__
kkonganti@1 317
kkonganti@1 318 =head1 SYNOPSIS
kkonganti@1 319
kkonganti@1 320 This script will take in a PDG metadata file, a C<.tbl> file and generate
kkonganti@1 321 the final list by B<I<waterfall>> priority.
kkonganti@1 322
kkonganti@1 323 See complete description:
kkonganti@1 324
kkonganti@1 325 perldoc waterfall_per_snp_cluster.pl
kkonganti@1 326
kkonganti@1 327 or
kkonganti@1 328
kkonganti@1 329 waterfall_per_snp_cluster.pl --help
kkonganti@1 330
kkonganti@1 331 Examples:
kkonganti@1 332
kkonganti@1 333 waterfall_per_snp_cluster.pl
kkonganti@1 334
kkonganti@1 335 =head1 DESCRIPTION
kkonganti@1 336
kkonganti@1 337 We will retain up to N number of genome accessions per SNP cluster.
kkonganti@1 338 It prioritizes SNP Cluster participation over serotype coverage.
kkonganti@1 339 Which N genomes are selected depends on (in order):
kkonganti@1 340
kkonganti@1 341 1. Genome assembly level, whose priority is
kkonganti@1 342
kkonganti@1 343 a: Complete Genome
kkonganti@1 344 b: Chromosome
kkonganti@1 345 c: Scaffold
kkonganti@1 346 d: Contig
kkonganti@1 347
kkonganti@1 348 2. If the genomes are of same assembly level, then
kkonganti@1 349 scaffold N50 followed by contig N50 is chosen.
kkonganti@1 350
kkonganti@1 351 3. If the scaffold or contig N50 is same, then all
kkonganti@1 352 of them are included
kkonganti@1 353
kkonganti@1 354 =head1 OPTIONS
kkonganti@1 355
kkonganti@1 356 =over 3
kkonganti@1 357
kkonganti@1 358 =item -p PDGXXXXX.XXXX.metadata.tsv
kkonganti@1 359
kkonganti@1 360 Absolute UNIX path pointing to the PDG metadata file.
kkonganti@1 361 Example: PDG000000002.2505.metadata.tsv
kkonganti@1 362
kkonganti@1 363 =item -t asm.tbl
kkonganti@1 364
kkonganti@1 365 Absolute UNIX path pointing to the file from the result
kkonganti@1 366 of the C<dl_pdg_data.py> script, which is the C<asm.tbl>
kkonganti@1 367 file.
kkonganti@1 368
kkonganti@1 369 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv
kkonganti@1 370
kkonganti@1 371 Absolute UNIX path pointing to the SNP Cluster metadata file.
kkonganti@1 372 Examples: PDG000000002.2505.reference_target.cluster_list.tsv
kkonganti@1 373
kkonganti@1 374
kkonganti@1 375 =item --serocol <int> (Optional)
kkonganti@1 376
kkonganti@1 377 Column number (non 0-based index) of the PDG metadata file
kkonganti@1 378 by which the serotypes are collected. Default: 49
kkonganti@1 379
kkonganti@1 380 =item --complete_serotype_name (Optional)
kkonganti@1 381
kkonganti@1 382 Skip indexing serotypes when the serotype name in the column
kkonganti@1 383 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if
kkonganti@1 384 an accession has a I<B<serotype=>> string as such in column
kkonganti@1 385 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-">
kkonganti@1 386 then, the indexing of that accession is skipped.
kkonganti@1 387 Default: False
kkonganti@1 388
kkonganti@1 389 =item --not_null_pdg_serovar (Optional)
kkonganti@1 390
kkonganti@1 391 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based)
kkonganti@1 392 if the B<I<serovar>> column is not C<NULL>.
kkonganti@1 393
kkonganti@1 394 =item -i <serotype name> (Optional)
kkonganti@1 395
kkonganti@1 396 Make sure the following serotype is included. Mention C<-i> multiple
kkonganti@1 397 times to include multiple serotypes.
kkonganti@1 398
kkonganti@1 399 =item -num <int> (Optional)
kkonganti@1 400
kkonganti@1 401 Number of genome accessions per SNP Cluster. Default: 1
kkonganti@1 402
kkonganti@1 403 =back
kkonganti@1 404
kkonganti@1 405 =head1 AUTHOR
kkonganti@1 406
kkonganti@1 407 Kranti Konganti
kkonganti@1 408
kkonganti@1 409 =cut