kkonganti@17: #!/usr/bin/env perl kkonganti@17: kkonganti@17: # Kranti Konganti kkonganti@17: # 01/02/2024 kkonganti@17: kkonganti@17: use strict; kkonganti@17: use warnings; kkonganti@17: use Getopt::Long; kkonganti@17: use Data::Dumper; kkonganti@17: use Pod::Usage; kkonganti@17: use File::Basename; kkonganti@17: use File::Spec::Functions; kkonganti@17: kkonganti@17: my $tbl = {}; kkonganti@17: my $snp_2_serovar = {}; kkonganti@17: my $acc_2_serovar = {}; kkonganti@17: my $acc_2_target = {}; kkonganti@17: my $snp_count = {}; kkonganti@17: my $snp_2_acc = {}; kkonganti@17: my $acc_2_snp = {}; kkonganti@17: my $multi_cluster_acc = {}; kkonganti@17: my ( kkonganti@17: $serovar_limit, $serovar_or_type_col, $min_asm_size, kkonganti@17: $complete_serotype_name, $PDG_file, $table_file, kkonganti@17: $not_null_pdg_serovar, $snp_cluster, $help, kkonganti@17: $out_prefix, $acc_col, $seronamecol, kkonganti@17: $target_acc_col kkonganti@17: ); kkonganti@17: my @custom_serovars; kkonganti@17: kkonganti@17: GetOptions( kkonganti@17: 'help' => \$help, kkonganti@17: 'pdg=s' => \$PDG_file, kkonganti@17: 'tbl=s' => \$table_file, kkonganti@17: 'snp=s' => \$snp_cluster, kkonganti@17: 'min_contig_size=i' => \$min_asm_size, kkonganti@17: 'complete_serotype_name' => \$complete_serotype_name, kkonganti@17: 'serocol:i' => \$serovar_or_type_col, kkonganti@17: 'seronamecol:i' => \$seronamecol, kkonganti@17: 'target_acc_col:i' => \$target_acc_col, kkonganti@17: 'acc_col:i' => \$acc_col, kkonganti@17: 'not_null_pdg_serovar' => \$not_null_pdg_serovar, kkonganti@17: 'num_serotypes_per_serotype:i' => \$serovar_limit, kkonganti@17: 'include_serovar=s' => \@custom_serovars, kkonganti@17: 'op=s' => \$out_prefix kkonganti@17: ) or pod2usage( -verbose => 2 ); kkonganti@17: kkonganti@17: if ( defined $help ) { kkonganti@17: pod2usage( -verbose => 2 ); kkonganti@17: } kkonganti@17: kkonganti@17: if ( !defined $serovar_limit ) { kkonganti@17: $serovar_limit = 1; kkonganti@17: } kkonganti@17: kkonganti@17: if ( !defined $serovar_or_type_col ) { kkonganti@17: $serovar_or_type_col = 50; kkonganti@17: } kkonganti@17: kkonganti@17: if ( !defined $seronamecol ) { kkonganti@17: $seronamecol = 34; kkonganti@17: } kkonganti@17: kkonganti@17: if ( !defined $target_acc_col ) { kkonganti@17: $target_acc_col = 43; kkonganti@17: } kkonganti@17: kkonganti@17: if ( !defined $acc_col ) { kkonganti@17: $acc_col = 10; kkonganti@17: } kkonganti@17: kkonganti@17: if ( !defined $min_asm_size ) { kkonganti@17: $min_asm_size = 0; kkonganti@17: } kkonganti@17: kkonganti@17: if ( defined $out_prefix ) { kkonganti@17: $out_prefix .= '_'; kkonganti@17: } kkonganti@17: else { kkonganti@17: $out_prefix = ''; kkonganti@17: } kkonganti@17: kkonganti@17: pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster ); kkonganti@17: kkonganti@17: open( my $pdg_file, '<', $PDG_file ) kkonganti@17: || die "\nCannot open PDG file $PDG_file: $!\n\n"; kkonganti@17: open( my $tbl_file, '<', $table_file ) kkonganti@17: || die "\nCannot open tbl file $table_file: $!\n\n"; kkonganti@17: open( my $snp_cluster_file, '<', $snp_cluster ) kkonganti@17: || die "\nCannot open $snp_cluster: $!\n\n"; kkonganti@17: open( my $acc_fh, '>', 'acc2serovar.txt' ) kkonganti@17: || die "\nCannot open acc2serovar.txt: $!\n\n"; kkonganti@17: open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n"; kkonganti@17: open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n"; kkonganti@17: open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' ) kkonganti@17: || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n"; kkonganti@17: open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' ) kkonganti@17: || die "\nCannnot open " kkonganti@17: . $out_prefix kkonganti@17: . "mash_snp_genome_list.txt for writing: $!\n\n"; kkonganti@17: kkonganti@17: my $pdg_release = basename( $PDG_file, ".metadata.tsv" ); kkonganti@17: kkonganti@17: while ( my $line = <$pdg_file> ) { kkonganti@17: chomp $line; kkonganti@17: next if ( $line =~ m/^\#/ ); kkonganti@17: kkonganti@17: # Relevent columns (Perl index): kkonganti@17: # 10-1 = 9: asm_acc kkonganti@17: # 34 -1 = 33: serovar kkonganti@17: # 50 -1 = 49: computed serotype kkonganti@17: kkonganti@17: my @cols = split( /\t/, $line ); kkonganti@17: my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ]; kkonganti@17: my $acc = $cols[ $acc_col - 1 ]; kkonganti@17: my $serovar = $cols[ $seronamecol - 1 ]; kkonganti@17: my $target_acc = $cols[ $target_acc_col - 1 ]; kkonganti@17: kkonganti@17: $serovar_or_type =~ s/\"//g; kkonganti@17: kkonganti@17: my $skip = 1; kkonganti@17: foreach my $ser (@custom_serovars) { kkonganti@17: $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ ); kkonganti@17: } kkonganti@17: kkonganti@17: if ( defined $complete_serotype_name ) { kkonganti@17: next kkonganti@17: if ( $skip kkonganti@17: && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ ) kkonganti@17: ); kkonganti@17: } kkonganti@17: kkonganti@17: next kkonganti@17: if ( kkonganti@17: $skip kkonganti@17: && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ kkonganti@17: || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ ) kkonganti@17: ); kkonganti@17: kkonganti@17: # next kkonganti@17: # if ( kkonganti@17: # ( kkonganti@17: # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ kkonganti@17: # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ kkonganti@17: # ) kkonganti@17: # ); kkonganti@17: kkonganti@17: if ( defined $not_null_pdg_serovar ) { kkonganti@17: $acc_2_serovar->{$acc} = $serovar_or_type, kkonganti@17: $acc_2_target->{$acc} = $target_acc, kkonganti@17: print $acc_fh "$acc\t$serovar_or_type\n" kkonganti@17: if ( $acc !~ m/NULL/ kkonganti@17: && $serovar !~ m/NULL/ kkonganti@17: && $serovar_or_type !~ m/NULL/ ); kkonganti@17: } kkonganti@17: else { kkonganti@17: $acc_2_serovar->{$acc} = $serovar_or_type, kkonganti@17: $acc_2_target->{$acc} = $target_acc, kkonganti@17: print $acc_fh "$acc\t$serovar_or_type\n" kkonganti@17: if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ ); kkonganti@17: } kkonganti@17: kkonganti@17: # $snp_count->{$serovar_or_type} = 0; kkonganti@17: } kkonganti@17: kkonganti@17: # kkonganti@17: # SNP to ACC kkonganti@17: # kkonganti@17: kkonganti@17: while ( my $line = <$snp_cluster_file> ) { kkonganti@17: chomp $line; kkonganti@17: my @cols = split( /\t/, $line ); kkonganti@17: kkonganti@17: # Relevant columns kkonganti@17: # 0: SNP Cluster ID kkonganti@17: # 3: Genome Accession belonging to the cluster (RefSeq or GenBank) kkonganti@17: my $snp_clus_id = $cols[0]; kkonganti@17: my $acc = $cols[3]; kkonganti@17: kkonganti@17: next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ ); kkonganti@17: next if ( !exists $acc_2_serovar->{$acc} ); kkonganti@17: push @{ $snp_2_acc->{$snp_clus_id} }, $acc; kkonganti@17: if ( exists $acc_2_snp->{$acc} ) { kkonganti@17: print $Stderr kkonganti@17: "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n"; kkonganti@17: exit 1; kkonganti@17: } kkonganti@17: $acc_2_snp->{$acc} = $snp_clus_id; kkonganti@17: $snp_count->{$snp_clus_id} = 0; kkonganti@17: } kkonganti@17: kkonganti@17: while ( my $line = <$tbl_file> ) { kkonganti@17: chomp $line; kkonganti@17: kkonganti@17: my @cols = split( /\t/, $line ); kkonganti@17: kkonganti@17: # .tbl file columns (Perl index): kkonganti@17: # kkonganti@17: # 0: Accession kkonganti@17: # 1: AssemblyLevel kkonganti@17: # 2: ScaffoldN50 kkonganti@17: # 3: ContigN50 kkonganti@17: kkonganti@17: my $acc = $cols[0]; kkonganti@17: my $asm_lvl = $cols[1]; kkonganti@17: my $scaffold_n50 = $cols[2]; kkonganti@17: my $contig_n50 = $cols[3]; kkonganti@17: kkonganti@17: # my $idx0 = $acc_2_serovar->{$cols[0]}; kkonganti@17: my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } ); kkonganti@17: kkonganti@17: if ( not_empty($acc) && defined $idx0 ) { kkonganti@17: my $fna_rel_loc = kkonganti@17: "$pdg_release/ncbi_dataset/data/$acc/" kkonganti@17: . $acc kkonganti@17: . '_scaffolded_genomic.fna.gz'; kkonganti@17: kkonganti@17: if ( not_empty($scaffold_n50) ) { kkonganti@17: next if ( $scaffold_n50 <= $min_asm_size ); kkonganti@17: push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } kkonganti@17: ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; kkonganti@17: } kkonganti@17: elsif ( not_empty($contig_n50) ) { kkonganti@17: next if ( $contig_n50 <= $min_asm_size ); kkonganti@17: push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } kkonganti@17: ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; kkonganti@17: } kkonganti@17: } kkonganti@17: } kkonganti@17: kkonganti@17: foreach my $snp_cluster_id ( keys %$snp_2_acc ) { kkonganti@17: my $count = $snp_count->{$snp_cluster_id}; kkonganti@17: foreach my $asm_lvl ( kkonganti@17: sort { $a cmp $b } kkonganti@17: keys %{ $snp_2_serovar->{$snp_cluster_id} } kkonganti@17: ) kkonganti@17: { kkonganti@17: if ( $asm_lvl =~ m/Complete\s+Genome/i ) { kkonganti@17: $count = kkonganti@17: print_dl_metadata( $asm_lvl, kkonganti@17: \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, kkonganti@17: $count, $snp_cluster_id ); kkonganti@17: } kkonganti@17: if ( $asm_lvl =~ m/Chromosome/i ) { kkonganti@17: $count = kkonganti@17: print_dl_metadata( $asm_lvl, kkonganti@17: \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, kkonganti@17: $count, $snp_cluster_id ); kkonganti@17: } kkonganti@17: if ( $asm_lvl =~ m/Scaffold/i ) { kkonganti@17: $count = kkonganti@17: print_dl_metadata( $asm_lvl, kkonganti@17: \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, kkonganti@17: $count, $snp_cluster_id ); kkonganti@17: } kkonganti@17: if ( $asm_lvl =~ m/Contig/i ) { kkonganti@17: $count = kkonganti@17: print_dl_metadata( $asm_lvl, kkonganti@17: \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, kkonganti@17: $count, $snp_cluster_id ); kkonganti@17: } kkonganti@17: printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count kkonganti@17: if ( $count > 0 ); kkonganti@17: last if ( $count >= $serovar_limit ); kkonganti@17: } kkonganti@17: } kkonganti@17: kkonganti@17: close $pdg_file; kkonganti@17: close $tbl_file; kkonganti@17: close $snp_cluster_file; kkonganti@17: close $acc_fh; kkonganti@17: close $accs_snp_fh; kkonganti@17: kkonganti@17: #------------------------------------------- kkonganti@17: # Main ends kkonganti@17: #------------------------------------------- kkonganti@17: # Routines begin kkonganti@17: #------------------------------------------- kkonganti@17: kkonganti@17: sub print_dl_metadata { kkonganti@17: my $asm_lvl = shift; kkonganti@17: my $acc_sizes = shift; kkonganti@17: my $curr_count = shift; kkonganti@17: my $snp_cluster_id = shift; kkonganti@17: kkonganti@17: $asm_lvl =~ s/.+?\_(.+)/$1/; kkonganti@17: kkonganti@17: foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) { kkonganti@17: foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) { kkonganti@17: my ( $serovar, $url ) = split( /\|/, $serovar_url ); kkonganti@17: return $curr_count if ( exists $multi_cluster_acc->{$url} ); kkonganti@17: $multi_cluster_acc->{$url} = 1; kkonganti@17: $curr_count++; kkonganti@17: my ( $final_acc, $genome_header ) = kkonganti@17: ( split( /\//, $url ) )[ 3 .. 4 ]; kkonganti@17: print $accs_snp_fh "$final_acc\n"; kkonganti@17: print $genome_headers_fh catfile( 'scaffold_genomes', kkonganti@17: $genome_header ) kkonganti@17: . "\n"; kkonganti@17: print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n" kkonganti@17: if ( $curr_count > 0 ); kkonganti@17: } kkonganti@17: last if ( $curr_count >= $serovar_limit ); kkonganti@17: } kkonganti@17: return $curr_count; kkonganti@17: } kkonganti@17: kkonganti@17: sub sort_asm_level { kkonganti@17: my $level = shift; kkonganti@17: kkonganti@17: $level =~ s/(Complete\s+Genome)/a\_$1/ kkonganti@17: if ( $level =~ m/Complete\s+Genome/i ); kkonganti@17: $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i ); kkonganti@17: $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i ); kkonganti@17: $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i ); kkonganti@17: kkonganti@17: return $level; kkonganti@17: } kkonganti@17: kkonganti@17: sub not_empty { kkonganti@17: my $col = shift; kkonganti@17: kkonganti@17: if ( $col !~ m/^$/ ) { kkonganti@17: return 1; kkonganti@17: } kkonganti@17: else { kkonganti@17: return 0; kkonganti@17: } kkonganti@17: } kkonganti@17: kkonganti@17: __END__ kkonganti@17: kkonganti@17: =head1 SYNOPSIS kkonganti@17: kkonganti@17: This script will take in a PDG metadata file, a C<.tbl> file and generate kkonganti@17: the final list by B> priority. kkonganti@17: kkonganti@17: See complete description: kkonganti@17: kkonganti@17: perldoc waterfall_per_snp_cluster.pl kkonganti@17: kkonganti@17: or kkonganti@17: kkonganti@17: waterfall_per_snp_cluster.pl --help kkonganti@17: kkonganti@17: Examples: kkonganti@17: kkonganti@17: waterfall_per_snp_cluster.pl kkonganti@17: kkonganti@17: =head1 DESCRIPTION kkonganti@17: kkonganti@17: We will retain up to N number of genome accessions per SNP cluster. kkonganti@17: It prioritizes SNP Cluster participation over serotype coverage. kkonganti@17: Which N genomes are selected depends on (in order): kkonganti@17: kkonganti@17: 1. Genome assembly level, whose priority is kkonganti@17: kkonganti@17: a: Complete Genome kkonganti@17: b: Chromosome kkonganti@17: c: Scaffold kkonganti@17: d: Contig kkonganti@17: kkonganti@17: 2. If the genomes are of same assembly level, then kkonganti@17: scaffold N50 followed by contig N50 is chosen. kkonganti@17: kkonganti@17: 3. If the scaffold or contig N50 is same, then all kkonganti@17: of them are included kkonganti@17: kkonganti@17: =head1 OPTIONS kkonganti@17: kkonganti@17: =over 3 kkonganti@17: kkonganti@17: =item -p PDGXXXXX.XXXX.metadata.tsv kkonganti@17: kkonganti@17: Absolute UNIX path pointing to the PDG metadata file. kkonganti@17: Example: PDG000000002.2505.metadata.tsv kkonganti@17: kkonganti@17: =item -t asm.tbl kkonganti@17: kkonganti@17: Absolute UNIX path pointing to the file from the result kkonganti@17: of the C script, which is the C kkonganti@17: file. kkonganti@17: kkonganti@17: =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv kkonganti@17: kkonganti@17: Absolute UNIX path pointing to the SNP Cluster metadata file. kkonganti@17: Examples: PDG000000002.2505.reference_target.cluster_list.tsv kkonganti@17: kkonganti@17: =item --serocol (Optional) kkonganti@17: kkonganti@17: Column number (non 0-based index) of the PDG metadata file kkonganti@17: by which the serotypes are collected kkonganti@17: (column name: "computed_types"). Default: 50 kkonganti@17: kkonganti@17: =item --seronamecol (Optional) kkonganti@17: kkonganti@17: Column number (non 0-based index) of the PDG metadata file kkonganti@17: whose column name is "serovar". Default: 34 kkonganti@17: kkonganti@17: =item --acc_col (Optional) kkonganti@17: kkonganti@17: Column number (non 0-based index) of the PDG metadata file kkonganti@17: whose column name is "acc". Default: 10 kkonganti@17: kkonganti@17: =item --target_acc_col (Optional) kkonganti@17: kkonganti@17: Column number (non 0-based index) of the PDG metadata file kkonganti@17: whose column name is "target_acc". Default: 43 kkonganti@17: kkonganti@17: =item --complete_serotype_name (Optional) kkonganti@17: kkonganti@17: Skip indexing serotypes when the serotype name in the column kkonganti@17: number 49 (non 0-based) of PDG metadata file consists a "-". For example, if kkonganti@17: an accession has a I> string as such in column kkonganti@17: number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-"> kkonganti@17: then, the indexing of that accession is skipped. kkonganti@17: Default: False kkonganti@17: kkonganti@17: =item --not_null_pdg_serovar (Optional) kkonganti@17: kkonganti@17: Only index the B> column i.e. column number 49 (non 0-based) kkonganti@17: if the B> column is not C. kkonganti@17: kkonganti@17: =item -i (Optional) kkonganti@17: kkonganti@17: Make sure the following serotype is included. Mention C<-i> multiple kkonganti@17: times to include multiple serotypes. kkonganti@17: kkonganti@17: =item -num (Optional) kkonganti@17: kkonganti@17: Number of genome accessions per SNP Cluster. Default: 1 kkonganti@17: kkonganti@17: =back kkonganti@17: kkonganti@17: =head1 AUTHOR kkonganti@17: kkonganti@17: Kranti Konganti kkonganti@17: kkonganti@17: =cut