Mercurial > repos > kkonganti > cfsan_bettercallsal
diff 0.5.0/bin/waterfall_per_snp_cluster.pl @ 1:365849f031fd
"planemo upload"
author | kkonganti |
---|---|
date | Mon, 05 Jun 2023 18:48:51 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/0.5.0/bin/waterfall_per_snp_cluster.pl Mon Jun 05 18:48:51 2023 -0400 @@ -0,0 +1,409 @@ +#!/usr/bin/env perl + +# Kranti Konganti +# 10/12/2022 + +use strict; +use warnings; +use Getopt::Long; +use Data::Dumper; +use Pod::Usage; +use File::Basename; +use File::Spec::Functions; + +my $tbl = {}; +my $snp_2_serovar = {}; +my $acc_2_serovar = {}; +my $acc_2_target = {}; +my $snp_count = {}; +my $snp_2_acc = {}; +my $acc_2_snp = {}; +my $multi_cluster_acc = {}; +my ( + $serovar_limit, $serovar_or_type_col, $min_asm_size, + $complete_serotype_name, $PDG_file, $table_file, + $not_null_pdg_serovar, $snp_cluster, $help, + $out_prefix +); +my @custom_serovars; + +GetOptions( + 'help' => \$help, + 'pdg=s' => \$PDG_file, + 'tbl=s' => \$table_file, + 'snp=s' => \$snp_cluster, + 'min_contig_size=i' => \$min_asm_size, + 'complete_serotype_name' => \$complete_serotype_name, + 'serocol:i' => \$serovar_or_type_col, + 'not_null_pdg_serovar' => \$not_null_pdg_serovar, + 'num_serotypes_per_serotype:i' => \$serovar_limit, + 'include_serovar=s' => \@custom_serovars, + 'op=s' => \$out_prefix +) or pod2usage( -verbose => 2 ); + +if ( defined $help ) { + pod2usage( -verbose => 2 ); +} + +if ( !defined $serovar_limit ) { + $serovar_limit = 1; +} + +if ( !defined $serovar_or_type_col ) { + $serovar_or_type_col = 49; +} + +if ( !defined $min_asm_size ) { + $min_asm_size = 0; +} + +if ( defined $out_prefix ) { + $out_prefix .= '_'; +} +else { + $out_prefix = ''; +} + +pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster ); + +open( my $pdg_file, '<', $PDG_file ) + || die "\nCannot open PDG file $PDG_file: $!\n\n"; +open( my $tbl_file, '<', $table_file ) + || die "\nCannot open tbl file $table_file: $!\n\n"; +open( my $snp_cluster_file, '<', $snp_cluster ) + || die "\nCannot open $snp_cluster: $!\n\n"; +open( my $acc_fh, '>', 'acc2serovar.txt' ) + || die "\nCannot open acc2serovar.txt: $!\n\n"; +open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n"; +open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n"; +open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' ) + || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n"; +open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' ) + || die "\nCannnot open " + . $out_prefix + . "mash_snp_genome_list.txt for writing: $!\n\n"; + +my $pdg_release = basename( $PDG_file, ".metadata.tsv" ); + +while ( my $line = <$pdg_file> ) { + chomp $line; + next if ( $line =~ m/^\#/ ); + + # Relevent columns (Perl index): + # 9: asm_acc + # 33: serovar + # 48: computed serotype + + my @cols = split( /\t/, $line ); + my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ]; + my $acc = $cols[9]; + my $serovar = $cols[33]; + my $target_acc = $cols[41]; + + $serovar_or_type =~ s/\"//g; + + my $skip = 1; + foreach my $ser (@custom_serovars) { + $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ ); + } + + if ( defined $complete_serotype_name ) { + next + if ( $skip + && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ ) + ); + } + + next + if ( + $skip + && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ + || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ ) + ); + + # next + # if ( + # ( + # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/ + # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ + # ) + # ); + + if ( defined $not_null_pdg_serovar ) { + $acc_2_serovar->{$acc} = $serovar_or_type, + $acc_2_target->{$acc} = $target_acc, + print $acc_fh "$acc\t$serovar_or_type\n" + if ( $acc !~ m/NULL/ + && $serovar !~ m/NULL/ + && $serovar_or_type !~ m/NULL/ ); + } + else { + $acc_2_serovar->{$acc} = $serovar_or_type, + $acc_2_target->{$acc} = $target_acc, + print $acc_fh "$acc\t$serovar_or_type\n" + if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ ); + } + + # $snp_count->{$serovar_or_type} = 0; +} + +# +# SNP to ACC +# + +while ( my $line = <$snp_cluster_file> ) { + chomp $line; + my @cols = split( /\t/, $line ); + + # Relevant columns + # 0: SNP Cluster ID + # 3: Genome Accession belonging to the cluster (RefSeq or GenBank) + my $snp_clus_id = $cols[0]; + my $acc = $cols[3]; + + next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ ); + next if ( !exists $acc_2_serovar->{$acc} ); + push @{ $snp_2_acc->{$snp_clus_id} }, $acc; + if ( exists $acc_2_snp->{$acc} ) { + print $Stderr + "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n"; + exit 1; + } + $acc_2_snp->{$acc} = $snp_clus_id; + $snp_count->{$snp_clus_id} = 0; +} + +while ( my $line = <$tbl_file> ) { + chomp $line; + + my @cols = split( /\t/, $line ); + + # .tbl file columns (Perl index): + # + # 0: Accession + # 1: AssemblyLevel + # 2: ScaffoldN50 + # 3: ContigN50 + + my $acc = $cols[0]; + my $asm_lvl = $cols[1]; + my $scaffold_n50 = $cols[2]; + my $contig_n50 = $cols[3]; + + # my $idx0 = $acc_2_serovar->{$cols[0]}; + my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } ); + + if ( not_empty($acc) && defined $idx0 ) { + my $fna_rel_loc = + "$pdg_release/ncbi_dataset/data/$acc/" + . $acc + . '_scaffolded_genomic.fna.gz'; + + if ( not_empty($scaffold_n50) ) { + next if ( $scaffold_n50 <= $min_asm_size ); + push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } + ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; + } + elsif ( not_empty($contig_n50) ) { + next if ( $contig_n50 <= $min_asm_size ); + push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) } + ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc"; + } + } +} + +foreach my $snp_cluster_id ( keys %$snp_2_acc ) { + my $count = $snp_count->{$snp_cluster_id}; + foreach my $asm_lvl ( + sort { $a cmp $b } + keys %{ $snp_2_serovar->{$snp_cluster_id} } + ) + { + if ( $asm_lvl =~ m/Complete\s+Genome/i ) { + $count = + print_dl_metadata( $asm_lvl, + \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, + $count, $snp_cluster_id ); + } + if ( $asm_lvl =~ m/Chromosome/i ) { + $count = + print_dl_metadata( $asm_lvl, + \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, + $count, $snp_cluster_id ); + } + if ( $asm_lvl =~ m/Scaffold/i ) { + $count = + print_dl_metadata( $asm_lvl, + \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, + $count, $snp_cluster_id ); + } + if ( $asm_lvl =~ m/Contig/i ) { + $count = + print_dl_metadata( $asm_lvl, + \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl}, + $count, $snp_cluster_id ); + } + printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count + if ( $count > 0 ); + last if ( $count == $serovar_limit ); + } +} + +close $pdg_file; +close $tbl_file; +close $snp_cluster_file; +close $acc_fh; +close $accs_snp_fh; + +#------------------------------------------- +# Main ends +#------------------------------------------- +# Routines begin +#------------------------------------------- + +sub print_dl_metadata { + my $asm_lvl = shift; + my $acc_sizes = shift; + my $curr_count = shift; + my $snp_cluster_id = shift; + + $asm_lvl =~ s/.+?\_(.+)/$1/; + + foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) { + foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) { + my ( $serovar, $url ) = split( /\|/, $serovar_url ); + return $curr_count if ( exists $multi_cluster_acc->{$url} ); + $multi_cluster_acc->{$url} = 1; + $curr_count++; + my ( $final_acc, $genome_header ) = + ( split( /\//, $url ) )[ 3 .. 4 ]; + print $accs_snp_fh "$final_acc\n"; + print $genome_headers_fh catfile( 'scaffold_genomes', + $genome_header ) + . "\n"; + print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n" + if ( $curr_count > 0 ); + last if ( $curr_count == $serovar_limit ); + } + last if ( $curr_count == $serovar_limit ); + } + return $curr_count; +} + +sub sort_asm_level { + my $level = shift; + + $level =~ s/(Complete\s+Genome)/a\_$1/ + if ( $level =~ m/Complete\s+Genome/i ); + $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i ); + $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i ); + $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i ); + + return $level; +} + +sub not_empty { + my $col = shift; + + if ( $col !~ m/^$/ ) { + return 1; + } + else { + return 0; + } +} + +__END__ + +=head1 SYNOPSIS + +This script will take in a PDG metadata file, a C<.tbl> file and generate +the final list by B<I<waterfall>> priority. + +See complete description: + + perldoc waterfall_per_snp_cluster.pl + + or + + waterfall_per_snp_cluster.pl --help + +Examples: + + waterfall_per_snp_cluster.pl + +=head1 DESCRIPTION + +We will retain up to N number of genome accessions per SNP cluster. +It prioritizes SNP Cluster participation over serotype coverage. +Which N genomes are selected depends on (in order): + +1. Genome assembly level, whose priority is + + a: Complete Genome + b: Chromosome + c: Scaffold + d: Contig + +2. If the genomes are of same assembly level, then + scaffold N50 followed by contig N50 is chosen. + +3. If the scaffold or contig N50 is same, then all + of them are included + +=head1 OPTIONS + +=over 3 + +=item -p PDGXXXXX.XXXX.metadata.tsv + +Absolute UNIX path pointing to the PDG metadata file. +Example: PDG000000002.2505.metadata.tsv + +=item -t asm.tbl + +Absolute UNIX path pointing to the file from the result +of the C<dl_pdg_data.py> script, which is the C<asm.tbl> +file. + +=item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv + +Absolute UNIX path pointing to the SNP Cluster metadata file. +Examples: PDG000000002.2505.reference_target.cluster_list.tsv + + +=item --serocol <int> (Optional) + +Column number (non 0-based index) of the PDG metadata file +by which the serotypes are collected. Default: 49 + +=item --complete_serotype_name (Optional) + +Skip indexing serotypes when the serotype name in the column +number 49 (non 0-based) of PDG metadata file consists a "-". For example, if +an accession has a I<B<serotype=>> string as such in column +number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-"> +then, the indexing of that accession is skipped. +Default: False + +=item --not_null_pdg_serovar (Optional) + +Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based) +if the B<I<serovar>> column is not C<NULL>. + +=item -i <serotype name> (Optional) + +Make sure the following serotype is included. Mention C<-i> multiple +times to include multiple serotypes. + +=item -num <int> (Optional) + +Number of genome accessions per SNP Cluster. Default: 1 + +=back + +=head1 AUTHOR + +Kranti Konganti + +=cut