view 0.7.0/bin/waterfall_per_snp_cluster.pl @ 21:4ce0e079377d tip

planemo upload
author kkonganti
date Mon, 15 Jul 2024 12:01:00 -0400
parents 0e7a0053e4a6
children
line wrap: on
line source
#!/usr/bin/env perl

# Kranti Konganti
# 01/02/2024

use strict;
use warnings;
use Getopt::Long;
use Data::Dumper;
use Pod::Usage;
use File::Basename;
use File::Spec::Functions;

my $tbl               = {};
my $snp_2_serovar     = {};
my $acc_2_serovar     = {};
my $acc_2_target      = {};
my $snp_count         = {};
my $snp_2_acc         = {};
my $acc_2_snp         = {};
my $multi_cluster_acc = {};
my (
    $serovar_limit,          $serovar_or_type_col, $min_asm_size,
    $complete_serotype_name, $PDG_file,            $table_file,
    $not_null_pdg_serovar,   $snp_cluster,         $help,
    $out_prefix,             $acc_col,             $seronamecol,
    $target_acc_col
);
my @custom_serovars;

GetOptions(
    'help'                         => \$help,
    'pdg=s'                        => \$PDG_file,
    'tbl=s'                        => \$table_file,
    'snp=s'                        => \$snp_cluster,
    'min_contig_size=i'            => \$min_asm_size,
    'complete_serotype_name'       => \$complete_serotype_name,
    'serocol:i'                    => \$serovar_or_type_col,
    'seronamecol:i'                => \$seronamecol,
    'target_acc_col:i'             => \$target_acc_col,
    'acc_col:i'                    => \$acc_col,
    'not_null_pdg_serovar'         => \$not_null_pdg_serovar,
    'num_serotypes_per_serotype:i' => \$serovar_limit,
    'include_serovar=s'            => \@custom_serovars,
    'op=s'                         => \$out_prefix
) or pod2usage( -verbose => 2 );

if ( defined $help ) {
    pod2usage( -verbose => 2 );
}

if ( !defined $serovar_limit ) {
    $serovar_limit = 1;
}

if ( !defined $serovar_or_type_col ) {
    $serovar_or_type_col = 50;
}

if ( !defined $seronamecol ) {
    $seronamecol = 34;
}

if ( !defined $target_acc_col ) {
    $target_acc_col = 43;
}

if ( !defined $acc_col ) {
    $acc_col = 10;
}

if ( !defined $min_asm_size ) {
    $min_asm_size = 0;
}

if ( defined $out_prefix ) {
    $out_prefix .= '_';
}
else {
    $out_prefix = '';
}

pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster );

open( my $pdg_file, '<', $PDG_file )
  || die "\nCannot open PDG file $PDG_file: $!\n\n";
open( my $tbl_file, '<', $table_file )
  || die "\nCannot open tbl file $table_file: $!\n\n";
open( my $snp_cluster_file, '<', $snp_cluster )
  || die "\nCannot open $snp_cluster: $!\n\n";
open( my $acc_fh, '>', 'acc2serovar.txt' )
  || die "\nCannot open acc2serovar.txt: $!\n\n";
open( my $Stdout,      '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n";
open( my $Stderr,      '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n";
open( my $accs_snp_fh, '>',  $out_prefix . 'accs_snp.txt' )
  || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n";
open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' )
  || die "\nCannnot open "
  . $out_prefix
  . "mash_snp_genome_list.txt for writing: $!\n\n";

my $pdg_release = basename( $PDG_file, ".metadata.tsv" );

while ( my $line = <$pdg_file> ) {
    chomp $line;
    next if ( $line =~ m/^\#/ );

    # Relevent columns (Perl index):
    #   10-1 = 9: asm_acc
    # 34 -1 = 33: serovar
    # 50 -1 = 49: computed serotype

    my @cols            = split( /\t/, $line );
    my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ];
    my $acc             = $cols[ $acc_col - 1 ];
    my $serovar         = $cols[ $seronamecol - 1 ];
    my $target_acc      = $cols[ $target_acc_col - 1 ];

    $serovar_or_type =~ s/\"//g;

    my $skip = 1;
    foreach my $ser (@custom_serovars) {
        $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ );
    }

    if ( defined $complete_serotype_name ) {
        next
          if ( $skip
            && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ )
          );
    }

    next
      if (
        $skip
        && (   $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
            || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ )
      );

    # next
    #   if (
    #     (
    #            $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
    #         || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/
    #     )
    #   );

    if ( defined $not_null_pdg_serovar ) {
        $acc_2_serovar->{$acc} = $serovar_or_type,
          $acc_2_target->{$acc} = $target_acc,
          print $acc_fh "$acc\t$serovar_or_type\n"
          if ( $acc !~ m/NULL/
            && $serovar         !~ m/NULL/
            && $serovar_or_type !~ m/NULL/ );
    }
    else {
        $acc_2_serovar->{$acc} = $serovar_or_type,
          $acc_2_target->{$acc} = $target_acc,
          print $acc_fh "$acc\t$serovar_or_type\n"
          if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ );
    }

    # $snp_count->{$serovar_or_type} = 0;
}

#
# SNP to ACC
#

while ( my $line = <$snp_cluster_file> ) {
    chomp $line;
    my @cols = split( /\t/, $line );

    # Relevant columns
    # 0: SNP Cluster ID
    # 3: Genome Accession belonging to the cluster (RefSeq or GenBank)
    my $snp_clus_id = $cols[0];
    my $acc         = $cols[3];

    next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ );
    next if ( !exists $acc_2_serovar->{$acc} );
    push @{ $snp_2_acc->{$snp_clus_id} }, $acc;
    if ( exists $acc_2_snp->{$acc} ) {
        print $Stderr
          "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n";
        exit 1;
    }
    $acc_2_snp->{$acc}         = $snp_clus_id;
    $snp_count->{$snp_clus_id} = 0;
}

while ( my $line = <$tbl_file> ) {
    chomp $line;

    my @cols = split( /\t/, $line );

    # .tbl file columns (Perl index):
    #
    # 0: Accession
    # 1: AssemblyLevel
    # 2: ScaffoldN50
    # 3: ContigN50

    my $acc          = $cols[0];
    my $asm_lvl      = $cols[1];
    my $scaffold_n50 = $cols[2];
    my $contig_n50   = $cols[3];

    # my $idx0 = $acc_2_serovar->{$cols[0]};
    my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } );

    if ( not_empty($acc) && defined $idx0 ) {
        my $fna_rel_loc =
            "$pdg_release/ncbi_dataset/data/$acc/"
          . $acc
          . '_scaffolded_genomic.fna.gz';

        if ( not_empty($scaffold_n50) ) {
            next if ( $scaffold_n50 <= $min_asm_size );
            push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
                  ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
        }
        elsif ( not_empty($contig_n50) ) {
            next if ( $contig_n50 <= $min_asm_size );
            push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
                  ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
        }
    }
}

foreach my $snp_cluster_id ( keys %$snp_2_acc ) {
    my $count = $snp_count->{$snp_cluster_id};
    foreach my $asm_lvl (
        sort { $a cmp $b }
        keys %{ $snp_2_serovar->{$snp_cluster_id} }
      )
    {
        if ( $asm_lvl =~ m/Complete\s+Genome/i ) {
            $count =
              print_dl_metadata( $asm_lvl,
                \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
                $count, $snp_cluster_id );
        }
        if ( $asm_lvl =~ m/Chromosome/i ) {
            $count =
              print_dl_metadata( $asm_lvl,
                \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
                $count, $snp_cluster_id );
        }
        if ( $asm_lvl =~ m/Scaffold/i ) {
            $count =
              print_dl_metadata( $asm_lvl,
                \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
                $count, $snp_cluster_id );
        }
        if ( $asm_lvl =~ m/Contig/i ) {
            $count =
              print_dl_metadata( $asm_lvl,
                \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
                $count, $snp_cluster_id );
        }
        printf $Stderr "%-17s  |  %s\n", $snp_cluster_id, $count
          if ( $count > 0 );
        last if ( $count >= $serovar_limit );
    }
}

close $pdg_file;
close $tbl_file;
close $snp_cluster_file;
close $acc_fh;
close $accs_snp_fh;

#-------------------------------------------
# Main ends
#-------------------------------------------
# Routines begin
#-------------------------------------------

sub print_dl_metadata {
    my $asm_lvl        = shift;
    my $acc_sizes      = shift;
    my $curr_count     = shift;
    my $snp_cluster_id = shift;

    $asm_lvl =~ s/.+?\_(.+)/$1/;

    foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) {
        foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) {
            my ( $serovar, $url ) = split( /\|/, $serovar_url );
            return $curr_count if ( exists $multi_cluster_acc->{$url} );
            $multi_cluster_acc->{$url} = 1;
            $curr_count++;
            my ( $final_acc, $genome_header ) =
              ( split( /\//, $url ) )[ 3 .. 4 ];
            print $accs_snp_fh "$final_acc\n";
            print $genome_headers_fh catfile( 'scaffold_genomes',
                $genome_header )
              . "\n";
            print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n"
              if ( $curr_count > 0 );
        }
        last if ( $curr_count >= $serovar_limit );
    }
    return $curr_count;
}

sub sort_asm_level {
    my $level = shift;

    $level =~ s/(Complete\s+Genome)/a\_$1/
      if ( $level =~ m/Complete\s+Genome/i );
    $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i );
    $level =~ s/(Scaffold)/c\_$1/   if ( $level =~ m/Scaffold/i );
    $level =~ s/(Contig)/d\_$1/     if ( $level =~ m/Contig/i );

    return $level;
}

sub not_empty {
    my $col = shift;

    if ( $col !~ m/^$/ ) {
        return 1;
    }
    else {
        return 0;
    }
}

__END__

=head1 SYNOPSIS

This script will take in a PDG metadata file, a C<.tbl> file and generate
the final list by B<I<waterfall>> priority.

See complete description:

  perldoc waterfall_per_snp_cluster.pl

    or

  waterfall_per_snp_cluster.pl --help

Examples:

  waterfall_per_snp_cluster.pl

=head1 DESCRIPTION

We will retain up to N number of genome accessions per SNP cluster.
It prioritizes SNP Cluster participation over serotype coverage.
Which N genomes are selected depends on (in order):

1. Genome assembly level, whose priority is

    a: Complete Genome
    b: Chromosome
    c: Scaffold
    d: Contig

2. If the genomes are of same assembly level, then
    scaffold N50 followed by contig N50 is chosen.

3. If the scaffold or contig N50 is same, then all
    of them are included

=head1 OPTIONS

=over 3

=item -p PDGXXXXX.XXXX.metadata.tsv

Absolute UNIX path pointing to the PDG metadata file.
Example: PDG000000002.2505.metadata.tsv

=item -t asm.tbl

Absolute UNIX path pointing to the file from the result
of the C<dl_pdg_data.py> script, which is the C<asm.tbl>
file.

=item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv

Absolute UNIX path pointing to the SNP Cluster metadata file.
Examples: PDG000000002.2505.reference_target.cluster_list.tsv

=item --serocol <int> (Optional)

Column number (non 0-based index) of the PDG metadata file
by which the serotypes are collected
(column name: "computed_types"). Default: 50

=item --seronamecol <int> (Optional)

Column number (non 0-based index) of the PDG metadata file
whose column name is "serovar". Default: 34

=item --acc_col <int> (Optional)

Column number (non 0-based index) of the PDG metadata file
whose column name is "acc". Default: 10

=item --target_acc_col <int> (Optional)

Column number (non 0-based index) of the PDG metadata file
whose column name is "target_acc". Default: 43

=item --complete_serotype_name (Optional)

Skip indexing serotypes when the serotype name in the column
number 49 (non 0-based) of PDG metadata file consists a "-". For example, if
an accession has a I<B<serotype=>> string as such in column
number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-">
then, the indexing of that accession is skipped.
Default: False

=item --not_null_pdg_serovar (Optional)

Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based)
if the B<I<serovar>> column is not C<NULL>.

=item -i <serotype name> (Optional)

Make sure the following serotype is included. Mention C<-i> multiple
times to include multiple serotypes.

=item -num <int> (Optional)

Number of genome accessions per SNP Cluster. Default: 1

=back

=head1 AUTHOR

Kranti Konganti

=cut