Mercurial > repos > kkonganti > cfsan_bettercallsal
view 0.7.0/bin/fasta_join.pl @ 21:4ce0e079377d tip
planemo upload
author | kkonganti |
---|---|
date | Mon, 15 Jul 2024 12:01:00 -0400 |
parents | 0e7a0053e4a6 |
children |
line wrap: on
line source
#!/usr/bin/env perl # Kranti Konganti # Takes in a gzipped multi-fasta file # and joins contigs by 10 N's use strict; use warnings; use Cwd; use Bio::SeqIO; use Getopt::Long; use File::Find; use File::Basename; use File::Spec::Functions; my ( $in_dir, $out_dir, $suffix, @uncatted_genomes ); GetOptions( 'in_dir=s' => \$in_dir, 'out_dir=s' => \$out_dir, 'suffix=s' => \$suffix ) or die usage(); $in_dir = getcwd if ( !defined $in_dir ); $out_dir = getcwd if ( !defined $out_dir ); $suffix = '_genomic.fna.gz' if ( !defined $suffix ); find( { wanted => sub { push @uncatted_genomes, $File::Find::name if ( $_ =~ m/$suffix$/ ); } }, $in_dir ); if ( $out_dir ne getcwd && !-d $out_dir ) { mkdir $out_dir || die "\nCannot create directory $out_dir: $!\n\n"; } open( my $geno_path, '>genome_paths.txt' ) || die "\nCannot open file genome_paths.txt: $!\n\n"; foreach my $uncatted_genome_path (@uncatted_genomes) { my $catted_genome_header = '>' . basename( $uncatted_genome_path, $suffix ); $catted_genome_header =~ s/(GC[AF]\_\d+\.\d+)\_*.*/$1/; my $catted_genome = catfile( $out_dir, $catted_genome_header . '_scaffolded' . $suffix ); $catted_genome =~ s/\/\>(GC[AF])/\/$1/; print $geno_path "$catted_genome\n"; open( my $fh, "gunzip -c $uncatted_genome_path |" ) || die "\nCannot create pipe for $uncatted_genome_path: $!\n\n"; open( my $fho, '|-', "gzip -c > $catted_genome" ) || die "\nCannot pipe to gzip: $!\n\n"; my $seq_obj = Bio::SeqIO->new( -fh => $fh, -format => 'Fasta' ); my $joined_seq = ''; while ( my $seq = $seq_obj->next_seq ) { $joined_seq = $joined_seq . 'NNNNNNNNNN' . $seq->seq; } $joined_seq =~ s/NNNNNNNNNN$//; $joined_seq =~ s/^NNNNNNNNNN//; # $joined_seq =~ s/.{80}\K/\n/g; # $joined_seq =~ s/\n$//; print $fho $catted_genome_header, "\n", $joined_seq, "\n"; $seq_obj->close(); close $fh; close $fho; } sub usage { print "\nUsage: $0 [-in IN_DIR] [-ou OUT_DIR] [-su Filename Suffix for Header]\n\n"; exit; }