annotate 0.7.0/bin/fasta_join.pl @ 21:4ce0e079377d tip

planemo upload
author kkonganti
date Mon, 15 Jul 2024 12:01:00 -0400
parents 0e7a0053e4a6
children
rev   line source
kkonganti@17 1 #!/usr/bin/env perl
kkonganti@17 2
kkonganti@17 3 # Kranti Konganti
kkonganti@17 4 # Takes in a gzipped multi-fasta file
kkonganti@17 5 # and joins contigs by 10 N's
kkonganti@17 6
kkonganti@17 7 use strict;
kkonganti@17 8 use warnings;
kkonganti@17 9 use Cwd;
kkonganti@17 10 use Bio::SeqIO;
kkonganti@17 11 use Getopt::Long;
kkonganti@17 12 use File::Find;
kkonganti@17 13 use File::Basename;
kkonganti@17 14 use File::Spec::Functions;
kkonganti@17 15
kkonganti@17 16 my ( $in_dir, $out_dir, $suffix, @uncatted_genomes );
kkonganti@17 17
kkonganti@17 18 GetOptions(
kkonganti@17 19 'in_dir=s' => \$in_dir,
kkonganti@17 20 'out_dir=s' => \$out_dir,
kkonganti@17 21 'suffix=s' => \$suffix
kkonganti@17 22 ) or die usage();
kkonganti@17 23
kkonganti@17 24 $in_dir = getcwd if ( !defined $in_dir );
kkonganti@17 25 $out_dir = getcwd if ( !defined $out_dir );
kkonganti@17 26 $suffix = '_genomic.fna.gz' if ( !defined $suffix );
kkonganti@17 27
kkonganti@17 28 find(
kkonganti@17 29 {
kkonganti@17 30 wanted => sub {
kkonganti@17 31 push @uncatted_genomes, $File::Find::name if ( $_ =~ m/$suffix$/ );
kkonganti@17 32 }
kkonganti@17 33 },
kkonganti@17 34 $in_dir
kkonganti@17 35 );
kkonganti@17 36
kkonganti@17 37 if ( $out_dir ne getcwd && !-d $out_dir ) {
kkonganti@17 38 mkdir $out_dir || die "\nCannot create directory $out_dir: $!\n\n";
kkonganti@17 39 }
kkonganti@17 40
kkonganti@17 41 open( my $geno_path, '>genome_paths.txt' )
kkonganti@17 42 || die "\nCannot open file genome_paths.txt: $!\n\n";
kkonganti@17 43
kkonganti@17 44 foreach my $uncatted_genome_path (@uncatted_genomes) {
kkonganti@17 45 my $catted_genome_header = '>' . basename( $uncatted_genome_path, $suffix );
kkonganti@17 46 $catted_genome_header =~ s/(GC[AF]\_\d+\.\d+)\_*.*/$1/;
kkonganti@17 47
kkonganti@17 48 my $catted_genome =
kkonganti@17 49 catfile( $out_dir, $catted_genome_header . '_scaffolded' . $suffix );
kkonganti@17 50
kkonganti@17 51 $catted_genome =~ s/\/\>(GC[AF])/\/$1/;
kkonganti@17 52
kkonganti@17 53 print $geno_path "$catted_genome\n";
kkonganti@17 54
kkonganti@17 55 open( my $fh, "gunzip -c $uncatted_genome_path |" )
kkonganti@17 56 || die "\nCannot create pipe for $uncatted_genome_path: $!\n\n";
kkonganti@17 57
kkonganti@17 58 open( my $fho, '|-', "gzip -c > $catted_genome" )
kkonganti@17 59 || die "\nCannot pipe to gzip: $!\n\n";
kkonganti@17 60
kkonganti@17 61 my $seq_obj = Bio::SeqIO->new(
kkonganti@17 62 -fh => $fh,
kkonganti@17 63 -format => 'Fasta'
kkonganti@17 64 );
kkonganti@17 65
kkonganti@17 66 my $joined_seq = '';
kkonganti@17 67 while ( my $seq = $seq_obj->next_seq ) {
kkonganti@17 68 $joined_seq = $joined_seq . 'NNNNNNNNNN' . $seq->seq;
kkonganti@17 69 }
kkonganti@17 70
kkonganti@17 71 $joined_seq =~ s/NNNNNNNNNN$//;
kkonganti@17 72 $joined_seq =~ s/^NNNNNNNNNN//;
kkonganti@17 73
kkonganti@17 74 # $joined_seq =~ s/.{80}\K/\n/g;
kkonganti@17 75 # $joined_seq =~ s/\n$//;
kkonganti@17 76 print $fho $catted_genome_header, "\n", $joined_seq, "\n";
kkonganti@17 77
kkonganti@17 78 $seq_obj->close();
kkonganti@17 79 close $fh;
kkonganti@17 80 close $fho;
kkonganti@17 81 }
kkonganti@17 82
kkonganti@17 83 sub usage {
kkonganti@17 84 print
kkonganti@17 85 "\nUsage: $0 [-in IN_DIR] [-ou OUT_DIR] [-su Filename Suffix for Header]\n\n";
kkonganti@17 86 exit;
kkonganti@17 87 }
kkonganti@17 88