kkonganti@11: #!/usr/bin/env perl kkonganti@11: kkonganti@11: # Kranti Konganti kkonganti@11: # Takes in a gzipped multi-fasta file kkonganti@11: # and joins contigs by 10 N's kkonganti@11: kkonganti@11: use strict; kkonganti@11: use warnings; kkonganti@11: use Cwd; kkonganti@11: use Bio::SeqIO; kkonganti@11: use Getopt::Long; kkonganti@11: use File::Find; kkonganti@11: use File::Basename; kkonganti@11: use File::Spec::Functions; kkonganti@11: kkonganti@11: my ( $in_dir, $out_dir, $suffix, @uncatted_genomes ); kkonganti@11: kkonganti@11: GetOptions( kkonganti@11: 'in_dir=s' => \$in_dir, kkonganti@11: 'out_dir=s' => \$out_dir, kkonganti@11: 'suffix=s' => \$suffix kkonganti@11: ) or die usage(); kkonganti@11: kkonganti@11: $in_dir = getcwd if ( !defined $in_dir ); kkonganti@11: $out_dir = getcwd if ( !defined $out_dir ); kkonganti@11: $suffix = '_genomic.fna.gz' if ( !defined $suffix ); kkonganti@11: kkonganti@11: find( kkonganti@11: { kkonganti@11: wanted => sub { kkonganti@11: push @uncatted_genomes, $File::Find::name if ( $_ =~ m/$suffix$/ ); kkonganti@11: } kkonganti@11: }, kkonganti@11: $in_dir kkonganti@11: ); kkonganti@11: kkonganti@11: if ( $out_dir ne getcwd && !-d $out_dir ) { kkonganti@11: mkdir $out_dir || die "\nCannot create directory $out_dir: $!\n\n"; kkonganti@11: } kkonganti@11: kkonganti@11: open( my $geno_path, '>genome_paths.txt' ) kkonganti@11: || die "\nCannot open file genome_paths.txt: $!\n\n"; kkonganti@11: kkonganti@11: foreach my $uncatted_genome_path (@uncatted_genomes) { kkonganti@11: my $catted_genome_header = '>' . basename( $uncatted_genome_path, $suffix ); kkonganti@11: $catted_genome_header =~ s/(GC[AF]\_\d+\.\d+)\_*.*/$1/; kkonganti@11: kkonganti@11: my $catted_genome = kkonganti@11: catfile( $out_dir, $catted_genome_header . '_scaffolded' . $suffix ); kkonganti@11: kkonganti@11: $catted_genome =~ s/\/\>(GC[AF])/\/$1/; kkonganti@11: kkonganti@11: print $geno_path "$catted_genome\n"; kkonganti@11: kkonganti@11: open( my $fh, "gunzip -c $uncatted_genome_path |" ) kkonganti@11: || die "\nCannot create pipe for $uncatted_genome_path: $!\n\n"; kkonganti@11: kkonganti@11: open( my $fho, '|-', "gzip -c > $catted_genome" ) kkonganti@11: || die "\nCannot pipe to gzip: $!\n\n"; kkonganti@11: kkonganti@11: my $seq_obj = Bio::SeqIO->new( kkonganti@11: -fh => $fh, kkonganti@11: -format => 'Fasta' kkonganti@11: ); kkonganti@11: kkonganti@11: my $joined_seq = ''; kkonganti@11: while ( my $seq = $seq_obj->next_seq ) { kkonganti@11: $joined_seq = $joined_seq . 'NNNNNNNNNN' . $seq->seq; kkonganti@11: } kkonganti@11: kkonganti@11: $joined_seq =~ s/NNNNNNNNNN$//; kkonganti@11: $joined_seq =~ s/^NNNNNNNNNN//; kkonganti@11: kkonganti@11: # $joined_seq =~ s/.{80}\K/\n/g; kkonganti@11: # $joined_seq =~ s/\n$//; kkonganti@11: print $fho $catted_genome_header, "\n", $joined_seq, "\n"; kkonganti@11: kkonganti@11: $seq_obj->close(); kkonganti@11: close $fh; kkonganti@11: close $fho; kkonganti@11: } kkonganti@11: kkonganti@11: sub usage { kkonganti@11: print kkonganti@11: "\nUsage: $0 [-in IN_DIR] [-ou OUT_DIR] [-su Filename Suffix for Header]\n\n"; kkonganti@11: exit; kkonganti@11: } kkonganti@11: