kkonganti@17
|
1 #!/usr/bin/env perl
|
kkonganti@17
|
2
|
kkonganti@17
|
3 # Kranti Konganti
|
kkonganti@17
|
4 # Takes in a gzipped multi-fasta file
|
kkonganti@17
|
5 # and joins contigs by 10 N's
|
kkonganti@17
|
6
|
kkonganti@17
|
7 use strict;
|
kkonganti@17
|
8 use warnings;
|
kkonganti@17
|
9 use Cwd;
|
kkonganti@17
|
10 use Bio::SeqIO;
|
kkonganti@17
|
11 use Getopt::Long;
|
kkonganti@17
|
12 use File::Find;
|
kkonganti@17
|
13 use File::Basename;
|
kkonganti@17
|
14 use File::Spec::Functions;
|
kkonganti@17
|
15
|
kkonganti@17
|
16 my ( $in_dir, $out_dir, $suffix, @uncatted_genomes );
|
kkonganti@17
|
17
|
kkonganti@17
|
18 GetOptions(
|
kkonganti@17
|
19 'in_dir=s' => \$in_dir,
|
kkonganti@17
|
20 'out_dir=s' => \$out_dir,
|
kkonganti@17
|
21 'suffix=s' => \$suffix
|
kkonganti@17
|
22 ) or die usage();
|
kkonganti@17
|
23
|
kkonganti@17
|
24 $in_dir = getcwd if ( !defined $in_dir );
|
kkonganti@17
|
25 $out_dir = getcwd if ( !defined $out_dir );
|
kkonganti@17
|
26 $suffix = '_genomic.fna.gz' if ( !defined $suffix );
|
kkonganti@17
|
27
|
kkonganti@17
|
28 find(
|
kkonganti@17
|
29 {
|
kkonganti@17
|
30 wanted => sub {
|
kkonganti@17
|
31 push @uncatted_genomes, $File::Find::name if ( $_ =~ m/$suffix$/ );
|
kkonganti@17
|
32 }
|
kkonganti@17
|
33 },
|
kkonganti@17
|
34 $in_dir
|
kkonganti@17
|
35 );
|
kkonganti@17
|
36
|
kkonganti@17
|
37 if ( $out_dir ne getcwd && !-d $out_dir ) {
|
kkonganti@17
|
38 mkdir $out_dir || die "\nCannot create directory $out_dir: $!\n\n";
|
kkonganti@17
|
39 }
|
kkonganti@17
|
40
|
kkonganti@17
|
41 open( my $geno_path, '>genome_paths.txt' )
|
kkonganti@17
|
42 || die "\nCannot open file genome_paths.txt: $!\n\n";
|
kkonganti@17
|
43
|
kkonganti@17
|
44 foreach my $uncatted_genome_path (@uncatted_genomes) {
|
kkonganti@17
|
45 my $catted_genome_header = '>' . basename( $uncatted_genome_path, $suffix );
|
kkonganti@17
|
46 $catted_genome_header =~ s/(GC[AF]\_\d+\.\d+)\_*.*/$1/;
|
kkonganti@17
|
47
|
kkonganti@17
|
48 my $catted_genome =
|
kkonganti@17
|
49 catfile( $out_dir, $catted_genome_header . '_scaffolded' . $suffix );
|
kkonganti@17
|
50
|
kkonganti@17
|
51 $catted_genome =~ s/\/\>(GC[AF])/\/$1/;
|
kkonganti@17
|
52
|
kkonganti@17
|
53 print $geno_path "$catted_genome\n";
|
kkonganti@17
|
54
|
kkonganti@17
|
55 open( my $fh, "gunzip -c $uncatted_genome_path |" )
|
kkonganti@17
|
56 || die "\nCannot create pipe for $uncatted_genome_path: $!\n\n";
|
kkonganti@17
|
57
|
kkonganti@17
|
58 open( my $fho, '|-', "gzip -c > $catted_genome" )
|
kkonganti@17
|
59 || die "\nCannot pipe to gzip: $!\n\n";
|
kkonganti@17
|
60
|
kkonganti@17
|
61 my $seq_obj = Bio::SeqIO->new(
|
kkonganti@17
|
62 -fh => $fh,
|
kkonganti@17
|
63 -format => 'Fasta'
|
kkonganti@17
|
64 );
|
kkonganti@17
|
65
|
kkonganti@17
|
66 my $joined_seq = '';
|
kkonganti@17
|
67 while ( my $seq = $seq_obj->next_seq ) {
|
kkonganti@17
|
68 $joined_seq = $joined_seq . 'NNNNNNNNNN' . $seq->seq;
|
kkonganti@17
|
69 }
|
kkonganti@17
|
70
|
kkonganti@17
|
71 $joined_seq =~ s/NNNNNNNNNN$//;
|
kkonganti@17
|
72 $joined_seq =~ s/^NNNNNNNNNN//;
|
kkonganti@17
|
73
|
kkonganti@17
|
74 # $joined_seq =~ s/.{80}\K/\n/g;
|
kkonganti@17
|
75 # $joined_seq =~ s/\n$//;
|
kkonganti@17
|
76 print $fho $catted_genome_header, "\n", $joined_seq, "\n";
|
kkonganti@17
|
77
|
kkonganti@17
|
78 $seq_obj->close();
|
kkonganti@17
|
79 close $fh;
|
kkonganti@17
|
80 close $fho;
|
kkonganti@17
|
81 }
|
kkonganti@17
|
82
|
kkonganti@17
|
83 sub usage {
|
kkonganti@17
|
84 print
|
kkonganti@17
|
85 "\nUsage: $0 [-in IN_DIR] [-ou OUT_DIR] [-su Filename Suffix for Header]\n\n";
|
kkonganti@17
|
86 exit;
|
kkonganti@17
|
87 }
|
kkonganti@17
|
88
|