annotate 0.5.0/bin/fasta_join.pl @ 0:3c767f9cfd88 draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:37:56 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env perl
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
2
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
3 # Kranti Konganti
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
4 # Takes in a gzipped multi-fasta file
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
5 # and joins contigs by 10 N's
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
6
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
7 use strict;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
8 use warnings;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
9 use Cwd;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
10 use Bio::SeqIO;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
11 use Getopt::Long;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
12 use File::Find;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
13 use File::Basename;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
14 use File::Spec::Functions;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
15
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
16 my ( $in_dir, $out_dir, $suffix, @uncatted_genomes );
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
17
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
18 GetOptions(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
19 'in_dir=s' => \$in_dir,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
20 'out_dir=s' => \$out_dir,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
21 'suffix=s' => \$suffix
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
22 ) or die usage();
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
23
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
24 $in_dir = getcwd if ( !defined $in_dir );
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
25 $out_dir = getcwd if ( !defined $out_dir );
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
26 $suffix = '_genomic.fna.gz' if ( !defined $suffix );
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
27
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
28 find(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
29 {
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
30 wanted => sub {
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
31 push @uncatted_genomes, $File::Find::name if ( $_ =~ m/$suffix$/ );
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
32 }
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
33 },
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
34 $in_dir
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
35 );
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
36
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
37 if ( $out_dir ne getcwd && !-d $out_dir ) {
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
38 mkdir $out_dir || die "\nCannot create directory $out_dir: $!\n\n";
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
39 }
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
40
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
41 open( my $geno_path, '>genome_paths.txt' )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
42 || die "\nCannot open file genome_paths.txt: $!\n\n";
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
43
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
44 foreach my $uncatted_genome_path (@uncatted_genomes) {
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
45 my $catted_genome_header = '>' . basename( $uncatted_genome_path, $suffix );
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
46 $catted_genome_header =~ s/(GC[AF]\_\d+\.\d+)\_*.*/$1/;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
47
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
48 my $catted_genome =
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
49 catfile( $out_dir, $catted_genome_header . '_scaffolded' . $suffix );
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
50
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
51 $catted_genome =~ s/\/\>(GC[AF])/\/$1/;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
52
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
53 print $geno_path "$catted_genome\n";
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
54
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
55 open( my $fh, "gunzip -c $uncatted_genome_path |" )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
56 || die "\nCannot create pipe for $uncatted_genome_path: $!\n\n";
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
57
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
58 open( my $fho, '|-', "gzip -c > $catted_genome" )
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
59 || die "\nCannot pipe to gzip: $!\n\n";
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
60
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
61 my $seq_obj = Bio::SeqIO->new(
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
62 -fh => $fh,
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
63 -format => 'Fasta'
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
64 );
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
65
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
66 my $joined_seq = '';
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
67 while ( my $seq = $seq_obj->next_seq ) {
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
68 $joined_seq = $joined_seq . 'NNNNNNNNNN' . $seq->seq;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
69 }
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
70
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
71 $joined_seq =~ s/NNNNNNNNNN$//;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
72 $joined_seq =~ s/^NNNNNNNNNN//;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
73
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
74 # $joined_seq =~ s/.{80}\K/\n/g;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
75 # $joined_seq =~ s/\n$//;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
76 print $fho $catted_genome_header, "\n", $joined_seq, "\n";
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
77
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
78 $seq_obj->close();
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
79 close $fh;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
80 close $fho;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
81 }
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
82
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
83 sub usage {
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
84 print
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
85 "\nUsage: $0 [-in IN_DIR] [-ou OUT_DIR] [-su Filename Suffix for Header]\n\n";
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
86 exit;
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
87 }
3c767f9cfd88 planemo upload
galaxytrakr
parents:
diff changeset
88