Mercurial > repos > galaxytrakr > hfp_bettercallsal_awsbatch
comparison 1.0.0/bin/fasta_join.pl @ 0:801b85b03a17 draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Thu, 28 May 2026 20:31:42 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:801b85b03a17 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 # Kranti Konganti | |
| 4 # Takes in a gzipped multi-fasta file | |
| 5 # and joins contigs by 10 N's | |
| 6 | |
| 7 use strict; | |
| 8 use warnings; | |
| 9 use Cwd; | |
| 10 use Bio::SeqIO; | |
| 11 use Getopt::Long; | |
| 12 use File::Find; | |
| 13 use File::Basename; | |
| 14 use File::Spec::Functions; | |
| 15 | |
| 16 my ( $in_dir, $out_dir, $suffix, @uncatted_genomes ); | |
| 17 | |
| 18 GetOptions( | |
| 19 'in_dir=s' => \$in_dir, | |
| 20 'out_dir=s' => \$out_dir, | |
| 21 'suffix=s' => \$suffix | |
| 22 ) or die usage(); | |
| 23 | |
| 24 $in_dir = getcwd if ( !defined $in_dir ); | |
| 25 $out_dir = getcwd if ( !defined $out_dir ); | |
| 26 $suffix = '_genomic.fna.gz' if ( !defined $suffix ); | |
| 27 | |
| 28 find( | |
| 29 { | |
| 30 wanted => sub { | |
| 31 push @uncatted_genomes, $File::Find::name if ( $_ =~ m/$suffix$/ ); | |
| 32 } | |
| 33 }, | |
| 34 $in_dir | |
| 35 ); | |
| 36 | |
| 37 if ( $out_dir ne getcwd && !-d $out_dir ) { | |
| 38 mkdir $out_dir || die "\nCannot create directory $out_dir: $!\n\n"; | |
| 39 } | |
| 40 | |
| 41 open( my $geno_path, '>genome_paths.txt' ) | |
| 42 || die "\nCannot open file genome_paths.txt: $!\n\n"; | |
| 43 | |
| 44 foreach my $uncatted_genome_path (@uncatted_genomes) { | |
| 45 my $catted_genome_header = '>' . basename( $uncatted_genome_path, $suffix ); | |
| 46 $catted_genome_header =~ s/(GC[AF]\_\d+\.\d+)\_*.*/$1/; | |
| 47 | |
| 48 my $catted_genome = | |
| 49 catfile( $out_dir, $catted_genome_header . '_scaffolded' . $suffix ); | |
| 50 | |
| 51 $catted_genome =~ s/\/\>(GC[AF])/\/$1/; | |
| 52 | |
| 53 print $geno_path "$catted_genome\n"; | |
| 54 | |
| 55 open( my $fh, "gunzip -c $uncatted_genome_path |" ) | |
| 56 || die "\nCannot create pipe for $uncatted_genome_path: $!\n\n"; | |
| 57 | |
| 58 open( my $fho, '|-', "gzip -c > $catted_genome" ) | |
| 59 || die "\nCannot pipe to gzip: $!\n\n"; | |
| 60 | |
| 61 my $seq_obj = Bio::SeqIO->new( | |
| 62 -fh => $fh, | |
| 63 -format => 'Fasta' | |
| 64 ); | |
| 65 | |
| 66 my $joined_seq = ''; | |
| 67 while ( my $seq = $seq_obj->next_seq ) { | |
| 68 $joined_seq = $joined_seq . 'NNNNNNNNNN' . $seq->seq; | |
| 69 } | |
| 70 | |
| 71 $joined_seq =~ s/NNNNNNNNNN$//; | |
| 72 $joined_seq =~ s/^NNNNNNNNNN//; | |
| 73 | |
| 74 # $joined_seq =~ s/.{80}\K/\n/g; | |
| 75 # $joined_seq =~ s/\n$//; | |
| 76 print $fho $catted_genome_header, "\n", $joined_seq, "\n"; | |
| 77 | |
| 78 $seq_obj->close(); | |
| 79 close $fh; | |
| 80 close $fho; | |
| 81 } | |
| 82 | |
| 83 sub usage { | |
| 84 print | |
| 85 "\nUsage: $0 [-in IN_DIR] [-ou OUT_DIR] [-su Filename Suffix for Header]\n\n"; | |
| 86 exit; | |
| 87 } | |
| 88 |
