cstrittmatter@0: """ cstrittmatter@0: Rewrite of rgFastQC.py for Version 0.11.2 of FastQC. cstrittmatter@0: cstrittmatter@0: Changes implemented from tmcgowan at cstrittmatter@0: https://testtoolshed.g2.bx.psu.edu/view/tmcgowan/fastqc cstrittmatter@0: and iuc at https://toolshed.g2.bx.psu.edu/view/iuc/fastqc cstrittmatter@0: with minor changes and bug fixes cstrittmatter@0: cstrittmatter@0: SYNOPSIS cstrittmatter@0: cstrittmatter@0: rgFastQC.py -i input_file -j input_file.name -o output_html_file [-d output_directory] cstrittmatter@0: [-f fastq|bam|sam] [-n job_name] [-c contaminant_file] [-e fastqc_executable] cstrittmatter@0: cstrittmatter@0: EXAMPLE (generated by Galaxy) cstrittmatter@0: cstrittmatter@0: rgFastQC.py -i path/dataset_1.dat -j 1000gsample.fastq -o path/dataset_3.dat -d path/job_working_directory/subfolder cstrittmatter@0: -f fastq -n FastQC -c path/dataset_2.dat -e fastqc cstrittmatter@0: cstrittmatter@0: """ cstrittmatter@0: cstrittmatter@0: import re cstrittmatter@0: import os cstrittmatter@0: import shutil cstrittmatter@0: import subprocess cstrittmatter@0: import optparse cstrittmatter@0: import tempfile cstrittmatter@0: import glob cstrittmatter@0: import gzip cstrittmatter@0: import bz2 cstrittmatter@0: import zipfile cstrittmatter@0: cstrittmatter@0: class FastQCRunner(object): cstrittmatter@0: cstrittmatter@0: def __init__(self,opts=None): cstrittmatter@0: ''' cstrittmatter@0: Initializes an object to run FastQC in Galaxy. To start the process, use the function run_fastqc() cstrittmatter@0: ''' cstrittmatter@0: cstrittmatter@0: # Check whether the options are specified and saves them into the object cstrittmatter@0: assert opts != None cstrittmatter@0: self.opts = opts cstrittmatter@0: cstrittmatter@0: def prepare_command_line(self): cstrittmatter@0: ''' cstrittmatter@0: Develops the Commandline to run FastQC in Galaxy cstrittmatter@0: ''' cstrittmatter@0: cstrittmatter@0: # Check whether a given file compression format is valid cstrittmatter@0: # This prevents uncompression of already uncompressed files cstrittmatter@0: infname = self.opts.inputfilename cstrittmatter@0: linf = infname.lower() cstrittmatter@0: trimext = False cstrittmatter@0: # decompression at upload currently does NOT remove this now bogus ending - fastqc will barf cstrittmatter@0: # patched may 29 2013 until this is fixed properly cstrittmatter@0: if ( linf.endswith('.gz') or linf.endswith('.gzip') ): cstrittmatter@0: f = gzip.open(self.opts.input) cstrittmatter@0: try: cstrittmatter@0: f.readline() cstrittmatter@0: except: cstrittmatter@0: trimext = True cstrittmatter@0: f.close() cstrittmatter@0: elif linf.endswith('bz2'): cstrittmatter@0: f = bz2.open(self.opts.input,'rb') cstrittmatter@0: try: cstrittmatter@0: f.readline() cstrittmatter@0: except: cstrittmatter@0: trimext = True cstrittmatter@0: f.close() cstrittmatter@0: elif linf.endswith('.zip'): cstrittmatter@0: if not zipfile.is_zipfile(self.opts.input): cstrittmatter@0: trimext = True cstrittmatter@0: if trimext: cstrittmatter@0: f = open(self.opts.input) cstrittmatter@0: try: cstrittmatter@0: f.readline() cstrittmatter@0: except: cstrittmatter@0: raise Exception("Input file corruption, could not identify the filetype") cstrittmatter@0: infname = os.path.splitext(infname)[0] cstrittmatter@0: cstrittmatter@0: # Replace unwanted or problematic charaters in the input file name cstrittmatter@0: self.fastqinfilename = re.sub(r'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname)) cstrittmatter@0: # check that the symbolic link gets a proper ending, fastqc seems to ignore the given format otherwise cstrittmatter@0: if 'fastq' in opts.informat: cstrittmatter@0: # with fastq the .ext is ignored, but when a format is actually passed it must comply with fastqc's cstrittmatter@0: # accepted formats.. cstrittmatter@0: opts.informat = 'fastq' cstrittmatter@0: elif not self.fastqinfilename.endswith(opts.informat): cstrittmatter@0: self.fastqinfilename += '.%s' % opts.informat cstrittmatter@0: cstrittmatter@0: # Build the Commandline from the given parameters cstrittmatter@0: command_line = [opts.executable, '--outdir %s' % opts.outputdir] cstrittmatter@0: if opts.contaminants != None: cstrittmatter@0: command_line.append('--contaminants %s' % opts.contaminants) cstrittmatter@0: if opts.limits != None: cstrittmatter@0: command_line.append('--limits %s' % opts.limits) cstrittmatter@0: command_line.append('--quiet') cstrittmatter@0: command_line.append('--extract') # to access the output text file cstrittmatter@0: command_line.append(self.fastqinfilename) cstrittmatter@0: command_line.append('-f %s' % opts.informat) cstrittmatter@0: self.command_line = ' '.join(command_line) cstrittmatter@0: cstrittmatter@0: def copy_output_file_to_dataset(self): cstrittmatter@0: ''' cstrittmatter@0: Retrieves the output html and text files from the output directory and copies them to the Galaxy output files cstrittmatter@0: ''' cstrittmatter@0: cstrittmatter@0: # retrieve html file cstrittmatter@0: result_file = glob.glob(opts.outputdir + '/*html') cstrittmatter@0: with open(result_file[0], 'rb') as fsrc: cstrittmatter@0: with open(self.opts.htmloutput, 'wb') as fdest: cstrittmatter@0: shutil.copyfileobj(fsrc, fdest) cstrittmatter@0: cstrittmatter@0: # retrieve text file cstrittmatter@0: text_file = glob.glob(opts.outputdir + '/*/fastqc_data.txt') cstrittmatter@0: with open(text_file[0], 'rb') as fsrc: cstrittmatter@0: with open(self.opts.textoutput, 'wb') as fdest: cstrittmatter@0: shutil.copyfileobj(fsrc, fdest) cstrittmatter@0: cstrittmatter@0: def run_fastqc(self): cstrittmatter@0: ''' cstrittmatter@0: Executes FastQC. Make sure the mandatory import parameters input, inputfilename, outputdir and htmloutput have been specified in the options (opts) cstrittmatter@0: ''' cstrittmatter@0: cstrittmatter@0: # Create a log file cstrittmatter@0: dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir) cstrittmatter@0: sout = open(tlog, 'w') cstrittmatter@0: cstrittmatter@0: self.prepare_command_line() cstrittmatter@0: sout.write(self.command_line) cstrittmatter@0: sout.write('\n') cstrittmatter@0: sout.write("Creating symlink\n") # between the input (.dat) file and the given input file name cstrittmatter@0: os.symlink(self.opts.input, self.fastqinfilename) cstrittmatter@0: sout.write("check_call\n") cstrittmatter@0: subprocess.check_call(self.command_line, shell=True) cstrittmatter@0: sout.write("Copying working %s file to %s \n" % (self.fastqinfilename, self.opts.htmloutput)) cstrittmatter@0: self.copy_output_file_to_dataset() cstrittmatter@0: sout.write("Finished") cstrittmatter@0: sout.close() cstrittmatter@0: cstrittmatter@0: if __name__ == '__main__': cstrittmatter@0: op = optparse.OptionParser() cstrittmatter@0: op.add_option('-i', '--input', default=None) cstrittmatter@0: op.add_option('-j', '--inputfilename', default=None) cstrittmatter@0: op.add_option('-o', '--htmloutput', default=None) cstrittmatter@0: op.add_option('-t', '--textoutput', default=None) cstrittmatter@0: op.add_option('-d', '--outputdir', default="/tmp/shortread") cstrittmatter@0: op.add_option('-f', '--informat', default='fastq') cstrittmatter@0: op.add_option('-n', '--namejob', default='rgFastQC') cstrittmatter@0: op.add_option('-c', '--contaminants', default=None) cstrittmatter@0: op.add_option('-l', '--limits', default=None) cstrittmatter@0: op.add_option('-e', '--executable', default='fastqc') cstrittmatter@0: opts, args = op.parse_args() cstrittmatter@0: cstrittmatter@0: assert opts.input != None cstrittmatter@0: assert opts.inputfilename != None cstrittmatter@0: assert opts.htmloutput != None cstrittmatter@0: #assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable cstrittmatter@0: if not os.path.exists(opts.outputdir): cstrittmatter@0: os.makedirs(opts.outputdir) cstrittmatter@0: cstrittmatter@0: fastqc_runner = FastQCRunner(opts) cstrittmatter@0: fastqc_runner.run_fastqc()