Mercurial > repos > jpayne > cfsan_qaqc
changeset 0:dafec28bd37e
planemo upload
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cfsan-qaqc.xml Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,73 @@ +<tool id="cfsan-qaqc" name="CFSAN QA/QC process" version="0.1.0"> + <requirements> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="1.4.3">matplotlib</requirement> + <requirement type="package" version="1.9.2">numpy</requirement> + <requirement type="package" version="0.16.1">pandas</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + #if $reads.reads_select == 'collection' + #set name=$reads.coll.name + #set forward=$reads.coll.forward + #set reverse=$reads.coll.reverse + #else + #set name=$forward.name + #end if + mkdir plots && + qa_core/gims_core.py --sequencingType "Illumina MiSeq" + --approximateGenomeSize $genomeSize + --coverageThreshold $covThreshold + --minimumQScore $minQ + --output_plots plots + --parallelism \${GALAXY_SLOTS:-6} + --annotate $name + --log - + suite + $forward + $reverse + -o $output + ]]></command> + <inputs> + <conditional name="reads"> + <param name="reads_select" type="select" label="Paired-end collection, or two datasets from your history"> + <option value="collection">Paired collection from your history</option> + <option value="history">Two FASTQ datasets from your history</option> + </param> + <when value="collection"> + <param label="Paired reads" name="coll" type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" collection_type="paired" /> + </when> + <when value="history"> + <param label="Forward reads" type="data" name="forward" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" /> + <param label="Reverse reads" type="data" name="reverse" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" /> + </when> + </conditional> + <param name="genomeSize" type="text" label="Approximate genome size" value="4700000" /> + <param name="covThreshold" type="text" label="Coverage threshold" value="20" /> + <param name="minQ" type="text" label="Minimum average Q score" value="27.0" /> + </inputs> + <outputs> + <data label="QA/QC Results" name="output" format="tabular" /> + <data label="Nucleotide plot" name="nucleo_plot" format="png" from_work_dir="nucleotide_distribution.png" /> + <data label="QScore Distribution" name="qscore_dist" format="png" from_work_dir="qscore_distribution.png" /> + <data label="QScore Histogram" name="qscore_histro" format="png" from_work_dir="qscore_histogram.png" /> + </outputs> + <tests> + <test> + <param name="reads_select" value="collection" /> + <param name="coll"> + <collection type="list:paired"> + <element name="TEST TEST TEST"> + <collection type="paired"> + <element name="forward" value="Input/forward_big.fastq.gz" ftype="fastqsanger.gz" /> + <element name="reverse" value="Input/reverse_big.fastq.gz" ftype="fastqsanger.gz" /> + </collection> + </element> + </collection> + </param> + <output name="output" value="Output/output.tsv" /> + </test> + </tests> + <help><![CDATA[ + TODO: Fill in help. + ]]></help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/.gitignore Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,17 @@ +*.pyc +sample/Output/*xml +./deploy-dev +./deploy-dev.pub +./deploy-prep +./deploy-prep.pub +./deploy-prod +./deploy-prod.pub +deploy-* +venv +venv/* +.DS_Store +*.patch +*bak +tar +.vscode +.vscode/* \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/.gitmodules Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,3 @@ +[submodule "task_core"] + path = task_core + url = http://10.12.207.89/justin.payne/task_core.git
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/README.md Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,7 @@ +# qa_core +## Genomics QA Core for CFSAN Sequencing Operations + +### Installing + virtualenv venv + source venv/bin/activate + pip install -r requirements.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_core.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,300 @@ +#! /usr/bin/env python + +import argparse +import binascii +from collections import OrderedDict +from contextlib import contextmanager +import xml.dom.minidom as dom +from xml.etree import ElementTree as xml +from functools import partial, wraps +from multiprocessing.pool import ThreadPool as Pool +from inspect import getargspec +import os, os.path, sys, errno +import imp +import subprocess +import logging + +ver = "unknown" + + +usage = """ CFSAN Genomics QA Core +May 15, 2015 +Justin Payne +justin.payne@fda.hhs.gov + + usage: + {0} /path/to/sandbox (production mode) + {0} {{0}} + +""".format(os.path.basename(__file__)) + +PHRED_OFFSET = 33 + +input_subdir = 'Input' +output_subdir = 'Output' + +#Heng Li's readfq implementation +#https://github.com/lh3/readfq/blob/master/readfq.py +def readfq(fp): # this is a generator function + last = None # this is a buffer keeping the last unprocessed line + while True: # mimic closure; is it a bad idea? + if not last: # the first record or a record following a fastq + for l in fp: # search for the start of the next record + if l[0] in '>@': # fasta/q header line + last = l[:-1] # save this line + break + if not last: break + name, seqs, last = last[1:].partition(" ")[0], [], None + for l in fp: # read the sequence + if l[0] in '@+>': + last = l[:-1] + break + seqs.append(l[:-1]) + if not last or last[0] != '+': # this is a fasta record + yield name, ''.join(seqs), [] # yield a fasta record + if not last: break + else: # this is a fastq record + seq, leng, seqs = ''.join(seqs), 0, [] + for l in fp: # read the quality + seqs.append(l[:-1]) + leng += len(l) - 1 + if leng >= len(seq): # have read enough quality + last = None + yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record + break + if last: # reach EOF before reading enough quality + yield name, seq, [] # yield a fasta record instead + break + +#from qa_core import qa_test, new_output_file, open_input, readfq, qa_params + +qa_tests = OrderedDict() + +qa_params = OrderedDict() + +qa_core = imp.new_module("qa_core") +sys.modules['qa_core'] = qa_core + +def qa_test(func=None, disable=False, profile=True): + "QA Test decorator" + def qa_test_decorator(qa_func): + #define a parameter-binding wrapper + @wraps(qa_func) + def wrap(namespace): + "Use partial application of functions to bind parameters declared in the QA function from a namespace object" + monad = qa_func + for argument in getargspec(qa_func).args: + if hasattr(namespace, argument) and getattr(namespace, argument): + monad = partial(monad, **{argument: getattr(namespace, argument)}) + + #and then call it + return monad() + + wrap.__profile_enabled__ = profile + if not disable: + # if not name: + # name = qa_func.__name__ + qa_tests[qa_func.__name__] = wrap + return wrap + if func: + return qa_test_decorator(func) + return qa_test_decorator + +qa_core.qa_test = qa_test +qa_core.readfq = readfq +qa_core.qa_params = qa_params + +@qa_test +def version(): + "Collect repository commit id from git" + p = os.path.abspath(os.path.dirname(__file__)) + try: + ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '') + except subprocess.CalledProcessError: + ref = "+".join(qa_tests.keys())[:255] + return ref + +@qa_test +def name(annotation=None): + if annotation: + return annotation + +@contextmanager +def open_file(name, path='.', mode='w'): + with open(os.path.join(path, name), mode) as f: + if 'r' in mode and binascii.hexlify(f.read(2)) == b'1f8b': + import gzip + with gzip.open(os.path.join(path, name), mode) as g: + yield g + else: + f.seek(0) + yield f + +def suite(namespace, log=logging.getLogger('qa.suite')): + for name, test in qa_tests.items(): + logg = log.getChild(name) + try: + result = test(namespace) + if result is not None: + logg.info(result) + yield name, result + except Exception as e: + import traceback + logg.error(traceback.format_exc()) + + +def main(namespace): + if namespace.annotation: + qa_params['annotation'] = namespace.annotation + parallelism = vars(namespace).get('parallelism', 1) + if parallelism > 1: + qa_params['map implementation'] = Pool(parallelism).map + if namespace.mode not in ('suite', 'profile'): + #it's a path to a sandbox job + logging.getLogger('qa.input').info("Load job description xml from {}...".format(namespace.mode)) + tree = xml.parse(os.path.join(namespace.mode, "Input/input.xml")) + root = tree.getroot() + #logging.getLogger('qa.input').debug(xml.tostring(root).strip()) + for element in root: + val = getattr(namespace, element.tag) + if isinstance(val, list): + val.append(element.text) + else: + setattr(namespace, element.tag, element.text) + #set up utility functions + path = os.path.join(namespace.mode, 'Output') + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + except Exception as e: + logging.getLogger('qa.setup').error(e) + qa_core.new_output_file = partial(open_file, path=path) + #qa_core.new_output_file = lambda f: open_file(f, path=path) + qa_core.open_input = partial(open_file, path=os.path.join(namespace.mode, "Input"), mode='r') + #import and run tests + logging.getLogger('qa.tests').info("Loading and running tests...") + import gims_qa_tests + root = xml.Element("qaQcCoverageOutput") + tree = xml.ElementTree(element=root) + [logging.getLogger('qa.params').info("{}:{}".format(k,v)) for k,v in vars(namespace).items()] + for i, (name, result) in enumerate(suite(namespace)): + if isinstance(result, xml.Element): + root.append(result) + else: + if result: + xml.SubElement(root, name).text = str(result) + + try: + with qa_core.new_output_file('output.xml') as out: + d = dom.parseString(xml.tostring(root)) + d.writexml(out, addindent=' ', newl='\n') + logging.getLogger('qa.results').info("Finished tests and wrote to {}".format(namespace.mode)) + except Exception as e: + logging.getLogger('qa.results').error(e) + elif namespace.mode == 'suite': + qa_core.open_input = partial(open_file, mode='rb') + qa_core.new_output_file = open_file + if not namespace.output_plots: + qa_params['plots'] = False + else: + #set up directory for plots + pass + import gims_qa_tests + def fix_value(result): + if isinstance(result, xml.Element): + return xml.tostring(result) + return result + results = {name:fix_value(result) for name, result in suite(namespace)} + import csv + wtr = csv.DictWriter(namespace.output, delimiter='\t', fieldnames=sorted(results.keys())) + wtr.writeheader() + wtr.writerow(results) + elif namespace.mode == 'profile': + return profile(namespace) + + +def profile(namespace): + log = logging.getLogger('qa.profile') + import cProfile + import StringIO + from Bio.Seq import Seq + from Bio.SeqRecord import SeqRecord + from Bio.Alphabet import IUPAC + import random + from random import choice as c + from tempfile import TemporaryFile as tmp + + qa_params['plots'] = False + + profile_fastq = StringIO.StringIO() + log.info("Setup...") + #create some fake FASTQ 'reads' + for r in range(10000): + #boring random distribution + #s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)]) + #interesting diamond distribution + s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)]) + s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)]) + + q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)] + rec = SeqRecord(Seq(s, IUPAC.unambiguous_dna), id="read_{}".format(r)) + rec.letter_annotations['phred_quality'] = q + profile_fastq.write(rec.format('fastq')) + @contextmanager + def input_stub(filename, mode=None): + if 'xml' in filename: + profile_xml.seek(0) + yield profile_xml + elif 'fastq' in filename: + profile_fastq.seek(0) + yield profile_fastq + else: + yield open('/dev/null', 'r') + + qa_core.open_input = input_stub + profiling_buffer = StringIO.StringIO() + + @contextmanager + def get_profiling_buffer(*a, **k): + yield profiling_buffer + profiling_buffer.seek(0) + + qa_core.new_output_file = get_profiling_buffer + import gims_qa_tests + + namespace.sequenceFileName = ('forward.fastq', 'reverse.fastq') + namespace.sequencingType = 'Illumina MiSeq' + namespace.approximateGenomeSize = 1000000 + + def run_tests(namespace, log): + log.getChild('run') + log.info(vars(namespace)) + for name, test in qa_tests.items(): + if test.__profile_enabled__: + test(namespace) + log.info("Profiling...") + return cProfile.runctx('run_tests(namespace, log)', None, locals(), sort='tottime') + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument("mode") + parser.add_argument('sequenceFileName', nargs='*') + parser.add_argument('--genus','-g') + parser.add_argument('--species', '-s') + parser.add_argument('--sequencingType', '-t') + parser.add_argument('--approximateGenomeSize','--gs', type=int, default=4500000) + parser.add_argument('--coverageThreshold', '-c', type=int, default=20) + parser.add_argument('--minimumQScore', '-q', type=float, default=27.0) + parser.add_argument('--databasePath', '--db') + parser.add_argument('--log', '-l', type=argparse.FileType('w'), default=sys.stdout) + parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout) + parser.add_argument('--output_plots') + parser.add_argument('--parallelism', '-p', type=int, default=1) + parser.add_argument('--annotate', '-a', type=str, default=None, dest='annotation') + params = parser.parse_args() + logging.basicConfig(format='%(name)s\t%(levelname)s:%(message)s', level=logging.DEBUG, stream = params.log) + logging.getLogger('qa.cmd').info(" ".join(sys.argv)) + main(params) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_qa_core.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,390 @@ +#!/usr/bin/env python + +#from __future__ import print_function + +import xml.dom.minidom as dom +import xml.etree.ElementTree as xml +import sys +import os, os.path +from functools import partial, wraps +from inspect import getargspec, getfile, currentframe, getsource +from os.path import join as j +import subprocess +import textwrap +import imp +from collections import OrderedDict +from contextlib import contextmanager +import StringIO +import gzip + +ver = "core+plots+fxt" + + +usage = """ CFSAN Genomics QA Core +May 15, 2015 +Justin Payne +justin.payne@fda.hhs.gov + + usage: + {0} /path/to/sandbox (production mode) + {0} {{0}} + +""".format(os.path.basename(__file__)) + +PHRED_OFFSET = 33 + +input_subdir = 'Input' +output_subdir = 'Output' + +path = os.getcwd() + +qa_tests = OrderedDict() +qa_undecorated_tests = OrderedDict() +qa_params = OrderedDict() + +class QaParamsDict(OrderedDict): + "orderedDict that sets on get access" + + def get(self, key, value=None): + self[key] = result = super(QaParamsDict, self).get(key, value) + return result + + +qa_params = QaParamsDict() + +def int_then_string(value): + try: + return int(value) + except ValueError: + return str(value) + +class IncompleteXMLError(Exception): + pass + +def qa_test(func=None, disable=False, profile=True): + "Decorator to simplify XML interaction" + def wrap(qa_func): + "Inner function to decorate qa test" + #print "registering", qa_func.__name__ + qa_undecorated_tests[qa_func.__name__] = qa_func + if disable: + return qa_func + @wraps(qa_func) + def bind_metadata_to_arguments(metadata, qa_func): + "Find elements in XML with same name as function argument and bind their content" + try: + arguments = list(getargspec(qa_func).args) + except TypeError: + arguments = list(getargspec(qa_func.func).args) + for argument in arguments: + elements = metadata.findall(".//{0}".format(argument)) + if len(elements) == 1: + qa_func = partial(qa_func, **{argument:int_then_string(elements[0].text)}) + elif len(elements) > 1: + qa_func = partial(qa_func, **{argument:[int_then_string(e.text) for e in elements]}) + else: + pass + try: + return qa_func() +# except TypeError as e: +# for el in metadata: +# if el.tag in args: +# arguments.remove(el.tag) +# message = "Missing argument or arguments in input.xml: {} not present.".format(', '.join(arguments)) +# raise IncompleteXMLError, message, sys.exc_info()[-1] + finally: + pass + #bind_metadata_to_arguments + decorated_qa_func = partial(bind_metadata_to_arguments, qa_func=qa_func) + decorated_qa_func.__profile_enabled__ = profile + qa_tests[qa_func.__name__] = decorated_qa_func + return decorated_qa_func + #qa_test_decorator + if func: + return wrap(func) + return wrap + + +qa_core = imp.new_module("qa_core") +qa_core.qa_test = qa_test +qa_core.test_output_buffer = StringIO.StringIO() + +def present(file): + return "/dev/null" + +qa_core.present = present + +@contextmanager +def get_test_buffer(*args, **kwargs): + yield qa_core.test_output_buffer + +@contextmanager +def get_test_input_file(*args, **kwargs): + yield open('/dev/null', 'r') + +qa_core.new_output_file = get_test_buffer +qa_core.open_input = get_test_input_file +qa_core.qa_params = qa_params +sys.modules['qa_core'] = qa_core + + +def main(sandbox, profile=False, *a, **k): + output = "qa_core: ran " + with qa_core.open_input('input.xml') as inputfile: + metadata = xml.parse(inputfile).getroot() + + qaoutput = xml.Element("qaQcCoverageOutput") + outputxml = xml.ElementTree(element=qaoutput) + + for name, test in qa_tests.items(): + if (profile and test.__profile_enabled__) or not profile: + output += "{0}, ".format(name) + try: + result = test(metadata) + if isinstance(result, xml.Element): + qaoutput.append(result) + else: + if result: + xml.SubElement(qaoutput, name).text = str(result) + except Exception as e: + output += 'and died on error.' + import traceback + traceback.print_exc() + print(test) + xml.SubElement(qaoutput, 'reasonForFailure').text = str(e) + break +# if gims_qa_tests.failure_mode: +# xml.SubElement(qaoutput, "reasonForFailure").text = str(gims_qa_tests.failure_mode) + + with qa_core.new_output_file('output.xml') as outputfile: + d = dom.parseString(xml.tostring(qaoutput)) + d.writexml(outputfile, addindent=' ', newl='\n') + return output + + + + + +@qa_test +def version(): + "Collect repository commit id from git" + p = os.path.abspath(os.path.dirname(__file__)) + try: + ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '') + except subprocess.CalledProcessError: + ref = ver + return ref + +@qa_test(disable=True) +def profile(): + "profiling command to test performance" + + print("Starting profiler. Imports...") + + import cProfile + import StringIO + import Bio + import random + from random import choice as c + + from tempfile import TemporaryFile as tmp + + + print("Setting up test data...") + + profile_xml = StringIO.StringIO("""<?xml version='1.0' encoding='UTF-8'?> +<qaQcCoverageInput> + <sequenceFileName>forward.fastq</sequenceFileName> + <sequenceFileName>reverse.fastq</sequenceFileName> + <sequencingType>Illumina MiSeq</sequencingType> + <approximateGenomeSize>1000000</approximateGenomeSize> +</qaQcCoverageInput> +""") + + qa_params['plots'] = False + + profile_fastq = StringIO.StringIO() + #create some fake FASTQ 'reads' + for r in range(10000): + #boring random distribution + #s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)]) + #interesting diamond distribution + s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)]) + s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)]) + + q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)] + rec = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s, Bio.Alphabet.IUPAC.unambiguous_dna), id="read_{}".format(r)) + rec.letter_annotations['phred_quality'] = q + try: + profile_fastq.write(rec.format('fastq')) + except: + print q + print rec + raise + + + + + @contextmanager + def input_stub(filename, mode=None): + if 'xml' in filename: + profile_xml.seek(0) + yield profile_xml + elif 'fastq' in filename: + profile_fastq.seek(0) + yield profile_fastq + else: + yield open('/dev/null', 'r') + + qa_core.open_input = input_stub + #qa_core.new_output_file = get_test_buffer + + profiling_buffer = StringIO.StringIO() + + @contextmanager + def get_profiling_buffer(*a, **k): + yield profiling_buffer + profiling_buffer.seek(0) + + qa_core.new_output_file = get_profiling_buffer + + reload(gims_qa_tests) + + print("Beginning profiling...") + try: + s = cProfile.run('main(profile=True)', sort='tottime') + #s.sort_stats('pcalls', 'tottime') + return str(profiling_buffer.read()) + except NameError: + import traceback + traceback.print_exc() + main(profile=True) + quit() + except Exception: + import traceback + traceback.print_exc() + #print str(profile_fastq.getvalue())[:1024] + quit() + #return exec('main') + + +#Heng Li's readfq implementation +#https://github.com/lh3/readfq/blob/master/readfq.py +def readfq(fp): # this is a generator function + last = None # this is a buffer keeping the last unprocessed line + while True: # mimic closure; is it a bad idea? + if not last: # the first record or a record following a fastq + for l in fp: # search for the start of the next record + if l[0] in '>@': # fasta/q header line + last = l[:-1] # save this line + break + if not last: break + name, seqs, last = last[1:].partition(" ")[0], [], None + for l in fp: # read the sequence + if l[0] in '@+>': + last = l[:-1] + break + seqs.append(l[:-1]) + if not last or last[0] != '+': # this is a fasta record + yield name, ''.join(seqs), [] # yield a fasta record + if not last: break + else: # this is a fastq record + seq, leng, seqs = ''.join(seqs), 0, [] + for l in fp: # read the quality + seqs.append(l[:-1]) + leng += len(l) - 1 + if leng >= len(seq): # have read enough quality + last = None + yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record + break + if last: # reach EOF before reading enough quality + yield name, seq, [] # yield a fasta record instead + break + +qa_core.readfq = readfq + + + +if __name__ == "__main__": + try: + name, sandbox_path = sys.argv[0:2] + args = sys.argv[2:] + + if '-p' in args: + from multiprocessing import Pool + try: + m = int(args[args.index('-p') + 1]) + p = Pool(processes=m) + del args[args.index('-p') + 1] + qa_core.qa_params['parallelism'] = m + except IndexError, ValueError: + p = Pool() + args.remove('-p') + qa_core.qa_params['map implementation'] = p.map + + if '-q' in args: + try: + q = float(args.pop(args.index('-q') + 1)) + args.remove('-q') + qa_params['user Q threshold'] = q + except IndexError, ValueError: + print("no Q score threshold provided with '-q' option.") + quit(1) + + + @contextmanager + def new_output_file(filename, mode='w'): + if not os.path.exists(j(sandbox_path, output_subdir)): + os.mkdir(j(sandbox_path, output_subdir)) + fileh = open(j(sandbox_path, output_subdir, filename), mode) + yield fileh + fileh.close() + + + def open_input_from(path): + @contextmanager + def open_input(filename, mode='rU'): + if 'gz' in filename: + fopen = gzip.open + else: + fopen = open + fileh = fopen(j(path, filename), mode=mode) + yield fileh + fileh.close() + return open_input + + def present_from(path): + def present(file): + return os.path.abspath(j(path, file)) + return present + + qa_core.new_output_file = new_output_file + main_func = main + + if not os.path.exists(sandbox_path): + qa_core.open_input = open_input_from(os.getcwd()) + qa_core.present = present_from(os.getcwd()) + try: + main_func = lambda s, *a: qa_undecorated_tests[sandbox_path](*a) + except KeyError: + print("Invalid QA test name: {0}".format(sandbox_path)) + quit(1) + else: + qa_core.open_input = open_input_from(j(sandbox_path, input_subdir)) + qa_core.present = present_from(j(sandbox_path, input_subdir)) + + + + except ValueError: + import gims_qa_tests + print("Missing argument: path to sandbox or name of QA test.") + commands = "" + #commands = "\n\t\t".join([' '.join((n, " ".join(["<{0}>".format(a) for a in getargspec(f)[0]]) or 'DELETEME')).replace(' DELETEME', '') for n, f in qa_undecorated_tests.items()]) + for name, func in qa_undecorated_tests.items(): + args = " ".join(["<{0}>".format(a) for a in getargspec(func).args]) + commands += "{0} {1}\n\t\t\t".format(name, textwrap.fill(args, 60, subsequent_indent='\t\t\t\t')) + print(usage.format(commands)) + quit(1) + else: + import gims_qa_tests + print(main_func(sandbox_path, profile=False, *args))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_qa_kraken.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,217 @@ +import csv +from collections import Counter, OrderedDict +from hashlib import md5 +import os.path +import tempfile +from subprocess import check_call +from gims_qa_tests import TaxTuple +#from qa_core import present + +def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)): + "kraken options should be supplied as a dict to kraken_options; give a None value for valueless options" + + #hash the kraken db files + md5hash = md5() + #os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None) + md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read()) + + db_hash = md5hash.hexdigest() + + + with tempfile.NamedTemporaryFile('rU') as kraken_out: + with tempfile.NamedTemporaryFile() as seq_data: + #cat all the files together + check_call("cat {files} > {seq_data.name}".format( + files = " ".join([present(f) for f in sequenceFiles]), + seq_data = seq_data + ), + shell=True) + + def make_option(key, value): + if value is not None: + return "--{} {}".format(key, value) + return "--{}".format(key) + + #build options + options = " ".join([make_option(k,v) for k,v in kraken_options.items()]) + + #call kraken + check_call("kraken {options} " + "--db {database} " + "--gzip-compressed " + "--fastq {seq_data.name} " + "--output {kraken_out.name}".format(**locals()), shell=True) + + counts = Counter() + + with tempfile.NamedTemporaryFile('rU') as kraken_report: + check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True) + rows = csv.reader(kraken_report, delimiter='\t') + for seq_id, taxon in rows: + counts[taxon] += 1.0 + counts["total"] += 1.0 + + #clean up all the temp files by releasing contexts + + top_results = counts.most_common(6) #total, plus five most common taxa + total, total_count = top_results.pop(0) + top_hit, top_hit_count = top_results.pop(0) + next_hits = [h[0].replace(';', ' ') for h in top_results] + next_hit_counts = [h[1] for h in top_results] + + top_hit_frac = top_hit_count / total_count + next_hit_frac = [h / total_count for h in next_hit_counts] + + return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash) + +def runKrakenSeqtest(sequenceFiles,database): + + krakenoutoutpath = "/home/charles.strittmatter/output/kraken/" + + w = datetime.datetime.now() + + krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M") + krakenreport = "" + krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M") + + sequenceFiles[0] + sequenceFiles[1] + + os.path.abspath(os.path.join("Input", sequenceFileName[0])) + + command = "kraken --preload " + command += "--db " + database + ' ' + #if len(fastqPaths) > 1: + command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1])) + # command += ' '.join(fastqPaths) + command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout + + print command + + check_call(command, shell=True) + print "%s -- Kraken called..." % datetime.datetime.now() + + + kraken_report_cmd = "kraken-report --db " + kraken_report_cmd += database + kraken_report_cmd += " " + krakenoutoutpath + krakenout + kraken_report_cmd += " > " + krakenreport + + print kraken_report_cmd + + check_call(kraken_report_cmd, shell=True) + + + + + parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5" + + kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True) + + split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines) + + print "=====================" + krakenreport + "==========================" + print split_kraken_rpt_line + split_kraken_rpt_length = len(split_kraken_rpt_line) + print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4] + taxLen = 6; + taxIdIdx = 4; + outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M") + '_metadata_kraken.tsv' + + metadata = "Platform Run asm_stats_contig_n50 asm_stats_length_bp asm_stats_n_contig attribute_package bioproject_acc bioproject_center bioproject_title biosample_acc collected_by sample_name scientific_name serovar sra_center strain sub_species tax-id" + + + + + splitMetadata = metadata.split("\t") + for i in range(0, len(split_kraken_rpt_line)): + if split_kraken_rpt_line[i] == '': + print "don't add to the metadata...split_kraken_rpt_line.pop(i)" + elif i%(taxLen) == taxLen-1: + print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1" + splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip()) + elif i%(taxLen) == taxIdIdx: + print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx" + splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) + elif i%(taxLen) == 0: + print "i is "+str(i)+"under this condition: i%(taxLen) == 0" + splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) + + splitMetadataLength = len(splitMetadata) + first_kraken_percentage = splitMetadataLength-15 + with open(outputFile, 'wb') as f: + #writer = csv.writer(f, delimiter="\t") + + # print splitMetadata + for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): + print "Split metadata of i "+splitMetadata[i] + for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): + if float(splitMetadata[j]) > float(splitMetadata[i]): + print "i: "+str(i)+" and j: "+str(j) + print "swapping "+splitMetadata[i]+" with "+splitMetadata[j] + old_hit_org = splitMetadata[i-1] + old_hit_percentage = splitMetadata[i] + old_hit_tx_id = splitMetadata[i+1] + + splitMetadata[i-1] = splitMetadata[j-1] + splitMetadata[i] = splitMetadata[j] + splitMetadata[i+1] = splitMetadata[j+1] + + splitMetadata[j-1] = old_hit_org + splitMetadata[j] = old_hit_percentage + splitMetadata[j+1] = old_hit_tx_id + + df = pandas.DataFrame(splitMetadata) + + print df + + + # Here aree dataframe collumns for ! + + print df.iat[0,0] + print df.iat[1,0] + print df.iat[3,0] + print df.iat[4,0] + print df.iat[6,0] + print df.iat[7,0] + print df.iat[9,0] + print df.iat[10,0] + + OrganismOne = df.iat[0,0] + OrganismTwo = df.iat[3,0] + OrganismThree = df.iat[6,0] + OrganismFour = df.iat[9,0] + + + OrganismOnePer = df.iat[1,0] + OrganismTwoPer = df.iat[4,0] + OrganismThreePer = df.iat[7,0] + OrganismFourPer = df.iat[10,0] + + + df = {'OrganismOne':OrganismOne, + 'OrganismTwo':OrganismTwo, + 'OrganismThree':OrganismThree, + 'OrganismFour':OrganismFour, + 'OrganismOne%':OrganismOnePer, + 'OrganismTwo%':OrganismTwoPer, + 'OrganismThree%':OrganismThreePer, + 'OrganismFour%':OrganismFourPer } + + + #Clean up files + + command = "" + command += "rm " + krakenoutoutpath + krakenout + + print command + + check_call(command, shell=True) + + + + command = "" + command += "rm " + outputFile + + print command + + return pandas.DataFrame(df, index=[0]) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_qa_plot.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,148 @@ +import subprocess +from qa_core import new_output_file, qa_params +from collections import defaultdict +import warnings +warnings.filterwarnings("ignore") +import pandas +from pandas import DataFrame +import matplotlib.pyplot as plot +from numpy import zeros, add, mean, array, percentile, arange +import traceback +from gims_qa_tests import cluster +import logging + +bases = ("A", "T", "G", "C", "N") + +def qa_plots(df, annotation, log=logging.getLogger('qa.plots'), **kwargs): + #log.debug(df) + #log.debug(annotation) + plot_quality_histo(df['qualityHisto'], annotation, **kwargs) + plot_nucleotide_distribution_histo(df['nucleotideHisto'], annotation) + plot_quality_distribution_histo(df['baseQScoreHisto'], annotation) + plot_base_balance(df, annotation, **kwargs) + #plot.show() + +def plot_base_balance(df, + annotation=qa_params.get('annotation', ''), + threshold=qa_params.get('permitted base balance variance', .2), + **kwargs): + try: + plot.figure() + plot.axis((0,2,0,2)) + plot.suptitle("{} Base Balance".format(annotation)) + plot.xlabel("G/C") + plot.ylabel("A/T") + thres_min = 1 - threshold + thres_max = 1 + threshold + for coord in (thres_min, thres_max): + for linef in (plot.axvline, plot.axhline): + linef(coord, linestyle='dashed', label=str(coord)) + + plot.annotate(thres_max, xy=(0, thres_max)) + plot.annotate(thres_min, xy=(0, thres_min)) + plot.annotate(thres_max, xy=(thres_max, 0)) + plot.annotate(thres_min, xy=(thres_min, 0)) + + a, t, g, c = [sum(df['nucleotideHisto'][b]) for b in ('A', 'T', 'G', 'C')] + y, x = a / float(t), g / float(c) + plot.plot((x,), (y,), marker="+") + plot.annotate("({:.2},{:.2})".format(x, y), xy=(x+.05, y+.05), textcoords='data') + with new_output_file("base_balance.png") as base_bal: + plot.savefig(base_bal, format='png', dpi=92) + except Exception: + traceback.print_exc() + + +def plot_quality_histo(histo, + annotation=qa_params.get('annotation', ''), + q=qa_params.get('user Q threshold', 27)): + try: + plot.figure() + labels = range(max(histo.keys()) + 1) + plot.bar(left=labels, + height=[histo[n] for n in labels], + color='y', + linewidth=1) +# plot.hist(histo, bins=range(max(histo))) + plot.xticks() + plot.axvline(x=q, label="Q {}".format(q)) + #plot.show() + plot.xlabel(r"""$Q (-10 log_(10) \rho)$""") + plot.ylabel('Number of Bases') + plot.suptitle('{} Q-Score Histogram'.format(annotation)) + plot.legend(loc='upper left', fontsize='x-small') + with new_output_file("qscore_histogram.png") as qual_histogram: + plot.savefig(qual_histogram, format='png', dpi=92) +# except ValueError: +# pass + except Exception: + traceback.print_exc() + +def plot_nucleotide_distribution_histo(dataframe, + annotation=qa_params.get('annotation', '')): + try: + #dataframe = defaultdict(list) + colors = iter(('r', 'y', 'g', 'b', 'pink')) + plot.figure() + last_base = zeros((len(dataframe.values()[0]),)) #set all zeros for bottom of first bar row + for base in bases: + try: + plot.bar(left=range(len(dataframe[base])), + height=dataframe[base], + width=2, + bottom=last_base, + color=colors.next(), + label=base, + linewidth=0) + last_base = add(last_base, dataframe[base]) + except: + #print [len(d) for d in dataframe.data.values()] + raise + plot.autoscale(tight=True) + plot.xlabel('Base Position') + plot.ylabel('Nucleotide Count') + plot.suptitle('{} Nucleotide Distribution Plot'.format(annotation)) + plot.legend(loc='upper right', fontsize='xx-small') + with new_output_file("nucleotide_distribution.png") as nuc_histogram: + plot.savefig(nuc_histogram, format='png', dpi=108) + except Exception: + traceback.print_exc() + +def plot_quality_distribution_histo(histo, + annotation=qa_params.get('annotation', ''), + q=qa_params.get('user Q threshold', 27) + ): + try: + binning = 20 + plot.figure(figsize=(12, 6)) + vec = [tuple(c.elements()) for c in histo] + means = [mean(t) for t in vec] + upper_iqr = array([percentile(t, 75) for t in vec]) + lower_iqr = array([percentile(t, 25) for t in vec]) +# plot.boxplot(vec, +# widths=.000005, +# whis=(5,95), +# showfliers=False, +# showbox=True, +# showmeans=True, +# meanline=True, +# showcaps=True, +# whiskerprops=dict(linestyle='-', color='0.60') +# ) + + plot.fill_between(arange(len(upper_iqr)), upper_iqr, lower_iqr, facecolor='0.50', alpha=0.5, label='interquartile range') + plot.plot(means, 'r-', label='Mean') + plot.plot(upper_iqr, 'b-', label='interquartile range', alpha=0.25) + plot.autoscale(tight=True) + plot.axhline(y=q, label="Q {}".format(q)) + plot.xlabel('Base Position') + plot.ylabel(r"""$Q (-10 log_(10) \rho)$""") + plot.xticks(range(0, len(histo), binning), range(0, len(histo), binning), rotation='vertical') + plot.suptitle('{} Q-Score Distribution Plot'.format(annotation)) + plot.legend(loc='lower left', fontsize='x-small') + plot.grid(True) + with new_output_file("qscore_distribution.png") as q_distro: + plot.savefig(q_distro, format='png', dpi=108) + + except Exception: + traceback.print_exc() \ No newline at end of file