Mercurial > repos > jpayne > cfsan_qaqc
changeset 0:dafec28bd37e
planemo upload
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cfsan-qaqc.xml Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,73 @@ +<tool id="cfsan-qaqc" name="CFSAN QA/QC process" version="0.1.0"> + <requirements> + <requirement type="package" version="1.65">biopython</requirement> + <requirement type="package" version="1.4.3">matplotlib</requirement> + <requirement type="package" version="1.9.2">numpy</requirement> + <requirement type="package" version="0.16.1">pandas</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + #if $reads.reads_select == 'collection' + #set name=$reads.coll.name + #set forward=$reads.coll.forward + #set reverse=$reads.coll.reverse + #else + #set name=$forward.name + #end if + mkdir plots && + qa_core/gims_core.py --sequencingType "Illumina MiSeq" + --approximateGenomeSize $genomeSize + --coverageThreshold $covThreshold + --minimumQScore $minQ + --output_plots plots + --parallelism \${GALAXY_SLOTS:-6} + --annotate $name + --log - + suite + $forward + $reverse + -o $output + ]]></command> + <inputs> + <conditional name="reads"> + <param name="reads_select" type="select" label="Paired-end collection, or two datasets from your history"> + <option value="collection">Paired collection from your history</option> + <option value="history">Two FASTQ datasets from your history</option> + </param> + <when value="collection"> + <param label="Paired reads" name="coll" type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" collection_type="paired" /> + </when> + <when value="history"> + <param label="Forward reads" type="data" name="forward" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" /> + <param label="Reverse reads" type="data" name="reverse" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" /> + </when> + </conditional> + <param name="genomeSize" type="text" label="Approximate genome size" value="4700000" /> + <param name="covThreshold" type="text" label="Coverage threshold" value="20" /> + <param name="minQ" type="text" label="Minimum average Q score" value="27.0" /> + </inputs> + <outputs> + <data label="QA/QC Results" name="output" format="tabular" /> + <data label="Nucleotide plot" name="nucleo_plot" format="png" from_work_dir="nucleotide_distribution.png" /> + <data label="QScore Distribution" name="qscore_dist" format="png" from_work_dir="qscore_distribution.png" /> + <data label="QScore Histogram" name="qscore_histro" format="png" from_work_dir="qscore_histogram.png" /> + </outputs> + <tests> + <test> + <param name="reads_select" value="collection" /> + <param name="coll"> + <collection type="list:paired"> + <element name="TEST TEST TEST"> + <collection type="paired"> + <element name="forward" value="Input/forward_big.fastq.gz" ftype="fastqsanger.gz" /> + <element name="reverse" value="Input/reverse_big.fastq.gz" ftype="fastqsanger.gz" /> + </collection> + </element> + </collection> + </param> + <output name="output" value="Output/output.tsv" /> + </test> + </tests> + <help><![CDATA[ + TODO: Fill in help. + ]]></help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/.gitignore Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,17 @@ +*.pyc +sample/Output/*xml +./deploy-dev +./deploy-dev.pub +./deploy-prep +./deploy-prep.pub +./deploy-prod +./deploy-prod.pub +deploy-* +venv +venv/* +.DS_Store +*.patch +*bak +tar +.vscode +.vscode/* \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/.gitmodules Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,3 @@ +[submodule "task_core"] + path = task_core + url = http://10.12.207.89/justin.payne/task_core.git
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/README.md Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,7 @@ +# qa_core +## Genomics QA Core for CFSAN Sequencing Operations + +### Installing + virtualenv venv + source venv/bin/activate + pip install -r requirements.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_core.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,300 @@ +#! /usr/bin/env python + +import argparse +import binascii +from collections import OrderedDict +from contextlib import contextmanager +import xml.dom.minidom as dom +from xml.etree import ElementTree as xml +from functools import partial, wraps +from multiprocessing.pool import ThreadPool as Pool +from inspect import getargspec +import os, os.path, sys, errno +import imp +import subprocess +import logging + +ver = "unknown" + + +usage = """ CFSAN Genomics QA Core +May 15, 2015 +Justin Payne +justin.payne@fda.hhs.gov + + usage: + {0} /path/to/sandbox (production mode) + {0} {{0}} + +""".format(os.path.basename(__file__)) + +PHRED_OFFSET = 33 + +input_subdir = 'Input' +output_subdir = 'Output' + +#Heng Li's readfq implementation +#https://github.com/lh3/readfq/blob/master/readfq.py +def readfq(fp): # this is a generator function + last = None # this is a buffer keeping the last unprocessed line + while True: # mimic closure; is it a bad idea? + if not last: # the first record or a record following a fastq + for l in fp: # search for the start of the next record + if l[0] in '>@': # fasta/q header line + last = l[:-1] # save this line + break + if not last: break + name, seqs, last = last[1:].partition(" ")[0], [], None + for l in fp: # read the sequence + if l[0] in '@+>': + last = l[:-1] + break + seqs.append(l[:-1]) + if not last or last[0] != '+': # this is a fasta record + yield name, ''.join(seqs), [] # yield a fasta record + if not last: break + else: # this is a fastq record + seq, leng, seqs = ''.join(seqs), 0, [] + for l in fp: # read the quality + seqs.append(l[:-1]) + leng += len(l) - 1 + if leng >= len(seq): # have read enough quality + last = None + yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record + break + if last: # reach EOF before reading enough quality + yield name, seq, [] # yield a fasta record instead + break + +#from qa_core import qa_test, new_output_file, open_input, readfq, qa_params + +qa_tests = OrderedDict() + +qa_params = OrderedDict() + +qa_core = imp.new_module("qa_core") +sys.modules['qa_core'] = qa_core + +def qa_test(func=None, disable=False, profile=True): + "QA Test decorator" + def qa_test_decorator(qa_func): + #define a parameter-binding wrapper + @wraps(qa_func) + def wrap(namespace): + "Use partial application of functions to bind parameters declared in the QA function from a namespace object" + monad = qa_func + for argument in getargspec(qa_func).args: + if hasattr(namespace, argument) and getattr(namespace, argument): + monad = partial(monad, **{argument: getattr(namespace, argument)}) + + #and then call it + return monad() + + wrap.__profile_enabled__ = profile + if not disable: + # if not name: + # name = qa_func.__name__ + qa_tests[qa_func.__name__] = wrap + return wrap + if func: + return qa_test_decorator(func) + return qa_test_decorator + +qa_core.qa_test = qa_test +qa_core.readfq = readfq +qa_core.qa_params = qa_params + +@qa_test +def version(): + "Collect repository commit id from git" + p = os.path.abspath(os.path.dirname(__file__)) + try: + ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '') + except subprocess.CalledProcessError: + ref = "+".join(qa_tests.keys())[:255] + return ref + +@qa_test +def name(annotation=None): + if annotation: + return annotation + +@contextmanager +def open_file(name, path='.', mode='w'): + with open(os.path.join(path, name), mode) as f: + if 'r' in mode and binascii.hexlify(f.read(2)) == b'1f8b': + import gzip + with gzip.open(os.path.join(path, name), mode) as g: + yield g + else: + f.seek(0) + yield f + +def suite(namespace, log=logging.getLogger('qa.suite')): + for name, test in qa_tests.items(): + logg = log.getChild(name) + try: + result = test(namespace) + if result is not None: + logg.info(result) + yield name, result + except Exception as e: + import traceback + logg.error(traceback.format_exc()) + + +def main(namespace): + if namespace.annotation: + qa_params['annotation'] = namespace.annotation + parallelism = vars(namespace).get('parallelism', 1) + if parallelism > 1: + qa_params['map implementation'] = Pool(parallelism).map + if namespace.mode not in ('suite', 'profile'): + #it's a path to a sandbox job + logging.getLogger('qa.input').info("Load job description xml from {}...".format(namespace.mode)) + tree = xml.parse(os.path.join(namespace.mode, "Input/input.xml")) + root = tree.getroot() + #logging.getLogger('qa.input').debug(xml.tostring(root).strip()) + for element in root: + val = getattr(namespace, element.tag) + if isinstance(val, list): + val.append(element.text) + else: + setattr(namespace, element.tag, element.text) + #set up utility functions + path = os.path.join(namespace.mode, 'Output') + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + except Exception as e: + logging.getLogger('qa.setup').error(e) + qa_core.new_output_file = partial(open_file, path=path) + #qa_core.new_output_file = lambda f: open_file(f, path=path) + qa_core.open_input = partial(open_file, path=os.path.join(namespace.mode, "Input"), mode='r') + #import and run tests + logging.getLogger('qa.tests').info("Loading and running tests...") + import gims_qa_tests + root = xml.Element("qaQcCoverageOutput") + tree = xml.ElementTree(element=root) + [logging.getLogger('qa.params').info("{}:{}".format(k,v)) for k,v in vars(namespace).items()] + for i, (name, result) in enumerate(suite(namespace)): + if isinstance(result, xml.Element): + root.append(result) + else: + if result: + xml.SubElement(root, name).text = str(result) + + try: + with qa_core.new_output_file('output.xml') as out: + d = dom.parseString(xml.tostring(root)) + d.writexml(out, addindent=' ', newl='\n') + logging.getLogger('qa.results').info("Finished tests and wrote to {}".format(namespace.mode)) + except Exception as e: + logging.getLogger('qa.results').error(e) + elif namespace.mode == 'suite': + qa_core.open_input = partial(open_file, mode='rb') + qa_core.new_output_file = open_file + if not namespace.output_plots: + qa_params['plots'] = False + else: + #set up directory for plots + pass + import gims_qa_tests + def fix_value(result): + if isinstance(result, xml.Element): + return xml.tostring(result) + return result + results = {name:fix_value(result) for name, result in suite(namespace)} + import csv + wtr = csv.DictWriter(namespace.output, delimiter='\t', fieldnames=sorted(results.keys())) + wtr.writeheader() + wtr.writerow(results) + elif namespace.mode == 'profile': + return profile(namespace) + + +def profile(namespace): + log = logging.getLogger('qa.profile') + import cProfile + import StringIO + from Bio.Seq import Seq + from Bio.SeqRecord import SeqRecord + from Bio.Alphabet import IUPAC + import random + from random import choice as c + from tempfile import TemporaryFile as tmp + + qa_params['plots'] = False + + profile_fastq = StringIO.StringIO() + log.info("Setup...") + #create some fake FASTQ 'reads' + for r in range(10000): + #boring random distribution + #s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)]) + #interesting diamond distribution + s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)]) + s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)]) + + q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)] + rec = SeqRecord(Seq(s, IUPAC.unambiguous_dna), id="read_{}".format(r)) + rec.letter_annotations['phred_quality'] = q + profile_fastq.write(rec.format('fastq')) + @contextmanager + def input_stub(filename, mode=None): + if 'xml' in filename: + profile_xml.seek(0) + yield profile_xml + elif 'fastq' in filename: + profile_fastq.seek(0) + yield profile_fastq + else: + yield open('/dev/null', 'r') + + qa_core.open_input = input_stub + profiling_buffer = StringIO.StringIO() + + @contextmanager + def get_profiling_buffer(*a, **k): + yield profiling_buffer + profiling_buffer.seek(0) + + qa_core.new_output_file = get_profiling_buffer + import gims_qa_tests + + namespace.sequenceFileName = ('forward.fastq', 'reverse.fastq') + namespace.sequencingType = 'Illumina MiSeq' + namespace.approximateGenomeSize = 1000000 + + def run_tests(namespace, log): + log.getChild('run') + log.info(vars(namespace)) + for name, test in qa_tests.items(): + if test.__profile_enabled__: + test(namespace) + log.info("Profiling...") + return cProfile.runctx('run_tests(namespace, log)', None, locals(), sort='tottime') + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument("mode") + parser.add_argument('sequenceFileName', nargs='*') + parser.add_argument('--genus','-g') + parser.add_argument('--species', '-s') + parser.add_argument('--sequencingType', '-t') + parser.add_argument('--approximateGenomeSize','--gs', type=int, default=4500000) + parser.add_argument('--coverageThreshold', '-c', type=int, default=20) + parser.add_argument('--minimumQScore', '-q', type=float, default=27.0) + parser.add_argument('--databasePath', '--db') + parser.add_argument('--log', '-l', type=argparse.FileType('w'), default=sys.stdout) + parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout) + parser.add_argument('--output_plots') + parser.add_argument('--parallelism', '-p', type=int, default=1) + parser.add_argument('--annotate', '-a', type=str, default=None, dest='annotation') + params = parser.parse_args() + logging.basicConfig(format='%(name)s\t%(levelname)s:%(message)s', level=logging.DEBUG, stream = params.log) + logging.getLogger('qa.cmd').info(" ".join(sys.argv)) + main(params) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_qa_core.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,390 @@ +#!/usr/bin/env python + +#from __future__ import print_function + +import xml.dom.minidom as dom +import xml.etree.ElementTree as xml +import sys +import os, os.path +from functools import partial, wraps +from inspect import getargspec, getfile, currentframe, getsource +from os.path import join as j +import subprocess +import textwrap +import imp +from collections import OrderedDict +from contextlib import contextmanager +import StringIO +import gzip + +ver = "core+plots+fxt" + + +usage = """ CFSAN Genomics QA Core +May 15, 2015 +Justin Payne +justin.payne@fda.hhs.gov + + usage: + {0} /path/to/sandbox (production mode) + {0} {{0}} + +""".format(os.path.basename(__file__)) + +PHRED_OFFSET = 33 + +input_subdir = 'Input' +output_subdir = 'Output' + +path = os.getcwd() + +qa_tests = OrderedDict() +qa_undecorated_tests = OrderedDict() +qa_params = OrderedDict() + +class QaParamsDict(OrderedDict): + "orderedDict that sets on get access" + + def get(self, key, value=None): + self[key] = result = super(QaParamsDict, self).get(key, value) + return result + + +qa_params = QaParamsDict() + +def int_then_string(value): + try: + return int(value) + except ValueError: + return str(value) + +class IncompleteXMLError(Exception): + pass + +def qa_test(func=None, disable=False, profile=True): + "Decorator to simplify XML interaction" + def wrap(qa_func): + "Inner function to decorate qa test" + #print "registering", qa_func.__name__ + qa_undecorated_tests[qa_func.__name__] = qa_func + if disable: + return qa_func + @wraps(qa_func) + def bind_metadata_to_arguments(metadata, qa_func): + "Find elements in XML with same name as function argument and bind their content" + try: + arguments = list(getargspec(qa_func).args) + except TypeError: + arguments = list(getargspec(qa_func.func).args) + for argument in arguments: + elements = metadata.findall(".//{0}".format(argument)) + if len(elements) == 1: + qa_func = partial(qa_func, **{argument:int_then_string(elements[0].text)}) + elif len(elements) > 1: + qa_func = partial(qa_func, **{argument:[int_then_string(e.text) for e in elements]}) + else: + pass + try: + return qa_func() +# except TypeError as e: +# for el in metadata: +# if el.tag in args: +# arguments.remove(el.tag) +# message = "Missing argument or arguments in input.xml: {} not present.".format(', '.join(arguments)) +# raise IncompleteXMLError, message, sys.exc_info()[-1] + finally: + pass + #bind_metadata_to_arguments + decorated_qa_func = partial(bind_metadata_to_arguments, qa_func=qa_func) + decorated_qa_func.__profile_enabled__ = profile + qa_tests[qa_func.__name__] = decorated_qa_func + return decorated_qa_func + #qa_test_decorator + if func: + return wrap(func) + return wrap + + +qa_core = imp.new_module("qa_core") +qa_core.qa_test = qa_test +qa_core.test_output_buffer = StringIO.StringIO() + +def present(file): + return "/dev/null" + +qa_core.present = present + +@contextmanager +def get_test_buffer(*args, **kwargs): + yield qa_core.test_output_buffer + +@contextmanager +def get_test_input_file(*args, **kwargs): + yield open('/dev/null', 'r') + +qa_core.new_output_file = get_test_buffer +qa_core.open_input = get_test_input_file +qa_core.qa_params = qa_params +sys.modules['qa_core'] = qa_core + + +def main(sandbox, profile=False, *a, **k): + output = "qa_core: ran " + with qa_core.open_input('input.xml') as inputfile: + metadata = xml.parse(inputfile).getroot() + + qaoutput = xml.Element("qaQcCoverageOutput") + outputxml = xml.ElementTree(element=qaoutput) + + for name, test in qa_tests.items(): + if (profile and test.__profile_enabled__) or not profile: + output += "{0}, ".format(name) + try: + result = test(metadata) + if isinstance(result, xml.Element): + qaoutput.append(result) + else: + if result: + xml.SubElement(qaoutput, name).text = str(result) + except Exception as e: + output += 'and died on error.' + import traceback + traceback.print_exc() + print(test) + xml.SubElement(qaoutput, 'reasonForFailure').text = str(e) + break +# if gims_qa_tests.failure_mode: +# xml.SubElement(qaoutput, "reasonForFailure").text = str(gims_qa_tests.failure_mode) + + with qa_core.new_output_file('output.xml') as outputfile: + d = dom.parseString(xml.tostring(qaoutput)) + d.writexml(outputfile, addindent=' ', newl='\n') + return output + + + + + +@qa_test +def version(): + "Collect repository commit id from git" + p = os.path.abspath(os.path.dirname(__file__)) + try: + ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '') + except subprocess.CalledProcessError: + ref = ver + return ref + +@qa_test(disable=True) +def profile(): + "profiling command to test performance" + + print("Starting profiler. Imports...") + + import cProfile + import StringIO + import Bio + import random + from random import choice as c + + from tempfile import TemporaryFile as tmp + + + print("Setting up test data...") + + profile_xml = StringIO.StringIO("""<?xml version='1.0' encoding='UTF-8'?> +<qaQcCoverageInput> + <sequenceFileName>forward.fastq</sequenceFileName> + <sequenceFileName>reverse.fastq</sequenceFileName> + <sequencingType>Illumina MiSeq</sequencingType> + <approximateGenomeSize>1000000</approximateGenomeSize> +</qaQcCoverageInput> +""") + + qa_params['plots'] = False + + profile_fastq = StringIO.StringIO() + #create some fake FASTQ 'reads' + for r in range(10000): + #boring random distribution + #s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)]) + #interesting diamond distribution + s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)]) + s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)]) + + q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)] + rec = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s, Bio.Alphabet.IUPAC.unambiguous_dna), id="read_{}".format(r)) + rec.letter_annotations['phred_quality'] = q + try: + profile_fastq.write(rec.format('fastq')) + except: + print q + print rec + raise + + + + + @contextmanager + def input_stub(filename, mode=None): + if 'xml' in filename: + profile_xml.seek(0) + yield profile_xml + elif 'fastq' in filename: + profile_fastq.seek(0) + yield profile_fastq + else: + yield open('/dev/null', 'r') + + qa_core.open_input = input_stub + #qa_core.new_output_file = get_test_buffer + + profiling_buffer = StringIO.StringIO() + + @contextmanager + def get_profiling_buffer(*a, **k): + yield profiling_buffer + profiling_buffer.seek(0) + + qa_core.new_output_file = get_profiling_buffer + + reload(gims_qa_tests) + + print("Beginning profiling...") + try: + s = cProfile.run('main(profile=True)', sort='tottime') + #s.sort_stats('pcalls', 'tottime') + return str(profiling_buffer.read()) + except NameError: + import traceback + traceback.print_exc() + main(profile=True) + quit() + except Exception: + import traceback + traceback.print_exc() + #print str(profile_fastq.getvalue())[:1024] + quit() + #return exec('main') + + +#Heng Li's readfq implementation +#https://github.com/lh3/readfq/blob/master/readfq.py +def readfq(fp): # this is a generator function + last = None # this is a buffer keeping the last unprocessed line + while True: # mimic closure; is it a bad idea? + if not last: # the first record or a record following a fastq + for l in fp: # search for the start of the next record + if l[0] in '>@': # fasta/q header line + last = l[:-1] # save this line + break + if not last: break + name, seqs, last = last[1:].partition(" ")[0], [], None + for l in fp: # read the sequence + if l[0] in '@+>': + last = l[:-1] + break + seqs.append(l[:-1]) + if not last or last[0] != '+': # this is a fasta record + yield name, ''.join(seqs), [] # yield a fasta record + if not last: break + else: # this is a fastq record + seq, leng, seqs = ''.join(seqs), 0, [] + for l in fp: # read the quality + seqs.append(l[:-1]) + leng += len(l) - 1 + if leng >= len(seq): # have read enough quality + last = None + yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record + break + if last: # reach EOF before reading enough quality + yield name, seq, [] # yield a fasta record instead + break + +qa_core.readfq = readfq + + + +if __name__ == "__main__": + try: + name, sandbox_path = sys.argv[0:2] + args = sys.argv[2:] + + if '-p' in args: + from multiprocessing import Pool + try: + m = int(args[args.index('-p') + 1]) + p = Pool(processes=m) + del args[args.index('-p') + 1] + qa_core.qa_params['parallelism'] = m + except IndexError, ValueError: + p = Pool() + args.remove('-p') + qa_core.qa_params['map implementation'] = p.map + + if '-q' in args: + try: + q = float(args.pop(args.index('-q') + 1)) + args.remove('-q') + qa_params['user Q threshold'] = q + except IndexError, ValueError: + print("no Q score threshold provided with '-q' option.") + quit(1) + + + @contextmanager + def new_output_file(filename, mode='w'): + if not os.path.exists(j(sandbox_path, output_subdir)): + os.mkdir(j(sandbox_path, output_subdir)) + fileh = open(j(sandbox_path, output_subdir, filename), mode) + yield fileh + fileh.close() + + + def open_input_from(path): + @contextmanager + def open_input(filename, mode='rU'): + if 'gz' in filename: + fopen = gzip.open + else: + fopen = open + fileh = fopen(j(path, filename), mode=mode) + yield fileh + fileh.close() + return open_input + + def present_from(path): + def present(file): + return os.path.abspath(j(path, file)) + return present + + qa_core.new_output_file = new_output_file + main_func = main + + if not os.path.exists(sandbox_path): + qa_core.open_input = open_input_from(os.getcwd()) + qa_core.present = present_from(os.getcwd()) + try: + main_func = lambda s, *a: qa_undecorated_tests[sandbox_path](*a) + except KeyError: + print("Invalid QA test name: {0}".format(sandbox_path)) + quit(1) + else: + qa_core.open_input = open_input_from(j(sandbox_path, input_subdir)) + qa_core.present = present_from(j(sandbox_path, input_subdir)) + + + + except ValueError: + import gims_qa_tests + print("Missing argument: path to sandbox or name of QA test.") + commands = "" + #commands = "\n\t\t".join([' '.join((n, " ".join(["<{0}>".format(a) for a in getargspec(f)[0]]) or 'DELETEME')).replace(' DELETEME', '') for n, f in qa_undecorated_tests.items()]) + for name, func in qa_undecorated_tests.items(): + args = " ".join(["<{0}>".format(a) for a in getargspec(func).args]) + commands += "{0} {1}\n\t\t\t".format(name, textwrap.fill(args, 60, subsequent_indent='\t\t\t\t')) + print(usage.format(commands)) + quit(1) + else: + import gims_qa_tests + print(main_func(sandbox_path, profile=False, *args))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_qa_kraken.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,217 @@ +import csv +from collections import Counter, OrderedDict +from hashlib import md5 +import os.path +import tempfile +from subprocess import check_call +from gims_qa_tests import TaxTuple +#from qa_core import present + +def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)): + "kraken options should be supplied as a dict to kraken_options; give a None value for valueless options" + + #hash the kraken db files + md5hash = md5() + #os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None) + md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read()) + + db_hash = md5hash.hexdigest() + + + with tempfile.NamedTemporaryFile('rU') as kraken_out: + with tempfile.NamedTemporaryFile() as seq_data: + #cat all the files together + check_call("cat {files} > {seq_data.name}".format( + files = " ".join([present(f) for f in sequenceFiles]), + seq_data = seq_data + ), + shell=True) + + def make_option(key, value): + if value is not None: + return "--{} {}".format(key, value) + return "--{}".format(key) + + #build options + options = " ".join([make_option(k,v) for k,v in kraken_options.items()]) + + #call kraken + check_call("kraken {options} " + "--db {database} " + "--gzip-compressed " + "--fastq {seq_data.name} " + "--output {kraken_out.name}".format(**locals()), shell=True) + + counts = Counter() + + with tempfile.NamedTemporaryFile('rU') as kraken_report: + check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True) + rows = csv.reader(kraken_report, delimiter='\t') + for seq_id, taxon in rows: + counts[taxon] += 1.0 + counts["total"] += 1.0 + + #clean up all the temp files by releasing contexts + + top_results = counts.most_common(6) #total, plus five most common taxa + total, total_count = top_results.pop(0) + top_hit, top_hit_count = top_results.pop(0) + next_hits = [h[0].replace(';', ' ') for h in top_results] + next_hit_counts = [h[1] for h in top_results] + + top_hit_frac = top_hit_count / total_count + next_hit_frac = [h / total_count for h in next_hit_counts] + + return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash) + +def runKrakenSeqtest(sequenceFiles,database): + + krakenoutoutpath = "/home/charles.strittmatter/output/kraken/" + + w = datetime.datetime.now() + + krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M") + krakenreport = "" + krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M") + + sequenceFiles[0] + sequenceFiles[1] + + os.path.abspath(os.path.join("Input", sequenceFileName[0])) + + command = "kraken --preload " + command += "--db " + database + ' ' + #if len(fastqPaths) > 1: + command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1])) + # command += ' '.join(fastqPaths) + command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout + + print command + + check_call(command, shell=True) + print "%s -- Kraken called..." % datetime.datetime.now() + + + kraken_report_cmd = "kraken-report --db " + kraken_report_cmd += database + kraken_report_cmd += " " + krakenoutoutpath + krakenout + kraken_report_cmd += " > " + krakenreport + + print kraken_report_cmd + + check_call(kraken_report_cmd, shell=True) + + + + + parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5" + + kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True) + + split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines) + + print "=====================" + krakenreport + "==========================" + print split_kraken_rpt_line + split_kraken_rpt_length = len(split_kraken_rpt_line) + print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4] + taxLen = 6; + taxIdIdx = 4; + outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M") + '_metadata_kraken.tsv' + + metadata = "Platform Run asm_stats_contig_n50 asm_stats_length_bp asm_stats_n_contig attribute_package bioproject_acc bioproject_center bioproject_title biosample_acc collected_by sample_name scientific_name serovar sra_center strain sub_species tax-id" + + + + + splitMetadata = metadata.split("\t") + for i in range(0, len(split_kraken_rpt_line)): + if split_kraken_rpt_line[i] == '': + print "don't add to the metadata...split_kraken_rpt_line.pop(i)" + elif i%(taxLen) == taxLen-1: + print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1" + splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip()) + elif i%(taxLen) == taxIdIdx: + print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx" + splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) + elif i%(taxLen) == 0: + print "i is "+str(i)+"under this condition: i%(taxLen) == 0" + splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) + + splitMetadataLength = len(splitMetadata) + first_kraken_percentage = splitMetadataLength-15 + with open(outputFile, 'wb') as f: + #writer = csv.writer(f, delimiter="\t") + + # print splitMetadata + for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): + print "Split metadata of i "+splitMetadata[i] + for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): + if float(splitMetadata[j]) > float(splitMetadata[i]): + print "i: "+str(i)+" and j: "+str(j) + print "swapping "+splitMetadata[i]+" with "+splitMetadata[j] + old_hit_org = splitMetadata[i-1] + old_hit_percentage = splitMetadata[i] + old_hit_tx_id = splitMetadata[i+1] + + splitMetadata[i-1] = splitMetadata[j-1] + splitMetadata[i] = splitMetadata[j] + splitMetadata[i+1] = splitMetadata[j+1] + + splitMetadata[j-1] = old_hit_org + splitMetadata[j] = old_hit_percentage + splitMetadata[j+1] = old_hit_tx_id + + df = pandas.DataFrame(splitMetadata) + + print df + + + # Here aree dataframe collumns for ! + + print df.iat[0,0] + print df.iat[1,0] + print df.iat[3,0] + print df.iat[4,0] + print df.iat[6,0] + print df.iat[7,0] + print df.iat[9,0] + print df.iat[10,0] + + OrganismOne = df.iat[0,0] + OrganismTwo = df.iat[3,0] + OrganismThree = df.iat[6,0] + OrganismFour = df.iat[9,0] + + + OrganismOnePer = df.iat[1,0] + OrganismTwoPer = df.iat[4,0] + OrganismThreePer = df.iat[7,0] + OrganismFourPer = df.iat[10,0] + + + df = {'OrganismOne':OrganismOne, + 'OrganismTwo':OrganismTwo, + 'OrganismThree':OrganismThree, + 'OrganismFour':OrganismFour, + 'OrganismOne%':OrganismOnePer, + 'OrganismTwo%':OrganismTwoPer, + 'OrganismThree%':OrganismThreePer, + 'OrganismFour%':OrganismFourPer } + + + #Clean up files + + command = "" + command += "rm " + krakenoutoutpath + krakenout + + print command + + check_call(command, shell=True) + + + + command = "" + command += "rm " + outputFile + + print command + + return pandas.DataFrame(df, index=[0]) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_qa_plot.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,148 @@ +import subprocess +from qa_core import new_output_file, qa_params +from collections import defaultdict +import warnings +warnings.filterwarnings("ignore") +import pandas +from pandas import DataFrame +import matplotlib.pyplot as plot +from numpy import zeros, add, mean, array, percentile, arange +import traceback +from gims_qa_tests import cluster +import logging + +bases = ("A", "T", "G", "C", "N") + +def qa_plots(df, annotation, log=logging.getLogger('qa.plots'), **kwargs): + #log.debug(df) + #log.debug(annotation) + plot_quality_histo(df['qualityHisto'], annotation, **kwargs) + plot_nucleotide_distribution_histo(df['nucleotideHisto'], annotation) + plot_quality_distribution_histo(df['baseQScoreHisto'], annotation) + plot_base_balance(df, annotation, **kwargs) + #plot.show() + +def plot_base_balance(df, + annotation=qa_params.get('annotation', ''), + threshold=qa_params.get('permitted base balance variance', .2), + **kwargs): + try: + plot.figure() + plot.axis((0,2,0,2)) + plot.suptitle("{} Base Balance".format(annotation)) + plot.xlabel("G/C") + plot.ylabel("A/T") + thres_min = 1 - threshold + thres_max = 1 + threshold + for coord in (thres_min, thres_max): + for linef in (plot.axvline, plot.axhline): + linef(coord, linestyle='dashed', label=str(coord)) + + plot.annotate(thres_max, xy=(0, thres_max)) + plot.annotate(thres_min, xy=(0, thres_min)) + plot.annotate(thres_max, xy=(thres_max, 0)) + plot.annotate(thres_min, xy=(thres_min, 0)) + + a, t, g, c = [sum(df['nucleotideHisto'][b]) for b in ('A', 'T', 'G', 'C')] + y, x = a / float(t), g / float(c) + plot.plot((x,), (y,), marker="+") + plot.annotate("({:.2},{:.2})".format(x, y), xy=(x+.05, y+.05), textcoords='data') + with new_output_file("base_balance.png") as base_bal: + plot.savefig(base_bal, format='png', dpi=92) + except Exception: + traceback.print_exc() + + +def plot_quality_histo(histo, + annotation=qa_params.get('annotation', ''), + q=qa_params.get('user Q threshold', 27)): + try: + plot.figure() + labels = range(max(histo.keys()) + 1) + plot.bar(left=labels, + height=[histo[n] for n in labels], + color='y', + linewidth=1) +# plot.hist(histo, bins=range(max(histo))) + plot.xticks() + plot.axvline(x=q, label="Q {}".format(q)) + #plot.show() + plot.xlabel(r"""$Q (-10 log_(10) \rho)$""") + plot.ylabel('Number of Bases') + plot.suptitle('{} Q-Score Histogram'.format(annotation)) + plot.legend(loc='upper left', fontsize='x-small') + with new_output_file("qscore_histogram.png") as qual_histogram: + plot.savefig(qual_histogram, format='png', dpi=92) +# except ValueError: +# pass + except Exception: + traceback.print_exc() + +def plot_nucleotide_distribution_histo(dataframe, + annotation=qa_params.get('annotation', '')): + try: + #dataframe = defaultdict(list) + colors = iter(('r', 'y', 'g', 'b', 'pink')) + plot.figure() + last_base = zeros((len(dataframe.values()[0]),)) #set all zeros for bottom of first bar row + for base in bases: + try: + plot.bar(left=range(len(dataframe[base])), + height=dataframe[base], + width=2, + bottom=last_base, + color=colors.next(), + label=base, + linewidth=0) + last_base = add(last_base, dataframe[base]) + except: + #print [len(d) for d in dataframe.data.values()] + raise + plot.autoscale(tight=True) + plot.xlabel('Base Position') + plot.ylabel('Nucleotide Count') + plot.suptitle('{} Nucleotide Distribution Plot'.format(annotation)) + plot.legend(loc='upper right', fontsize='xx-small') + with new_output_file("nucleotide_distribution.png") as nuc_histogram: + plot.savefig(nuc_histogram, format='png', dpi=108) + except Exception: + traceback.print_exc() + +def plot_quality_distribution_histo(histo, + annotation=qa_params.get('annotation', ''), + q=qa_params.get('user Q threshold', 27) + ): + try: + binning = 20 + plot.figure(figsize=(12, 6)) + vec = [tuple(c.elements()) for c in histo] + means = [mean(t) for t in vec] + upper_iqr = array([percentile(t, 75) for t in vec]) + lower_iqr = array([percentile(t, 25) for t in vec]) +# plot.boxplot(vec, +# widths=.000005, +# whis=(5,95), +# showfliers=False, +# showbox=True, +# showmeans=True, +# meanline=True, +# showcaps=True, +# whiskerprops=dict(linestyle='-', color='0.60') +# ) + + plot.fill_between(arange(len(upper_iqr)), upper_iqr, lower_iqr, facecolor='0.50', alpha=0.5, label='interquartile range') + plot.plot(means, 'r-', label='Mean') + plot.plot(upper_iqr, 'b-', label='interquartile range', alpha=0.25) + plot.autoscale(tight=True) + plot.axhline(y=q, label="Q {}".format(q)) + plot.xlabel('Base Position') + plot.ylabel(r"""$Q (-10 log_(10) \rho)$""") + plot.xticks(range(0, len(histo), binning), range(0, len(histo), binning), rotation='vertical') + plot.suptitle('{} Q-Score Distribution Plot'.format(annotation)) + plot.legend(loc='lower left', fontsize='x-small') + plot.grid(True) + with new_output_file("qscore_distribution.png") as q_distro: + plot.savefig(q_distro, format='png', dpi=108) + + except Exception: + traceback.print_exc() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_qa_serotyping.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,110 @@ +import csv +import shutil +from collections import Counter, OrderedDict +from hashlib import md5 +import os.path +import tempfile +import glob +import subprocess +from subprocess import check_call, check_output +from gims_qa_tests import TaxTuple +import logging +#from qa_core import present + + + +def run_serotype_usingseqsero( + sequenceFileName1, sequenceFileName2, + sequencingType, + Oantigen="Undetermined", + H1antigen="Undetermined", + H2antigen="Undetermined", + AntigenicProfile="Undetermined", + PredictedSerotype="Undetermined", + log=logging.getLogger('qa.seqsero')): + + log.info(check_output("SeqSero.py -m 2 -i {sequenceFileName1} {sequenceFileName2} -b sam".format(**locals()), shell=True)) + + + data_lines = [] + filepath = "SeqSero_result*/Seqsero_result.txt" + + + i = -1 + + for name in glob.glob(filepath): + #with name2 as seqsero_report: + rows = csv.reader(open(name)) + fields = rows.next() + #data_lines = [] + for row in rows: + i += 1 + str1 = ''.join(row) + words2 = str1.split(':') + if i == 0: + Oantigen = words2[1] + if i == 1: + H1antigen = words2[1] + if i == 2: + H2antigen = words2[1] + if i == 3: + if len(words2) > 2: + print words2[1] + ":" + words2[2] + ":" + words2[3] + AntigenicProfile = words2[1] + ":" + words2[2] + ":" + words2[3] + if i == 4: + if words2[1].find("N/A") != -1: + PredictedSerotype = "No Salmonella Serotype" + else: + PredictedSerotype = words2[1] + + + import xml.etree.ElementTree as xml + + root = xml.Element('SerotypeQualityResultType') + global failure_mode + + #Saving for posssible future change to xml + #el = xml.SubElement + + #el(root, 'Oantigen').text = Oantigen + #el(root, 'H1antigen').text = H1antigen + #el(root, 'H2antigen').text = H2antigen + #el(root, 'AntigenicProfile').text = AntigenicProfile + #el(root, 'PredictedSerotype').text = PredictedSerotype + + root.set('Oantigen',Oantigen.strip()) + root.set('H1antigen',H1antigen.strip()) + root.set('H2antigen',H2antigen.strip()) + root.set('AntigenicProfile',AntigenicProfile.strip()) + root.set('PredictedSerotype', PredictedSerotype.strip()) + + + + # print Oantigen + # print H1antigen + # print H2antigen + # print AntigenicProfile + # print PredictedSerotype + + log.getChild('Oantigen').info(Oantigen) + log.getChild('H1antigen').info(H1antigen) + log.getChild('H2antigen').info(H2antigen) + log.getChild('AntigenicProfile').info(AntigenicProfile) + log.getChild('PredictedSerotype').info(PredictedSerotype) + + + #shutil.rmtree("SeqSero_result*", ignore_errors=True) + + check_call("rm -rf SeqSero_result*".format(**locals()), shell=True, stderr=subprocess.STDOUT) + + + + + return root + + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_qa_tests.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,434 @@ +from qa_core import qa_test, new_output_file, open_input, readfq, qa_params +from collections import namedtuple, defaultdict, Counter +from Bio import SeqIO +from multiprocessing import Pool, Process, cpu_count +from functools import partial +from itertools import izip, count, izip_longest, islice +from tempfile import NamedTemporaryFile, mkdtemp +from subprocess import check_call, CalledProcessError +from numpy import genfromtxt +from os.path import abspath, join, split, splitext +from contextlib import contextmanager +from subprocess import call +from subprocess import check_output +from hashlib import md5 + + +import sys +import shutil +import os +import pandas +import datetime +import re +import glob, string + +#Test implementation classes and functions. The actual tests are down at the bottom. + +SuperResultTuple = namedtuple("SuperResultTuple", ("readCount", "numBases", "baseQScoreHisto", "nucleotideHisto", "qualityHisto")) + +# #Global Variables for kraken methods +# krakendatabase = "/scratch/biostat/kraken/miniDB/" +# we don't actually need this as a global + +class ResultTuple(SuperResultTuple): + def __add__(self, other): + r1, n1, h1, nuc1, qual1 = self + r2, n2, h2, nuc2, qual2 = other + return ResultTuple(r1 + r2, + n1 + n2, + [p + q for p, q in izip_longest(h1, h2, fillvalue=Counter())], + [p + q for p, q in izip_longest(nuc1, nuc2, fillvalue=Counter())], + qual1 + qual2) + + def __and__(self, other): + r1, n1, h1, nuc1, qual1 = self + r2, n2, h2, nuc2, qual2 = other + return ResultTuple(r1 + r2, + n1 + n2, + h1 + h2, + nuc1 + nuc2, + qual1 + qual2) + + @classmethod + def zero(cls): + return cls(0, 0, [], [], Counter()) + + def to_dataFrame(self): + try: + from pandas import DataFrame + #from_dict = partial(DataFrame.from_dict, DataFrame) + except: + #no pandas + def DataFrame(self, data): + return ResultTuple(**data) + + pivotNucHisto = defaultdict(list) + for cnt in self.nucleotideHisto: + for base in ('A', 'T', 'G', 'C', 'N'): + pivotNucHisto[base].append(cnt[base]) + pivotQBaseHisto = self.baseQScoreHisto + pivotQualHisto = {} + for q in range(max(self.qualityHisto.keys())): + pivotQualHisto[q] = self.qualityHisto[q] + df = {'readCount':self.readCount, + 'numBases':self.numBases, + 'baseQScoreHisto':pivotQBaseHisto, + 'nucleotideHisto':pivotNucHisto, + 'qualityHisto':pivotQualHisto} + return df + +class MergedResultTuple(): + "Merged multi-file results object. You can iterate on it for filename and ResultTuple pairs, or call its attributes for combined results" + def __init__(self, **kwargs): + self.per_file = {} + self.per_file.update(kwargs) + + def __getattr__(self, name): + return getattr(reduce(lambda p, q: p & q, [v for k, v in sorted(self.per_file.items())], ResultTuple.zero()), name) + + def __iter__(self): + return iter(self.per_file.items()) + + +def analyze_batch_reads(reads): + readCount = 0 + numBases = 0 + baseQScoreHisto = [] + nucleotideHisto = [] + qualityHisto = Counter() + for name, seq, quals in reads: + qualityHisto.update(quals) + def scan_a_read(read): + #for base, qual, index in zip(seq, quals, count(0)): + base, qual, index = read + try: + nucleotideHisto[index][base] += 1 + except IndexError: + nucleotideHisto.append(Counter(base)) + try: + baseQScoreHisto[index][qual] += 1 + except IndexError: + baseQScoreHisto.append(Counter({qual:1})) + return index + m = map(scan_a_read, zip(seq, quals, count(0))) #numBases is the last index of the read + #print(name, m[:3]) + numBases += m[-1] + readCount += 1 + r = ResultTuple(readCount, numBases, baseQScoreHisto, nucleotideHisto, qualityHisto) + return r + + +def cluster(slicable, n=2): + "return an n-tuple of iterators over an in-memory sequence" + #determine the offset size + offset = (len(slicable) / n) + 1 + #return tuple([islice(slicable, i, i+offset) for i in range(n)]) + return tuple([tuple(slicable[i*offset:(i+1)*offset-1]) for i in range(n)]) + + +def tupleize_parse(fileh, format): + for read in SeqIO.parse(fileh, format): + yield (read.id, str(read.seq), read.letter_annotations['phred_quality']) + +#@profile +def analyze_shortread(seqFile, map_f=map, multi=qa_params.get('parallelism', cpu_count())): + "analyze one file whose reads are typical short-read technology" + if 'sff' in seqFile: + format, quality, mode = 'sff', 'phred_quality', 'rb' + rds_f = partial(tupleize_parse, format=format) + else: + format, quality, mode = 'fastq', 'phred_quality', 'rU' + rds_f = readfq + with open_input(seqFile) as seqFileH: + rds = list(rds_f(seqFileH)) + rds = cluster(rds, multi) + result = sum(map_f(analyze_batch_reads, rds), ResultTuple.zero()) + return result + + +def analyze_longread(seqFile, map_f=map): + "analyze one file from pacbio or nanopore" + + return ResultTuple.zero() + #what to do here? if nanopore, plot 2d/1d length vs accuracy + + +def panic(*a, **k): + raise ValueError("No analysis method for this sequencing type") + +config = {'Roche 454':analyze_shortread, + 'Illumina MiSeq':analyze_shortread, + 'Illumina NextSeq':analyze_shortread, + 'Illumina HiSeq':analyze_shortread, + 'IonTorrent PGM':analyze_shortread, + 'PacBio RS':analyze_longread, + 'Oxford Nanopore MiniION':analyze_longread} + + +class QaMemo(object): + "lazy-evaluated QA operation" + + result = None #named tuple results object + + @classmethod + def analyze(cls, + sequenceFiles, + sequencingType, + map_f=qa_params.get('map implementation', map), + plots=qa_params.get('plots', True), + annotation=qa_params.get('annotation', '')): + if type(sequenceFiles) == str: #if this is a single file + sequenceFiles = [sequenceFiles] + analysis = partial(config.get(sequencingType, panic), map_f=map_f) +# result = reduce(lambda a, b: a & b, map(analysis, sequenceFiles), ResultTuple.zero()) + result = MergedResultTuple(**{fi:analysis(fi) for fi in sequenceFiles}) + if plots: + try: + from gims_qa_plot import qa_plots + result_df = result.to_dataFrame() + qa_plots(result_df, annotation) + except Exception: + import traceback + traceback.print_exc() + return result + + @classmethod + def of(cls, sequenceFiles, sequencingType): + try: + if not cls.result: + cls.result = QaMemo.analyze(sequenceFiles, sequencingType) + return cls.result + except TypeError as e: + raise ValueError(str(e)), None, sys.exc_info()[2] + + +TaxTuple = namedtuple("TaxTuple", ("taxon", "taxon_prevalence", "contaminant", "contaminant_prevalence", "checksum")) + + +class TaxMemo(object): + "lazy-evaluated Tax check operation" + + result = None # + + @classmethod + def analyze(cls, sequenceFiles, database): + from gims_qa_kraken import run_kraken + return run_kraken(sequenceFiles, database, dict(preload=None, + threads=qa_params.get('parallelism', 1))) + + # ## print cls + # ## print sequenceFiles + # from gims_qa_kraken import runKrakenSeqTest + + # #hash the kraken db + # #this isn't right yet, the db is a directory + # db_hash = md5(open(database, 'rb').read()).hexdigest() + + + # #Run Kraken on the current sequence and collect result and pass to TaxTuple to be written to xml + # KrakenResult = runKrakenSeqtest(sequenceFiles,krakendatabase) + + # #do this more readably lines with list comprehensions + # org_headers = ('OrganismTwo', 'OrganismThree', 'OrganismFour', 'OrganismFive') + # org_percent_headers = ('OrganismTwo%', 'OrganismThree%', 'OrganismFour%', 'OrganismFive%') + # Organisms = [KrakenResult[h].iloc[0] for h in org_headers] + # OrganismsPer = [KrakenResult[h].iloc[0] for h in org_percent_headers] + + + # return TaxTuple(KrakenResult['OrganismOne'].iloc[0], KrakenResult['OrganismOne%'].iloc[0], Organisms, OrganismsPer, db_hash) + + @classmethod + def of(cls, *args, **kwargs): + if not cls.result: + cls.result = TaxMemo.analyze(*args, **kwargs) + return cls.result + +failure_mode = "" + +def update_failure_mode(failure): + global failure_mode + if failure_mode: + failure_mode += "; {}".format(failure) + else: + failure_mode = failure + + +########################################################################################## +# +# START HERE to design and configure the tests ---------- +# +########################################################################################## + + + + +@qa_test(disable=True) +def parametersUsed(sequencingType='not given', + coverageThreshold='not given', + approximateGenomeSize=4500000): + "Simple function to save the test and evaluation parameters used" + import xml.etree.ElementTree as xml + el = xml.SubElement + root = xml.Element("parametersUsed") + for key, value in qa_params.items(): + el(root, "parameter", attrib=dict(key=str(key), value=str(value))) + return root + +@qa_test +def readCount(sequenceFileName, sequencingType): + return QaMemo.of(sequenceFileName, sequencingType).readCount + +@qa_test +def coverage(sequenceFileName, sequencingType, approximateGenomeSize=4500000): + return QaMemo.of(sequenceFileName, sequencingType).numBases / int(approximateGenomeSize) + + +# @qa_test(disable=False) +# def coverageQThirtyPlus(sequenceFileName, sequencingType, approximateGenomeSize=4500000, minimumQScore=qa_params.get('user Q threshold', 27.0)): +# numOverMinQ = 0 +# for score, num in QaMemo.of(sequenceFileName, sequencingType).baseQScoreHisto.items(): +# if score >= int(minimumQScore): +# numOverMinQ += num +# return numOverMinQ + + +def mean(counter): + return float(sum([v * c for v, c in counter.items()])) / float(sum(counter.values())) + + +@qa_test +def averageQPerFile(sequenceFileName, sequencingType): + return sorted({fi:mean(res.qualityHisto) for fi,res in QaMemo.of(sequenceFileName, sequencingType)}.items()) + + +def averages_exceed(qualityHisto, q): + return mean(qualityHisto) >= q + + +@qa_test +def qcStatus(sequenceFileName, sequencingType, + coverageThreshold=None, + approximateGenomeSize=4500000, + minimumQScore=qa_params.get('user Q threshold', 27.0), + bbThreshold=qa_params.get('permitted base balance variance', 0.2)): + global failure_mode + results = QaMemo.of(sequenceFileName, sequencingType) + if not all([averages_exceed(r.qualityHisto, minimumQScore) for f, r in results]): + update_failure_mode("qscore below {}".format(minimumQScore)) + return "Fail" + a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')] + at = float(a) / float(t) + gc = float(g) / float(c) + if any((at > 1.0 + bbThreshold, at < 1.0 - bbThreshold, gc > 1.0 + bbThreshold, gc < 1.0 - bbThreshold)): + update_failure_mode("AT or GC base-balance outside of {} threshold".format(bbThreshold)) + return "Fail" + if coverageThreshold: + if not (results.numBases / int(approximateGenomeSize)) >= float(coverageThreshold): + update_failure_mode("coverage below {}".format(coverageThreshold)) + return "Fail" + return "Pass" + return "Undetermined" + + + + +def baseBalance(sequenceFileName, sequencingType): + results = QaMemo.of(sequenceFileName, sequencingType) + a, t, g, c = [sum([n[b] for n in results.nucleotideHisto]) for b in ('A', 'T', 'G', 'C')] + return (float(a) / float(t) if t else 1, float(g) / float(c) if c else 1) + +@qa_test +def at_gc_baseBalance(sequenceFileName, sequencingType): + a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')] + at = float(a) / float(t) if t else 1 + gc = float(g) / float(c) if c else 1 + return "{:02}/{:02}".format(at, gc) + +@qa_test(disable=True) +def baseBalance(sequenceFileName, sequencingType, threshold=qa_params.get('permitted base balance variance', 0.2) ): + import xml.etree.ElementTree as xml + el = xml.SubElement + a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')] + at = float(a) / float(t) if t else 1 + gc = float(g) / float(c) if c else 1 + status = "Pass" + if any((at > 1.0 + threshold, at < 1.0 - threshold, gc > 1.0 + threshold, gc < 1.0 - threshold)): + status = "Fail" + update_failure_mode("Base balance out of variance threshold ({:.2},{:.2} > {})".format(at, gc, threshold)) + root = xml.Element('baseBalanceResult') + el(root, 'AT').text = str(at) + el(root, 'GC').text = str(gc) + el(root, 'baseBalanceStatus').text = status + return root + + + + + +@qa_test(disable = ~os.path.exists(qa_params.get('path to kraken database', '/scratch/biostat/kraken/miniDB')), + profile=False) +def taxonomyQualityUsingKraken(sequenceFileName, + sequencingType, + genus=None, + species=None, + databasePath=qa_params.get('path to kraken database', '/scratch/biostat/kraken/miniDB') + ): + + #print sequenceFileName + #print sequencingType + #print sequenceFileName + #print sequencingType + #print genus + #print species + + + import xml.etree.ElementTree as xml + global failure_mode + el = xml.SubElement + res = TaxMemo.of(sequenceFileName, databasePath) + #make return element + root = xml.Element('taxonomyQualityResult') + if not genus: + el(root, 'taxStatus').text = "Undetermined" + else: + if genus in res.taxon.split() and species in res.taxon.split(): + el(root, 'taxStatus').text = "Pass" + else: + el(root, 'taxStatus').text = "Fail" + update_failure_mode('"{} {}" was not in agreement with calculated ID "{}"'.format(genus, species, res.taxon)) + el(root, 'mostPrevalentTaxon', dict(prevalence=str(res.taxon_prevalence))).text = res.taxon[-230:] #last 255 characters + + contams = el(root, 'contaminants') + for i in range(4): + el(contams, 'prevalentContaminant', dict(prevalence=str(res.contaminant_prevalence[i]), ordinal=str(i))).text = res.contaminant[i][-230:] + el(root, 'database', dict(checksum=str(res.checksum))).text = databasePath + + + return root + + + +@qa_test +def reasonForFailure(): + global failure_mode + return failure_mode[0:254] #255 character limit in destination DB field + +@qa_test(disable=True) +def run_serotype_usingseqsero( + sequenceFileName, + sequencingType): + from gims_qa_serotyping import run_serotype_usingseqsero + + seq1 = "" + sequenceFileName[0] + seq2 = "" + sequenceFileName[1] + + return run_serotype_usingseqsero( + seq1, seq2, + sequencingType, + Oantigen="Undetermined", + H1antigen="Undetermined", + H2antigen="Undetermined", + AntigenicProfile="Undetermined", + PredictedSerotype="Undetermined") + + return root
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/qa_core.sh Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,14 @@ +#!/bin/bash + +#export MODULEPATH=/etc/modulefiles:/opt/mellanox/bupc/2.2/modules:/sw/Modules/ +#export GIT_SSH_COMMAND="ssh -i /home/justin.payne/.ssh/deploy_key_dev" + +module load qa-core +module load kraken +module load git +#module load fastqc +module load pandas +module load SeqSero + +GIT_SSH_COMMAND="ssh -qi /sw/apps/qa-core/deploy_key_dev" git -C /sw/apps/qa-core/ pull origin deployment --quiet 2>&1>/dev/null +TEMP=/isilon/gpfs_scratch python /sw/apps/qa-core/gims_core.py $1 -p 8
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/sample/Input/input.xml Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,45 @@ +<?xml version='1.0' encoding='UTF-8'?> +<qaQcCoverageInput> +<!-- + <sequenceFileName comment="passes cov, fails quality">FNW19G43_S8_L001_R1_001.fastq.gz</sequenceFileName> + <sequenceFileName>FNW19G43_S8_L001_R2_001.fastq.gz</sequenceFileName> + --> + + + + + + <!-- <sequenceFileName comment="passes cov and quality">NY65357675_S20_L001_R1_001.fastq.gz</sequenceFileName> + <sequenceFileName>NY65357675_S20_L001_R2_001.fastq.gz</sequenceFileName> --> + <genus>Campylobacter</genus> + <species>jejuni</species> + + + + +<!-- + <sequenceFileName comment="small file">forward.fastq.gz</sequenceFileName> + <sequenceFileName>reverse.fastq.gz</sequenceFileName> --> + + + + + + + + + <sequenceFileName comment="moderate size file">forward_big.fastq.gz</sequenceFileName> + <sequenceFileName>reverse_big.fastq.gz</sequenceFileName> + + + +<!-- <sequenceFileName comment="454 file">roche_10000.sff</sequenceFileName> --> + + <sequencingType>Illumina MiSeq</sequencingType> + +<!-- <sequencingType>Roche 454</sequencingType> --> + + <approximateGenomeSize>4500000</approximateGenomeSize> + <coverageThreshold>20</coverageThreshold> + +</qaQcCoverageInput>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/sample/Output/output.tsv Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,2 @@ +at_gc_baseBalance averageQPerFile coverage name qcStatus readCount reasonForFailure version +0.957667053211/0.982688971661 [('qa_core/sample/Input/forward_big.fastq.gz', 35.54551581739571), ('qa_core/sample/Input/reverse_big.fastq.gz', 25.590399012754776)] 2 TEST TEST TEST Fail 59994 qscore below 27.0 galaxy/pure+mulcore+plots+qscore+basebal
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/sample/Output/output.xml Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,14 @@ +<?xml version="1.0" ?> +<qaQcCoverageOutput> + <version>galaxy/pure+mulcore+plots+qscore+basebal</version> + <readCount>59994</readCount> + <coverage>2</coverage> + <averageQPerFile>[('forward_big.fastq.gz', 35.54551581739571), ('reverse_big.fastq.gz', 25.590399012754776)]</averageQPerFile> + <qcStatus>Fail</qcStatus> + <baseBalanceResult> + <AT>0.957667053211</AT> + <GC>0.982688971661</GC> + <baseBalanceStatus>Pass</baseBalanceStatus> + </baseBalanceResult> + <reasonForFailure>qscore below 27.0</reasonForFailure> +</qaQcCoverageOutput>