# HG changeset patch
# User jpayne
# Date 1525820060 14400
# Node ID dafec28bd37efe099da72fab19437f1ae2068a7c
planemo upload
diff -r 000000000000 -r dafec28bd37e base_balance.png
Binary file base_balance.png has changed
diff -r 000000000000 -r dafec28bd37e cfsan-qaqc.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cfsan-qaqc.xml Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,73 @@
+
+
+ biopython
+ matplotlib
+ numpy
+ pandas
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/.gitignore
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/.gitignore Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,17 @@
+*.pyc
+sample/Output/*xml
+./deploy-dev
+./deploy-dev.pub
+./deploy-prep
+./deploy-prep.pub
+./deploy-prod
+./deploy-prod.pub
+deploy-*
+venv
+venv/*
+.DS_Store
+*.patch
+*bak
+tar
+.vscode
+.vscode/*
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/.gitmodules
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/.gitmodules Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,3 @@
+[submodule "task_core"]
+ path = task_core
+ url = http://10.12.207.89/justin.payne/task_core.git
diff -r 000000000000 -r dafec28bd37e qa_core/README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/README.md Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,7 @@
+# qa_core
+## Genomics QA Core for CFSAN Sequencing Operations
+
+### Installing
+ virtualenv venv
+ source venv/bin/activate
+ pip install -r requirements.txt
diff -r 000000000000 -r dafec28bd37e qa_core/gims_core.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_core.py Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,300 @@
+#! /usr/bin/env python
+
+import argparse
+import binascii
+from collections import OrderedDict
+from contextlib import contextmanager
+import xml.dom.minidom as dom
+from xml.etree import ElementTree as xml
+from functools import partial, wraps
+from multiprocessing.pool import ThreadPool as Pool
+from inspect import getargspec
+import os, os.path, sys, errno
+import imp
+import subprocess
+import logging
+
+ver = "unknown"
+
+
+usage = """ CFSAN Genomics QA Core
+May 15, 2015
+Justin Payne
+justin.payne@fda.hhs.gov
+
+ usage:
+ {0} /path/to/sandbox (production mode)
+ {0} {{0}}
+
+""".format(os.path.basename(__file__))
+
+PHRED_OFFSET = 33
+
+input_subdir = 'Input'
+output_subdir = 'Output'
+
+#Heng Li's readfq implementation
+#https://github.com/lh3/readfq/blob/master/readfq.py
+def readfq(fp): # this is a generator function
+ last = None # this is a buffer keeping the last unprocessed line
+ while True: # mimic closure; is it a bad idea?
+ if not last: # the first record or a record following a fastq
+ for l in fp: # search for the start of the next record
+ if l[0] in '>@': # fasta/q header line
+ last = l[:-1] # save this line
+ break
+ if not last: break
+ name, seqs, last = last[1:].partition(" ")[0], [], None
+ for l in fp: # read the sequence
+ if l[0] in '@+>':
+ last = l[:-1]
+ break
+ seqs.append(l[:-1])
+ if not last or last[0] != '+': # this is a fasta record
+ yield name, ''.join(seqs), [] # yield a fasta record
+ if not last: break
+ else: # this is a fastq record
+ seq, leng, seqs = ''.join(seqs), 0, []
+ for l in fp: # read the quality
+ seqs.append(l[:-1])
+ leng += len(l) - 1
+ if leng >= len(seq): # have read enough quality
+ last = None
+ yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record
+ break
+ if last: # reach EOF before reading enough quality
+ yield name, seq, [] # yield a fasta record instead
+ break
+
+#from qa_core import qa_test, new_output_file, open_input, readfq, qa_params
+
+qa_tests = OrderedDict()
+
+qa_params = OrderedDict()
+
+qa_core = imp.new_module("qa_core")
+sys.modules['qa_core'] = qa_core
+
+def qa_test(func=None, disable=False, profile=True):
+ "QA Test decorator"
+ def qa_test_decorator(qa_func):
+ #define a parameter-binding wrapper
+ @wraps(qa_func)
+ def wrap(namespace):
+ "Use partial application of functions to bind parameters declared in the QA function from a namespace object"
+ monad = qa_func
+ for argument in getargspec(qa_func).args:
+ if hasattr(namespace, argument) and getattr(namespace, argument):
+ monad = partial(monad, **{argument: getattr(namespace, argument)})
+
+ #and then call it
+ return monad()
+
+ wrap.__profile_enabled__ = profile
+ if not disable:
+ # if not name:
+ # name = qa_func.__name__
+ qa_tests[qa_func.__name__] = wrap
+ return wrap
+ if func:
+ return qa_test_decorator(func)
+ return qa_test_decorator
+
+qa_core.qa_test = qa_test
+qa_core.readfq = readfq
+qa_core.qa_params = qa_params
+
+@qa_test
+def version():
+ "Collect repository commit id from git"
+ p = os.path.abspath(os.path.dirname(__file__))
+ try:
+ ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '')
+ except subprocess.CalledProcessError:
+ ref = "+".join(qa_tests.keys())[:255]
+ return ref
+
+@qa_test
+def name(annotation=None):
+ if annotation:
+ return annotation
+
+@contextmanager
+def open_file(name, path='.', mode='w'):
+ with open(os.path.join(path, name), mode) as f:
+ if 'r' in mode and binascii.hexlify(f.read(2)) == b'1f8b':
+ import gzip
+ with gzip.open(os.path.join(path, name), mode) as g:
+ yield g
+ else:
+ f.seek(0)
+ yield f
+
+def suite(namespace, log=logging.getLogger('qa.suite')):
+ for name, test in qa_tests.items():
+ logg = log.getChild(name)
+ try:
+ result = test(namespace)
+ if result is not None:
+ logg.info(result)
+ yield name, result
+ except Exception as e:
+ import traceback
+ logg.error(traceback.format_exc())
+
+
+def main(namespace):
+ if namespace.annotation:
+ qa_params['annotation'] = namespace.annotation
+ parallelism = vars(namespace).get('parallelism', 1)
+ if parallelism > 1:
+ qa_params['map implementation'] = Pool(parallelism).map
+ if namespace.mode not in ('suite', 'profile'):
+ #it's a path to a sandbox job
+ logging.getLogger('qa.input').info("Load job description xml from {}...".format(namespace.mode))
+ tree = xml.parse(os.path.join(namespace.mode, "Input/input.xml"))
+ root = tree.getroot()
+ #logging.getLogger('qa.input').debug(xml.tostring(root).strip())
+ for element in root:
+ val = getattr(namespace, element.tag)
+ if isinstance(val, list):
+ val.append(element.text)
+ else:
+ setattr(namespace, element.tag, element.text)
+ #set up utility functions
+ path = os.path.join(namespace.mode, 'Output')
+ try:
+ os.makedirs(path)
+ except OSError as e:
+ if e.errno != errno.EEXIST:
+ raise
+ except Exception as e:
+ logging.getLogger('qa.setup').error(e)
+ qa_core.new_output_file = partial(open_file, path=path)
+ #qa_core.new_output_file = lambda f: open_file(f, path=path)
+ qa_core.open_input = partial(open_file, path=os.path.join(namespace.mode, "Input"), mode='r')
+ #import and run tests
+ logging.getLogger('qa.tests').info("Loading and running tests...")
+ import gims_qa_tests
+ root = xml.Element("qaQcCoverageOutput")
+ tree = xml.ElementTree(element=root)
+ [logging.getLogger('qa.params').info("{}:{}".format(k,v)) for k,v in vars(namespace).items()]
+ for i, (name, result) in enumerate(suite(namespace)):
+ if isinstance(result, xml.Element):
+ root.append(result)
+ else:
+ if result:
+ xml.SubElement(root, name).text = str(result)
+
+ try:
+ with qa_core.new_output_file('output.xml') as out:
+ d = dom.parseString(xml.tostring(root))
+ d.writexml(out, addindent=' ', newl='\n')
+ logging.getLogger('qa.results').info("Finished tests and wrote to {}".format(namespace.mode))
+ except Exception as e:
+ logging.getLogger('qa.results').error(e)
+ elif namespace.mode == 'suite':
+ qa_core.open_input = partial(open_file, mode='rb')
+ qa_core.new_output_file = open_file
+ if not namespace.output_plots:
+ qa_params['plots'] = False
+ else:
+ #set up directory for plots
+ pass
+ import gims_qa_tests
+ def fix_value(result):
+ if isinstance(result, xml.Element):
+ return xml.tostring(result)
+ return result
+ results = {name:fix_value(result) for name, result in suite(namespace)}
+ import csv
+ wtr = csv.DictWriter(namespace.output, delimiter='\t', fieldnames=sorted(results.keys()))
+ wtr.writeheader()
+ wtr.writerow(results)
+ elif namespace.mode == 'profile':
+ return profile(namespace)
+
+
+def profile(namespace):
+ log = logging.getLogger('qa.profile')
+ import cProfile
+ import StringIO
+ from Bio.Seq import Seq
+ from Bio.SeqRecord import SeqRecord
+ from Bio.Alphabet import IUPAC
+ import random
+ from random import choice as c
+ from tempfile import TemporaryFile as tmp
+
+ qa_params['plots'] = False
+
+ profile_fastq = StringIO.StringIO()
+ log.info("Setup...")
+ #create some fake FASTQ 'reads'
+ for r in range(10000):
+ #boring random distribution
+ #s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)])
+ #interesting diamond distribution
+ s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)])
+ s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)])
+
+ q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)]
+ rec = SeqRecord(Seq(s, IUPAC.unambiguous_dna), id="read_{}".format(r))
+ rec.letter_annotations['phred_quality'] = q
+ profile_fastq.write(rec.format('fastq'))
+ @contextmanager
+ def input_stub(filename, mode=None):
+ if 'xml' in filename:
+ profile_xml.seek(0)
+ yield profile_xml
+ elif 'fastq' in filename:
+ profile_fastq.seek(0)
+ yield profile_fastq
+ else:
+ yield open('/dev/null', 'r')
+
+ qa_core.open_input = input_stub
+ profiling_buffer = StringIO.StringIO()
+
+ @contextmanager
+ def get_profiling_buffer(*a, **k):
+ yield profiling_buffer
+ profiling_buffer.seek(0)
+
+ qa_core.new_output_file = get_profiling_buffer
+ import gims_qa_tests
+
+ namespace.sequenceFileName = ('forward.fastq', 'reverse.fastq')
+ namespace.sequencingType = 'Illumina MiSeq'
+ namespace.approximateGenomeSize = 1000000
+
+ def run_tests(namespace, log):
+ log.getChild('run')
+ log.info(vars(namespace))
+ for name, test in qa_tests.items():
+ if test.__profile_enabled__:
+ test(namespace)
+ log.info("Profiling...")
+ return cProfile.runctx('run_tests(namespace, log)', None, locals(), sort='tottime')
+
+if __name__ == '__main__':
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("mode")
+ parser.add_argument('sequenceFileName', nargs='*')
+ parser.add_argument('--genus','-g')
+ parser.add_argument('--species', '-s')
+ parser.add_argument('--sequencingType', '-t')
+ parser.add_argument('--approximateGenomeSize','--gs', type=int, default=4500000)
+ parser.add_argument('--coverageThreshold', '-c', type=int, default=20)
+ parser.add_argument('--minimumQScore', '-q', type=float, default=27.0)
+ parser.add_argument('--databasePath', '--db')
+ parser.add_argument('--log', '-l', type=argparse.FileType('w'), default=sys.stdout)
+ parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout)
+ parser.add_argument('--output_plots')
+ parser.add_argument('--parallelism', '-p', type=int, default=1)
+ parser.add_argument('--annotate', '-a', type=str, default=None, dest='annotation')
+ params = parser.parse_args()
+ logging.basicConfig(format='%(name)s\t%(levelname)s:%(message)s', level=logging.DEBUG, stream = params.log)
+ logging.getLogger('qa.cmd').info(" ".join(sys.argv))
+ main(params)
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_core.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_core.py Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,390 @@
+#!/usr/bin/env python
+
+#from __future__ import print_function
+
+import xml.dom.minidom as dom
+import xml.etree.ElementTree as xml
+import sys
+import os, os.path
+from functools import partial, wraps
+from inspect import getargspec, getfile, currentframe, getsource
+from os.path import join as j
+import subprocess
+import textwrap
+import imp
+from collections import OrderedDict
+from contextlib import contextmanager
+import StringIO
+import gzip
+
+ver = "core+plots+fxt"
+
+
+usage = """ CFSAN Genomics QA Core
+May 15, 2015
+Justin Payne
+justin.payne@fda.hhs.gov
+
+ usage:
+ {0} /path/to/sandbox (production mode)
+ {0} {{0}}
+
+""".format(os.path.basename(__file__))
+
+PHRED_OFFSET = 33
+
+input_subdir = 'Input'
+output_subdir = 'Output'
+
+path = os.getcwd()
+
+qa_tests = OrderedDict()
+qa_undecorated_tests = OrderedDict()
+qa_params = OrderedDict()
+
+class QaParamsDict(OrderedDict):
+ "orderedDict that sets on get access"
+
+ def get(self, key, value=None):
+ self[key] = result = super(QaParamsDict, self).get(key, value)
+ return result
+
+
+qa_params = QaParamsDict()
+
+def int_then_string(value):
+ try:
+ return int(value)
+ except ValueError:
+ return str(value)
+
+class IncompleteXMLError(Exception):
+ pass
+
+def qa_test(func=None, disable=False, profile=True):
+ "Decorator to simplify XML interaction"
+ def wrap(qa_func):
+ "Inner function to decorate qa test"
+ #print "registering", qa_func.__name__
+ qa_undecorated_tests[qa_func.__name__] = qa_func
+ if disable:
+ return qa_func
+ @wraps(qa_func)
+ def bind_metadata_to_arguments(metadata, qa_func):
+ "Find elements in XML with same name as function argument and bind their content"
+ try:
+ arguments = list(getargspec(qa_func).args)
+ except TypeError:
+ arguments = list(getargspec(qa_func.func).args)
+ for argument in arguments:
+ elements = metadata.findall(".//{0}".format(argument))
+ if len(elements) == 1:
+ qa_func = partial(qa_func, **{argument:int_then_string(elements[0].text)})
+ elif len(elements) > 1:
+ qa_func = partial(qa_func, **{argument:[int_then_string(e.text) for e in elements]})
+ else:
+ pass
+ try:
+ return qa_func()
+# except TypeError as e:
+# for el in metadata:
+# if el.tag in args:
+# arguments.remove(el.tag)
+# message = "Missing argument or arguments in input.xml: {} not present.".format(', '.join(arguments))
+# raise IncompleteXMLError, message, sys.exc_info()[-1]
+ finally:
+ pass
+ #bind_metadata_to_arguments
+ decorated_qa_func = partial(bind_metadata_to_arguments, qa_func=qa_func)
+ decorated_qa_func.__profile_enabled__ = profile
+ qa_tests[qa_func.__name__] = decorated_qa_func
+ return decorated_qa_func
+ #qa_test_decorator
+ if func:
+ return wrap(func)
+ return wrap
+
+
+qa_core = imp.new_module("qa_core")
+qa_core.qa_test = qa_test
+qa_core.test_output_buffer = StringIO.StringIO()
+
+def present(file):
+ return "/dev/null"
+
+qa_core.present = present
+
+@contextmanager
+def get_test_buffer(*args, **kwargs):
+ yield qa_core.test_output_buffer
+
+@contextmanager
+def get_test_input_file(*args, **kwargs):
+ yield open('/dev/null', 'r')
+
+qa_core.new_output_file = get_test_buffer
+qa_core.open_input = get_test_input_file
+qa_core.qa_params = qa_params
+sys.modules['qa_core'] = qa_core
+
+
+def main(sandbox, profile=False, *a, **k):
+ output = "qa_core: ran "
+ with qa_core.open_input('input.xml') as inputfile:
+ metadata = xml.parse(inputfile).getroot()
+
+ qaoutput = xml.Element("qaQcCoverageOutput")
+ outputxml = xml.ElementTree(element=qaoutput)
+
+ for name, test in qa_tests.items():
+ if (profile and test.__profile_enabled__) or not profile:
+ output += "{0}, ".format(name)
+ try:
+ result = test(metadata)
+ if isinstance(result, xml.Element):
+ qaoutput.append(result)
+ else:
+ if result:
+ xml.SubElement(qaoutput, name).text = str(result)
+ except Exception as e:
+ output += 'and died on error.'
+ import traceback
+ traceback.print_exc()
+ print(test)
+ xml.SubElement(qaoutput, 'reasonForFailure').text = str(e)
+ break
+# if gims_qa_tests.failure_mode:
+# xml.SubElement(qaoutput, "reasonForFailure").text = str(gims_qa_tests.failure_mode)
+
+ with qa_core.new_output_file('output.xml') as outputfile:
+ d = dom.parseString(xml.tostring(qaoutput))
+ d.writexml(outputfile, addindent=' ', newl='\n')
+ return output
+
+
+
+
+
+@qa_test
+def version():
+ "Collect repository commit id from git"
+ p = os.path.abspath(os.path.dirname(__file__))
+ try:
+ ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '')
+ except subprocess.CalledProcessError:
+ ref = ver
+ return ref
+
+@qa_test(disable=True)
+def profile():
+ "profiling command to test performance"
+
+ print("Starting profiler. Imports...")
+
+ import cProfile
+ import StringIO
+ import Bio
+ import random
+ from random import choice as c
+
+ from tempfile import TemporaryFile as tmp
+
+
+ print("Setting up test data...")
+
+ profile_xml = StringIO.StringIO("""
+
+ forward.fastq
+ reverse.fastq
+ Illumina MiSeq
+ 1000000
+
+""")
+
+ qa_params['plots'] = False
+
+ profile_fastq = StringIO.StringIO()
+ #create some fake FASTQ 'reads'
+ for r in range(10000):
+ #boring random distribution
+ #s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)])
+ #interesting diamond distribution
+ s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)])
+ s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)])
+
+ q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)]
+ rec = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s, Bio.Alphabet.IUPAC.unambiguous_dna), id="read_{}".format(r))
+ rec.letter_annotations['phred_quality'] = q
+ try:
+ profile_fastq.write(rec.format('fastq'))
+ except:
+ print q
+ print rec
+ raise
+
+
+
+
+ @contextmanager
+ def input_stub(filename, mode=None):
+ if 'xml' in filename:
+ profile_xml.seek(0)
+ yield profile_xml
+ elif 'fastq' in filename:
+ profile_fastq.seek(0)
+ yield profile_fastq
+ else:
+ yield open('/dev/null', 'r')
+
+ qa_core.open_input = input_stub
+ #qa_core.new_output_file = get_test_buffer
+
+ profiling_buffer = StringIO.StringIO()
+
+ @contextmanager
+ def get_profiling_buffer(*a, **k):
+ yield profiling_buffer
+ profiling_buffer.seek(0)
+
+ qa_core.new_output_file = get_profiling_buffer
+
+ reload(gims_qa_tests)
+
+ print("Beginning profiling...")
+ try:
+ s = cProfile.run('main(profile=True)', sort='tottime')
+ #s.sort_stats('pcalls', 'tottime')
+ return str(profiling_buffer.read())
+ except NameError:
+ import traceback
+ traceback.print_exc()
+ main(profile=True)
+ quit()
+ except Exception:
+ import traceback
+ traceback.print_exc()
+ #print str(profile_fastq.getvalue())[:1024]
+ quit()
+ #return exec('main')
+
+
+#Heng Li's readfq implementation
+#https://github.com/lh3/readfq/blob/master/readfq.py
+def readfq(fp): # this is a generator function
+ last = None # this is a buffer keeping the last unprocessed line
+ while True: # mimic closure; is it a bad idea?
+ if not last: # the first record or a record following a fastq
+ for l in fp: # search for the start of the next record
+ if l[0] in '>@': # fasta/q header line
+ last = l[:-1] # save this line
+ break
+ if not last: break
+ name, seqs, last = last[1:].partition(" ")[0], [], None
+ for l in fp: # read the sequence
+ if l[0] in '@+>':
+ last = l[:-1]
+ break
+ seqs.append(l[:-1])
+ if not last or last[0] != '+': # this is a fasta record
+ yield name, ''.join(seqs), [] # yield a fasta record
+ if not last: break
+ else: # this is a fastq record
+ seq, leng, seqs = ''.join(seqs), 0, []
+ for l in fp: # read the quality
+ seqs.append(l[:-1])
+ leng += len(l) - 1
+ if leng >= len(seq): # have read enough quality
+ last = None
+ yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record
+ break
+ if last: # reach EOF before reading enough quality
+ yield name, seq, [] # yield a fasta record instead
+ break
+
+qa_core.readfq = readfq
+
+
+
+if __name__ == "__main__":
+ try:
+ name, sandbox_path = sys.argv[0:2]
+ args = sys.argv[2:]
+
+ if '-p' in args:
+ from multiprocessing import Pool
+ try:
+ m = int(args[args.index('-p') + 1])
+ p = Pool(processes=m)
+ del args[args.index('-p') + 1]
+ qa_core.qa_params['parallelism'] = m
+ except IndexError, ValueError:
+ p = Pool()
+ args.remove('-p')
+ qa_core.qa_params['map implementation'] = p.map
+
+ if '-q' in args:
+ try:
+ q = float(args.pop(args.index('-q') + 1))
+ args.remove('-q')
+ qa_params['user Q threshold'] = q
+ except IndexError, ValueError:
+ print("no Q score threshold provided with '-q' option.")
+ quit(1)
+
+
+ @contextmanager
+ def new_output_file(filename, mode='w'):
+ if not os.path.exists(j(sandbox_path, output_subdir)):
+ os.mkdir(j(sandbox_path, output_subdir))
+ fileh = open(j(sandbox_path, output_subdir, filename), mode)
+ yield fileh
+ fileh.close()
+
+
+ def open_input_from(path):
+ @contextmanager
+ def open_input(filename, mode='rU'):
+ if 'gz' in filename:
+ fopen = gzip.open
+ else:
+ fopen = open
+ fileh = fopen(j(path, filename), mode=mode)
+ yield fileh
+ fileh.close()
+ return open_input
+
+ def present_from(path):
+ def present(file):
+ return os.path.abspath(j(path, file))
+ return present
+
+ qa_core.new_output_file = new_output_file
+ main_func = main
+
+ if not os.path.exists(sandbox_path):
+ qa_core.open_input = open_input_from(os.getcwd())
+ qa_core.present = present_from(os.getcwd())
+ try:
+ main_func = lambda s, *a: qa_undecorated_tests[sandbox_path](*a)
+ except KeyError:
+ print("Invalid QA test name: {0}".format(sandbox_path))
+ quit(1)
+ else:
+ qa_core.open_input = open_input_from(j(sandbox_path, input_subdir))
+ qa_core.present = present_from(j(sandbox_path, input_subdir))
+
+
+
+ except ValueError:
+ import gims_qa_tests
+ print("Missing argument: path to sandbox or name of QA test.")
+ commands = ""
+ #commands = "\n\t\t".join([' '.join((n, " ".join(["<{0}>".format(a) for a in getargspec(f)[0]]) or 'DELETEME')).replace(' DELETEME', '') for n, f in qa_undecorated_tests.items()])
+ for name, func in qa_undecorated_tests.items():
+ args = " ".join(["<{0}>".format(a) for a in getargspec(func).args])
+ commands += "{0} {1}\n\t\t\t".format(name, textwrap.fill(args, 60, subsequent_indent='\t\t\t\t'))
+ print(usage.format(commands))
+ quit(1)
+ else:
+ import gims_qa_tests
+ print(main_func(sandbox_path, profile=False, *args))
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_kraken.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_kraken.py Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,217 @@
+import csv
+from collections import Counter, OrderedDict
+from hashlib import md5
+import os.path
+import tempfile
+from subprocess import check_call
+from gims_qa_tests import TaxTuple
+#from qa_core import present
+
+def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)):
+ "kraken options should be supplied as a dict to kraken_options; give a None value for valueless options"
+
+ #hash the kraken db files
+ md5hash = md5()
+ #os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None)
+ md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read())
+
+ db_hash = md5hash.hexdigest()
+
+
+ with tempfile.NamedTemporaryFile('rU') as kraken_out:
+ with tempfile.NamedTemporaryFile() as seq_data:
+ #cat all the files together
+ check_call("cat {files} > {seq_data.name}".format(
+ files = " ".join([present(f) for f in sequenceFiles]),
+ seq_data = seq_data
+ ),
+ shell=True)
+
+ def make_option(key, value):
+ if value is not None:
+ return "--{} {}".format(key, value)
+ return "--{}".format(key)
+
+ #build options
+ options = " ".join([make_option(k,v) for k,v in kraken_options.items()])
+
+ #call kraken
+ check_call("kraken {options} "
+ "--db {database} "
+ "--gzip-compressed "
+ "--fastq {seq_data.name} "
+ "--output {kraken_out.name}".format(**locals()), shell=True)
+
+ counts = Counter()
+
+ with tempfile.NamedTemporaryFile('rU') as kraken_report:
+ check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True)
+ rows = csv.reader(kraken_report, delimiter='\t')
+ for seq_id, taxon in rows:
+ counts[taxon] += 1.0
+ counts["total"] += 1.0
+
+ #clean up all the temp files by releasing contexts
+
+ top_results = counts.most_common(6) #total, plus five most common taxa
+ total, total_count = top_results.pop(0)
+ top_hit, top_hit_count = top_results.pop(0)
+ next_hits = [h[0].replace(';', ' ') for h in top_results]
+ next_hit_counts = [h[1] for h in top_results]
+
+ top_hit_frac = top_hit_count / total_count
+ next_hit_frac = [h / total_count for h in next_hit_counts]
+
+ return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash)
+
+def runKrakenSeqtest(sequenceFiles,database):
+
+ krakenoutoutpath = "/home/charles.strittmatter/output/kraken/"
+
+ w = datetime.datetime.now()
+
+ krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M")
+ krakenreport = "" + krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M")
+
+ sequenceFiles[0]
+ sequenceFiles[1]
+
+ os.path.abspath(os.path.join("Input", sequenceFileName[0]))
+
+ command = "kraken --preload "
+ command += "--db " + database + ' '
+ #if len(fastqPaths) > 1:
+ command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1]))
+ # command += ' '.join(fastqPaths)
+ command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout
+
+ print command
+
+ check_call(command, shell=True)
+ print "%s -- Kraken called..." % datetime.datetime.now()
+
+
+ kraken_report_cmd = "kraken-report --db "
+ kraken_report_cmd += database
+ kraken_report_cmd += " " + krakenoutoutpath + krakenout
+ kraken_report_cmd += " > " + krakenreport
+
+ print kraken_report_cmd
+
+ check_call(kraken_report_cmd, shell=True)
+
+
+
+
+ parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5"
+
+ kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True)
+
+ split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines)
+
+ print "=====================" + krakenreport + "=========================="
+ print split_kraken_rpt_line
+ split_kraken_rpt_length = len(split_kraken_rpt_line)
+ print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4]
+ taxLen = 6;
+ taxIdIdx = 4;
+ outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M") + '_metadata_kraken.tsv'
+
+ metadata = "Platform Run asm_stats_contig_n50 asm_stats_length_bp asm_stats_n_contig attribute_package bioproject_acc bioproject_center bioproject_title biosample_acc collected_by sample_name scientific_name serovar sra_center strain sub_species tax-id"
+
+
+
+
+ splitMetadata = metadata.split("\t")
+ for i in range(0, len(split_kraken_rpt_line)):
+ if split_kraken_rpt_line[i] == '':
+ print "don't add to the metadata...split_kraken_rpt_line.pop(i)"
+ elif i%(taxLen) == taxLen-1:
+ print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1"
+ splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip())
+ elif i%(taxLen) == taxIdIdx:
+ print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx"
+ splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
+ elif i%(taxLen) == 0:
+ print "i is "+str(i)+"under this condition: i%(taxLen) == 0"
+ splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
+
+ splitMetadataLength = len(splitMetadata)
+ first_kraken_percentage = splitMetadataLength-15
+ with open(outputFile, 'wb') as f:
+ #writer = csv.writer(f, delimiter="\t")
+
+ # print splitMetadata
+ for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
+ print "Split metadata of i "+splitMetadata[i]
+ for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
+ if float(splitMetadata[j]) > float(splitMetadata[i]):
+ print "i: "+str(i)+" and j: "+str(j)
+ print "swapping "+splitMetadata[i]+" with "+splitMetadata[j]
+ old_hit_org = splitMetadata[i-1]
+ old_hit_percentage = splitMetadata[i]
+ old_hit_tx_id = splitMetadata[i+1]
+
+ splitMetadata[i-1] = splitMetadata[j-1]
+ splitMetadata[i] = splitMetadata[j]
+ splitMetadata[i+1] = splitMetadata[j+1]
+
+ splitMetadata[j-1] = old_hit_org
+ splitMetadata[j] = old_hit_percentage
+ splitMetadata[j+1] = old_hit_tx_id
+
+ df = pandas.DataFrame(splitMetadata)
+
+ print df
+
+
+ # Here aree dataframe collumns for !
+
+ print df.iat[0,0]
+ print df.iat[1,0]
+ print df.iat[3,0]
+ print df.iat[4,0]
+ print df.iat[6,0]
+ print df.iat[7,0]
+ print df.iat[9,0]
+ print df.iat[10,0]
+
+ OrganismOne = df.iat[0,0]
+ OrganismTwo = df.iat[3,0]
+ OrganismThree = df.iat[6,0]
+ OrganismFour = df.iat[9,0]
+
+
+ OrganismOnePer = df.iat[1,0]
+ OrganismTwoPer = df.iat[4,0]
+ OrganismThreePer = df.iat[7,0]
+ OrganismFourPer = df.iat[10,0]
+
+
+ df = {'OrganismOne':OrganismOne,
+ 'OrganismTwo':OrganismTwo,
+ 'OrganismThree':OrganismThree,
+ 'OrganismFour':OrganismFour,
+ 'OrganismOne%':OrganismOnePer,
+ 'OrganismTwo%':OrganismTwoPer,
+ 'OrganismThree%':OrganismThreePer,
+ 'OrganismFour%':OrganismFourPer }
+
+
+ #Clean up files
+
+ command = ""
+ command += "rm " + krakenoutoutpath + krakenout
+
+ print command
+
+ check_call(command, shell=True)
+
+
+
+ command = ""
+ command += "rm " + outputFile
+
+ print command
+
+ return pandas.DataFrame(df, index=[0])
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_plot.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_plot.py Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,148 @@
+import subprocess
+from qa_core import new_output_file, qa_params
+from collections import defaultdict
+import warnings
+warnings.filterwarnings("ignore")
+import pandas
+from pandas import DataFrame
+import matplotlib.pyplot as plot
+from numpy import zeros, add, mean, array, percentile, arange
+import traceback
+from gims_qa_tests import cluster
+import logging
+
+bases = ("A", "T", "G", "C", "N")
+
+def qa_plots(df, annotation, log=logging.getLogger('qa.plots'), **kwargs):
+ #log.debug(df)
+ #log.debug(annotation)
+ plot_quality_histo(df['qualityHisto'], annotation, **kwargs)
+ plot_nucleotide_distribution_histo(df['nucleotideHisto'], annotation)
+ plot_quality_distribution_histo(df['baseQScoreHisto'], annotation)
+ plot_base_balance(df, annotation, **kwargs)
+ #plot.show()
+
+def plot_base_balance(df,
+ annotation=qa_params.get('annotation', ''),
+ threshold=qa_params.get('permitted base balance variance', .2),
+ **kwargs):
+ try:
+ plot.figure()
+ plot.axis((0,2,0,2))
+ plot.suptitle("{} Base Balance".format(annotation))
+ plot.xlabel("G/C")
+ plot.ylabel("A/T")
+ thres_min = 1 - threshold
+ thres_max = 1 + threshold
+ for coord in (thres_min, thres_max):
+ for linef in (plot.axvline, plot.axhline):
+ linef(coord, linestyle='dashed', label=str(coord))
+
+ plot.annotate(thres_max, xy=(0, thres_max))
+ plot.annotate(thres_min, xy=(0, thres_min))
+ plot.annotate(thres_max, xy=(thres_max, 0))
+ plot.annotate(thres_min, xy=(thres_min, 0))
+
+ a, t, g, c = [sum(df['nucleotideHisto'][b]) for b in ('A', 'T', 'G', 'C')]
+ y, x = a / float(t), g / float(c)
+ plot.plot((x,), (y,), marker="+")
+ plot.annotate("({:.2},{:.2})".format(x, y), xy=(x+.05, y+.05), textcoords='data')
+ with new_output_file("base_balance.png") as base_bal:
+ plot.savefig(base_bal, format='png', dpi=92)
+ except Exception:
+ traceback.print_exc()
+
+
+def plot_quality_histo(histo,
+ annotation=qa_params.get('annotation', ''),
+ q=qa_params.get('user Q threshold', 27)):
+ try:
+ plot.figure()
+ labels = range(max(histo.keys()) + 1)
+ plot.bar(left=labels,
+ height=[histo[n] for n in labels],
+ color='y',
+ linewidth=1)
+# plot.hist(histo, bins=range(max(histo)))
+ plot.xticks()
+ plot.axvline(x=q, label="Q {}".format(q))
+ #plot.show()
+ plot.xlabel(r"""$Q (-10 log_(10) \rho)$""")
+ plot.ylabel('Number of Bases')
+ plot.suptitle('{} Q-Score Histogram'.format(annotation))
+ plot.legend(loc='upper left', fontsize='x-small')
+ with new_output_file("qscore_histogram.png") as qual_histogram:
+ plot.savefig(qual_histogram, format='png', dpi=92)
+# except ValueError:
+# pass
+ except Exception:
+ traceback.print_exc()
+
+def plot_nucleotide_distribution_histo(dataframe,
+ annotation=qa_params.get('annotation', '')):
+ try:
+ #dataframe = defaultdict(list)
+ colors = iter(('r', 'y', 'g', 'b', 'pink'))
+ plot.figure()
+ last_base = zeros((len(dataframe.values()[0]),)) #set all zeros for bottom of first bar row
+ for base in bases:
+ try:
+ plot.bar(left=range(len(dataframe[base])),
+ height=dataframe[base],
+ width=2,
+ bottom=last_base,
+ color=colors.next(),
+ label=base,
+ linewidth=0)
+ last_base = add(last_base, dataframe[base])
+ except:
+ #print [len(d) for d in dataframe.data.values()]
+ raise
+ plot.autoscale(tight=True)
+ plot.xlabel('Base Position')
+ plot.ylabel('Nucleotide Count')
+ plot.suptitle('{} Nucleotide Distribution Plot'.format(annotation))
+ plot.legend(loc='upper right', fontsize='xx-small')
+ with new_output_file("nucleotide_distribution.png") as nuc_histogram:
+ plot.savefig(nuc_histogram, format='png', dpi=108)
+ except Exception:
+ traceback.print_exc()
+
+def plot_quality_distribution_histo(histo,
+ annotation=qa_params.get('annotation', ''),
+ q=qa_params.get('user Q threshold', 27)
+ ):
+ try:
+ binning = 20
+ plot.figure(figsize=(12, 6))
+ vec = [tuple(c.elements()) for c in histo]
+ means = [mean(t) for t in vec]
+ upper_iqr = array([percentile(t, 75) for t in vec])
+ lower_iqr = array([percentile(t, 25) for t in vec])
+# plot.boxplot(vec,
+# widths=.000005,
+# whis=(5,95),
+# showfliers=False,
+# showbox=True,
+# showmeans=True,
+# meanline=True,
+# showcaps=True,
+# whiskerprops=dict(linestyle='-', color='0.60')
+# )
+
+ plot.fill_between(arange(len(upper_iqr)), upper_iqr, lower_iqr, facecolor='0.50', alpha=0.5, label='interquartile range')
+ plot.plot(means, 'r-', label='Mean')
+ plot.plot(upper_iqr, 'b-', label='interquartile range', alpha=0.25)
+ plot.autoscale(tight=True)
+ plot.axhline(y=q, label="Q {}".format(q))
+ plot.xlabel('Base Position')
+ plot.ylabel(r"""$Q (-10 log_(10) \rho)$""")
+ plot.xticks(range(0, len(histo), binning), range(0, len(histo), binning), rotation='vertical')
+ plot.suptitle('{} Q-Score Distribution Plot'.format(annotation))
+ plot.legend(loc='lower left', fontsize='x-small')
+ plot.grid(True)
+ with new_output_file("qscore_distribution.png") as q_distro:
+ plot.savefig(q_distro, format='png', dpi=108)
+
+ except Exception:
+ traceback.print_exc()
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_plot.pyc
Binary file qa_core/gims_qa_plot.pyc has changed
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_serotyping.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_serotyping.py Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,110 @@
+import csv
+import shutil
+from collections import Counter, OrderedDict
+from hashlib import md5
+import os.path
+import tempfile
+import glob
+import subprocess
+from subprocess import check_call, check_output
+from gims_qa_tests import TaxTuple
+import logging
+#from qa_core import present
+
+
+
+def run_serotype_usingseqsero(
+ sequenceFileName1, sequenceFileName2,
+ sequencingType,
+ Oantigen="Undetermined",
+ H1antigen="Undetermined",
+ H2antigen="Undetermined",
+ AntigenicProfile="Undetermined",
+ PredictedSerotype="Undetermined",
+ log=logging.getLogger('qa.seqsero')):
+
+ log.info(check_output("SeqSero.py -m 2 -i {sequenceFileName1} {sequenceFileName2} -b sam".format(**locals()), shell=True))
+
+
+ data_lines = []
+ filepath = "SeqSero_result*/Seqsero_result.txt"
+
+
+ i = -1
+
+ for name in glob.glob(filepath):
+ #with name2 as seqsero_report:
+ rows = csv.reader(open(name))
+ fields = rows.next()
+ #data_lines = []
+ for row in rows:
+ i += 1
+ str1 = ''.join(row)
+ words2 = str1.split(':')
+ if i == 0:
+ Oantigen = words2[1]
+ if i == 1:
+ H1antigen = words2[1]
+ if i == 2:
+ H2antigen = words2[1]
+ if i == 3:
+ if len(words2) > 2:
+ print words2[1] + ":" + words2[2] + ":" + words2[3]
+ AntigenicProfile = words2[1] + ":" + words2[2] + ":" + words2[3]
+ if i == 4:
+ if words2[1].find("N/A") != -1:
+ PredictedSerotype = "No Salmonella Serotype"
+ else:
+ PredictedSerotype = words2[1]
+
+
+ import xml.etree.ElementTree as xml
+
+ root = xml.Element('SerotypeQualityResultType')
+ global failure_mode
+
+ #Saving for posssible future change to xml
+ #el = xml.SubElement
+
+ #el(root, 'Oantigen').text = Oantigen
+ #el(root, 'H1antigen').text = H1antigen
+ #el(root, 'H2antigen').text = H2antigen
+ #el(root, 'AntigenicProfile').text = AntigenicProfile
+ #el(root, 'PredictedSerotype').text = PredictedSerotype
+
+ root.set('Oantigen',Oantigen.strip())
+ root.set('H1antigen',H1antigen.strip())
+ root.set('H2antigen',H2antigen.strip())
+ root.set('AntigenicProfile',AntigenicProfile.strip())
+ root.set('PredictedSerotype', PredictedSerotype.strip())
+
+
+
+ # print Oantigen
+ # print H1antigen
+ # print H2antigen
+ # print AntigenicProfile
+ # print PredictedSerotype
+
+ log.getChild('Oantigen').info(Oantigen)
+ log.getChild('H1antigen').info(H1antigen)
+ log.getChild('H2antigen').info(H2antigen)
+ log.getChild('AntigenicProfile').info(AntigenicProfile)
+ log.getChild('PredictedSerotype').info(PredictedSerotype)
+
+
+ #shutil.rmtree("SeqSero_result*", ignore_errors=True)
+
+ check_call("rm -rf SeqSero_result*".format(**locals()), shell=True, stderr=subprocess.STDOUT)
+
+
+
+
+ return root
+
+
+
+
+
+
+
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_tests.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_tests.py Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,434 @@
+from qa_core import qa_test, new_output_file, open_input, readfq, qa_params
+from collections import namedtuple, defaultdict, Counter
+from Bio import SeqIO
+from multiprocessing import Pool, Process, cpu_count
+from functools import partial
+from itertools import izip, count, izip_longest, islice
+from tempfile import NamedTemporaryFile, mkdtemp
+from subprocess import check_call, CalledProcessError
+from numpy import genfromtxt
+from os.path import abspath, join, split, splitext
+from contextlib import contextmanager
+from subprocess import call
+from subprocess import check_output
+from hashlib import md5
+
+
+import sys
+import shutil
+import os
+import pandas
+import datetime
+import re
+import glob, string
+
+#Test implementation classes and functions. The actual tests are down at the bottom.
+
+SuperResultTuple = namedtuple("SuperResultTuple", ("readCount", "numBases", "baseQScoreHisto", "nucleotideHisto", "qualityHisto"))
+
+# #Global Variables for kraken methods
+# krakendatabase = "/scratch/biostat/kraken/miniDB/"
+# we don't actually need this as a global
+
+class ResultTuple(SuperResultTuple):
+ def __add__(self, other):
+ r1, n1, h1, nuc1, qual1 = self
+ r2, n2, h2, nuc2, qual2 = other
+ return ResultTuple(r1 + r2,
+ n1 + n2,
+ [p + q for p, q in izip_longest(h1, h2, fillvalue=Counter())],
+ [p + q for p, q in izip_longest(nuc1, nuc2, fillvalue=Counter())],
+ qual1 + qual2)
+
+ def __and__(self, other):
+ r1, n1, h1, nuc1, qual1 = self
+ r2, n2, h2, nuc2, qual2 = other
+ return ResultTuple(r1 + r2,
+ n1 + n2,
+ h1 + h2,
+ nuc1 + nuc2,
+ qual1 + qual2)
+
+ @classmethod
+ def zero(cls):
+ return cls(0, 0, [], [], Counter())
+
+ def to_dataFrame(self):
+ try:
+ from pandas import DataFrame
+ #from_dict = partial(DataFrame.from_dict, DataFrame)
+ except:
+ #no pandas
+ def DataFrame(self, data):
+ return ResultTuple(**data)
+
+ pivotNucHisto = defaultdict(list)
+ for cnt in self.nucleotideHisto:
+ for base in ('A', 'T', 'G', 'C', 'N'):
+ pivotNucHisto[base].append(cnt[base])
+ pivotQBaseHisto = self.baseQScoreHisto
+ pivotQualHisto = {}
+ for q in range(max(self.qualityHisto.keys())):
+ pivotQualHisto[q] = self.qualityHisto[q]
+ df = {'readCount':self.readCount,
+ 'numBases':self.numBases,
+ 'baseQScoreHisto':pivotQBaseHisto,
+ 'nucleotideHisto':pivotNucHisto,
+ 'qualityHisto':pivotQualHisto}
+ return df
+
+class MergedResultTuple():
+ "Merged multi-file results object. You can iterate on it for filename and ResultTuple pairs, or call its attributes for combined results"
+ def __init__(self, **kwargs):
+ self.per_file = {}
+ self.per_file.update(kwargs)
+
+ def __getattr__(self, name):
+ return getattr(reduce(lambda p, q: p & q, [v for k, v in sorted(self.per_file.items())], ResultTuple.zero()), name)
+
+ def __iter__(self):
+ return iter(self.per_file.items())
+
+
+def analyze_batch_reads(reads):
+ readCount = 0
+ numBases = 0
+ baseQScoreHisto = []
+ nucleotideHisto = []
+ qualityHisto = Counter()
+ for name, seq, quals in reads:
+ qualityHisto.update(quals)
+ def scan_a_read(read):
+ #for base, qual, index in zip(seq, quals, count(0)):
+ base, qual, index = read
+ try:
+ nucleotideHisto[index][base] += 1
+ except IndexError:
+ nucleotideHisto.append(Counter(base))
+ try:
+ baseQScoreHisto[index][qual] += 1
+ except IndexError:
+ baseQScoreHisto.append(Counter({qual:1}))
+ return index
+ m = map(scan_a_read, zip(seq, quals, count(0))) #numBases is the last index of the read
+ #print(name, m[:3])
+ numBases += m[-1]
+ readCount += 1
+ r = ResultTuple(readCount, numBases, baseQScoreHisto, nucleotideHisto, qualityHisto)
+ return r
+
+
+def cluster(slicable, n=2):
+ "return an n-tuple of iterators over an in-memory sequence"
+ #determine the offset size
+ offset = (len(slicable) / n) + 1
+ #return tuple([islice(slicable, i, i+offset) for i in range(n)])
+ return tuple([tuple(slicable[i*offset:(i+1)*offset-1]) for i in range(n)])
+
+
+def tupleize_parse(fileh, format):
+ for read in SeqIO.parse(fileh, format):
+ yield (read.id, str(read.seq), read.letter_annotations['phred_quality'])
+
+#@profile
+def analyze_shortread(seqFile, map_f=map, multi=qa_params.get('parallelism', cpu_count())):
+ "analyze one file whose reads are typical short-read technology"
+ if 'sff' in seqFile:
+ format, quality, mode = 'sff', 'phred_quality', 'rb'
+ rds_f = partial(tupleize_parse, format=format)
+ else:
+ format, quality, mode = 'fastq', 'phred_quality', 'rU'
+ rds_f = readfq
+ with open_input(seqFile) as seqFileH:
+ rds = list(rds_f(seqFileH))
+ rds = cluster(rds, multi)
+ result = sum(map_f(analyze_batch_reads, rds), ResultTuple.zero())
+ return result
+
+
+def analyze_longread(seqFile, map_f=map):
+ "analyze one file from pacbio or nanopore"
+
+ return ResultTuple.zero()
+ #what to do here? if nanopore, plot 2d/1d length vs accuracy
+
+
+def panic(*a, **k):
+ raise ValueError("No analysis method for this sequencing type")
+
+config = {'Roche 454':analyze_shortread,
+ 'Illumina MiSeq':analyze_shortread,
+ 'Illumina NextSeq':analyze_shortread,
+ 'Illumina HiSeq':analyze_shortread,
+ 'IonTorrent PGM':analyze_shortread,
+ 'PacBio RS':analyze_longread,
+ 'Oxford Nanopore MiniION':analyze_longread}
+
+
+class QaMemo(object):
+ "lazy-evaluated QA operation"
+
+ result = None #named tuple results object
+
+ @classmethod
+ def analyze(cls,
+ sequenceFiles,
+ sequencingType,
+ map_f=qa_params.get('map implementation', map),
+ plots=qa_params.get('plots', True),
+ annotation=qa_params.get('annotation', '')):
+ if type(sequenceFiles) == str: #if this is a single file
+ sequenceFiles = [sequenceFiles]
+ analysis = partial(config.get(sequencingType, panic), map_f=map_f)
+# result = reduce(lambda a, b: a & b, map(analysis, sequenceFiles), ResultTuple.zero())
+ result = MergedResultTuple(**{fi:analysis(fi) for fi in sequenceFiles})
+ if plots:
+ try:
+ from gims_qa_plot import qa_plots
+ result_df = result.to_dataFrame()
+ qa_plots(result_df, annotation)
+ except Exception:
+ import traceback
+ traceback.print_exc()
+ return result
+
+ @classmethod
+ def of(cls, sequenceFiles, sequencingType):
+ try:
+ if not cls.result:
+ cls.result = QaMemo.analyze(sequenceFiles, sequencingType)
+ return cls.result
+ except TypeError as e:
+ raise ValueError(str(e)), None, sys.exc_info()[2]
+
+
+TaxTuple = namedtuple("TaxTuple", ("taxon", "taxon_prevalence", "contaminant", "contaminant_prevalence", "checksum"))
+
+
+class TaxMemo(object):
+ "lazy-evaluated Tax check operation"
+
+ result = None #
+
+ @classmethod
+ def analyze(cls, sequenceFiles, database):
+ from gims_qa_kraken import run_kraken
+ return run_kraken(sequenceFiles, database, dict(preload=None,
+ threads=qa_params.get('parallelism', 1)))
+
+ # ## print cls
+ # ## print sequenceFiles
+ # from gims_qa_kraken import runKrakenSeqTest
+
+ # #hash the kraken db
+ # #this isn't right yet, the db is a directory
+ # db_hash = md5(open(database, 'rb').read()).hexdigest()
+
+
+ # #Run Kraken on the current sequence and collect result and pass to TaxTuple to be written to xml
+ # KrakenResult = runKrakenSeqtest(sequenceFiles,krakendatabase)
+
+ # #do this more readably lines with list comprehensions
+ # org_headers = ('OrganismTwo', 'OrganismThree', 'OrganismFour', 'OrganismFive')
+ # org_percent_headers = ('OrganismTwo%', 'OrganismThree%', 'OrganismFour%', 'OrganismFive%')
+ # Organisms = [KrakenResult[h].iloc[0] for h in org_headers]
+ # OrganismsPer = [KrakenResult[h].iloc[0] for h in org_percent_headers]
+
+
+ # return TaxTuple(KrakenResult['OrganismOne'].iloc[0], KrakenResult['OrganismOne%'].iloc[0], Organisms, OrganismsPer, db_hash)
+
+ @classmethod
+ def of(cls, *args, **kwargs):
+ if not cls.result:
+ cls.result = TaxMemo.analyze(*args, **kwargs)
+ return cls.result
+
+failure_mode = ""
+
+def update_failure_mode(failure):
+ global failure_mode
+ if failure_mode:
+ failure_mode += "; {}".format(failure)
+ else:
+ failure_mode = failure
+
+
+##########################################################################################
+#
+# START HERE to design and configure the tests ----------
+#
+##########################################################################################
+
+
+
+
+@qa_test(disable=True)
+def parametersUsed(sequencingType='not given',
+ coverageThreshold='not given',
+ approximateGenomeSize=4500000):
+ "Simple function to save the test and evaluation parameters used"
+ import xml.etree.ElementTree as xml
+ el = xml.SubElement
+ root = xml.Element("parametersUsed")
+ for key, value in qa_params.items():
+ el(root, "parameter", attrib=dict(key=str(key), value=str(value)))
+ return root
+
+@qa_test
+def readCount(sequenceFileName, sequencingType):
+ return QaMemo.of(sequenceFileName, sequencingType).readCount
+
+@qa_test
+def coverage(sequenceFileName, sequencingType, approximateGenomeSize=4500000):
+ return QaMemo.of(sequenceFileName, sequencingType).numBases / int(approximateGenomeSize)
+
+
+# @qa_test(disable=False)
+# def coverageQThirtyPlus(sequenceFileName, sequencingType, approximateGenomeSize=4500000, minimumQScore=qa_params.get('user Q threshold', 27.0)):
+# numOverMinQ = 0
+# for score, num in QaMemo.of(sequenceFileName, sequencingType).baseQScoreHisto.items():
+# if score >= int(minimumQScore):
+# numOverMinQ += num
+# return numOverMinQ
+
+
+def mean(counter):
+ return float(sum([v * c for v, c in counter.items()])) / float(sum(counter.values()))
+
+
+@qa_test
+def averageQPerFile(sequenceFileName, sequencingType):
+ return sorted({fi:mean(res.qualityHisto) for fi,res in QaMemo.of(sequenceFileName, sequencingType)}.items())
+
+
+def averages_exceed(qualityHisto, q):
+ return mean(qualityHisto) >= q
+
+
+@qa_test
+def qcStatus(sequenceFileName, sequencingType,
+ coverageThreshold=None,
+ approximateGenomeSize=4500000,
+ minimumQScore=qa_params.get('user Q threshold', 27.0),
+ bbThreshold=qa_params.get('permitted base balance variance', 0.2)):
+ global failure_mode
+ results = QaMemo.of(sequenceFileName, sequencingType)
+ if not all([averages_exceed(r.qualityHisto, minimumQScore) for f, r in results]):
+ update_failure_mode("qscore below {}".format(minimumQScore))
+ return "Fail"
+ a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
+ at = float(a) / float(t)
+ gc = float(g) / float(c)
+ if any((at > 1.0 + bbThreshold, at < 1.0 - bbThreshold, gc > 1.0 + bbThreshold, gc < 1.0 - bbThreshold)):
+ update_failure_mode("AT or GC base-balance outside of {} threshold".format(bbThreshold))
+ return "Fail"
+ if coverageThreshold:
+ if not (results.numBases / int(approximateGenomeSize)) >= float(coverageThreshold):
+ update_failure_mode("coverage below {}".format(coverageThreshold))
+ return "Fail"
+ return "Pass"
+ return "Undetermined"
+
+
+
+
+def baseBalance(sequenceFileName, sequencingType):
+ results = QaMemo.of(sequenceFileName, sequencingType)
+ a, t, g, c = [sum([n[b] for n in results.nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
+ return (float(a) / float(t) if t else 1, float(g) / float(c) if c else 1)
+
+@qa_test
+def at_gc_baseBalance(sequenceFileName, sequencingType):
+ a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
+ at = float(a) / float(t) if t else 1
+ gc = float(g) / float(c) if c else 1
+ return "{:02}/{:02}".format(at, gc)
+
+@qa_test(disable=True)
+def baseBalance(sequenceFileName, sequencingType, threshold=qa_params.get('permitted base balance variance', 0.2) ):
+ import xml.etree.ElementTree as xml
+ el = xml.SubElement
+ a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
+ at = float(a) / float(t) if t else 1
+ gc = float(g) / float(c) if c else 1
+ status = "Pass"
+ if any((at > 1.0 + threshold, at < 1.0 - threshold, gc > 1.0 + threshold, gc < 1.0 - threshold)):
+ status = "Fail"
+ update_failure_mode("Base balance out of variance threshold ({:.2},{:.2} > {})".format(at, gc, threshold))
+ root = xml.Element('baseBalanceResult')
+ el(root, 'AT').text = str(at)
+ el(root, 'GC').text = str(gc)
+ el(root, 'baseBalanceStatus').text = status
+ return root
+
+
+
+
+
+@qa_test(disable = ~os.path.exists(qa_params.get('path to kraken database', '/scratch/biostat/kraken/miniDB')),
+ profile=False)
+def taxonomyQualityUsingKraken(sequenceFileName,
+ sequencingType,
+ genus=None,
+ species=None,
+ databasePath=qa_params.get('path to kraken database', '/scratch/biostat/kraken/miniDB')
+ ):
+
+ #print sequenceFileName
+ #print sequencingType
+ #print sequenceFileName
+ #print sequencingType
+ #print genus
+ #print species
+
+
+ import xml.etree.ElementTree as xml
+ global failure_mode
+ el = xml.SubElement
+ res = TaxMemo.of(sequenceFileName, databasePath)
+ #make return element
+ root = xml.Element('taxonomyQualityResult')
+ if not genus:
+ el(root, 'taxStatus').text = "Undetermined"
+ else:
+ if genus in res.taxon.split() and species in res.taxon.split():
+ el(root, 'taxStatus').text = "Pass"
+ else:
+ el(root, 'taxStatus').text = "Fail"
+ update_failure_mode('"{} {}" was not in agreement with calculated ID "{}"'.format(genus, species, res.taxon))
+ el(root, 'mostPrevalentTaxon', dict(prevalence=str(res.taxon_prevalence))).text = res.taxon[-230:] #last 255 characters
+
+ contams = el(root, 'contaminants')
+ for i in range(4):
+ el(contams, 'prevalentContaminant', dict(prevalence=str(res.contaminant_prevalence[i]), ordinal=str(i))).text = res.contaminant[i][-230:]
+ el(root, 'database', dict(checksum=str(res.checksum))).text = databasePath
+
+
+ return root
+
+
+
+@qa_test
+def reasonForFailure():
+ global failure_mode
+ return failure_mode[0:254] #255 character limit in destination DB field
+
+@qa_test(disable=True)
+def run_serotype_usingseqsero(
+ sequenceFileName,
+ sequencingType):
+ from gims_qa_serotyping import run_serotype_usingseqsero
+
+ seq1 = "" + sequenceFileName[0]
+ seq2 = "" + sequenceFileName[1]
+
+ return run_serotype_usingseqsero(
+ seq1, seq2,
+ sequencingType,
+ Oantigen="Undetermined",
+ H1antigen="Undetermined",
+ H2antigen="Undetermined",
+ AntigenicProfile="Undetermined",
+ PredictedSerotype="Undetermined")
+
+ return root
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_tests.pyc
Binary file qa_core/gims_qa_tests.pyc has changed
diff -r 000000000000 -r dafec28bd37e qa_core/qa_core.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/qa_core.sh Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+#export MODULEPATH=/etc/modulefiles:/opt/mellanox/bupc/2.2/modules:/sw/Modules/
+#export GIT_SSH_COMMAND="ssh -i /home/justin.payne/.ssh/deploy_key_dev"
+
+module load qa-core
+module load kraken
+module load git
+#module load fastqc
+module load pandas
+module load SeqSero
+
+GIT_SSH_COMMAND="ssh -qi /sw/apps/qa-core/deploy_key_dev" git -C /sw/apps/qa-core/ pull origin deployment --quiet 2>&1>/dev/null
+TEMP=/isilon/gpfs_scratch python /sw/apps/qa-core/gims_core.py $1 -p 8
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/NY65357675_S20_L001_R1_001.fastq.gz
Binary file qa_core/sample/Input/NY65357675_S20_L001_R1_001.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/NY65357675_S20_L001_R2_001.fastq.gz
Binary file qa_core/sample/Input/NY65357675_S20_L001_R2_001.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/forward.fastq.gz
Binary file qa_core/sample/Input/forward.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/forward_big.fastq.gz
Binary file qa_core/sample/Input/forward_big.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/input.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/sample/Input/input.xml Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,45 @@
+
+
+
+
+
+
+
+
+
+ Campylobacter
+ jejuni
+
+
+
+
+
+
+
+
+
+
+
+
+
+ forward_big.fastq.gz
+ reverse_big.fastq.gz
+
+
+
+
+
+ Illumina MiSeq
+
+
+
+ 4500000
+ 20
+
+
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/reverse.fastq.gz
Binary file qa_core/sample/Input/reverse.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/reverse_big.fastq.gz
Binary file qa_core/sample/Input/reverse_big.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/roche_10000.sff
Binary file qa_core/sample/Input/roche_10000.sff has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/base_balance.png
Binary file qa_core/sample/Output/base_balance.png has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/nucleotide_distribution.png
Binary file qa_core/sample/Output/nucleotide_distribution.png has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/output.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/sample/Output/output.tsv Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,2 @@
+at_gc_baseBalance averageQPerFile coverage name qcStatus readCount reasonForFailure version
+0.957667053211/0.982688971661 [('qa_core/sample/Input/forward_big.fastq.gz', 35.54551581739571), ('qa_core/sample/Input/reverse_big.fastq.gz', 25.590399012754776)] 2 TEST TEST TEST Fail 59994 qscore below 27.0 galaxy/pure+mulcore+plots+qscore+basebal
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/output.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/sample/Output/output.xml Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,14 @@
+
+
+ galaxy/pure+mulcore+plots+qscore+basebal
+ 59994
+ 2
+ [('forward_big.fastq.gz', 35.54551581739571), ('reverse_big.fastq.gz', 25.590399012754776)]
+ Fail
+
+ 0.957667053211
+ 0.982688971661
+ Pass
+
+ qscore below 27.0
+
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/qscore_distribution.png
Binary file qa_core/sample/Output/qscore_distribution.png has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/qscore_histogram.png
Binary file qa_core/sample/Output/qscore_histogram.png has changed