# HG changeset patch
# User jpayne
# Date 1525820060 14400
# Node ID dafec28bd37efe099da72fab19437f1ae2068a7c

planemo upload

diff -r 000000000000 -r dafec28bd37e base_balance.png
Binary file base_balance.png has changed
diff -r 000000000000 -r dafec28bd37e cfsan-qaqc.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cfsan-qaqc.xml	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,73 @@
+<tool id="cfsan-qaqc" name="CFSAN QA/QC process" version="0.1.0">
+    <requirements>
+        <requirement type="package" version="1.65">biopython</requirement>
+        <requirement type="package" version="1.4.3">matplotlib</requirement>
+        <requirement type="package" version="1.9.2">numpy</requirement>
+        <requirement type="package" version="0.16.1">pandas</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #if $reads.reads_select == 'collection'
+            #set name=$reads.coll.name
+            #set forward=$reads.coll.forward
+            #set reverse=$reads.coll.reverse
+        #else
+            #set name=$forward.name
+        #end if
+        mkdir plots &&
+        qa_core/gims_core.py --sequencingType "Illumina MiSeq"
+                             --approximateGenomeSize $genomeSize
+                             --coverageThreshold $covThreshold
+                             --minimumQScore $minQ
+                             --output_plots plots
+                             --parallelism \${GALAXY_SLOTS:-6}
+                             --annotate $name
+                             --log -
+                             suite
+                             $forward
+                             $reverse
+                             -o $output
+    ]]></command>
+    <inputs>
+        <conditional name="reads">
+            <param name="reads_select" type="select" label="Paired-end collection, or two datasets from your history">
+                <option value="collection">Paired collection from your history</option>
+                <option value="history">Two FASTQ datasets from your history</option>
+            </param>
+            <when value="collection">
+                <param label="Paired reads" name="coll" type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" collection_type="paired" />
+            </when>
+            <when value="history">
+                <param label="Forward reads" type="data" name="forward" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" />
+                <param label="Reverse reads" type="data" name="reverse" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" />
+            </when>
+        </conditional>
+        <param name="genomeSize" type="text" label="Approximate genome size" value="4700000" />
+        <param name="covThreshold" type="text" label="Coverage threshold" value="20" />
+        <param name="minQ" type="text" label="Minimum average Q score" value="27.0" />
+    </inputs>
+    <outputs>
+        <data label="QA/QC Results" name="output" format="tabular" />
+        <data label="Nucleotide plot" name="nucleo_plot" format="png" from_work_dir="nucleotide_distribution.png" />
+        <data label="QScore Distribution" name="qscore_dist" format="png" from_work_dir="qscore_distribution.png" />
+        <data label="QScore Histogram" name="qscore_histro" format="png" from_work_dir="qscore_histogram.png" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="reads_select" value="collection" />
+            <param name="coll">
+                <collection type="list:paired">
+                <element name="TEST TEST TEST">
+                <collection type="paired">
+                    <element name="forward" value="Input/forward_big.fastq.gz" ftype="fastqsanger.gz" />
+                    <element name="reverse" value="Input/reverse_big.fastq.gz" ftype="fastqsanger.gz" />
+                </collection>
+                </element>
+                </collection>
+            </param>
+            <output name="output" value="Output/output.tsv" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        TODO: Fill in help.
+    ]]></help>
+</tool>
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/.gitignore
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/.gitignore	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,17 @@
+*.pyc
+sample/Output/*xml
+./deploy-dev
+./deploy-dev.pub
+./deploy-prep
+./deploy-prep.pub
+./deploy-prod
+./deploy-prod.pub
+deploy-*
+venv
+venv/*
+.DS_Store
+*.patch
+*bak
+tar
+.vscode
+.vscode/*
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/.gitmodules
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/.gitmodules	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,3 @@
+[submodule "task_core"]
+	path = task_core
+	url = http://10.12.207.89/justin.payne/task_core.git
diff -r 000000000000 -r dafec28bd37e qa_core/README.md
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/README.md	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,7 @@
+# qa_core
+## Genomics QA Core for CFSAN Sequencing Operations
+
+### Installing
+	virtualenv venv
+	source venv/bin/activate
+	pip install -r requirements.txt
diff -r 000000000000 -r dafec28bd37e qa_core/gims_core.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_core.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,300 @@
+#! /usr/bin/env python
+
+import argparse
+import binascii
+from collections import OrderedDict
+from contextlib import contextmanager
+import xml.dom.minidom as dom
+from xml.etree import ElementTree as xml
+from functools import partial, wraps
+from multiprocessing.pool import ThreadPool as Pool
+from inspect import getargspec
+import os, os.path, sys, errno
+import imp
+import subprocess
+import logging
+
+ver = "unknown"
+
+
+usage = """ CFSAN Genomics QA Core
+May 15, 2015
+Justin Payne
+justin.payne@fda.hhs.gov
+
+	usage:
+	{0} /path/to/sandbox (production mode)
+	{0} {{0}}
+
+""".format(os.path.basename(__file__))
+
+PHRED_OFFSET = 33
+
+input_subdir = 'Input'
+output_subdir = 'Output'
+
+#Heng Li's readfq implementation
+#https://github.com/lh3/readfq/blob/master/readfq.py
+def readfq(fp): # this is a generator function
+    last = None # this is a buffer keeping the last unprocessed line
+    while True: # mimic closure; is it a bad idea?
+        if not last: # the first record or a record following a fastq
+            for l in fp: # search for the start of the next record
+                if l[0] in '>@': # fasta/q header line
+                    last = l[:-1] # save this line
+                    break
+        if not last: break
+        name, seqs, last = last[1:].partition(" ")[0], [], None
+        for l in fp: # read the sequence
+            if l[0] in '@+>':
+                last = l[:-1]
+                break
+            seqs.append(l[:-1])
+        if not last or last[0] != '+': # this is a fasta record
+            yield name, ''.join(seqs), [] # yield a fasta record
+            if not last: break
+        else: # this is a fastq record
+            seq, leng, seqs = ''.join(seqs), 0, []
+            for l in fp: # read the quality
+                seqs.append(l[:-1])
+                leng += len(l) - 1
+                if leng >= len(seq): # have read enough quality
+                    last = None
+                    yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record
+                    break
+            if last: # reach EOF before reading enough quality
+                yield name, seq, [] # yield a fasta record instead
+                break
+
+#from qa_core import qa_test, new_output_file, open_input, readfq, qa_params
+
+qa_tests = OrderedDict()
+
+qa_params = OrderedDict()
+
+qa_core = imp.new_module("qa_core")
+sys.modules['qa_core'] = qa_core
+
+def qa_test(func=None, disable=False, profile=True):
+	"QA Test decorator"
+	def qa_test_decorator(qa_func):
+		#define a parameter-binding wrapper
+		@wraps(qa_func)
+		def wrap(namespace):
+			"Use partial application of functions to bind parameters declared in the QA function from a namespace object"
+			monad = qa_func
+			for argument in getargspec(qa_func).args:
+				if hasattr(namespace, argument) and getattr(namespace, argument):
+					monad = partial(monad, **{argument: getattr(namespace, argument)})
+
+			#and then call it
+			return monad()
+
+		wrap.__profile_enabled__ = profile
+		if not disable:
+			# if not name:
+			# 	name = qa_func.__name__
+			qa_tests[qa_func.__name__] = wrap
+		return wrap
+	if func:
+		return qa_test_decorator(func)
+	return qa_test_decorator
+
+qa_core.qa_test = qa_test
+qa_core.readfq = readfq
+qa_core.qa_params = qa_params
+
+@qa_test
+def version():
+	"Collect repository commit id from git"
+	p = os.path.abspath(os.path.dirname(__file__))
+	try:
+		ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '')
+	except subprocess.CalledProcessError:
+		ref = "+".join(qa_tests.keys())[:255]
+	return ref
+
+@qa_test
+def name(annotation=None):
+	if annotation:
+		return annotation
+
+@contextmanager
+def open_file(name, path='.', mode='w'):
+	with open(os.path.join(path, name), mode) as f:
+		if 'r' in mode and binascii.hexlify(f.read(2)) == b'1f8b':
+			import gzip
+			with gzip.open(os.path.join(path, name), mode) as g:
+				yield g
+		else:
+			f.seek(0)
+			yield f
+
+def suite(namespace, log=logging.getLogger('qa.suite')):
+	for name, test in qa_tests.items():
+			logg = log.getChild(name)
+			try:
+				result = test(namespace)
+				if result is not None:
+					logg.info(result)
+				yield name, result
+			except Exception as e:
+				import traceback
+				logg.error(traceback.format_exc())
+
+
+def main(namespace):
+	if namespace.annotation:
+		qa_params['annotation'] = namespace.annotation
+	parallelism = vars(namespace).get('parallelism', 1)
+	if parallelism > 1:
+		qa_params['map implementation'] = Pool(parallelism).map
+	if namespace.mode not in ('suite', 'profile'):
+		#it's a path to a sandbox job
+		logging.getLogger('qa.input').info("Load job description xml from {}...".format(namespace.mode))
+		tree = xml.parse(os.path.join(namespace.mode, "Input/input.xml"))
+		root = tree.getroot()
+		#logging.getLogger('qa.input').debug(xml.tostring(root).strip())
+		for element in root:
+			val = getattr(namespace, element.tag)
+			if isinstance(val, list):
+				val.append(element.text)
+			else:
+				setattr(namespace, element.tag, element.text)
+		#set up utility functions
+		path = os.path.join(namespace.mode, 'Output')
+		try:
+			os.makedirs(path)
+		except OSError as e:
+			if e.errno != errno.EEXIST:
+				raise
+		except Exception as e:
+			logging.getLogger('qa.setup').error(e)
+		qa_core.new_output_file = partial(open_file, path=path)
+		#qa_core.new_output_file = lambda f: open_file(f, path=path)
+		qa_core.open_input = partial(open_file, path=os.path.join(namespace.mode, "Input"), mode='r')
+		#import and run tests
+		logging.getLogger('qa.tests').info("Loading and running tests...")
+		import gims_qa_tests
+		root = xml.Element("qaQcCoverageOutput")
+		tree = xml.ElementTree(element=root)
+		[logging.getLogger('qa.params').info("{}:{}".format(k,v)) for k,v in vars(namespace).items()]
+		for i, (name, result) in enumerate(suite(namespace)):
+			if isinstance(result, xml.Element):
+				root.append(result)
+			else:
+				if result:
+					xml.SubElement(root, name).text = str(result)
+
+		try:
+			with qa_core.new_output_file('output.xml') as out:
+				d = dom.parseString(xml.tostring(root))
+				d.writexml(out, addindent='    ', newl='\n')
+			logging.getLogger('qa.results').info("Finished tests and wrote to {}".format(namespace.mode))
+		except Exception as e:
+			logging.getLogger('qa.results').error(e)
+	elif namespace.mode == 'suite':
+		qa_core.open_input = partial(open_file, mode='rb')
+		qa_core.new_output_file = open_file
+		if not namespace.output_plots:
+			qa_params['plots'] = False
+		else:
+			#set up directory for plots
+			pass
+		import gims_qa_tests
+		def fix_value(result):
+			if isinstance(result, xml.Element):
+				return xml.tostring(result)
+			return result
+		results = {name:fix_value(result) for name, result in suite(namespace)}
+		import csv
+		wtr = csv.DictWriter(namespace.output, delimiter='\t', fieldnames=sorted(results.keys()))
+		wtr.writeheader()
+		wtr.writerow(results)
+	elif namespace.mode == 'profile':
+		return profile(namespace)
+	
+
+def profile(namespace):
+	log = logging.getLogger('qa.profile')
+	import cProfile
+	import StringIO
+	from Bio.Seq import Seq
+	from Bio.SeqRecord import SeqRecord
+	from Bio.Alphabet import IUPAC
+	import random
+	from random import choice as c
+	from tempfile import TemporaryFile as tmp
+
+	qa_params['plots'] = False
+
+	profile_fastq = StringIO.StringIO()
+	log.info("Setup...")
+	#create some fake FASTQ 'reads'
+	for r in range(10000):
+		#boring random distribution
+		#s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)])
+		#interesting diamond distribution
+		s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)])
+		s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)])
+
+		q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)]
+		rec = SeqRecord(Seq(s, IUPAC.unambiguous_dna), id="read_{}".format(r))
+		rec.letter_annotations['phred_quality'] = q
+		profile_fastq.write(rec.format('fastq'))
+	@contextmanager
+	def input_stub(filename, mode=None):
+		if 'xml' in filename:
+			profile_xml.seek(0)
+			yield profile_xml
+		elif 'fastq' in filename:
+			profile_fastq.seek(0)
+			yield profile_fastq
+		else:
+			yield open('/dev/null', 'r')
+			
+	qa_core.open_input = input_stub
+	profiling_buffer = StringIO.StringIO()
+
+	@contextmanager
+	def get_profiling_buffer(*a, **k):
+		yield profiling_buffer
+		profiling_buffer.seek(0)
+		
+	qa_core.new_output_file = get_profiling_buffer
+	import gims_qa_tests
+
+	namespace.sequenceFileName = ('forward.fastq', 'reverse.fastq')
+	namespace.sequencingType = 'Illumina MiSeq'
+	namespace.approximateGenomeSize = 1000000
+
+	def run_tests(namespace, log):
+		log.getChild('run')
+		log.info(vars(namespace))
+		for name, test in qa_tests.items():
+			if test.__profile_enabled__:
+				test(namespace)
+	log.info("Profiling...")
+	return cProfile.runctx('run_tests(namespace, log)', None, locals(), sort='tottime')
+
+if __name__ == '__main__':
+	
+	parser = argparse.ArgumentParser()
+	parser.add_argument("mode")
+	parser.add_argument('sequenceFileName', nargs='*')
+	parser.add_argument('--genus','-g')
+	parser.add_argument('--species', '-s')
+	parser.add_argument('--sequencingType', '-t')
+	parser.add_argument('--approximateGenomeSize','--gs', type=int, default=4500000)
+	parser.add_argument('--coverageThreshold', '-c', type=int, default=20)
+	parser.add_argument('--minimumQScore', '-q', type=float, default=27.0)
+	parser.add_argument('--databasePath', '--db')
+	parser.add_argument('--log', '-l', type=argparse.FileType('w'), default=sys.stdout)
+	parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout)
+	parser.add_argument('--output_plots')
+	parser.add_argument('--parallelism', '-p', type=int, default=1)
+	parser.add_argument('--annotate', '-a', type=str, default=None, dest='annotation')
+	params = parser.parse_args()
+	logging.basicConfig(format='%(name)s\t%(levelname)s:%(message)s', level=logging.DEBUG, stream = params.log)
+	logging.getLogger('qa.cmd').info(" ".join(sys.argv))
+	main(params)
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_core.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_core.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,390 @@
+#!/usr/bin/env python
+
+#from __future__ import print_function
+
+import xml.dom.minidom as dom
+import xml.etree.ElementTree as xml
+import sys
+import os, os.path
+from functools import partial, wraps
+from inspect import getargspec, getfile, currentframe, getsource
+from os.path import join as j
+import subprocess
+import textwrap
+import imp
+from collections import OrderedDict
+from contextlib import contextmanager
+import StringIO
+import gzip
+
+ver = "core+plots+fxt"
+
+
+usage = """ CFSAN Genomics QA Core
+May 15, 2015
+Justin Payne
+justin.payne@fda.hhs.gov
+
+	usage:
+	{0} /path/to/sandbox (production mode)
+	{0} {{0}}
+
+""".format(os.path.basename(__file__))
+
+PHRED_OFFSET = 33
+
+input_subdir = 'Input'
+output_subdir = 'Output'
+
+path = os.getcwd()
+
+qa_tests = OrderedDict()
+qa_undecorated_tests = OrderedDict()
+qa_params = OrderedDict()
+
+class QaParamsDict(OrderedDict):
+	"orderedDict that sets on get access"
+	
+	def get(self, key, value=None):
+		self[key] = result = super(QaParamsDict, self).get(key, value)
+		return result
+
+
+qa_params = QaParamsDict()
+
+def int_then_string(value):
+	try:
+		return int(value)
+	except ValueError:
+		return str(value)
+		
+class IncompleteXMLError(Exception):
+	pass
+
+def qa_test(func=None, disable=False, profile=True):
+	"Decorator to simplify XML interaction"
+	def wrap(qa_func):
+		"Inner function to decorate qa test"
+		#print "registering", qa_func.__name__
+		qa_undecorated_tests[qa_func.__name__] = qa_func
+		if disable:
+			return qa_func
+		@wraps(qa_func)
+		def bind_metadata_to_arguments(metadata, qa_func):
+			"Find elements in XML with same name as function argument and bind their content"
+			try:
+				arguments = list(getargspec(qa_func).args)
+			except TypeError:
+				arguments = list(getargspec(qa_func.func).args)
+			for argument in arguments:
+				elements = metadata.findall(".//{0}".format(argument))
+				if len(elements) == 1:
+					qa_func = partial(qa_func, **{argument:int_then_string(elements[0].text)})
+				elif len(elements) > 1:
+					qa_func = partial(qa_func, **{argument:[int_then_string(e.text) for e in elements]})
+				else:
+					pass
+			try:
+				return qa_func()
+# 			except TypeError as e:
+# 				for el in metadata:
+# 					if el.tag in args:
+# 						arguments.remove(el.tag)
+# 				message = "Missing argument or arguments in input.xml: {} not present.".format(', '.join(arguments))
+# 				raise IncompleteXMLError, message, sys.exc_info()[-1]
+			finally:
+				pass
+			#bind_metadata_to_arguments
+		decorated_qa_func = partial(bind_metadata_to_arguments, qa_func=qa_func)
+		decorated_qa_func.__profile_enabled__ = profile
+		qa_tests[qa_func.__name__] = decorated_qa_func
+		return decorated_qa_func
+		#qa_test_decorator
+	if func:
+		return wrap(func)
+	return wrap
+
+
+qa_core = imp.new_module("qa_core")
+qa_core.qa_test = qa_test
+qa_core.test_output_buffer = StringIO.StringIO()
+
+def present(file):
+	return "/dev/null"
+
+qa_core.present = present
+
+@contextmanager
+def get_test_buffer(*args, **kwargs):
+	yield qa_core.test_output_buffer
+
+@contextmanager
+def get_test_input_file(*args, **kwargs):
+	yield open('/dev/null', 'r') 
+
+qa_core.new_output_file = get_test_buffer
+qa_core.open_input = get_test_input_file
+qa_core.qa_params = qa_params
+sys.modules['qa_core'] = qa_core
+	
+
+def main(sandbox, profile=False, *a, **k):
+	output = "qa_core: ran "
+	with qa_core.open_input('input.xml') as inputfile:
+		metadata = xml.parse(inputfile).getroot()
+	
+	qaoutput = xml.Element("qaQcCoverageOutput")
+	outputxml = xml.ElementTree(element=qaoutput)
+	
+	for name, test in qa_tests.items():
+		if (profile and test.__profile_enabled__) or not profile:
+			output += "{0}, ".format(name)
+			try:
+				result = test(metadata)
+				if isinstance(result, xml.Element):
+					qaoutput.append(result)
+				else:
+					if result:
+						xml.SubElement(qaoutput, name).text = str(result)
+			except Exception as e:
+				output += 'and died on error.'
+				import traceback
+				traceback.print_exc()
+				print(test)
+				xml.SubElement(qaoutput, 'reasonForFailure').text = str(e)
+				break
+# 	if gims_qa_tests.failure_mode:
+# 		xml.SubElement(qaoutput, "reasonForFailure").text = str(gims_qa_tests.failure_mode)
+
+	with qa_core.new_output_file('output.xml') as outputfile:
+		d = dom.parseString(xml.tostring(qaoutput))
+		d.writexml(outputfile, addindent='    ', newl='\n')
+	return output
+
+
+
+
+
+@qa_test
+def version():
+	"Collect repository commit id from git"
+	p = os.path.abspath(os.path.dirname(__file__))
+	try:
+		ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '')
+	except subprocess.CalledProcessError:
+		ref = ver
+	return ref
+	
+@qa_test(disable=True)
+def profile():
+	"profiling command to test performance"
+	
+	print("Starting profiler. Imports...")
+	
+	import cProfile
+	import StringIO
+	import Bio
+	import random
+	from random import choice as c
+
+	from tempfile import TemporaryFile as tmp
+
+	
+	print("Setting up test data...")
+	
+	profile_xml =  StringIO.StringIO("""<?xml version='1.0' encoding='UTF-8'?>
+<qaQcCoverageInput>
+	<sequenceFileName>forward.fastq</sequenceFileName>
+	<sequenceFileName>reverse.fastq</sequenceFileName>
+	<sequencingType>Illumina MiSeq</sequencingType>
+	<approximateGenomeSize>1000000</approximateGenomeSize>
+</qaQcCoverageInput>	
+""")
+
+	qa_params['plots'] = False
+
+	profile_fastq = StringIO.StringIO()
+	#create some fake FASTQ 'reads'
+	for r in range(10000):
+		#boring random distribution
+		#s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)])
+		#interesting diamond distribution
+		s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)])
+		s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)])
+
+		q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)]
+		rec = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s, Bio.Alphabet.IUPAC.unambiguous_dna), id="read_{}".format(r))
+		rec.letter_annotations['phred_quality'] = q
+		try:
+			profile_fastq.write(rec.format('fastq'))
+		except:
+			print q
+			print rec
+			raise
+
+		
+	
+			
+	@contextmanager
+	def input_stub(filename, mode=None):
+		if 'xml' in filename:
+			profile_xml.seek(0)
+			yield profile_xml
+		elif 'fastq' in filename:
+			profile_fastq.seek(0)
+			yield profile_fastq
+		else:
+			yield open('/dev/null', 'r')
+			
+	qa_core.open_input = input_stub
+	#qa_core.new_output_file = get_test_buffer
+	
+	profiling_buffer = StringIO.StringIO()
+	
+	@contextmanager
+	def get_profiling_buffer(*a, **k):
+		yield profiling_buffer
+		profiling_buffer.seek(0)
+		
+	qa_core.new_output_file = get_profiling_buffer
+	
+	reload(gims_qa_tests)
+	
+	print("Beginning profiling...")
+	try:
+		s = cProfile.run('main(profile=True)', sort='tottime')
+		#s.sort_stats('pcalls', 'tottime')
+		return str(profiling_buffer.read())
+	except NameError:
+		import traceback
+		traceback.print_exc()
+		main(profile=True)
+		quit()
+	except Exception:
+		import traceback
+		traceback.print_exc()
+		#print str(profile_fastq.getvalue())[:1024]
+		quit()
+	#return exec('main')
+	
+	
+#Heng Li's readfq implementation
+#https://github.com/lh3/readfq/blob/master/readfq.py
+def readfq(fp): # this is a generator function
+    last = None # this is a buffer keeping the last unprocessed line
+    while True: # mimic closure; is it a bad idea?
+        if not last: # the first record or a record following a fastq
+            for l in fp: # search for the start of the next record
+                if l[0] in '>@': # fasta/q header line
+                    last = l[:-1] # save this line
+                    break
+        if not last: break
+        name, seqs, last = last[1:].partition(" ")[0], [], None
+        for l in fp: # read the sequence
+            if l[0] in '@+>':
+                last = l[:-1]
+                break
+            seqs.append(l[:-1])
+        if not last or last[0] != '+': # this is a fasta record
+            yield name, ''.join(seqs), [] # yield a fasta record
+            if not last: break
+        else: # this is a fastq record
+            seq, leng, seqs = ''.join(seqs), 0, []
+            for l in fp: # read the quality
+                seqs.append(l[:-1])
+                leng += len(l) - 1
+                if leng >= len(seq): # have read enough quality
+                    last = None
+                    yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record
+                    break
+            if last: # reach EOF before reading enough quality
+                yield name, seq, [] # yield a fasta record instead
+                break
+                
+qa_core.readfq = readfq
+
+
+
+if __name__ == "__main__":
+	try:
+		name, sandbox_path = sys.argv[0:2]
+		args = sys.argv[2:]
+		
+		if '-p' in args:
+			from multiprocessing import Pool
+			try:
+				m = int(args[args.index('-p') + 1])
+				p = Pool(processes=m)
+				del args[args.index('-p') + 1]
+				qa_core.qa_params['parallelism'] = m
+			except IndexError, ValueError:
+				p = Pool()
+			args.remove('-p')
+			qa_core.qa_params['map implementation'] = p.map
+			
+		if '-q' in args:
+			try:
+				q = float(args.pop(args.index('-q') + 1))
+				args.remove('-q')
+				qa_params['user Q threshold'] = q
+			except IndexError, ValueError:
+				print("no Q score threshold provided with '-q' option.")
+				quit(1)
+				
+		
+		@contextmanager
+		def new_output_file(filename, mode='w'):
+			if not os.path.exists(j(sandbox_path, output_subdir)):
+				os.mkdir(j(sandbox_path, output_subdir))
+			fileh = open(j(sandbox_path, output_subdir, filename), mode)
+			yield fileh
+			fileh.close()
+			
+		
+		def open_input_from(path):
+			@contextmanager
+			def open_input(filename, mode='rU'):
+				if 'gz' in filename:
+					fopen = gzip.open
+				else:
+					fopen = open
+				fileh = fopen(j(path, filename), mode=mode)
+				yield fileh
+				fileh.close()
+			return open_input
+
+		def present_from(path):
+			def present(file):
+				return os.path.abspath(j(path, file))
+			return present
+		
+		qa_core.new_output_file = new_output_file
+		main_func = main
+		
+		if not os.path.exists(sandbox_path):
+			qa_core.open_input = open_input_from(os.getcwd())
+			qa_core.present = present_from(os.getcwd())
+			try:
+				main_func = lambda s, *a: qa_undecorated_tests[sandbox_path](*a)
+			except KeyError:
+				print("Invalid QA test name: {0}".format(sandbox_path))
+				quit(1)
+		else:
+			qa_core.open_input = open_input_from(j(sandbox_path, input_subdir))
+			qa_core.present = present_from(j(sandbox_path, input_subdir))
+			
+
+		
+	except ValueError:
+		import gims_qa_tests
+		print("Missing argument: path to sandbox or name of QA test.")
+		commands = ""
+		#commands = "\n\t\t".join([' '.join((n, " ".join(["<{0}>".format(a) for a in getargspec(f)[0]]) or 'DELETEME')).replace(' DELETEME', '') for n, f in qa_undecorated_tests.items()])
+		for name, func in qa_undecorated_tests.items():
+			args = " ".join(["<{0}>".format(a) for a in getargspec(func).args])
+			commands += "{0} {1}\n\t\t\t".format(name, textwrap.fill(args, 60, subsequent_indent='\t\t\t\t'))
+		print(usage.format(commands))
+		quit(1)
+	else:
+		import gims_qa_tests
+		print(main_func(sandbox_path, profile=False, *args))
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_kraken.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_kraken.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,217 @@
+import csv
+from collections import Counter, OrderedDict
+from hashlib import md5
+import os.path
+import tempfile
+from subprocess import check_call
+from gims_qa_tests import TaxTuple
+#from qa_core import present
+
+def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)):
+	"kraken options should be supplied as a dict to kraken_options; give a None value for valueless options"
+
+	#hash the kraken db files
+	md5hash = md5()
+	#os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None)
+	md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read())
+
+	db_hash = md5hash.hexdigest()
+
+
+	with tempfile.NamedTemporaryFile('rU') as kraken_out:
+		with tempfile.NamedTemporaryFile() as seq_data:
+			#cat all the files together
+			check_call("cat {files} > {seq_data.name}".format(
+				files = " ".join([present(f) for f in sequenceFiles]),
+				seq_data = seq_data
+				), 
+			shell=True)
+
+			def make_option(key, value):
+				if value is not None:
+					return "--{} {}".format(key, value)
+				return "--{}".format(key)
+
+			#build options
+			options = " ".join([make_option(k,v) for k,v in kraken_options.items()])
+
+			#call kraken
+			check_call("kraken {options} "
+					"--db {database} "
+					"--gzip-compressed "
+					"--fastq {seq_data.name} "
+					"--output {kraken_out.name}".format(**locals()), shell=True)
+
+			counts = Counter()
+
+			with tempfile.NamedTemporaryFile('rU') as kraken_report:
+				check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True)
+				rows = csv.reader(kraken_report, delimiter='\t')
+				for seq_id, taxon in rows:
+					counts[taxon] += 1.0
+					counts["total"] += 1.0
+
+				#clean up all the temp files by releasing contexts
+	
+	top_results = counts.most_common(6) #total, plus five most common taxa
+	total, total_count = top_results.pop(0)
+	top_hit, top_hit_count = top_results.pop(0)
+	next_hits = [h[0].replace(';', ' ') for h in top_results]
+	next_hit_counts = [h[1] for h in top_results]
+
+	top_hit_frac = top_hit_count / total_count
+	next_hit_frac = [h / total_count for h in next_hit_counts]
+
+	return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash)
+
+def  runKrakenSeqtest(sequenceFiles,database):
+
+	krakenoutoutpath = "/home/charles.strittmatter/output/kraken/"
+
+	w = datetime.datetime.now()
+        
+	krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M")
+	krakenreport = "" +  krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M")
+	
+	sequenceFiles[0]
+	sequenceFiles[1]
+
+	os.path.abspath(os.path.join("Input", sequenceFileName[0]))
+
+	command = "kraken --preload "
+	command += "--db " + database + ' '
+	#if len(fastqPaths) > 1:
+	command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1])) 
+	#  command += ' '.join(fastqPaths)
+	command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout
+	
+	print command
+
+	check_call(command, shell=True)
+	print "%s -- Kraken called..." % datetime.datetime.now()
+
+
+	kraken_report_cmd = "kraken-report --db "
+	kraken_report_cmd += database
+	kraken_report_cmd += " " + krakenoutoutpath + krakenout
+	kraken_report_cmd += " > " + krakenreport
+	
+	print kraken_report_cmd
+
+	check_call(kraken_report_cmd, shell=True)
+
+
+
+
+	parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5"
+
+	kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True)
+
+	split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines)
+
+	print "=====================" + krakenreport + "=========================="
+	print split_kraken_rpt_line
+	split_kraken_rpt_length = len(split_kraken_rpt_line)
+	print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4]
+	taxLen = 6;
+	taxIdIdx = 4;
+	outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M")  + '_metadata_kraken.tsv'
+
+ 	metadata = "Platform        Run     asm_stats_contig_n50    asm_stats_length_bp     asm_stats_n_contig      attribute_package       bioproject_acc  bioproject_center       bioproject_title        biosample_acc   collected_by    sample_name     scientific_name serovar sra_center      strain  sub_species     tax-id"
+
+
+
+
+	splitMetadata = metadata.split("\t")
+	for i in range(0, len(split_kraken_rpt_line)):
+			if split_kraken_rpt_line[i] == '':
+					print "don't add to the metadata...split_kraken_rpt_line.pop(i)"
+			elif i%(taxLen) == taxLen-1:
+					print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1"
+					splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip())
+			elif i%(taxLen) == taxIdIdx:
+					print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx"
+					splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
+			elif i%(taxLen) == 0:
+					print "i is "+str(i)+"under this condition: i%(taxLen) == 0"
+					splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
+
+			splitMetadataLength = len(splitMetadata)
+			first_kraken_percentage = splitMetadataLength-15
+	with open(outputFile, 'wb') as f:
+			#writer = csv.writer(f, delimiter="\t")
+
+			#         print splitMetadata
+			for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
+					print "Split metadata of i "+splitMetadata[i]
+					for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
+							if float(splitMetadata[j]) > float(splitMetadata[i]):
+									print "i: "+str(i)+" and j: "+str(j)
+									print "swapping "+splitMetadata[i]+" with "+splitMetadata[j]
+									old_hit_org = splitMetadata[i-1]
+									old_hit_percentage = splitMetadata[i]
+									old_hit_tx_id = splitMetadata[i+1]
+
+									splitMetadata[i-1] = splitMetadata[j-1]
+									splitMetadata[i] = splitMetadata[j]
+									splitMetadata[i+1] = splitMetadata[j+1]
+
+									splitMetadata[j-1] = old_hit_org
+									splitMetadata[j] = old_hit_percentage
+									splitMetadata[j+1] = old_hit_tx_id
+
+	df = pandas.DataFrame(splitMetadata)
+
+	print df
+
+
+	# Here aree dataframe collumns for !
+
+	print df.iat[0,0]
+	print df.iat[1,0]
+	print df.iat[3,0]
+	print df.iat[4,0]
+	print df.iat[6,0]
+	print df.iat[7,0]
+	print df.iat[9,0]
+	print df.iat[10,0]
+
+	OrganismOne = df.iat[0,0]
+	OrganismTwo =  df.iat[3,0]
+	OrganismThree = df.iat[6,0]
+	OrganismFour =  df.iat[9,0]
+
+
+	OrganismOnePer = df.iat[1,0]
+	OrganismTwoPer = df.iat[4,0]
+	OrganismThreePer = df.iat[7,0]
+	OrganismFourPer = df.iat[10,0]
+
+
+	df = {'OrganismOne':OrganismOne,
+			'OrganismTwo':OrganismTwo,
+			'OrganismThree':OrganismThree,
+			'OrganismFour':OrganismFour,
+			'OrganismOne%':OrganismOnePer,
+			'OrganismTwo%':OrganismTwoPer,
+			'OrganismThree%':OrganismThreePer,
+			'OrganismFour%':OrganismFourPer }
+
+
+	#Clean up files
+
+	command = ""
+	command += "rm " + krakenoutoutpath + krakenout
+
+	print command
+
+	check_call(command, shell=True)
+
+
+
+	command = ""
+	command += "rm " + outputFile
+
+	print command
+
+	return pandas.DataFrame(df, index=[0])
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_plot.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_plot.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,148 @@
+import subprocess
+from qa_core import new_output_file, qa_params
+from collections import defaultdict
+import warnings
+warnings.filterwarnings("ignore")
+import pandas
+from pandas import DataFrame
+import matplotlib.pyplot as plot
+from numpy import zeros, add, mean, array, percentile, arange
+import traceback
+from gims_qa_tests import cluster
+import logging
+
+bases = ("A", "T", "G", "C", "N")
+
+def qa_plots(df, annotation, log=logging.getLogger('qa.plots'), **kwargs):
+	#log.debug(df)
+	#log.debug(annotation)
+	plot_quality_histo(df['qualityHisto'], annotation, **kwargs)
+	plot_nucleotide_distribution_histo(df['nucleotideHisto'], annotation)
+	plot_quality_distribution_histo(df['baseQScoreHisto'], annotation)
+	plot_base_balance(df, annotation, **kwargs)
+	#plot.show()
+	
+def plot_base_balance(df, 
+					  annotation=qa_params.get('annotation', ''),
+					  threshold=qa_params.get('permitted base balance variance', .2), 
+					  **kwargs):
+	try:
+		plot.figure()
+		plot.axis((0,2,0,2))
+		plot.suptitle("{} Base Balance".format(annotation))
+		plot.xlabel("G/C")
+		plot.ylabel("A/T")
+		thres_min = 1 - threshold
+		thres_max = 1 + threshold
+		for coord in (thres_min, thres_max):
+			for linef in (plot.axvline, plot.axhline):
+				linef(coord, linestyle='dashed', label=str(coord))
+				
+		plot.annotate(thres_max, xy=(0, thres_max))
+		plot.annotate(thres_min, xy=(0, thres_min))
+		plot.annotate(thres_max, xy=(thres_max, 0))
+		plot.annotate(thres_min, xy=(thres_min, 0))
+				
+		a, t, g, c = [sum(df['nucleotideHisto'][b]) for b in ('A', 'T', 'G', 'C')]
+		y, x = a / float(t), g / float(c)
+		plot.plot((x,), (y,), marker="+")
+		plot.annotate("({:.2},{:.2})".format(x, y), xy=(x+.05, y+.05), textcoords='data')
+		with new_output_file("base_balance.png") as base_bal:
+			plot.savefig(base_bal, format='png', dpi=92)
+	except Exception:
+		traceback.print_exc()
+	
+	
+def plot_quality_histo(histo,
+					   annotation=qa_params.get('annotation', ''),
+					   q=qa_params.get('user Q threshold', 27)):
+	try:
+		plot.figure()
+		labels = range(max(histo.keys()) + 1)
+ 		plot.bar(left=labels,
+ 				 height=[histo[n] for n in labels],
+ 				 color='y',
+ 				 linewidth=1)
+# 		plot.hist(histo, bins=range(max(histo)))
+		plot.xticks()
+		plot.axvline(x=q, label="Q {}".format(q))
+		#plot.show()
+		plot.xlabel(r"""$Q (-10 log_(10) \rho)$""")
+		plot.ylabel('Number of Bases')
+		plot.suptitle('{} Q-Score Histogram'.format(annotation))
+		plot.legend(loc='upper left', fontsize='x-small')
+		with new_output_file("qscore_histogram.png") as qual_histogram:
+			plot.savefig(qual_histogram, format='png', dpi=92)
+# 	except ValueError:
+# 		pass	
+	except Exception:
+		traceback.print_exc()
+	
+def plot_nucleotide_distribution_histo(dataframe,
+									   annotation=qa_params.get('annotation', '')):
+	try:
+		#dataframe = defaultdict(list)
+		colors = iter(('r', 'y', 'g', 'b', 'pink'))
+		plot.figure()
+		last_base = zeros((len(dataframe.values()[0]),)) #set all zeros for bottom of first bar row
+		for base in bases:
+			try:
+				plot.bar(left=range(len(dataframe[base])), 
+									height=dataframe[base], 
+									width=2, 
+									bottom=last_base, 
+									color=colors.next(), 
+									label=base, 
+									linewidth=0)
+				last_base = add(last_base, dataframe[base])
+			except:
+				#print [len(d) for d in dataframe.data.values()]
+				raise
+		plot.autoscale(tight=True)
+		plot.xlabel('Base Position')
+		plot.ylabel('Nucleotide Count')
+		plot.suptitle('{} Nucleotide Distribution Plot'.format(annotation))
+		plot.legend(loc='upper right', fontsize='xx-small')
+		with new_output_file("nucleotide_distribution.png") as nuc_histogram:
+			plot.savefig(nuc_histogram, format='png', dpi=108)
+	except Exception:
+		traceback.print_exc()
+	
+def plot_quality_distribution_histo(histo, 
+									annotation=qa_params.get('annotation', ''),
+									q=qa_params.get('user Q threshold', 27)
+									):
+	try:
+		binning = 20
+		plot.figure(figsize=(12, 6))
+		vec = [tuple(c.elements()) for c in histo]
+		means = [mean(t) for t in vec]
+		upper_iqr = array([percentile(t, 75) for t in vec])
+		lower_iqr = array([percentile(t, 25) for t in vec])
+# 		plot.boxplot(vec, 
+# 					 widths=.000005,
+# 					 whis=(5,95), 
+# 					 showfliers=False, 
+# 					 showbox=True, 
+# 					 showmeans=True,
+# 					 meanline=True, 
+# 					 showcaps=True,
+# 					 whiskerprops=dict(linestyle='-', color='0.60')
+# 					 )
+		
+		plot.fill_between(arange(len(upper_iqr)), upper_iqr, lower_iqr, facecolor='0.50', alpha=0.5, label='interquartile range')
+		plot.plot(means, 'r-', label='Mean')
+		plot.plot(upper_iqr, 'b-', label='interquartile range', alpha=0.25)
+		plot.autoscale(tight=True)
+		plot.axhline(y=q, label="Q {}".format(q))
+		plot.xlabel('Base Position')
+		plot.ylabel(r"""$Q (-10 log_(10) \rho)$""")
+		plot.xticks(range(0, len(histo), binning), range(0, len(histo), binning), rotation='vertical')
+		plot.suptitle('{} Q-Score Distribution Plot'.format(annotation))
+		plot.legend(loc='lower left', fontsize='x-small')
+		plot.grid(True)
+		with new_output_file("qscore_distribution.png") as q_distro:
+			plot.savefig(q_distro, format='png', dpi=108)
+		
+	except Exception:
+		traceback.print_exc()
\ No newline at end of file
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_plot.pyc
Binary file qa_core/gims_qa_plot.pyc has changed
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_serotyping.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_serotyping.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,110 @@
+import csv
+import shutil
+from collections import Counter, OrderedDict
+from hashlib import md5
+import os.path
+import tempfile
+import glob
+import subprocess
+from subprocess import check_call, check_output
+from gims_qa_tests import TaxTuple
+import logging
+#from qa_core import present
+
+
+
+def run_serotype_usingseqsero(
+	sequenceFileName1, sequenceFileName2,
+							   sequencingType,
+							   Oantigen="Undetermined",
+							   H1antigen="Undetermined", 
+							    H2antigen="Undetermined",
+							   AntigenicProfile="Undetermined",
+							   PredictedSerotype="Undetermined",
+							   log=logging.getLogger('qa.seqsero')):
+
+	log.info(check_output("SeqSero.py -m 2 -i {sequenceFileName1} {sequenceFileName2} -b sam".format(**locals()), shell=True))
+	
+
+	data_lines = []
+	filepath = "SeqSero_result*/Seqsero_result.txt"
+
+
+	i = -1
+
+	for name in glob.glob(filepath):
+			#with name2 as seqsero_report:
+				rows = csv.reader(open(name))
+				fields = rows.next()
+    			#data_lines = []
+				for row in rows:
+					i += 1
+					str1 = ''.join(row)
+					words2 = str1.split(':')
+					if i == 0:
+						Oantigen = words2[1]
+					if i == 1:
+						H1antigen = words2[1]
+					if i == 2:
+						H2antigen = words2[1]
+					if i == 3:
+						if len(words2) > 2:
+							print words2[1] + ":" + words2[2] + ":" + words2[3]
+							AntigenicProfile = words2[1] + ":" + words2[2] + ":" + words2[3]
+					if i == 4:
+						if words2[1].find("N/A") != -1:	
+							PredictedSerotype = "No Salmonella Serotype"
+						else:
+							PredictedSerotype = words2[1]
+
+
+	import xml.etree.ElementTree as xml
+
+	root = xml.Element('SerotypeQualityResultType')
+	global failure_mode
+	
+	#Saving for posssible future change to xml
+	#el = xml.SubElement
+
+	#el(root, 'Oantigen').text = Oantigen
+	#el(root, 'H1antigen').text = H1antigen
+	#el(root, 'H2antigen').text = H2antigen
+	#el(root, 'AntigenicProfile').text = AntigenicProfile
+	#el(root, 'PredictedSerotype').text = PredictedSerotype
+
+	root.set('Oantigen',Oantigen.strip())
+	root.set('H1antigen',H1antigen.strip())
+	root.set('H2antigen',H2antigen.strip())
+	root.set('AntigenicProfile',AntigenicProfile.strip())
+	root.set('PredictedSerotype', PredictedSerotype.strip())
+
+
+
+	# print Oantigen
+	# print H1antigen
+	# print H2antigen
+	# print AntigenicProfile
+	# print PredictedSerotype
+
+	log.getChild('Oantigen').info(Oantigen)
+	log.getChild('H1antigen').info(H1antigen)
+	log.getChild('H2antigen').info(H2antigen)
+	log.getChild('AntigenicProfile').info(AntigenicProfile)
+	log.getChild('PredictedSerotype').info(PredictedSerotype)
+
+
+	#shutil.rmtree("SeqSero_result*", ignore_errors=True)
+
+	check_call("rm -rf SeqSero_result*".format(**locals()), shell=True, stderr=subprocess.STDOUT)
+
+
+
+
+	return root
+
+
+
+
+
+
+
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_tests.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_tests.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,434 @@
+from qa_core import qa_test, new_output_file, open_input, readfq, qa_params
+from collections import namedtuple, defaultdict, Counter
+from Bio import SeqIO
+from multiprocessing import Pool, Process, cpu_count
+from functools import partial
+from itertools import izip, count, izip_longest, islice
+from tempfile import NamedTemporaryFile, mkdtemp
+from subprocess import check_call, CalledProcessError
+from numpy import genfromtxt
+from os.path import abspath, join, split, splitext
+from contextlib import contextmanager
+from subprocess import call
+from subprocess import check_output
+from hashlib import md5
+
+
+import sys
+import shutil
+import os
+import pandas
+import datetime
+import re
+import glob, string
+
+#Test implementation classes and functions. The actual tests are down at the bottom.
+
+SuperResultTuple = namedtuple("SuperResultTuple", ("readCount", "numBases", "baseQScoreHisto", "nucleotideHisto", "qualityHisto"))
+
+# #Global Variables for kraken methods
+# krakendatabase = "/scratch/biostat/kraken/miniDB/"
+# we don't actually need this as a global
+
+class ResultTuple(SuperResultTuple):
+	def __add__(self, other):
+		r1, n1, h1, nuc1, qual1 = self
+ 		r2, n2, h2, nuc2, qual2 = other
+ 		return ResultTuple(r1 + r2, 
+						   n1 + n2, 
+						   [p + q for p, q in izip_longest(h1, h2, fillvalue=Counter())], 
+						   [p + q for p, q in izip_longest(nuc1, nuc2, fillvalue=Counter())], 
+						   qual1 + qual2)
+ 		
+ 	def __and__(self, other):
+ 		r1, n1, h1, nuc1, qual1 = self
+ 		r2, n2, h2, nuc2, qual2 = other
+ 		return ResultTuple(r1 + r2, 
+						   n1 + n2,
+						   h1 + h2,
+						   nuc1 + nuc2,
+						   qual1 + qual2)
+ 		
+ 	@classmethod
+ 	def zero(cls):
+ 		return cls(0, 0, [], [], Counter())
+ 		
+ 	def to_dataFrame(self):
+		try:
+ 			from pandas import DataFrame
+ 			#from_dict = partial(DataFrame.from_dict, DataFrame)
+ 		except:
+ 			#no pandas
+ 			def DataFrame(self, data):
+ 				return ResultTuple(**data)
+ 		
+		pivotNucHisto = defaultdict(list)
+		for cnt in self.nucleotideHisto:
+			for base in ('A', 'T', 'G', 'C', 'N'):
+				pivotNucHisto[base].append(cnt[base])
+		pivotQBaseHisto = self.baseQScoreHisto
+		pivotQualHisto = {}
+		for q in range(max(self.qualityHisto.keys())):
+			pivotQualHisto[q] = self.qualityHisto[q]
+		df = {'readCount':self.readCount,
+			  'numBases':self.numBases,
+			  'baseQScoreHisto':pivotQBaseHisto,
+			  'nucleotideHisto':pivotNucHisto,
+			  'qualityHisto':pivotQualHisto}
+		return df
+		
+class MergedResultTuple():
+	"Merged multi-file results object. You can iterate on it for filename and ResultTuple pairs, or call its attributes for combined results"
+	def __init__(self, **kwargs):
+		self.per_file = {}
+		self.per_file.update(kwargs)
+		
+	def __getattr__(self, name):
+		return getattr(reduce(lambda p, q: p & q, [v for k, v in sorted(self.per_file.items())], ResultTuple.zero()), name)
+		
+	def __iter__(self):
+		return iter(self.per_file.items())
+
+
+def analyze_batch_reads(reads):
+	readCount = 0
+	numBases = 0
+	baseQScoreHisto = []
+	nucleotideHisto = []
+	qualityHisto = Counter()
+	for name, seq, quals in reads:
+		qualityHisto.update(quals)
+		def scan_a_read(read):
+		#for base, qual, index in zip(seq, quals, count(0)):
+			base, qual, index = read
+			try:
+				nucleotideHisto[index][base] += 1
+			except IndexError:
+				nucleotideHisto.append(Counter(base))
+			try:
+				baseQScoreHisto[index][qual] += 1
+			except IndexError:
+				baseQScoreHisto.append(Counter({qual:1}))
+			return index
+		m = map(scan_a_read, zip(seq, quals, count(0))) #numBases is the last index of the read
+		#print(name, m[:3])
+		numBases += m[-1]
+		readCount += 1
+	r = ResultTuple(readCount, numBases, baseQScoreHisto, nucleotideHisto, qualityHisto)
+	return r
+
+	
+def cluster(slicable, n=2):
+	"return an n-tuple of iterators over an in-memory sequence"
+	#determine the offset size
+	offset = (len(slicable) / n) + 1
+	#return tuple([islice(slicable, i, i+offset) for i in range(n)])
+	return tuple([tuple(slicable[i*offset:(i+1)*offset-1]) for i in range(n)])
+	
+	
+def tupleize_parse(fileh, format):
+	for read in SeqIO.parse(fileh, format):
+		yield (read.id, str(read.seq), read.letter_annotations['phred_quality'])
+
+#@profile
+def analyze_shortread(seqFile, map_f=map, multi=qa_params.get('parallelism', cpu_count())):
+	"analyze one file whose reads are typical short-read technology"
+	if 'sff' in seqFile:
+		format, quality, mode = 'sff', 'phred_quality', 'rb'
+		rds_f = partial(tupleize_parse, format=format)
+	else:
+		format, quality, mode = 'fastq', 'phred_quality', 'rU'
+		rds_f = readfq
+	with open_input(seqFile) as seqFileH:
+		rds = list(rds_f(seqFileH))
+		rds = cluster(rds, multi)
+		result = sum(map_f(analyze_batch_reads, rds), ResultTuple.zero())
+	return result
+
+
+def analyze_longread(seqFile, map_f=map):
+	"analyze one file from pacbio or nanopore"
+
+	return ResultTuple.zero()
+	#what to do here? if nanopore, plot 2d/1d length vs accuracy
+
+
+def panic(*a, **k):
+	raise ValueError("No analysis method for this sequencing type")
+
+config = {'Roche 454':analyze_shortread,
+		  'Illumina MiSeq':analyze_shortread,
+		  'Illumina NextSeq':analyze_shortread,
+		  'Illumina HiSeq':analyze_shortread,
+		  'IonTorrent PGM':analyze_shortread,
+		  'PacBio RS':analyze_longread,
+		  'Oxford Nanopore MiniION':analyze_longread}
+
+
+class QaMemo(object):
+	"lazy-evaluated QA operation"
+	
+	result = None #named tuple results object
+	
+	@classmethod
+	def analyze(cls, 
+				sequenceFiles, 
+				sequencingType, 
+				map_f=qa_params.get('map implementation', map), 
+				plots=qa_params.get('plots', True),
+				annotation=qa_params.get('annotation', '')):
+		if type(sequenceFiles) == str: #if this is a single file
+			sequenceFiles = [sequenceFiles]
+		analysis = partial(config.get(sequencingType, panic), map_f=map_f)
+#		result = reduce(lambda a, b: a & b, map(analysis, sequenceFiles), ResultTuple.zero())
+		result = MergedResultTuple(**{fi:analysis(fi) for fi in sequenceFiles})
+		if plots:
+			try:
+				from gims_qa_plot import qa_plots
+				result_df = result.to_dataFrame()
+				qa_plots(result_df, annotation)
+			except Exception:
+				import traceback
+				traceback.print_exc()
+		return result
+	
+	@classmethod
+	def of(cls, sequenceFiles, sequencingType):
+		try:
+			if not cls.result:
+				cls.result = QaMemo.analyze(sequenceFiles, sequencingType)
+			return cls.result
+		except TypeError as e:
+			raise ValueError(str(e)), None, sys.exc_info()[2]
+			
+			
+TaxTuple = namedtuple("TaxTuple", ("taxon", "taxon_prevalence", "contaminant", "contaminant_prevalence", "checksum"))
+			
+			
+class TaxMemo(object):
+	"lazy-evaluated Tax check operation"
+	
+	result = None #
+	
+	@classmethod
+	def analyze(cls, sequenceFiles, database):
+		from gims_qa_kraken import run_kraken
+		return run_kraken(sequenceFiles, database, dict(preload=None,
+														threads=qa_params.get('parallelism', 1)))
+	
+	# ##	print cls
+	# ##	print sequenceFiles		
+	# 	from gims_qa_kraken import runKrakenSeqTest
+
+	# 	#hash the kraken db
+	# 	#this isn't right yet, the db is a directory
+	# 	db_hash = md5(open(database, 'rb').read()).hexdigest()
+
+	
+	# 	#Run Kraken on the current sequence and collect result and pass to TaxTuple to be written to xml
+    #     KrakenResult = runKrakenSeqtest(sequenceFiles,krakendatabase)
+
+	# 	#do this more readably lines with list comprehensions
+	# 	org_headers = ('OrganismTwo', 'OrganismThree', 'OrganismFour', 'OrganismFive')
+	# 	org_percent_headers = ('OrganismTwo%', 'OrganismThree%', 'OrganismFour%', 'OrganismFive%')
+  	# 	Organisms  = [KrakenResult[h].iloc[0] for h in org_headers]
+    #     OrganismsPer  = [KrakenResult[h].iloc[0] for h in org_percent_headers]
+
+
+	# 	return TaxTuple(KrakenResult['OrganismOne'].iloc[0], KrakenResult['OrganismOne%'].iloc[0], Organisms, OrganismsPer, db_hash) 
+		
+	@classmethod
+	def of(cls, *args, **kwargs):
+		if not cls.result:
+			cls.result = TaxMemo.analyze(*args, **kwargs)
+		return cls.result
+
+failure_mode = ""
+
+def update_failure_mode(failure):
+	global failure_mode
+	if failure_mode:
+		failure_mode += "; {}".format(failure)
+	else:
+		failure_mode = failure
+			
+			
+##########################################################################################
+#
+#	START HERE to design and configure the tests ----------
+#
+##########################################################################################
+
+
+
+
+@qa_test(disable=True)
+def parametersUsed(sequencingType='not given', 
+				   coverageThreshold='not given', 
+				   approximateGenomeSize=4500000):
+	"Simple function to save the test and evaluation parameters used"
+	import xml.etree.ElementTree as xml
+	el = xml.SubElement
+	root = xml.Element("parametersUsed")
+	for key, value in qa_params.items():
+		el(root, "parameter", attrib=dict(key=str(key), value=str(value)))
+	return root
+
+@qa_test
+def readCount(sequenceFileName, sequencingType):
+	return QaMemo.of(sequenceFileName, sequencingType).readCount
+	
+@qa_test
+def coverage(sequenceFileName, sequencingType, approximateGenomeSize=4500000):
+	return QaMemo.of(sequenceFileName, sequencingType).numBases / int(approximateGenomeSize)
+
+	
+# @qa_test(disable=False)
+# def coverageQThirtyPlus(sequenceFileName, sequencingType, approximateGenomeSize=4500000, minimumQScore=qa_params.get('user Q threshold', 27.0)):
+# 	numOverMinQ = 0
+# 	for score, num in QaMemo.of(sequenceFileName, sequencingType).baseQScoreHisto.items():
+# 		if score >= int(minimumQScore):
+# 			numOverMinQ += num
+# 	return numOverMinQ
+	
+	
+def mean(counter):
+	return float(sum([v * c for v, c in counter.items()])) / float(sum(counter.values()))
+		
+
+@qa_test
+def averageQPerFile(sequenceFileName, sequencingType):
+	return sorted({fi:mean(res.qualityHisto) for fi,res in QaMemo.of(sequenceFileName, sequencingType)}.items())
+	
+		
+def averages_exceed(qualityHisto, q):
+	return mean(qualityHisto) >= q
+
+
+@qa_test	
+def qcStatus(sequenceFileName, sequencingType, 
+			 coverageThreshold=None, 
+			 approximateGenomeSize=4500000, 
+			 minimumQScore=qa_params.get('user Q threshold', 27.0),
+			 bbThreshold=qa_params.get('permitted base balance variance', 0.2)):
+	global failure_mode
+	results = QaMemo.of(sequenceFileName, sequencingType)
+	if not all([averages_exceed(r.qualityHisto, minimumQScore) for f, r in results]):
+		update_failure_mode("qscore below {}".format(minimumQScore))
+		return "Fail"
+	a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
+	at = float(a) / float(t)
+	gc = float(g) / float(c)
+	if any((at > 1.0 + bbThreshold, at < 1.0 - bbThreshold, gc > 1.0 + bbThreshold, gc < 1.0 - bbThreshold)):
+		update_failure_mode("AT or GC base-balance outside of {} threshold".format(bbThreshold))
+		return "Fail"
+	if coverageThreshold:
+		if not (results.numBases / int(approximateGenomeSize)) >= float(coverageThreshold):
+			update_failure_mode("coverage below {}".format(coverageThreshold))
+			return "Fail"
+		return "Pass"
+	return "Undetermined"
+	
+
+
+
+def baseBalance(sequenceFileName, sequencingType):
+	results = QaMemo.of(sequenceFileName, sequencingType)
+	a, t, g, c = [sum([n[b] for n in results.nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
+	return (float(a) / float(t) if t else 1, float(g) / float(c) if c else 1)
+
+@qa_test
+def at_gc_baseBalance(sequenceFileName, sequencingType):
+	a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
+	at = float(a) / float(t) if t else 1
+	gc = float(g) / float(c) if c else 1
+	return "{:02}/{:02}".format(at, gc)
+
+@qa_test(disable=True)
+def baseBalance(sequenceFileName, sequencingType, threshold=qa_params.get('permitted base balance variance', 0.2) ):
+	import xml.etree.ElementTree as xml
+	el = xml.SubElement
+	a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
+	at = float(a) / float(t) if t else 1
+	gc = float(g) / float(c) if c else 1
+	status = "Pass"
+	if any((at > 1.0 + threshold, at < 1.0 - threshold, gc > 1.0 + threshold, gc < 1.0 - threshold)):
+		status = "Fail"
+		update_failure_mode("Base balance out of variance threshold ({:.2},{:.2} > {})".format(at, gc, threshold))
+	root = xml.Element('baseBalanceResult')
+	el(root, 'AT').text = str(at)
+	el(root, 'GC').text = str(gc)
+	el(root, 'baseBalanceStatus').text = status
+	return root
+
+
+
+
+
+@qa_test(disable = ~os.path.exists(qa_params.get('path to kraken database', '/scratch/biostat/kraken/miniDB')), 
+		 profile=False)
+def taxonomyQualityUsingKraken(sequenceFileName, 
+							   sequencingType,
+							   genus=None,
+							   species=None, 
+							   databasePath=qa_params.get('path to kraken database', '/scratch/biostat/kraken/miniDB')
+							   ):
+
+	#print sequenceFileName
+        #print sequencingType
+	#print sequenceFileName
+        #print sequencingType
+	#print genus
+	#print species
+
+
+	import xml.etree.ElementTree as xml
+	global failure_mode
+	el = xml.SubElement
+	res = TaxMemo.of(sequenceFileName, databasePath)
+	#make return element
+	root = xml.Element('taxonomyQualityResult')
+	if not genus:
+ 		el(root, 'taxStatus').text = "Undetermined"
+ 	else:
+ 		if genus in res.taxon.split() and species in res.taxon.split():
+ 			el(root, 'taxStatus').text = "Pass"
+ 		else:
+ 			el(root, 'taxStatus').text = "Fail"
+ 			update_failure_mode('"{} {}" was not in agreement with calculated ID "{}"'.format(genus, species, res.taxon))
+ 	el(root, 'mostPrevalentTaxon', dict(prevalence=str(res.taxon_prevalence))).text = res.taxon[-230:] #last 255 characters
+ 	
+	contams = el(root, 'contaminants')
+	for i in range(4):
+		el(contams, 'prevalentContaminant', dict(prevalence=str(res.contaminant_prevalence[i]), ordinal=str(i))).text = res.contaminant[i][-230:]
+	el(root, 'database', dict(checksum=str(res.checksum))).text = databasePath
+	
+	
+	return root
+
+
+
+@qa_test
+def reasonForFailure():
+	global failure_mode
+	return failure_mode[0:254] #255 character limit in destination DB field
+
+@qa_test(disable=True)
+def run_serotype_usingseqsero(
+	sequenceFileName, 
+	sequencingType):
+	from gims_qa_serotyping import run_serotype_usingseqsero
+	
+	seq1 = "" + sequenceFileName[0]
+	seq2 = "" + sequenceFileName[1]
+
+	return run_serotype_usingseqsero(
+	seq1, seq2,
+							   sequencingType,
+							   Oantigen="Undetermined",
+							   H1antigen="Undetermined", 
+							   H2antigen="Undetermined",
+							   AntigenicProfile="Undetermined",
+							   PredictedSerotype="Undetermined")
+
+	return root
diff -r 000000000000 -r dafec28bd37e qa_core/gims_qa_tests.pyc
Binary file qa_core/gims_qa_tests.pyc has changed
diff -r 000000000000 -r dafec28bd37e qa_core/qa_core.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/qa_core.sh	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+#export MODULEPATH=/etc/modulefiles:/opt/mellanox/bupc/2.2/modules:/sw/Modules/
+#export GIT_SSH_COMMAND="ssh -i /home/justin.payne/.ssh/deploy_key_dev"
+
+module load qa-core
+module load kraken
+module load git
+#module load fastqc
+module load pandas
+module load SeqSero
+
+GIT_SSH_COMMAND="ssh -qi /sw/apps/qa-core/deploy_key_dev" git -C /sw/apps/qa-core/ pull origin deployment --quiet 2>&1>/dev/null
+TEMP=/isilon/gpfs_scratch python /sw/apps/qa-core/gims_core.py $1 -p 8
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/NY65357675_S20_L001_R1_001.fastq.gz
Binary file qa_core/sample/Input/NY65357675_S20_L001_R1_001.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/NY65357675_S20_L001_R2_001.fastq.gz
Binary file qa_core/sample/Input/NY65357675_S20_L001_R2_001.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/forward.fastq.gz
Binary file qa_core/sample/Input/forward.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/forward_big.fastq.gz
Binary file qa_core/sample/Input/forward_big.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/input.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/sample/Input/input.xml	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,45 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<qaQcCoverageInput>
+<!-- 
+	<sequenceFileName comment="passes cov, fails quality">FNW19G43_S8_L001_R1_001.fastq.gz</sequenceFileName>
+	<sequenceFileName>FNW19G43_S8_L001_R2_001.fastq.gz</sequenceFileName>
+ -->
+
+
+
+
+	
+	<!-- <sequenceFileName comment="passes cov and quality">NY65357675_S20_L001_R1_001.fastq.gz</sequenceFileName>
+	<sequenceFileName>NY65357675_S20_L001_R2_001.fastq.gz</sequenceFileName> -->
+	<genus>Campylobacter</genus>
+	<species>jejuni</species>
+
+
+
+
+<!-- 
+	<sequenceFileName comment="small file">forward.fastq.gz</sequenceFileName>
+	<sequenceFileName>reverse.fastq.gz</sequenceFileName> -->
+
+
+
+
+ 
+ 
+
+
+	<sequenceFileName comment="moderate size file">forward_big.fastq.gz</sequenceFileName>
+	<sequenceFileName>reverse_big.fastq.gz</sequenceFileName>
+
+
+
+<!-- <sequenceFileName comment="454 file">roche_10000.sff</sequenceFileName> -->
+
+	<sequencingType>Illumina MiSeq</sequencingType>
+	
+<!-- 	<sequencingType>Roche 454</sequencingType> -->
+	
+	<approximateGenomeSize>4500000</approximateGenomeSize>
+	<coverageThreshold>20</coverageThreshold>
+
+</qaQcCoverageInput>
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/reverse.fastq.gz
Binary file qa_core/sample/Input/reverse.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/reverse_big.fastq.gz
Binary file qa_core/sample/Input/reverse_big.fastq.gz has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Input/roche_10000.sff
Binary file qa_core/sample/Input/roche_10000.sff has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/base_balance.png
Binary file qa_core/sample/Output/base_balance.png has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/nucleotide_distribution.png
Binary file qa_core/sample/Output/nucleotide_distribution.png has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/output.tsv
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/sample/Output/output.tsv	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,2 @@
+at_gc_baseBalance	averageQPerFile	coverage	name	qcStatus	readCount	reasonForFailure	version
+0.957667053211/0.982688971661	[('qa_core/sample/Input/forward_big.fastq.gz', 35.54551581739571), ('qa_core/sample/Input/reverse_big.fastq.gz', 25.590399012754776)]	2	TEST TEST TEST	Fail	59994	qscore below 27.0	galaxy/pure+mulcore+plots+qscore+basebal
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/output.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/sample/Output/output.xml	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,14 @@
+<?xml version="1.0" ?>
+<qaQcCoverageOutput>
+    <version>galaxy/pure+mulcore+plots+qscore+basebal</version>
+    <readCount>59994</readCount>
+    <coverage>2</coverage>
+    <averageQPerFile>[('forward_big.fastq.gz', 35.54551581739571), ('reverse_big.fastq.gz', 25.590399012754776)]</averageQPerFile>
+    <qcStatus>Fail</qcStatus>
+    <baseBalanceResult>
+        <AT>0.957667053211</AT>
+        <GC>0.982688971661</GC>
+        <baseBalanceStatus>Pass</baseBalanceStatus>
+    </baseBalanceResult>
+    <reasonForFailure>qscore below 27.0</reasonForFailure>
+</qaQcCoverageOutput>
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/qscore_distribution.png
Binary file qa_core/sample/Output/qscore_distribution.png has changed
diff -r 000000000000 -r dafec28bd37e qa_core/sample/Output/qscore_histogram.png
Binary file qa_core/sample/Output/qscore_histogram.png has changed