changeset 0:dafec28bd37e

planemo upload
author jpayne
date Tue, 08 May 2018 18:54:20 -0400
parents
children d1f1b65cff7a
files base_balance.png cfsan-qaqc.xml qa_core/.gitignore qa_core/.gitmodules qa_core/README.md qa_core/gims_core.py qa_core/gims_qa_core.py qa_core/gims_qa_kraken.py qa_core/gims_qa_plot.py qa_core/gims_qa_plot.pyc qa_core/gims_qa_serotyping.py qa_core/gims_qa_tests.py qa_core/gims_qa_tests.pyc qa_core/qa_core.sh qa_core/sample/Input/NY65357675_S20_L001_R1_001.fastq.gz qa_core/sample/Input/NY65357675_S20_L001_R2_001.fastq.gz qa_core/sample/Input/forward.fastq.gz qa_core/sample/Input/forward_big.fastq.gz qa_core/sample/Input/input.xml qa_core/sample/Input/reverse.fastq.gz qa_core/sample/Input/reverse_big.fastq.gz qa_core/sample/Input/roche_10000.sff qa_core/sample/Output/base_balance.png qa_core/sample/Output/nucleotide_distribution.png qa_core/sample/Output/output.tsv qa_core/sample/Output/output.xml qa_core/sample/Output/qscore_distribution.png qa_core/sample/Output/qscore_histogram.png
diffstat 28 files changed, 1774 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file base_balance.png has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cfsan-qaqc.xml	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,73 @@
+<tool id="cfsan-qaqc" name="CFSAN QA/QC process" version="0.1.0">
+    <requirements>
+        <requirement type="package" version="1.65">biopython</requirement>
+        <requirement type="package" version="1.4.3">matplotlib</requirement>
+        <requirement type="package" version="1.9.2">numpy</requirement>
+        <requirement type="package" version="0.16.1">pandas</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #if $reads.reads_select == 'collection'
+            #set name=$reads.coll.name
+            #set forward=$reads.coll.forward
+            #set reverse=$reads.coll.reverse
+        #else
+            #set name=$forward.name
+        #end if
+        mkdir plots &&
+        qa_core/gims_core.py --sequencingType "Illumina MiSeq"
+                             --approximateGenomeSize $genomeSize
+                             --coverageThreshold $covThreshold
+                             --minimumQScore $minQ
+                             --output_plots plots
+                             --parallelism \${GALAXY_SLOTS:-6}
+                             --annotate $name
+                             --log -
+                             suite
+                             $forward
+                             $reverse
+                             -o $output
+    ]]></command>
+    <inputs>
+        <conditional name="reads">
+            <param name="reads_select" type="select" label="Paired-end collection, or two datasets from your history">
+                <option value="collection">Paired collection from your history</option>
+                <option value="history">Two FASTQ datasets from your history</option>
+            </param>
+            <when value="collection">
+                <param label="Paired reads" name="coll" type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" collection_type="paired" />
+            </when>
+            <when value="history">
+                <param label="Forward reads" type="data" name="forward" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" />
+                <param label="Reverse reads" type="data" name="reverse" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,fastq.bz2,fastqsanger.bz2" />
+            </when>
+        </conditional>
+        <param name="genomeSize" type="text" label="Approximate genome size" value="4700000" />
+        <param name="covThreshold" type="text" label="Coverage threshold" value="20" />
+        <param name="minQ" type="text" label="Minimum average Q score" value="27.0" />
+    </inputs>
+    <outputs>
+        <data label="QA/QC Results" name="output" format="tabular" />
+        <data label="Nucleotide plot" name="nucleo_plot" format="png" from_work_dir="nucleotide_distribution.png" />
+        <data label="QScore Distribution" name="qscore_dist" format="png" from_work_dir="qscore_distribution.png" />
+        <data label="QScore Histogram" name="qscore_histro" format="png" from_work_dir="qscore_histogram.png" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="reads_select" value="collection" />
+            <param name="coll">
+                <collection type="list:paired">
+                <element name="TEST TEST TEST">
+                <collection type="paired">
+                    <element name="forward" value="Input/forward_big.fastq.gz" ftype="fastqsanger.gz" />
+                    <element name="reverse" value="Input/reverse_big.fastq.gz" ftype="fastqsanger.gz" />
+                </collection>
+                </element>
+                </collection>
+            </param>
+            <output name="output" value="Output/output.tsv" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        TODO: Fill in help.
+    ]]></help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/.gitignore	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,17 @@
+*.pyc
+sample/Output/*xml
+./deploy-dev
+./deploy-dev.pub
+./deploy-prep
+./deploy-prep.pub
+./deploy-prod
+./deploy-prod.pub
+deploy-*
+venv
+venv/*
+.DS_Store
+*.patch
+*bak
+tar
+.vscode
+.vscode/*
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/.gitmodules	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,3 @@
+[submodule "task_core"]
+	path = task_core
+	url = http://10.12.207.89/justin.payne/task_core.git
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/README.md	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,7 @@
+# qa_core
+## Genomics QA Core for CFSAN Sequencing Operations
+
+### Installing
+	virtualenv venv
+	source venv/bin/activate
+	pip install -r requirements.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_core.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,300 @@
+#! /usr/bin/env python
+
+import argparse
+import binascii
+from collections import OrderedDict
+from contextlib import contextmanager
+import xml.dom.minidom as dom
+from xml.etree import ElementTree as xml
+from functools import partial, wraps
+from multiprocessing.pool import ThreadPool as Pool
+from inspect import getargspec
+import os, os.path, sys, errno
+import imp
+import subprocess
+import logging
+
+ver = "unknown"
+
+
+usage = """ CFSAN Genomics QA Core
+May 15, 2015
+Justin Payne
+justin.payne@fda.hhs.gov
+
+	usage:
+	{0} /path/to/sandbox (production mode)
+	{0} {{0}}
+
+""".format(os.path.basename(__file__))
+
+PHRED_OFFSET = 33
+
+input_subdir = 'Input'
+output_subdir = 'Output'
+
+#Heng Li's readfq implementation
+#https://github.com/lh3/readfq/blob/master/readfq.py
+def readfq(fp): # this is a generator function
+    last = None # this is a buffer keeping the last unprocessed line
+    while True: # mimic closure; is it a bad idea?
+        if not last: # the first record or a record following a fastq
+            for l in fp: # search for the start of the next record
+                if l[0] in '>@': # fasta/q header line
+                    last = l[:-1] # save this line
+                    break
+        if not last: break
+        name, seqs, last = last[1:].partition(" ")[0], [], None
+        for l in fp: # read the sequence
+            if l[0] in '@+>':
+                last = l[:-1]
+                break
+            seqs.append(l[:-1])
+        if not last or last[0] != '+': # this is a fasta record
+            yield name, ''.join(seqs), [] # yield a fasta record
+            if not last: break
+        else: # this is a fastq record
+            seq, leng, seqs = ''.join(seqs), 0, []
+            for l in fp: # read the quality
+                seqs.append(l[:-1])
+                leng += len(l) - 1
+                if leng >= len(seq): # have read enough quality
+                    last = None
+                    yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record
+                    break
+            if last: # reach EOF before reading enough quality
+                yield name, seq, [] # yield a fasta record instead
+                break
+
+#from qa_core import qa_test, new_output_file, open_input, readfq, qa_params
+
+qa_tests = OrderedDict()
+
+qa_params = OrderedDict()
+
+qa_core = imp.new_module("qa_core")
+sys.modules['qa_core'] = qa_core
+
+def qa_test(func=None, disable=False, profile=True):
+	"QA Test decorator"
+	def qa_test_decorator(qa_func):
+		#define a parameter-binding wrapper
+		@wraps(qa_func)
+		def wrap(namespace):
+			"Use partial application of functions to bind parameters declared in the QA function from a namespace object"
+			monad = qa_func
+			for argument in getargspec(qa_func).args:
+				if hasattr(namespace, argument) and getattr(namespace, argument):
+					monad = partial(monad, **{argument: getattr(namespace, argument)})
+
+			#and then call it
+			return monad()
+
+		wrap.__profile_enabled__ = profile
+		if not disable:
+			# if not name:
+			# 	name = qa_func.__name__
+			qa_tests[qa_func.__name__] = wrap
+		return wrap
+	if func:
+		return qa_test_decorator(func)
+	return qa_test_decorator
+
+qa_core.qa_test = qa_test
+qa_core.readfq = readfq
+qa_core.qa_params = qa_params
+
+@qa_test
+def version():
+	"Collect repository commit id from git"
+	p = os.path.abspath(os.path.dirname(__file__))
+	try:
+		ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '')
+	except subprocess.CalledProcessError:
+		ref = "+".join(qa_tests.keys())[:255]
+	return ref
+
+@qa_test
+def name(annotation=None):
+	if annotation:
+		return annotation
+
+@contextmanager
+def open_file(name, path='.', mode='w'):
+	with open(os.path.join(path, name), mode) as f:
+		if 'r' in mode and binascii.hexlify(f.read(2)) == b'1f8b':
+			import gzip
+			with gzip.open(os.path.join(path, name), mode) as g:
+				yield g
+		else:
+			f.seek(0)
+			yield f
+
+def suite(namespace, log=logging.getLogger('qa.suite')):
+	for name, test in qa_tests.items():
+			logg = log.getChild(name)
+			try:
+				result = test(namespace)
+				if result is not None:
+					logg.info(result)
+				yield name, result
+			except Exception as e:
+				import traceback
+				logg.error(traceback.format_exc())
+
+
+def main(namespace):
+	if namespace.annotation:
+		qa_params['annotation'] = namespace.annotation
+	parallelism = vars(namespace).get('parallelism', 1)
+	if parallelism > 1:
+		qa_params['map implementation'] = Pool(parallelism).map
+	if namespace.mode not in ('suite', 'profile'):
+		#it's a path to a sandbox job
+		logging.getLogger('qa.input').info("Load job description xml from {}...".format(namespace.mode))
+		tree = xml.parse(os.path.join(namespace.mode, "Input/input.xml"))
+		root = tree.getroot()
+		#logging.getLogger('qa.input').debug(xml.tostring(root).strip())
+		for element in root:
+			val = getattr(namespace, element.tag)
+			if isinstance(val, list):
+				val.append(element.text)
+			else:
+				setattr(namespace, element.tag, element.text)
+		#set up utility functions
+		path = os.path.join(namespace.mode, 'Output')
+		try:
+			os.makedirs(path)
+		except OSError as e:
+			if e.errno != errno.EEXIST:
+				raise
+		except Exception as e:
+			logging.getLogger('qa.setup').error(e)
+		qa_core.new_output_file = partial(open_file, path=path)
+		#qa_core.new_output_file = lambda f: open_file(f, path=path)
+		qa_core.open_input = partial(open_file, path=os.path.join(namespace.mode, "Input"), mode='r')
+		#import and run tests
+		logging.getLogger('qa.tests').info("Loading and running tests...")
+		import gims_qa_tests
+		root = xml.Element("qaQcCoverageOutput")
+		tree = xml.ElementTree(element=root)
+		[logging.getLogger('qa.params').info("{}:{}".format(k,v)) for k,v in vars(namespace).items()]
+		for i, (name, result) in enumerate(suite(namespace)):
+			if isinstance(result, xml.Element):
+				root.append(result)
+			else:
+				if result:
+					xml.SubElement(root, name).text = str(result)
+
+		try:
+			with qa_core.new_output_file('output.xml') as out:
+				d = dom.parseString(xml.tostring(root))
+				d.writexml(out, addindent='    ', newl='\n')
+			logging.getLogger('qa.results').info("Finished tests and wrote to {}".format(namespace.mode))
+		except Exception as e:
+			logging.getLogger('qa.results').error(e)
+	elif namespace.mode == 'suite':
+		qa_core.open_input = partial(open_file, mode='rb')
+		qa_core.new_output_file = open_file
+		if not namespace.output_plots:
+			qa_params['plots'] = False
+		else:
+			#set up directory for plots
+			pass
+		import gims_qa_tests
+		def fix_value(result):
+			if isinstance(result, xml.Element):
+				return xml.tostring(result)
+			return result
+		results = {name:fix_value(result) for name, result in suite(namespace)}
+		import csv
+		wtr = csv.DictWriter(namespace.output, delimiter='\t', fieldnames=sorted(results.keys()))
+		wtr.writeheader()
+		wtr.writerow(results)
+	elif namespace.mode == 'profile':
+		return profile(namespace)
+	
+
+def profile(namespace):
+	log = logging.getLogger('qa.profile')
+	import cProfile
+	import StringIO
+	from Bio.Seq import Seq
+	from Bio.SeqRecord import SeqRecord
+	from Bio.Alphabet import IUPAC
+	import random
+	from random import choice as c
+	from tempfile import TemporaryFile as tmp
+
+	qa_params['plots'] = False
+
+	profile_fastq = StringIO.StringIO()
+	log.info("Setup...")
+	#create some fake FASTQ 'reads'
+	for r in range(10000):
+		#boring random distribution
+		#s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)])
+		#interesting diamond distribution
+		s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)])
+		s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)])
+
+		q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)]
+		rec = SeqRecord(Seq(s, IUPAC.unambiguous_dna), id="read_{}".format(r))
+		rec.letter_annotations['phred_quality'] = q
+		profile_fastq.write(rec.format('fastq'))
+	@contextmanager
+	def input_stub(filename, mode=None):
+		if 'xml' in filename:
+			profile_xml.seek(0)
+			yield profile_xml
+		elif 'fastq' in filename:
+			profile_fastq.seek(0)
+			yield profile_fastq
+		else:
+			yield open('/dev/null', 'r')
+			
+	qa_core.open_input = input_stub
+	profiling_buffer = StringIO.StringIO()
+
+	@contextmanager
+	def get_profiling_buffer(*a, **k):
+		yield profiling_buffer
+		profiling_buffer.seek(0)
+		
+	qa_core.new_output_file = get_profiling_buffer
+	import gims_qa_tests
+
+	namespace.sequenceFileName = ('forward.fastq', 'reverse.fastq')
+	namespace.sequencingType = 'Illumina MiSeq'
+	namespace.approximateGenomeSize = 1000000
+
+	def run_tests(namespace, log):
+		log.getChild('run')
+		log.info(vars(namespace))
+		for name, test in qa_tests.items():
+			if test.__profile_enabled__:
+				test(namespace)
+	log.info("Profiling...")
+	return cProfile.runctx('run_tests(namespace, log)', None, locals(), sort='tottime')
+
+if __name__ == '__main__':
+	
+	parser = argparse.ArgumentParser()
+	parser.add_argument("mode")
+	parser.add_argument('sequenceFileName', nargs='*')
+	parser.add_argument('--genus','-g')
+	parser.add_argument('--species', '-s')
+	parser.add_argument('--sequencingType', '-t')
+	parser.add_argument('--approximateGenomeSize','--gs', type=int, default=4500000)
+	parser.add_argument('--coverageThreshold', '-c', type=int, default=20)
+	parser.add_argument('--minimumQScore', '-q', type=float, default=27.0)
+	parser.add_argument('--databasePath', '--db')
+	parser.add_argument('--log', '-l', type=argparse.FileType('w'), default=sys.stdout)
+	parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout)
+	parser.add_argument('--output_plots')
+	parser.add_argument('--parallelism', '-p', type=int, default=1)
+	parser.add_argument('--annotate', '-a', type=str, default=None, dest='annotation')
+	params = parser.parse_args()
+	logging.basicConfig(format='%(name)s\t%(levelname)s:%(message)s', level=logging.DEBUG, stream = params.log)
+	logging.getLogger('qa.cmd').info(" ".join(sys.argv))
+	main(params)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_core.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,390 @@
+#!/usr/bin/env python
+
+#from __future__ import print_function
+
+import xml.dom.minidom as dom
+import xml.etree.ElementTree as xml
+import sys
+import os, os.path
+from functools import partial, wraps
+from inspect import getargspec, getfile, currentframe, getsource
+from os.path import join as j
+import subprocess
+import textwrap
+import imp
+from collections import OrderedDict
+from contextlib import contextmanager
+import StringIO
+import gzip
+
+ver = "core+plots+fxt"
+
+
+usage = """ CFSAN Genomics QA Core
+May 15, 2015
+Justin Payne
+justin.payne@fda.hhs.gov
+
+	usage:
+	{0} /path/to/sandbox (production mode)
+	{0} {{0}}
+
+""".format(os.path.basename(__file__))
+
+PHRED_OFFSET = 33
+
+input_subdir = 'Input'
+output_subdir = 'Output'
+
+path = os.getcwd()
+
+qa_tests = OrderedDict()
+qa_undecorated_tests = OrderedDict()
+qa_params = OrderedDict()
+
+class QaParamsDict(OrderedDict):
+	"orderedDict that sets on get access"
+	
+	def get(self, key, value=None):
+		self[key] = result = super(QaParamsDict, self).get(key, value)
+		return result
+
+
+qa_params = QaParamsDict()
+
+def int_then_string(value):
+	try:
+		return int(value)
+	except ValueError:
+		return str(value)
+		
+class IncompleteXMLError(Exception):
+	pass
+
+def qa_test(func=None, disable=False, profile=True):
+	"Decorator to simplify XML interaction"
+	def wrap(qa_func):
+		"Inner function to decorate qa test"
+		#print "registering", qa_func.__name__
+		qa_undecorated_tests[qa_func.__name__] = qa_func
+		if disable:
+			return qa_func
+		@wraps(qa_func)
+		def bind_metadata_to_arguments(metadata, qa_func):
+			"Find elements in XML with same name as function argument and bind their content"
+			try:
+				arguments = list(getargspec(qa_func).args)
+			except TypeError:
+				arguments = list(getargspec(qa_func.func).args)
+			for argument in arguments:
+				elements = metadata.findall(".//{0}".format(argument))
+				if len(elements) == 1:
+					qa_func = partial(qa_func, **{argument:int_then_string(elements[0].text)})
+				elif len(elements) > 1:
+					qa_func = partial(qa_func, **{argument:[int_then_string(e.text) for e in elements]})
+				else:
+					pass
+			try:
+				return qa_func()
+# 			except TypeError as e:
+# 				for el in metadata:
+# 					if el.tag in args:
+# 						arguments.remove(el.tag)
+# 				message = "Missing argument or arguments in input.xml: {} not present.".format(', '.join(arguments))
+# 				raise IncompleteXMLError, message, sys.exc_info()[-1]
+			finally:
+				pass
+			#bind_metadata_to_arguments
+		decorated_qa_func = partial(bind_metadata_to_arguments, qa_func=qa_func)
+		decorated_qa_func.__profile_enabled__ = profile
+		qa_tests[qa_func.__name__] = decorated_qa_func
+		return decorated_qa_func
+		#qa_test_decorator
+	if func:
+		return wrap(func)
+	return wrap
+
+
+qa_core = imp.new_module("qa_core")
+qa_core.qa_test = qa_test
+qa_core.test_output_buffer = StringIO.StringIO()
+
+def present(file):
+	return "/dev/null"
+
+qa_core.present = present
+
+@contextmanager
+def get_test_buffer(*args, **kwargs):
+	yield qa_core.test_output_buffer
+
+@contextmanager
+def get_test_input_file(*args, **kwargs):
+	yield open('/dev/null', 'r') 
+
+qa_core.new_output_file = get_test_buffer
+qa_core.open_input = get_test_input_file
+qa_core.qa_params = qa_params
+sys.modules['qa_core'] = qa_core
+	
+
+def main(sandbox, profile=False, *a, **k):
+	output = "qa_core: ran "
+	with qa_core.open_input('input.xml') as inputfile:
+		metadata = xml.parse(inputfile).getroot()
+	
+	qaoutput = xml.Element("qaQcCoverageOutput")
+	outputxml = xml.ElementTree(element=qaoutput)
+	
+	for name, test in qa_tests.items():
+		if (profile and test.__profile_enabled__) or not profile:
+			output += "{0}, ".format(name)
+			try:
+				result = test(metadata)
+				if isinstance(result, xml.Element):
+					qaoutput.append(result)
+				else:
+					if result:
+						xml.SubElement(qaoutput, name).text = str(result)
+			except Exception as e:
+				output += 'and died on error.'
+				import traceback
+				traceback.print_exc()
+				print(test)
+				xml.SubElement(qaoutput, 'reasonForFailure').text = str(e)
+				break
+# 	if gims_qa_tests.failure_mode:
+# 		xml.SubElement(qaoutput, "reasonForFailure").text = str(gims_qa_tests.failure_mode)
+
+	with qa_core.new_output_file('output.xml') as outputfile:
+		d = dom.parseString(xml.tostring(qaoutput))
+		d.writexml(outputfile, addindent='    ', newl='\n')
+	return output
+
+
+
+
+
+@qa_test
+def version():
+	"Collect repository commit id from git"
+	p = os.path.abspath(os.path.dirname(__file__))
+	try:
+		ref = subprocess.check_output("git -C {0} describe".format(p), shell=True).replace("\n", '')
+	except subprocess.CalledProcessError:
+		ref = ver
+	return ref
+	
+@qa_test(disable=True)
+def profile():
+	"profiling command to test performance"
+	
+	print("Starting profiler. Imports...")
+	
+	import cProfile
+	import StringIO
+	import Bio
+	import random
+	from random import choice as c
+
+	from tempfile import TemporaryFile as tmp
+
+	
+	print("Setting up test data...")
+	
+	profile_xml =  StringIO.StringIO("""<?xml version='1.0' encoding='UTF-8'?>
+<qaQcCoverageInput>
+	<sequenceFileName>forward.fastq</sequenceFileName>
+	<sequenceFileName>reverse.fastq</sequenceFileName>
+	<sequencingType>Illumina MiSeq</sequencingType>
+	<approximateGenomeSize>1000000</approximateGenomeSize>
+</qaQcCoverageInput>	
+""")
+
+	qa_params['plots'] = False
+
+	profile_fastq = StringIO.StringIO()
+	#create some fake FASTQ 'reads'
+	for r in range(10000):
+		#boring random distribution
+		#s = "".join([random.choice(('A', 'T', 'G', 'C')) for i in range(300)])
+		#interesting diamond distribution
+		s = "".join([c(c(('A', 'C')) * (300-i) + c(('G','T'))*i) for i in range(150)])
+		s += "".join([c(c(('A', 'C')) * 150 + c(('G','T'))*(150-i)) for i in range(150)])
+
+		q = [int(max(0, min((random.normalvariate(15, 10), 62)))) for i in range(300)]
+		rec = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(s, Bio.Alphabet.IUPAC.unambiguous_dna), id="read_{}".format(r))
+		rec.letter_annotations['phred_quality'] = q
+		try:
+			profile_fastq.write(rec.format('fastq'))
+		except:
+			print q
+			print rec
+			raise
+
+		
+	
+			
+	@contextmanager
+	def input_stub(filename, mode=None):
+		if 'xml' in filename:
+			profile_xml.seek(0)
+			yield profile_xml
+		elif 'fastq' in filename:
+			profile_fastq.seek(0)
+			yield profile_fastq
+		else:
+			yield open('/dev/null', 'r')
+			
+	qa_core.open_input = input_stub
+	#qa_core.new_output_file = get_test_buffer
+	
+	profiling_buffer = StringIO.StringIO()
+	
+	@contextmanager
+	def get_profiling_buffer(*a, **k):
+		yield profiling_buffer
+		profiling_buffer.seek(0)
+		
+	qa_core.new_output_file = get_profiling_buffer
+	
+	reload(gims_qa_tests)
+	
+	print("Beginning profiling...")
+	try:
+		s = cProfile.run('main(profile=True)', sort='tottime')
+		#s.sort_stats('pcalls', 'tottime')
+		return str(profiling_buffer.read())
+	except NameError:
+		import traceback
+		traceback.print_exc()
+		main(profile=True)
+		quit()
+	except Exception:
+		import traceback
+		traceback.print_exc()
+		#print str(profile_fastq.getvalue())[:1024]
+		quit()
+	#return exec('main')
+	
+	
+#Heng Li's readfq implementation
+#https://github.com/lh3/readfq/blob/master/readfq.py
+def readfq(fp): # this is a generator function
+    last = None # this is a buffer keeping the last unprocessed line
+    while True: # mimic closure; is it a bad idea?
+        if not last: # the first record or a record following a fastq
+            for l in fp: # search for the start of the next record
+                if l[0] in '>@': # fasta/q header line
+                    last = l[:-1] # save this line
+                    break
+        if not last: break
+        name, seqs, last = last[1:].partition(" ")[0], [], None
+        for l in fp: # read the sequence
+            if l[0] in '@+>':
+                last = l[:-1]
+                break
+            seqs.append(l[:-1])
+        if not last or last[0] != '+': # this is a fasta record
+            yield name, ''.join(seqs), [] # yield a fasta record
+            if not last: break
+        else: # this is a fastq record
+            seq, leng, seqs = ''.join(seqs), 0, []
+            for l in fp: # read the quality
+                seqs.append(l[:-1])
+                leng += len(l) - 1
+                if leng >= len(seq): # have read enough quality
+                    last = None
+                    yield name, seq, [ord(c) - PHRED_OFFSET for c in ''.join(seqs)]; # yield a fastq record
+                    break
+            if last: # reach EOF before reading enough quality
+                yield name, seq, [] # yield a fasta record instead
+                break
+                
+qa_core.readfq = readfq
+
+
+
+if __name__ == "__main__":
+	try:
+		name, sandbox_path = sys.argv[0:2]
+		args = sys.argv[2:]
+		
+		if '-p' in args:
+			from multiprocessing import Pool
+			try:
+				m = int(args[args.index('-p') + 1])
+				p = Pool(processes=m)
+				del args[args.index('-p') + 1]
+				qa_core.qa_params['parallelism'] = m
+			except IndexError, ValueError:
+				p = Pool()
+			args.remove('-p')
+			qa_core.qa_params['map implementation'] = p.map
+			
+		if '-q' in args:
+			try:
+				q = float(args.pop(args.index('-q') + 1))
+				args.remove('-q')
+				qa_params['user Q threshold'] = q
+			except IndexError, ValueError:
+				print("no Q score threshold provided with '-q' option.")
+				quit(1)
+				
+		
+		@contextmanager
+		def new_output_file(filename, mode='w'):
+			if not os.path.exists(j(sandbox_path, output_subdir)):
+				os.mkdir(j(sandbox_path, output_subdir))
+			fileh = open(j(sandbox_path, output_subdir, filename), mode)
+			yield fileh
+			fileh.close()
+			
+		
+		def open_input_from(path):
+			@contextmanager
+			def open_input(filename, mode='rU'):
+				if 'gz' in filename:
+					fopen = gzip.open
+				else:
+					fopen = open
+				fileh = fopen(j(path, filename), mode=mode)
+				yield fileh
+				fileh.close()
+			return open_input
+
+		def present_from(path):
+			def present(file):
+				return os.path.abspath(j(path, file))
+			return present
+		
+		qa_core.new_output_file = new_output_file
+		main_func = main
+		
+		if not os.path.exists(sandbox_path):
+			qa_core.open_input = open_input_from(os.getcwd())
+			qa_core.present = present_from(os.getcwd())
+			try:
+				main_func = lambda s, *a: qa_undecorated_tests[sandbox_path](*a)
+			except KeyError:
+				print("Invalid QA test name: {0}".format(sandbox_path))
+				quit(1)
+		else:
+			qa_core.open_input = open_input_from(j(sandbox_path, input_subdir))
+			qa_core.present = present_from(j(sandbox_path, input_subdir))
+			
+
+		
+	except ValueError:
+		import gims_qa_tests
+		print("Missing argument: path to sandbox or name of QA test.")
+		commands = ""
+		#commands = "\n\t\t".join([' '.join((n, " ".join(["<{0}>".format(a) for a in getargspec(f)[0]]) or 'DELETEME')).replace(' DELETEME', '') for n, f in qa_undecorated_tests.items()])
+		for name, func in qa_undecorated_tests.items():
+			args = " ".join(["<{0}>".format(a) for a in getargspec(func).args])
+			commands += "{0} {1}\n\t\t\t".format(name, textwrap.fill(args, 60, subsequent_indent='\t\t\t\t'))
+		print(usage.format(commands))
+		quit(1)
+	else:
+		import gims_qa_tests
+		print(main_func(sandbox_path, profile=False, *args))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_kraken.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,217 @@
+import csv
+from collections import Counter, OrderedDict
+from hashlib import md5
+import os.path
+import tempfile
+from subprocess import check_call
+from gims_qa_tests import TaxTuple
+#from qa_core import present
+
+def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)):
+	"kraken options should be supplied as a dict to kraken_options; give a None value for valueless options"
+
+	#hash the kraken db files
+	md5hash = md5()
+	#os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None)
+	md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read())
+
+	db_hash = md5hash.hexdigest()
+
+
+	with tempfile.NamedTemporaryFile('rU') as kraken_out:
+		with tempfile.NamedTemporaryFile() as seq_data:
+			#cat all the files together
+			check_call("cat {files} > {seq_data.name}".format(
+				files = " ".join([present(f) for f in sequenceFiles]),
+				seq_data = seq_data
+				), 
+			shell=True)
+
+			def make_option(key, value):
+				if value is not None:
+					return "--{} {}".format(key, value)
+				return "--{}".format(key)
+
+			#build options
+			options = " ".join([make_option(k,v) for k,v in kraken_options.items()])
+
+			#call kraken
+			check_call("kraken {options} "
+					"--db {database} "
+					"--gzip-compressed "
+					"--fastq {seq_data.name} "
+					"--output {kraken_out.name}".format(**locals()), shell=True)
+
+			counts = Counter()
+
+			with tempfile.NamedTemporaryFile('rU') as kraken_report:
+				check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True)
+				rows = csv.reader(kraken_report, delimiter='\t')
+				for seq_id, taxon in rows:
+					counts[taxon] += 1.0
+					counts["total"] += 1.0
+
+				#clean up all the temp files by releasing contexts
+	
+	top_results = counts.most_common(6) #total, plus five most common taxa
+	total, total_count = top_results.pop(0)
+	top_hit, top_hit_count = top_results.pop(0)
+	next_hits = [h[0].replace(';', ' ') for h in top_results]
+	next_hit_counts = [h[1] for h in top_results]
+
+	top_hit_frac = top_hit_count / total_count
+	next_hit_frac = [h / total_count for h in next_hit_counts]
+
+	return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash)
+
+def  runKrakenSeqtest(sequenceFiles,database):
+
+	krakenoutoutpath = "/home/charles.strittmatter/output/kraken/"
+
+	w = datetime.datetime.now()
+        
+	krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M")
+	krakenreport = "" +  krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M")
+	
+	sequenceFiles[0]
+	sequenceFiles[1]
+
+	os.path.abspath(os.path.join("Input", sequenceFileName[0]))
+
+	command = "kraken --preload "
+	command += "--db " + database + ' '
+	#if len(fastqPaths) > 1:
+	command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1])) 
+	#  command += ' '.join(fastqPaths)
+	command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout
+	
+	print command
+
+	check_call(command, shell=True)
+	print "%s -- Kraken called..." % datetime.datetime.now()
+
+
+	kraken_report_cmd = "kraken-report --db "
+	kraken_report_cmd += database
+	kraken_report_cmd += " " + krakenoutoutpath + krakenout
+	kraken_report_cmd += " > " + krakenreport
+	
+	print kraken_report_cmd
+
+	check_call(kraken_report_cmd, shell=True)
+
+
+
+
+	parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5"
+
+	kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True)
+
+	split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines)
+
+	print "=====================" + krakenreport + "=========================="
+	print split_kraken_rpt_line
+	split_kraken_rpt_length = len(split_kraken_rpt_line)
+	print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4]
+	taxLen = 6;
+	taxIdIdx = 4;
+	outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M")  + '_metadata_kraken.tsv'
+
+ 	metadata = "Platform        Run     asm_stats_contig_n50    asm_stats_length_bp     asm_stats_n_contig      attribute_package       bioproject_acc  bioproject_center       bioproject_title        biosample_acc   collected_by    sample_name     scientific_name serovar sra_center      strain  sub_species     tax-id"
+
+
+
+
+	splitMetadata = metadata.split("\t")
+	for i in range(0, len(split_kraken_rpt_line)):
+			if split_kraken_rpt_line[i] == '':
+					print "don't add to the metadata...split_kraken_rpt_line.pop(i)"
+			elif i%(taxLen) == taxLen-1:
+					print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1"
+					splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip())
+			elif i%(taxLen) == taxIdIdx:
+					print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx"
+					splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
+			elif i%(taxLen) == 0:
+					print "i is "+str(i)+"under this condition: i%(taxLen) == 0"
+					splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
+
+			splitMetadataLength = len(splitMetadata)
+			first_kraken_percentage = splitMetadataLength-15
+	with open(outputFile, 'wb') as f:
+			#writer = csv.writer(f, delimiter="\t")
+
+			#         print splitMetadata
+			for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
+					print "Split metadata of i "+splitMetadata[i]
+					for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
+							if float(splitMetadata[j]) > float(splitMetadata[i]):
+									print "i: "+str(i)+" and j: "+str(j)
+									print "swapping "+splitMetadata[i]+" with "+splitMetadata[j]
+									old_hit_org = splitMetadata[i-1]
+									old_hit_percentage = splitMetadata[i]
+									old_hit_tx_id = splitMetadata[i+1]
+
+									splitMetadata[i-1] = splitMetadata[j-1]
+									splitMetadata[i] = splitMetadata[j]
+									splitMetadata[i+1] = splitMetadata[j+1]
+
+									splitMetadata[j-1] = old_hit_org
+									splitMetadata[j] = old_hit_percentage
+									splitMetadata[j+1] = old_hit_tx_id
+
+	df = pandas.DataFrame(splitMetadata)
+
+	print df
+
+
+	# Here aree dataframe collumns for !
+
+	print df.iat[0,0]
+	print df.iat[1,0]
+	print df.iat[3,0]
+	print df.iat[4,0]
+	print df.iat[6,0]
+	print df.iat[7,0]
+	print df.iat[9,0]
+	print df.iat[10,0]
+
+	OrganismOne = df.iat[0,0]
+	OrganismTwo =  df.iat[3,0]
+	OrganismThree = df.iat[6,0]
+	OrganismFour =  df.iat[9,0]
+
+
+	OrganismOnePer = df.iat[1,0]
+	OrganismTwoPer = df.iat[4,0]
+	OrganismThreePer = df.iat[7,0]
+	OrganismFourPer = df.iat[10,0]
+
+
+	df = {'OrganismOne':OrganismOne,
+			'OrganismTwo':OrganismTwo,
+			'OrganismThree':OrganismThree,
+			'OrganismFour':OrganismFour,
+			'OrganismOne%':OrganismOnePer,
+			'OrganismTwo%':OrganismTwoPer,
+			'OrganismThree%':OrganismThreePer,
+			'OrganismFour%':OrganismFourPer }
+
+
+	#Clean up files
+
+	command = ""
+	command += "rm " + krakenoutoutpath + krakenout
+
+	print command
+
+	check_call(command, shell=True)
+
+
+
+	command = ""
+	command += "rm " + outputFile
+
+	print command
+
+	return pandas.DataFrame(df, index=[0])
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_plot.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,148 @@
+import subprocess
+from qa_core import new_output_file, qa_params
+from collections import defaultdict
+import warnings
+warnings.filterwarnings("ignore")
+import pandas
+from pandas import DataFrame
+import matplotlib.pyplot as plot
+from numpy import zeros, add, mean, array, percentile, arange
+import traceback
+from gims_qa_tests import cluster
+import logging
+
+bases = ("A", "T", "G", "C", "N")
+
+def qa_plots(df, annotation, log=logging.getLogger('qa.plots'), **kwargs):
+	#log.debug(df)
+	#log.debug(annotation)
+	plot_quality_histo(df['qualityHisto'], annotation, **kwargs)
+	plot_nucleotide_distribution_histo(df['nucleotideHisto'], annotation)
+	plot_quality_distribution_histo(df['baseQScoreHisto'], annotation)
+	plot_base_balance(df, annotation, **kwargs)
+	#plot.show()
+	
+def plot_base_balance(df, 
+					  annotation=qa_params.get('annotation', ''),
+					  threshold=qa_params.get('permitted base balance variance', .2), 
+					  **kwargs):
+	try:
+		plot.figure()
+		plot.axis((0,2,0,2))
+		plot.suptitle("{} Base Balance".format(annotation))
+		plot.xlabel("G/C")
+		plot.ylabel("A/T")
+		thres_min = 1 - threshold
+		thres_max = 1 + threshold
+		for coord in (thres_min, thres_max):
+			for linef in (plot.axvline, plot.axhline):
+				linef(coord, linestyle='dashed', label=str(coord))
+				
+		plot.annotate(thres_max, xy=(0, thres_max))
+		plot.annotate(thres_min, xy=(0, thres_min))
+		plot.annotate(thres_max, xy=(thres_max, 0))
+		plot.annotate(thres_min, xy=(thres_min, 0))
+				
+		a, t, g, c = [sum(df['nucleotideHisto'][b]) for b in ('A', 'T', 'G', 'C')]
+		y, x = a / float(t), g / float(c)
+		plot.plot((x,), (y,), marker="+")
+		plot.annotate("({:.2},{:.2})".format(x, y), xy=(x+.05, y+.05), textcoords='data')
+		with new_output_file("base_balance.png") as base_bal:
+			plot.savefig(base_bal, format='png', dpi=92)
+	except Exception:
+		traceback.print_exc()
+	
+	
+def plot_quality_histo(histo,
+					   annotation=qa_params.get('annotation', ''),
+					   q=qa_params.get('user Q threshold', 27)):
+	try:
+		plot.figure()
+		labels = range(max(histo.keys()) + 1)
+ 		plot.bar(left=labels,
+ 				 height=[histo[n] for n in labels],
+ 				 color='y',
+ 				 linewidth=1)
+# 		plot.hist(histo, bins=range(max(histo)))
+		plot.xticks()
+		plot.axvline(x=q, label="Q {}".format(q))
+		#plot.show()
+		plot.xlabel(r"""$Q (-10 log_(10) \rho)$""")
+		plot.ylabel('Number of Bases')
+		plot.suptitle('{} Q-Score Histogram'.format(annotation))
+		plot.legend(loc='upper left', fontsize='x-small')
+		with new_output_file("qscore_histogram.png") as qual_histogram:
+			plot.savefig(qual_histogram, format='png', dpi=92)
+# 	except ValueError:
+# 		pass	
+	except Exception:
+		traceback.print_exc()
+	
+def plot_nucleotide_distribution_histo(dataframe,
+									   annotation=qa_params.get('annotation', '')):
+	try:
+		#dataframe = defaultdict(list)
+		colors = iter(('r', 'y', 'g', 'b', 'pink'))
+		plot.figure()
+		last_base = zeros((len(dataframe.values()[0]),)) #set all zeros for bottom of first bar row
+		for base in bases:
+			try:
+				plot.bar(left=range(len(dataframe[base])), 
+									height=dataframe[base], 
+									width=2, 
+									bottom=last_base, 
+									color=colors.next(), 
+									label=base, 
+									linewidth=0)
+				last_base = add(last_base, dataframe[base])
+			except:
+				#print [len(d) for d in dataframe.data.values()]
+				raise
+		plot.autoscale(tight=True)
+		plot.xlabel('Base Position')
+		plot.ylabel('Nucleotide Count')
+		plot.suptitle('{} Nucleotide Distribution Plot'.format(annotation))
+		plot.legend(loc='upper right', fontsize='xx-small')
+		with new_output_file("nucleotide_distribution.png") as nuc_histogram:
+			plot.savefig(nuc_histogram, format='png', dpi=108)
+	except Exception:
+		traceback.print_exc()
+	
+def plot_quality_distribution_histo(histo, 
+									annotation=qa_params.get('annotation', ''),
+									q=qa_params.get('user Q threshold', 27)
+									):
+	try:
+		binning = 20
+		plot.figure(figsize=(12, 6))
+		vec = [tuple(c.elements()) for c in histo]
+		means = [mean(t) for t in vec]
+		upper_iqr = array([percentile(t, 75) for t in vec])
+		lower_iqr = array([percentile(t, 25) for t in vec])
+# 		plot.boxplot(vec, 
+# 					 widths=.000005,
+# 					 whis=(5,95), 
+# 					 showfliers=False, 
+# 					 showbox=True, 
+# 					 showmeans=True,
+# 					 meanline=True, 
+# 					 showcaps=True,
+# 					 whiskerprops=dict(linestyle='-', color='0.60')
+# 					 )
+		
+		plot.fill_between(arange(len(upper_iqr)), upper_iqr, lower_iqr, facecolor='0.50', alpha=0.5, label='interquartile range')
+		plot.plot(means, 'r-', label='Mean')
+		plot.plot(upper_iqr, 'b-', label='interquartile range', alpha=0.25)
+		plot.autoscale(tight=True)
+		plot.axhline(y=q, label="Q {}".format(q))
+		plot.xlabel('Base Position')
+		plot.ylabel(r"""$Q (-10 log_(10) \rho)$""")
+		plot.xticks(range(0, len(histo), binning), range(0, len(histo), binning), rotation='vertical')
+		plot.suptitle('{} Q-Score Distribution Plot'.format(annotation))
+		plot.legend(loc='lower left', fontsize='x-small')
+		plot.grid(True)
+		with new_output_file("qscore_distribution.png") as q_distro:
+			plot.savefig(q_distro, format='png', dpi=108)
+		
+	except Exception:
+		traceback.print_exc()
\ No newline at end of file