cfsan_qaqc: qa_core/gims_qa

annotate qa_core/gims_qa_tests.py @ 0:dafec28bd37e

planemo upload

author	jpayne
date	Tue, 08 May 2018 18:54:20 -0400
parents
children

rev	line source
jpayne@0	1 from qa_core import qa_test, new_output_file, open_input, readfq, qa_params
jpayne@0	2 from collections import namedtuple, defaultdict, Counter
jpayne@0	3 from Bio import SeqIO
jpayne@0	4 from multiprocessing import Pool, Process, cpu_count
jpayne@0	5 from functools import partial
jpayne@0	6 from itertools import izip, count, izip_longest, islice
jpayne@0	7 from tempfile import NamedTemporaryFile, mkdtemp
jpayne@0	8 from subprocess import check_call, CalledProcessError
jpayne@0	9 from numpy import genfromtxt
jpayne@0	10 from os.path import abspath, join, split, splitext
jpayne@0	11 from contextlib import contextmanager
jpayne@0	12 from subprocess import call
jpayne@0	13 from subprocess import check_output
jpayne@0	14 from hashlib import md5
jpayne@0	15
jpayne@0	16
jpayne@0	17 import sys
jpayne@0	18 import shutil
jpayne@0	19 import os
jpayne@0	20 import pandas
jpayne@0	21 import datetime
jpayne@0	22 import re
jpayne@0	23 import glob, string
jpayne@0	24
jpayne@0	25 #Test implementation classes and functions. The actual tests are down at the bottom.
jpayne@0	26
jpayne@0	27 SuperResultTuple = namedtuple("SuperResultTuple", ("readCount", "numBases", "baseQScoreHisto", "nucleotideHisto", "qualityHisto"))
jpayne@0	28
jpayne@0	29 # #Global Variables for kraken methods
jpayne@0	30 # krakendatabase = "/scratch/biostat/kraken/miniDB/"
jpayne@0	31 # we don't actually need this as a global
jpayne@0	32
jpayne@0	33 class ResultTuple(SuperResultTuple):
jpayne@0	34 def __add__(self, other):
jpayne@0	35 r1, n1, h1, nuc1, qual1 = self
jpayne@0	36 r2, n2, h2, nuc2, qual2 = other
jpayne@0	37 return ResultTuple(r1 + r2,
jpayne@0	38 n1 + n2,
jpayne@0	39 [p + q for p, q in izip_longest(h1, h2, fillvalue=Counter())],
jpayne@0	40 [p + q for p, q in izip_longest(nuc1, nuc2, fillvalue=Counter())],
jpayne@0	41 qual1 + qual2)
jpayne@0	42
jpayne@0	43 def __and__(self, other):
jpayne@0	44 r1, n1, h1, nuc1, qual1 = self
jpayne@0	45 r2, n2, h2, nuc2, qual2 = other
jpayne@0	46 return ResultTuple(r1 + r2,
jpayne@0	47 n1 + n2,
jpayne@0	48 h1 + h2,
jpayne@0	49 nuc1 + nuc2,
jpayne@0	50 qual1 + qual2)
jpayne@0	51
jpayne@0	52 @classmethod
jpayne@0	53 def zero(cls):
jpayne@0	54 return cls(0, 0, [], [], Counter())
jpayne@0	55
jpayne@0	56 def to_dataFrame(self):
jpayne@0	57 try:
jpayne@0	58 from pandas import DataFrame
jpayne@0	59 #from_dict = partial(DataFrame.from_dict, DataFrame)
jpayne@0	60 except:
jpayne@0	61 #no pandas
jpayne@0	62 def DataFrame(self, data):
jpayne@0	63 return ResultTuple(**data)
jpayne@0	64
jpayne@0	65 pivotNucHisto = defaultdict(list)
jpayne@0	66 for cnt in self.nucleotideHisto:
jpayne@0	67 for base in ('A', 'T', 'G', 'C', 'N'):
jpayne@0	68 pivotNucHisto[base].append(cnt[base])
jpayne@0	69 pivotQBaseHisto = self.baseQScoreHisto
jpayne@0	70 pivotQualHisto = {}
jpayne@0	71 for q in range(max(self.qualityHisto.keys())):
jpayne@0	72 pivotQualHisto[q] = self.qualityHisto[q]
jpayne@0	73 df = {'readCount':self.readCount,
jpayne@0	74 'numBases':self.numBases,
jpayne@0	75 'baseQScoreHisto':pivotQBaseHisto,
jpayne@0	76 'nucleotideHisto':pivotNucHisto,
jpayne@0	77 'qualityHisto':pivotQualHisto}
jpayne@0	78 return df
jpayne@0	79
jpayne@0	80 class MergedResultTuple():
jpayne@0	81 "Merged multi-file results object. You can iterate on it for filename and ResultTuple pairs, or call its attributes for combined results"
jpayne@0	82 def __init__(self, **kwargs):
jpayne@0	83 self.per_file = {}
jpayne@0	84 self.per_file.update(kwargs)
jpayne@0	85
jpayne@0	86 def __getattr__(self, name):
jpayne@0	87 return getattr(reduce(lambda p, q: p & q, [v for k, v in sorted(self.per_file.items())], ResultTuple.zero()), name)
jpayne@0	88
jpayne@0	89 def __iter__(self):
jpayne@0	90 return iter(self.per_file.items())
jpayne@0	91
jpayne@0	92
jpayne@0	93 def analyze_batch_reads(reads):
jpayne@0	94 readCount = 0
jpayne@0	95 numBases = 0
jpayne@0	96 baseQScoreHisto = []
jpayne@0	97 nucleotideHisto = []
jpayne@0	98 qualityHisto = Counter()
jpayne@0	99 for name, seq, quals in reads:
jpayne@0	100 qualityHisto.update(quals)
jpayne@0	101 def scan_a_read(read):
jpayne@0	102 #for base, qual, index in zip(seq, quals, count(0)):
jpayne@0	103 base, qual, index = read
jpayne@0	104 try:
jpayne@0	105 nucleotideHisto[index][base] += 1
jpayne@0	106 except IndexError:
jpayne@0	107 nucleotideHisto.append(Counter(base))
jpayne@0	108 try:
jpayne@0	109 baseQScoreHisto[index][qual] += 1
jpayne@0	110 except IndexError:
jpayne@0	111 baseQScoreHisto.append(Counter({qual:1}))
jpayne@0	112 return index
jpayne@0	113 m = map(scan_a_read, zip(seq, quals, count(0))) #numBases is the last index of the read
jpayne@0	114 #print(name, m[:3])
jpayne@0	115 numBases += m[-1]
jpayne@0	116 readCount += 1
jpayne@0	117 r = ResultTuple(readCount, numBases, baseQScoreHisto, nucleotideHisto, qualityHisto)
jpayne@0	118 return r
jpayne@0	119
jpayne@0	120
jpayne@0	121 def cluster(slicable, n=2):
jpayne@0	122 "return an n-tuple of iterators over an in-memory sequence"
jpayne@0	123 #determine the offset size
jpayne@0	124 offset = (len(slicable) / n) + 1
jpayne@0	125 #return tuple([islice(slicable, i, i+offset) for i in range(n)])
jpayne@0	126 return tuple([tuple(slicable[ioffset:(i+1)offset-1]) for i in range(n)])
jpayne@0	127
jpayne@0	128
jpayne@0	129 def tupleize_parse(fileh, format):
jpayne@0	130 for read in SeqIO.parse(fileh, format):
jpayne@0	131 yield (read.id, str(read.seq), read.letter_annotations['phred_quality'])
jpayne@0	132
jpayne@0	133 #@profile
jpayne@0	134 def analyze_shortread(seqFile, map_f=map, multi=qa_params.get('parallelism', cpu_count())):
jpayne@0	135 "analyze one file whose reads are typical short-read technology"
jpayne@0	136 if 'sff' in seqFile:
jpayne@0	137 format, quality, mode = 'sff', 'phred_quality', 'rb'
jpayne@0	138 rds_f = partial(tupleize_parse, format=format)
jpayne@0	139 else:
jpayne@0	140 format, quality, mode = 'fastq', 'phred_quality', 'rU'
jpayne@0	141 rds_f = readfq
jpayne@0	142 with open_input(seqFile) as seqFileH:
jpayne@0	143 rds = list(rds_f(seqFileH))
jpayne@0	144 rds = cluster(rds, multi)
jpayne@0	145 result = sum(map_f(analyze_batch_reads, rds), ResultTuple.zero())
jpayne@0	146 return result
jpayne@0	147
jpayne@0	148
jpayne@0	149 def analyze_longread(seqFile, map_f=map):
jpayne@0	150 "analyze one file from pacbio or nanopore"
jpayne@0	151
jpayne@0	152 return ResultTuple.zero()
jpayne@0	153 #what to do here? if nanopore, plot 2d/1d length vs accuracy
jpayne@0	154
jpayne@0	155
jpayne@0	156 def panic(a, *k):
jpayne@0	157 raise ValueError("No analysis method for this sequencing type")
jpayne@0	158
jpayne@0	159 config = {'Roche 454':analyze_shortread,
jpayne@0	160 'Illumina MiSeq':analyze_shortread,
jpayne@0	161 'Illumina NextSeq':analyze_shortread,
jpayne@0	162 'Illumina HiSeq':analyze_shortread,
jpayne@0	163 'IonTorrent PGM':analyze_shortread,
jpayne@0	164 'PacBio RS':analyze_longread,
jpayne@0	165 'Oxford Nanopore MiniION':analyze_longread}
jpayne@0	166
jpayne@0	167
jpayne@0	168 class QaMemo(object):
jpayne@0	169 "lazy-evaluated QA operation"
jpayne@0	170
jpayne@0	171 result = None #named tuple results object
jpayne@0	172
jpayne@0	173 @classmethod
jpayne@0	174 def analyze(cls,
jpayne@0	175 sequenceFiles,
jpayne@0	176 sequencingType,
jpayne@0	177 map_f=qa_params.get('map implementation', map),
jpayne@0	178 plots=qa_params.get('plots', True),
jpayne@0	179 annotation=qa_params.get('annotation', '')):
jpayne@0	180 if type(sequenceFiles) == str: #if this is a single file
jpayne@0	181 sequenceFiles = [sequenceFiles]
jpayne@0	182 analysis = partial(config.get(sequencingType, panic), map_f=map_f)
jpayne@0	183 # result = reduce(lambda a, b: a & b, map(analysis, sequenceFiles), ResultTuple.zero())
jpayne@0	184 result = MergedResultTuple(**{fi:analysis(fi) for fi in sequenceFiles})
jpayne@0	185 if plots:
jpayne@0	186 try:
jpayne@0	187 from gims_qa_plot import qa_plots
jpayne@0	188 result_df = result.to_dataFrame()
jpayne@0	189 qa_plots(result_df, annotation)
jpayne@0	190 except Exception:
jpayne@0	191 import traceback
jpayne@0	192 traceback.print_exc()
jpayne@0	193 return result
jpayne@0	194
jpayne@0	195 @classmethod
jpayne@0	196 def of(cls, sequenceFiles, sequencingType):
jpayne@0	197 try:
jpayne@0	198 if not cls.result:
jpayne@0	199 cls.result = QaMemo.analyze(sequenceFiles, sequencingType)
jpayne@0	200 return cls.result
jpayne@0	201 except TypeError as e:
jpayne@0	202 raise ValueError(str(e)), None, sys.exc_info()[2]
jpayne@0	203
jpayne@0	204
jpayne@0	205 TaxTuple = namedtuple("TaxTuple", ("taxon", "taxon_prevalence", "contaminant", "contaminant_prevalence", "checksum"))
jpayne@0	206
jpayne@0	207
jpayne@0	208 class TaxMemo(object):
jpayne@0	209 "lazy-evaluated Tax check operation"
jpayne@0	210
jpayne@0	211 result = None #
jpayne@0	212
jpayne@0	213 @classmethod
jpayne@0	214 def analyze(cls, sequenceFiles, database):
jpayne@0	215 from gims_qa_kraken import run_kraken
jpayne@0	216 return run_kraken(sequenceFiles, database, dict(preload=None,
jpayne@0	217 threads=qa_params.get('parallelism', 1)))
jpayne@0	218
jpayne@0	219 # ## print cls
jpayne@0	220 # ## print sequenceFiles
jpayne@0	221 # from gims_qa_kraken import runKrakenSeqTest
jpayne@0	222
jpayne@0	223 # #hash the kraken db
jpayne@0	224 # #this isn't right yet, the db is a directory
jpayne@0	225 # db_hash = md5(open(database, 'rb').read()).hexdigest()
jpayne@0	226
jpayne@0	227
jpayne@0	228 # #Run Kraken on the current sequence and collect result and pass to TaxTuple to be written to xml
jpayne@0	229 # KrakenResult = runKrakenSeqtest(sequenceFiles,krakendatabase)
jpayne@0	230
jpayne@0	231 # #do this more readably lines with list comprehensions
jpayne@0	232 # org_headers = ('OrganismTwo', 'OrganismThree', 'OrganismFour', 'OrganismFive')
jpayne@0	233 # org_percent_headers = ('OrganismTwo%', 'OrganismThree%', 'OrganismFour%', 'OrganismFive%')
jpayne@0	234 # Organisms = [KrakenResult[h].iloc[0] for h in org_headers]
jpayne@0	235 # OrganismsPer = [KrakenResult[h].iloc[0] for h in org_percent_headers]
jpayne@0	236
jpayne@0	237
jpayne@0	238 # return TaxTuple(KrakenResult['OrganismOne'].iloc[0], KrakenResult['OrganismOne%'].iloc[0], Organisms, OrganismsPer, db_hash)
jpayne@0	239
jpayne@0	240 @classmethod
jpayne@0	241 def of(cls, args, *kwargs):
jpayne@0	242 if not cls.result:
jpayne@0	243 cls.result = TaxMemo.analyze(args, *kwargs)
jpayne@0	244 return cls.result
jpayne@0	245
jpayne@0	246 failure_mode = ""
jpayne@0	247
jpayne@0	248 def update_failure_mode(failure):
jpayne@0	249 global failure_mode
jpayne@0	250 if failure_mode:
jpayne@0	251 failure_mode += "; {}".format(failure)
jpayne@0	252 else:
jpayne@0	253 failure_mode = failure
jpayne@0	254
jpayne@0	255
jpayne@0	256 ##########################################################################################
jpayne@0	257 #
jpayne@0	258 # START HERE to design and configure the tests ----------
jpayne@0	259 #
jpayne@0	260 ##########################################################################################
jpayne@0	261
jpayne@0	262
jpayne@0	263
jpayne@0	264
jpayne@0	265 @qa_test(disable=True)
jpayne@0	266 def parametersUsed(sequencingType='not given',
jpayne@0	267 coverageThreshold='not given',
jpayne@0	268 approximateGenomeSize=4500000):
jpayne@0	269 "Simple function to save the test and evaluation parameters used"
jpayne@0	270 import xml.etree.ElementTree as xml
jpayne@0	271 el = xml.SubElement
jpayne@0	272 root = xml.Element("parametersUsed")
jpayne@0	273 for key, value in qa_params.items():
jpayne@0	274 el(root, "parameter", attrib=dict(key=str(key), value=str(value)))
jpayne@0	275 return root
jpayne@0	276
jpayne@0	277 @qa_test
jpayne@0	278 def readCount(sequenceFileName, sequencingType):
jpayne@0	279 return QaMemo.of(sequenceFileName, sequencingType).readCount
jpayne@0	280
jpayne@0	281 @qa_test
jpayne@0	282 def coverage(sequenceFileName, sequencingType, approximateGenomeSize=4500000):
jpayne@0	283 return QaMemo.of(sequenceFileName, sequencingType).numBases / int(approximateGenomeSize)
jpayne@0	284
jpayne@0	285
jpayne@0	286 # @qa_test(disable=False)
jpayne@0	287 # def coverageQThirtyPlus(sequenceFileName, sequencingType, approximateGenomeSize=4500000, minimumQScore=qa_params.get('user Q threshold', 27.0)):
jpayne@0	288 # numOverMinQ = 0
jpayne@0	289 # for score, num in QaMemo.of(sequenceFileName, sequencingType).baseQScoreHisto.items():
jpayne@0	290 # if score >= int(minimumQScore):
jpayne@0	291 # numOverMinQ += num
jpayne@0	292 # return numOverMinQ
jpayne@0	293
jpayne@0	294
jpayne@0	295 def mean(counter):
jpayne@0	296 return float(sum([v * c for v, c in counter.items()])) / float(sum(counter.values()))
jpayne@0	297
jpayne@0	298
jpayne@0	299 @qa_test
jpayne@0	300 def averageQPerFile(sequenceFileName, sequencingType):
jpayne@0	301 return sorted({fi:mean(res.qualityHisto) for fi,res in QaMemo.of(sequenceFileName, sequencingType)}.items())
jpayne@0	302
jpayne@0	303
jpayne@0	304 def averages_exceed(qualityHisto, q):
jpayne@0	305 return mean(qualityHisto) >= q
jpayne@0	306
jpayne@0	307
jpayne@0	308 @qa_test
jpayne@0	309 def qcStatus(sequenceFileName, sequencingType,
jpayne@0	310 coverageThreshold=None,
jpayne@0	311 approximateGenomeSize=4500000,
jpayne@0	312 minimumQScore=qa_params.get('user Q threshold', 27.0),
jpayne@0	313 bbThreshold=qa_params.get('permitted base balance variance', 0.2)):
jpayne@0	314 global failure_mode
jpayne@0	315 results = QaMemo.of(sequenceFileName, sequencingType)
jpayne@0	316 if not all([averages_exceed(r.qualityHisto, minimumQScore) for f, r in results]):
jpayne@0	317 update_failure_mode("qscore below {}".format(minimumQScore))
jpayne@0	318 return "Fail"
jpayne@0	319 a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
jpayne@0	320 at = float(a) / float(t)
jpayne@0	321 gc = float(g) / float(c)
jpayne@0	322 if any((at > 1.0 + bbThreshold, at < 1.0 - bbThreshold, gc > 1.0 + bbThreshold, gc < 1.0 - bbThreshold)):
jpayne@0	323 update_failure_mode("AT or GC base-balance outside of {} threshold".format(bbThreshold))
jpayne@0	324 return "Fail"
jpayne@0	325 if coverageThreshold:
jpayne@0	326 if not (results.numBases / int(approximateGenomeSize)) >= float(coverageThreshold):
jpayne@0	327 update_failure_mode("coverage below {}".format(coverageThreshold))
jpayne@0	328 return "Fail"
jpayne@0	329 return "Pass"
jpayne@0	330 return "Undetermined"
jpayne@0	331
jpayne@0	332
jpayne@0	333
jpayne@0	334
jpayne@0	335 def baseBalance(sequenceFileName, sequencingType):
jpayne@0	336 results = QaMemo.of(sequenceFileName, sequencingType)
jpayne@0	337 a, t, g, c = [sum([n[b] for n in results.nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
jpayne@0	338 return (float(a) / float(t) if t else 1, float(g) / float(c) if c else 1)
jpayne@0	339
jpayne@0	340 @qa_test
jpayne@0	341 def at_gc_baseBalance(sequenceFileName, sequencingType):
jpayne@0	342 a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
jpayne@0	343 at = float(a) / float(t) if t else 1
jpayne@0	344 gc = float(g) / float(c) if c else 1
jpayne@0	345 return "{:02}/{:02}".format(at, gc)
jpayne@0	346
jpayne@0	347 @qa_test(disable=True)
jpayne@0	348 def baseBalance(sequenceFileName, sequencingType, threshold=qa_params.get('permitted base balance variance', 0.2) ):
jpayne@0	349 import xml.etree.ElementTree as xml
jpayne@0	350 el = xml.SubElement
jpayne@0	351 a, t, g, c = [sum([n[b] for n in QaMemo.of(sequenceFileName, sequencingType).nucleotideHisto]) for b in ('A', 'T', 'G', 'C')]
jpayne@0	352 at = float(a) / float(t) if t else 1
jpayne@0	353 gc = float(g) / float(c) if c else 1
jpayne@0	354 status = "Pass"
jpayne@0	355 if any((at > 1.0 + threshold, at < 1.0 - threshold, gc > 1.0 + threshold, gc < 1.0 - threshold)):
jpayne@0	356 status = "Fail"
jpayne@0	357 update_failure_mode("Base balance out of variance threshold ({:.2},{:.2} > {})".format(at, gc, threshold))
jpayne@0	358 root = xml.Element('baseBalanceResult')
jpayne@0	359 el(root, 'AT').text = str(at)
jpayne@0	360 el(root, 'GC').text = str(gc)
jpayne@0	361 el(root, 'baseBalanceStatus').text = status
jpayne@0	362 return root
jpayne@0	363
jpayne@0	364
jpayne@0	365
jpayne@0	366
jpayne@0	367
jpayne@0	368 @qa_test(disable = ~os.path.exists(qa_params.get('path to kraken database', '/scratch/biostat/kraken/miniDB')),
jpayne@0	369 profile=False)
jpayne@0	370 def taxonomyQualityUsingKraken(sequenceFileName,
jpayne@0	371 sequencingType,
jpayne@0	372 genus=None,
jpayne@0	373 species=None,
jpayne@0	374 databasePath=qa_params.get('path to kraken database', '/scratch/biostat/kraken/miniDB')
jpayne@0	375 ):
jpayne@0	376
jpayne@0	377 #print sequenceFileName
jpayne@0	378 #print sequencingType
jpayne@0	379 #print sequenceFileName
jpayne@0	380 #print sequencingType
jpayne@0	381 #print genus
jpayne@0	382 #print species
jpayne@0	383
jpayne@0	384
jpayne@0	385 import xml.etree.ElementTree as xml
jpayne@0	386 global failure_mode
jpayne@0	387 el = xml.SubElement
jpayne@0	388 res = TaxMemo.of(sequenceFileName, databasePath)
jpayne@0	389 #make return element
jpayne@0	390 root = xml.Element('taxonomyQualityResult')
jpayne@0	391 if not genus:
jpayne@0	392 el(root, 'taxStatus').text = "Undetermined"
jpayne@0	393 else:
jpayne@0	394 if genus in res.taxon.split() and species in res.taxon.split():
jpayne@0	395 el(root, 'taxStatus').text = "Pass"
jpayne@0	396 else:
jpayne@0	397 el(root, 'taxStatus').text = "Fail"
jpayne@0	398 update_failure_mode('"{} {}" was not in agreement with calculated ID "{}"'.format(genus, species, res.taxon))
jpayne@0	399 el(root, 'mostPrevalentTaxon', dict(prevalence=str(res.taxon_prevalence))).text = res.taxon[-230:] #last 255 characters
jpayne@0	400
jpayne@0	401 contams = el(root, 'contaminants')
jpayne@0	402 for i in range(4):
jpayne@0	403 el(contams, 'prevalentContaminant', dict(prevalence=str(res.contaminant_prevalence[i]), ordinal=str(i))).text = res.contaminant[i][-230:]
jpayne@0	404 el(root, 'database', dict(checksum=str(res.checksum))).text = databasePath
jpayne@0	405
jpayne@0	406
jpayne@0	407 return root
jpayne@0	408
jpayne@0	409
jpayne@0	410
jpayne@0	411 @qa_test
jpayne@0	412 def reasonForFailure():
jpayne@0	413 global failure_mode
jpayne@0	414 return failure_mode[0:254] #255 character limit in destination DB field
jpayne@0	415
jpayne@0	416 @qa_test(disable=True)
jpayne@0	417 def run_serotype_usingseqsero(
jpayne@0	418 sequenceFileName,
jpayne@0	419 sequencingType):
jpayne@0	420 from gims_qa_serotyping import run_serotype_usingseqsero
jpayne@0	421
jpayne@0	422 seq1 = "" + sequenceFileName[0]
jpayne@0	423 seq2 = "" + sequenceFileName[1]
jpayne@0	424
jpayne@0	425 return run_serotype_usingseqsero(
jpayne@0	426 seq1, seq2,
jpayne@0	427 sequencingType,
jpayne@0	428 Oantigen="Undetermined",
jpayne@0	429 H1antigen="Undetermined",
jpayne@0	430 H2antigen="Undetermined",
jpayne@0	431 AntigenicProfile="Undetermined",
jpayne@0	432 PredictedSerotype="Undetermined")
jpayne@0	433
jpayne@0	434 return root

Mercurial > repos > jpayne > cfsan_qaqc

annotate qa_core/gims_qa_tests.py @ 0:dafec28bd37e