view qa_core/gims_qa_plot.py @ 0:dafec28bd37e

planemo upload
author jpayne
date Tue, 08 May 2018 18:54:20 -0400
parents
children
line wrap: on
line source
import subprocess
from qa_core import new_output_file, qa_params
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")
import pandas
from pandas import DataFrame
import matplotlib.pyplot as plot
from numpy import zeros, add, mean, array, percentile, arange
import traceback
from gims_qa_tests import cluster
import logging

bases = ("A", "T", "G", "C", "N")

def qa_plots(df, annotation, log=logging.getLogger('qa.plots'), **kwargs):
	#log.debug(df)
	#log.debug(annotation)
	plot_quality_histo(df['qualityHisto'], annotation, **kwargs)
	plot_nucleotide_distribution_histo(df['nucleotideHisto'], annotation)
	plot_quality_distribution_histo(df['baseQScoreHisto'], annotation)
	plot_base_balance(df, annotation, **kwargs)
	#plot.show()
	
def plot_base_balance(df, 
					  annotation=qa_params.get('annotation', ''),
					  threshold=qa_params.get('permitted base balance variance', .2), 
					  **kwargs):
	try:
		plot.figure()
		plot.axis((0,2,0,2))
		plot.suptitle("{} Base Balance".format(annotation))
		plot.xlabel("G/C")
		plot.ylabel("A/T")
		thres_min = 1 - threshold
		thres_max = 1 + threshold
		for coord in (thres_min, thres_max):
			for linef in (plot.axvline, plot.axhline):
				linef(coord, linestyle='dashed', label=str(coord))
				
		plot.annotate(thres_max, xy=(0, thres_max))
		plot.annotate(thres_min, xy=(0, thres_min))
		plot.annotate(thres_max, xy=(thres_max, 0))
		plot.annotate(thres_min, xy=(thres_min, 0))
				
		a, t, g, c = [sum(df['nucleotideHisto'][b]) for b in ('A', 'T', 'G', 'C')]
		y, x = a / float(t), g / float(c)
		plot.plot((x,), (y,), marker="+")
		plot.annotate("({:.2},{:.2})".format(x, y), xy=(x+.05, y+.05), textcoords='data')
		with new_output_file("base_balance.png") as base_bal:
			plot.savefig(base_bal, format='png', dpi=92)
	except Exception:
		traceback.print_exc()
	
	
def plot_quality_histo(histo,
					   annotation=qa_params.get('annotation', ''),
					   q=qa_params.get('user Q threshold', 27)):
	try:
		plot.figure()
		labels = range(max(histo.keys()) + 1)
 		plot.bar(left=labels,
 				 height=[histo[n] for n in labels],
 				 color='y',
 				 linewidth=1)
# 		plot.hist(histo, bins=range(max(histo)))
		plot.xticks()
		plot.axvline(x=q, label="Q {}".format(q))
		#plot.show()
		plot.xlabel(r"""$Q (-10 log_(10) \rho)$""")
		plot.ylabel('Number of Bases')
		plot.suptitle('{} Q-Score Histogram'.format(annotation))
		plot.legend(loc='upper left', fontsize='x-small')
		with new_output_file("qscore_histogram.png") as qual_histogram:
			plot.savefig(qual_histogram, format='png', dpi=92)
# 	except ValueError:
# 		pass	
	except Exception:
		traceback.print_exc()
	
def plot_nucleotide_distribution_histo(dataframe,
									   annotation=qa_params.get('annotation', '')):
	try:
		#dataframe = defaultdict(list)
		colors = iter(('r', 'y', 'g', 'b', 'pink'))
		plot.figure()
		last_base = zeros((len(dataframe.values()[0]),)) #set all zeros for bottom of first bar row
		for base in bases:
			try:
				plot.bar(left=range(len(dataframe[base])), 
									height=dataframe[base], 
									width=2, 
									bottom=last_base, 
									color=colors.next(), 
									label=base, 
									linewidth=0)
				last_base = add(last_base, dataframe[base])
			except:
				#print [len(d) for d in dataframe.data.values()]
				raise
		plot.autoscale(tight=True)
		plot.xlabel('Base Position')
		plot.ylabel('Nucleotide Count')
		plot.suptitle('{} Nucleotide Distribution Plot'.format(annotation))
		plot.legend(loc='upper right', fontsize='xx-small')
		with new_output_file("nucleotide_distribution.png") as nuc_histogram:
			plot.savefig(nuc_histogram, format='png', dpi=108)
	except Exception:
		traceback.print_exc()
	
def plot_quality_distribution_histo(histo, 
									annotation=qa_params.get('annotation', ''),
									q=qa_params.get('user Q threshold', 27)
									):
	try:
		binning = 20
		plot.figure(figsize=(12, 6))
		vec = [tuple(c.elements()) for c in histo]
		means = [mean(t) for t in vec]
		upper_iqr = array([percentile(t, 75) for t in vec])
		lower_iqr = array([percentile(t, 25) for t in vec])
# 		plot.boxplot(vec, 
# 					 widths=.000005,
# 					 whis=(5,95), 
# 					 showfliers=False, 
# 					 showbox=True, 
# 					 showmeans=True,
# 					 meanline=True, 
# 					 showcaps=True,
# 					 whiskerprops=dict(linestyle='-', color='0.60')
# 					 )
		
		plot.fill_between(arange(len(upper_iqr)), upper_iqr, lower_iqr, facecolor='0.50', alpha=0.5, label='interquartile range')
		plot.plot(means, 'r-', label='Mean')
		plot.plot(upper_iqr, 'b-', label='interquartile range', alpha=0.25)
		plot.autoscale(tight=True)
		plot.axhline(y=q, label="Q {}".format(q))
		plot.xlabel('Base Position')
		plot.ylabel(r"""$Q (-10 log_(10) \rho)$""")
		plot.xticks(range(0, len(histo), binning), range(0, len(histo), binning), rotation='vertical')
		plot.suptitle('{} Q-Score Distribution Plot'.format(annotation))
		plot.legend(loc='lower left', fontsize='x-small')
		plot.grid(True)
		with new_output_file("qscore_distribution.png") as q_distro:
			plot.savefig(q_distro, format='png', dpi=108)
		
	except Exception:
		traceback.print_exc()