diff qa_core/gims_qa_kraken.py @ 0:dafec28bd37e

planemo upload
author jpayne
date Tue, 08 May 2018 18:54:20 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qa_core/gims_qa_kraken.py	Tue May 08 18:54:20 2018 -0400
@@ -0,0 +1,217 @@
+import csv
+from collections import Counter, OrderedDict
+from hashlib import md5
+import os.path
+import tempfile
+from subprocess import check_call
+from gims_qa_tests import TaxTuple
+#from qa_core import present
+
+def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)):
+	"kraken options should be supplied as a dict to kraken_options; give a None value for valueless options"
+
+	#hash the kraken db files
+	md5hash = md5()
+	#os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None)
+	md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read())
+
+	db_hash = md5hash.hexdigest()
+
+
+	with tempfile.NamedTemporaryFile('rU') as kraken_out:
+		with tempfile.NamedTemporaryFile() as seq_data:
+			#cat all the files together
+			check_call("cat {files} > {seq_data.name}".format(
+				files = " ".join([present(f) for f in sequenceFiles]),
+				seq_data = seq_data
+				), 
+			shell=True)
+
+			def make_option(key, value):
+				if value is not None:
+					return "--{} {}".format(key, value)
+				return "--{}".format(key)
+
+			#build options
+			options = " ".join([make_option(k,v) for k,v in kraken_options.items()])
+
+			#call kraken
+			check_call("kraken {options} "
+					"--db {database} "
+					"--gzip-compressed "
+					"--fastq {seq_data.name} "
+					"--output {kraken_out.name}".format(**locals()), shell=True)
+
+			counts = Counter()
+
+			with tempfile.NamedTemporaryFile('rU') as kraken_report:
+				check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True)
+				rows = csv.reader(kraken_report, delimiter='\t')
+				for seq_id, taxon in rows:
+					counts[taxon] += 1.0
+					counts["total"] += 1.0
+
+				#clean up all the temp files by releasing contexts
+	
+	top_results = counts.most_common(6) #total, plus five most common taxa
+	total, total_count = top_results.pop(0)
+	top_hit, top_hit_count = top_results.pop(0)
+	next_hits = [h[0].replace(';', ' ') for h in top_results]
+	next_hit_counts = [h[1] for h in top_results]
+
+	top_hit_frac = top_hit_count / total_count
+	next_hit_frac = [h / total_count for h in next_hit_counts]
+
+	return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash)
+
+def  runKrakenSeqtest(sequenceFiles,database):
+
+	krakenoutoutpath = "/home/charles.strittmatter/output/kraken/"
+
+	w = datetime.datetime.now()
+        
+	krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M")
+	krakenreport = "" +  krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M")
+	
+	sequenceFiles[0]
+	sequenceFiles[1]
+
+	os.path.abspath(os.path.join("Input", sequenceFileName[0]))
+
+	command = "kraken --preload "
+	command += "--db " + database + ' '
+	#if len(fastqPaths) > 1:
+	command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1])) 
+	#  command += ' '.join(fastqPaths)
+	command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout
+	
+	print command
+
+	check_call(command, shell=True)
+	print "%s -- Kraken called..." % datetime.datetime.now()
+
+
+	kraken_report_cmd = "kraken-report --db "
+	kraken_report_cmd += database
+	kraken_report_cmd += " " + krakenoutoutpath + krakenout
+	kraken_report_cmd += " > " + krakenreport
+	
+	print kraken_report_cmd
+
+	check_call(kraken_report_cmd, shell=True)
+
+
+
+
+	parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5"
+
+	kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True)
+
+	split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines)
+
+	print "=====================" + krakenreport + "=========================="
+	print split_kraken_rpt_line
+	split_kraken_rpt_length = len(split_kraken_rpt_line)
+	print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4]
+	taxLen = 6;
+	taxIdIdx = 4;
+	outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M")  + '_metadata_kraken.tsv'
+
+ 	metadata = "Platform        Run     asm_stats_contig_n50    asm_stats_length_bp     asm_stats_n_contig      attribute_package       bioproject_acc  bioproject_center       bioproject_title        biosample_acc   collected_by    sample_name     scientific_name serovar sra_center      strain  sub_species     tax-id"
+
+
+
+
+	splitMetadata = metadata.split("\t")
+	for i in range(0, len(split_kraken_rpt_line)):
+			if split_kraken_rpt_line[i] == '':
+					print "don't add to the metadata...split_kraken_rpt_line.pop(i)"
+			elif i%(taxLen) == taxLen-1:
+					print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1"
+					splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip())
+			elif i%(taxLen) == taxIdIdx:
+					print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx"
+					splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
+			elif i%(taxLen) == 0:
+					print "i is "+str(i)+"under this condition: i%(taxLen) == 0"
+					splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
+
+			splitMetadataLength = len(splitMetadata)
+			first_kraken_percentage = splitMetadataLength-15
+	with open(outputFile, 'wb') as f:
+			#writer = csv.writer(f, delimiter="\t")
+
+			#         print splitMetadata
+			for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
+					print "Split metadata of i "+splitMetadata[i]
+					for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
+							if float(splitMetadata[j]) > float(splitMetadata[i]):
+									print "i: "+str(i)+" and j: "+str(j)
+									print "swapping "+splitMetadata[i]+" with "+splitMetadata[j]
+									old_hit_org = splitMetadata[i-1]
+									old_hit_percentage = splitMetadata[i]
+									old_hit_tx_id = splitMetadata[i+1]
+
+									splitMetadata[i-1] = splitMetadata[j-1]
+									splitMetadata[i] = splitMetadata[j]
+									splitMetadata[i+1] = splitMetadata[j+1]
+
+									splitMetadata[j-1] = old_hit_org
+									splitMetadata[j] = old_hit_percentage
+									splitMetadata[j+1] = old_hit_tx_id
+
+	df = pandas.DataFrame(splitMetadata)
+
+	print df
+
+
+	# Here aree dataframe collumns for !
+
+	print df.iat[0,0]
+	print df.iat[1,0]
+	print df.iat[3,0]
+	print df.iat[4,0]
+	print df.iat[6,0]
+	print df.iat[7,0]
+	print df.iat[9,0]
+	print df.iat[10,0]
+
+	OrganismOne = df.iat[0,0]
+	OrganismTwo =  df.iat[3,0]
+	OrganismThree = df.iat[6,0]
+	OrganismFour =  df.iat[9,0]
+
+
+	OrganismOnePer = df.iat[1,0]
+	OrganismTwoPer = df.iat[4,0]
+	OrganismThreePer = df.iat[7,0]
+	OrganismFourPer = df.iat[10,0]
+
+
+	df = {'OrganismOne':OrganismOne,
+			'OrganismTwo':OrganismTwo,
+			'OrganismThree':OrganismThree,
+			'OrganismFour':OrganismFour,
+			'OrganismOne%':OrganismOnePer,
+			'OrganismTwo%':OrganismTwoPer,
+			'OrganismThree%':OrganismThreePer,
+			'OrganismFour%':OrganismFourPer }
+
+
+	#Clean up files
+
+	command = ""
+	command += "rm " + krakenoutoutpath + krakenout
+
+	print command
+
+	check_call(command, shell=True)
+
+
+
+	command = ""
+	command += "rm " + outputFile
+
+	print command
+
+	return pandas.DataFrame(df, index=[0])
\ No newline at end of file