Mercurial > repos > jpayne > cfsan_qaqc
diff qa_core/gims_qa_kraken.py @ 0:dafec28bd37e
planemo upload
author | jpayne |
---|---|
date | Tue, 08 May 2018 18:54:20 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/qa_core/gims_qa_kraken.py Tue May 08 18:54:20 2018 -0400 @@ -0,0 +1,217 @@ +import csv +from collections import Counter, OrderedDict +from hashlib import md5 +import os.path +import tempfile +from subprocess import check_call +from gims_qa_tests import TaxTuple +#from qa_core import present + +def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)): + "kraken options should be supplied as a dict to kraken_options; give a None value for valueless options" + + #hash the kraken db files + md5hash = md5() + #os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None) + md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read()) + + db_hash = md5hash.hexdigest() + + + with tempfile.NamedTemporaryFile('rU') as kraken_out: + with tempfile.NamedTemporaryFile() as seq_data: + #cat all the files together + check_call("cat {files} > {seq_data.name}".format( + files = " ".join([present(f) for f in sequenceFiles]), + seq_data = seq_data + ), + shell=True) + + def make_option(key, value): + if value is not None: + return "--{} {}".format(key, value) + return "--{}".format(key) + + #build options + options = " ".join([make_option(k,v) for k,v in kraken_options.items()]) + + #call kraken + check_call("kraken {options} " + "--db {database} " + "--gzip-compressed " + "--fastq {seq_data.name} " + "--output {kraken_out.name}".format(**locals()), shell=True) + + counts = Counter() + + with tempfile.NamedTemporaryFile('rU') as kraken_report: + check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True) + rows = csv.reader(kraken_report, delimiter='\t') + for seq_id, taxon in rows: + counts[taxon] += 1.0 + counts["total"] += 1.0 + + #clean up all the temp files by releasing contexts + + top_results = counts.most_common(6) #total, plus five most common taxa + total, total_count = top_results.pop(0) + top_hit, top_hit_count = top_results.pop(0) + next_hits = [h[0].replace(';', ' ') for h in top_results] + next_hit_counts = [h[1] for h in top_results] + + top_hit_frac = top_hit_count / total_count + next_hit_frac = [h / total_count for h in next_hit_counts] + + return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash) + +def runKrakenSeqtest(sequenceFiles,database): + + krakenoutoutpath = "/home/charles.strittmatter/output/kraken/" + + w = datetime.datetime.now() + + krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M") + krakenreport = "" + krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M") + + sequenceFiles[0] + sequenceFiles[1] + + os.path.abspath(os.path.join("Input", sequenceFileName[0])) + + command = "kraken --preload " + command += "--db " + database + ' ' + #if len(fastqPaths) > 1: + command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1])) + # command += ' '.join(fastqPaths) + command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout + + print command + + check_call(command, shell=True) + print "%s -- Kraken called..." % datetime.datetime.now() + + + kraken_report_cmd = "kraken-report --db " + kraken_report_cmd += database + kraken_report_cmd += " " + krakenoutoutpath + krakenout + kraken_report_cmd += " > " + krakenreport + + print kraken_report_cmd + + check_call(kraken_report_cmd, shell=True) + + + + + parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5" + + kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True) + + split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines) + + print "=====================" + krakenreport + "==========================" + print split_kraken_rpt_line + split_kraken_rpt_length = len(split_kraken_rpt_line) + print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4] + taxLen = 6; + taxIdIdx = 4; + outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M") + '_metadata_kraken.tsv' + + metadata = "Platform Run asm_stats_contig_n50 asm_stats_length_bp asm_stats_n_contig attribute_package bioproject_acc bioproject_center bioproject_title biosample_acc collected_by sample_name scientific_name serovar sra_center strain sub_species tax-id" + + + + + splitMetadata = metadata.split("\t") + for i in range(0, len(split_kraken_rpt_line)): + if split_kraken_rpt_line[i] == '': + print "don't add to the metadata...split_kraken_rpt_line.pop(i)" + elif i%(taxLen) == taxLen-1: + print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1" + splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip()) + elif i%(taxLen) == taxIdIdx: + print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx" + splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) + elif i%(taxLen) == 0: + print "i is "+str(i)+"under this condition: i%(taxLen) == 0" + splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) + + splitMetadataLength = len(splitMetadata) + first_kraken_percentage = splitMetadataLength-15 + with open(outputFile, 'wb') as f: + #writer = csv.writer(f, delimiter="\t") + + # print splitMetadata + for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): + print "Split metadata of i "+splitMetadata[i] + for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): + if float(splitMetadata[j]) > float(splitMetadata[i]): + print "i: "+str(i)+" and j: "+str(j) + print "swapping "+splitMetadata[i]+" with "+splitMetadata[j] + old_hit_org = splitMetadata[i-1] + old_hit_percentage = splitMetadata[i] + old_hit_tx_id = splitMetadata[i+1] + + splitMetadata[i-1] = splitMetadata[j-1] + splitMetadata[i] = splitMetadata[j] + splitMetadata[i+1] = splitMetadata[j+1] + + splitMetadata[j-1] = old_hit_org + splitMetadata[j] = old_hit_percentage + splitMetadata[j+1] = old_hit_tx_id + + df = pandas.DataFrame(splitMetadata) + + print df + + + # Here aree dataframe collumns for ! + + print df.iat[0,0] + print df.iat[1,0] + print df.iat[3,0] + print df.iat[4,0] + print df.iat[6,0] + print df.iat[7,0] + print df.iat[9,0] + print df.iat[10,0] + + OrganismOne = df.iat[0,0] + OrganismTwo = df.iat[3,0] + OrganismThree = df.iat[6,0] + OrganismFour = df.iat[9,0] + + + OrganismOnePer = df.iat[1,0] + OrganismTwoPer = df.iat[4,0] + OrganismThreePer = df.iat[7,0] + OrganismFourPer = df.iat[10,0] + + + df = {'OrganismOne':OrganismOne, + 'OrganismTwo':OrganismTwo, + 'OrganismThree':OrganismThree, + 'OrganismFour':OrganismFour, + 'OrganismOne%':OrganismOnePer, + 'OrganismTwo%':OrganismTwoPer, + 'OrganismThree%':OrganismThreePer, + 'OrganismFour%':OrganismFourPer } + + + #Clean up files + + command = "" + command += "rm " + krakenoutoutpath + krakenout + + print command + + check_call(command, shell=True) + + + + command = "" + command += "rm " + outputFile + + print command + + return pandas.DataFrame(df, index=[0]) \ No newline at end of file