jpayne@0: import csv jpayne@0: from collections import Counter, OrderedDict jpayne@0: from hashlib import md5 jpayne@0: import os.path jpayne@0: import tempfile jpayne@0: from subprocess import check_call jpayne@0: from gims_qa_tests import TaxTuple jpayne@0: #from qa_core import present jpayne@0: jpayne@0: def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)): jpayne@0: "kraken options should be supplied as a dict to kraken_options; give a None value for valueless options" jpayne@0: jpayne@0: #hash the kraken db files jpayne@0: md5hash = md5() jpayne@0: #os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None) jpayne@0: md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read()) jpayne@0: jpayne@0: db_hash = md5hash.hexdigest() jpayne@0: jpayne@0: jpayne@0: with tempfile.NamedTemporaryFile('rU') as kraken_out: jpayne@0: with tempfile.NamedTemporaryFile() as seq_data: jpayne@0: #cat all the files together jpayne@0: check_call("cat {files} > {seq_data.name}".format( jpayne@0: files = " ".join([present(f) for f in sequenceFiles]), jpayne@0: seq_data = seq_data jpayne@0: ), jpayne@0: shell=True) jpayne@0: jpayne@0: def make_option(key, value): jpayne@0: if value is not None: jpayne@0: return "--{} {}".format(key, value) jpayne@0: return "--{}".format(key) jpayne@0: jpayne@0: #build options jpayne@0: options = " ".join([make_option(k,v) for k,v in kraken_options.items()]) jpayne@0: jpayne@0: #call kraken jpayne@0: check_call("kraken {options} " jpayne@0: "--db {database} " jpayne@0: "--gzip-compressed " jpayne@0: "--fastq {seq_data.name} " jpayne@0: "--output {kraken_out.name}".format(**locals()), shell=True) jpayne@0: jpayne@0: counts = Counter() jpayne@0: jpayne@0: with tempfile.NamedTemporaryFile('rU') as kraken_report: jpayne@0: check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True) jpayne@0: rows = csv.reader(kraken_report, delimiter='\t') jpayne@0: for seq_id, taxon in rows: jpayne@0: counts[taxon] += 1.0 jpayne@0: counts["total"] += 1.0 jpayne@0: jpayne@0: #clean up all the temp files by releasing contexts jpayne@0: jpayne@0: top_results = counts.most_common(6) #total, plus five most common taxa jpayne@0: total, total_count = top_results.pop(0) jpayne@0: top_hit, top_hit_count = top_results.pop(0) jpayne@0: next_hits = [h[0].replace(';', ' ') for h in top_results] jpayne@0: next_hit_counts = [h[1] for h in top_results] jpayne@0: jpayne@0: top_hit_frac = top_hit_count / total_count jpayne@0: next_hit_frac = [h / total_count for h in next_hit_counts] jpayne@0: jpayne@0: return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash) jpayne@0: jpayne@0: def runKrakenSeqtest(sequenceFiles,database): jpayne@0: jpayne@0: krakenoutoutpath = "/home/charles.strittmatter/output/kraken/" jpayne@0: jpayne@0: w = datetime.datetime.now() jpayne@0: jpayne@0: krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M") jpayne@0: krakenreport = "" + krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M") jpayne@0: jpayne@0: sequenceFiles[0] jpayne@0: sequenceFiles[1] jpayne@0: jpayne@0: os.path.abspath(os.path.join("Input", sequenceFileName[0])) jpayne@0: jpayne@0: command = "kraken --preload " jpayne@0: command += "--db " + database + ' ' jpayne@0: #if len(fastqPaths) > 1: jpayne@0: command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1])) jpayne@0: # command += ' '.join(fastqPaths) jpayne@0: command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout jpayne@0: jpayne@0: print command jpayne@0: jpayne@0: check_call(command, shell=True) jpayne@0: print "%s -- Kraken called..." % datetime.datetime.now() jpayne@0: jpayne@0: jpayne@0: kraken_report_cmd = "kraken-report --db " jpayne@0: kraken_report_cmd += database jpayne@0: kraken_report_cmd += " " + krakenoutoutpath + krakenout jpayne@0: kraken_report_cmd += " > " + krakenreport jpayne@0: jpayne@0: print kraken_report_cmd jpayne@0: jpayne@0: check_call(kraken_report_cmd, shell=True) jpayne@0: jpayne@0: jpayne@0: jpayne@0: jpayne@0: parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5" jpayne@0: jpayne@0: kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True) jpayne@0: jpayne@0: split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines) jpayne@0: jpayne@0: print "=====================" + krakenreport + "==========================" jpayne@0: print split_kraken_rpt_line jpayne@0: split_kraken_rpt_length = len(split_kraken_rpt_line) jpayne@0: print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4] jpayne@0: taxLen = 6; jpayne@0: taxIdIdx = 4; jpayne@0: outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M") + '_metadata_kraken.tsv' jpayne@0: jpayne@0: metadata = "Platform Run asm_stats_contig_n50 asm_stats_length_bp asm_stats_n_contig attribute_package bioproject_acc bioproject_center bioproject_title biosample_acc collected_by sample_name scientific_name serovar sra_center strain sub_species tax-id" jpayne@0: jpayne@0: jpayne@0: jpayne@0: jpayne@0: splitMetadata = metadata.split("\t") jpayne@0: for i in range(0, len(split_kraken_rpt_line)): jpayne@0: if split_kraken_rpt_line[i] == '': jpayne@0: print "don't add to the metadata...split_kraken_rpt_line.pop(i)" jpayne@0: elif i%(taxLen) == taxLen-1: jpayne@0: print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1" jpayne@0: splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip()) jpayne@0: elif i%(taxLen) == taxIdIdx: jpayne@0: print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx" jpayne@0: splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) jpayne@0: elif i%(taxLen) == 0: jpayne@0: print "i is "+str(i)+"under this condition: i%(taxLen) == 0" jpayne@0: splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) jpayne@0: jpayne@0: splitMetadataLength = len(splitMetadata) jpayne@0: first_kraken_percentage = splitMetadataLength-15 jpayne@0: with open(outputFile, 'wb') as f: jpayne@0: #writer = csv.writer(f, delimiter="\t") jpayne@0: jpayne@0: # print splitMetadata jpayne@0: for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): jpayne@0: print "Split metadata of i "+splitMetadata[i] jpayne@0: for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): jpayne@0: if float(splitMetadata[j]) > float(splitMetadata[i]): jpayne@0: print "i: "+str(i)+" and j: "+str(j) jpayne@0: print "swapping "+splitMetadata[i]+" with "+splitMetadata[j] jpayne@0: old_hit_org = splitMetadata[i-1] jpayne@0: old_hit_percentage = splitMetadata[i] jpayne@0: old_hit_tx_id = splitMetadata[i+1] jpayne@0: jpayne@0: splitMetadata[i-1] = splitMetadata[j-1] jpayne@0: splitMetadata[i] = splitMetadata[j] jpayne@0: splitMetadata[i+1] = splitMetadata[j+1] jpayne@0: jpayne@0: splitMetadata[j-1] = old_hit_org jpayne@0: splitMetadata[j] = old_hit_percentage jpayne@0: splitMetadata[j+1] = old_hit_tx_id jpayne@0: jpayne@0: df = pandas.DataFrame(splitMetadata) jpayne@0: jpayne@0: print df jpayne@0: jpayne@0: jpayne@0: # Here aree dataframe collumns for ! jpayne@0: jpayne@0: print df.iat[0,0] jpayne@0: print df.iat[1,0] jpayne@0: print df.iat[3,0] jpayne@0: print df.iat[4,0] jpayne@0: print df.iat[6,0] jpayne@0: print df.iat[7,0] jpayne@0: print df.iat[9,0] jpayne@0: print df.iat[10,0] jpayne@0: jpayne@0: OrganismOne = df.iat[0,0] jpayne@0: OrganismTwo = df.iat[3,0] jpayne@0: OrganismThree = df.iat[6,0] jpayne@0: OrganismFour = df.iat[9,0] jpayne@0: jpayne@0: jpayne@0: OrganismOnePer = df.iat[1,0] jpayne@0: OrganismTwoPer = df.iat[4,0] jpayne@0: OrganismThreePer = df.iat[7,0] jpayne@0: OrganismFourPer = df.iat[10,0] jpayne@0: jpayne@0: jpayne@0: df = {'OrganismOne':OrganismOne, jpayne@0: 'OrganismTwo':OrganismTwo, jpayne@0: 'OrganismThree':OrganismThree, jpayne@0: 'OrganismFour':OrganismFour, jpayne@0: 'OrganismOne%':OrganismOnePer, jpayne@0: 'OrganismTwo%':OrganismTwoPer, jpayne@0: 'OrganismThree%':OrganismThreePer, jpayne@0: 'OrganismFour%':OrganismFourPer } jpayne@0: jpayne@0: jpayne@0: #Clean up files jpayne@0: jpayne@0: command = "" jpayne@0: command += "rm " + krakenoutoutpath + krakenout jpayne@0: jpayne@0: print command jpayne@0: jpayne@0: check_call(command, shell=True) jpayne@0: jpayne@0: jpayne@0: jpayne@0: command = "" jpayne@0: command += "rm " + outputFile jpayne@0: jpayne@0: print command jpayne@0: jpayne@0: return pandas.DataFrame(df, index=[0])