annotate qa_core/gims_qa_kraken.py @ 2:021864b5a0a5 tip

planemo upload
author jpayne
date Fri, 11 May 2018 10:25:14 -0400
parents dafec28bd37e
children
rev   line source
jpayne@0 1 import csv
jpayne@0 2 from collections import Counter, OrderedDict
jpayne@0 3 from hashlib import md5
jpayne@0 4 import os.path
jpayne@0 5 import tempfile
jpayne@0 6 from subprocess import check_call
jpayne@0 7 from gims_qa_tests import TaxTuple
jpayne@0 8 #from qa_core import present
jpayne@0 9
jpayne@0 10 def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)):
jpayne@0 11 "kraken options should be supplied as a dict to kraken_options; give a None value for valueless options"
jpayne@0 12
jpayne@0 13 #hash the kraken db files
jpayne@0 14 md5hash = md5()
jpayne@0 15 #os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None)
jpayne@0 16 md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read())
jpayne@0 17
jpayne@0 18 db_hash = md5hash.hexdigest()
jpayne@0 19
jpayne@0 20
jpayne@0 21 with tempfile.NamedTemporaryFile('rU') as kraken_out:
jpayne@0 22 with tempfile.NamedTemporaryFile() as seq_data:
jpayne@0 23 #cat all the files together
jpayne@0 24 check_call("cat {files} > {seq_data.name}".format(
jpayne@0 25 files = " ".join([present(f) for f in sequenceFiles]),
jpayne@0 26 seq_data = seq_data
jpayne@0 27 ),
jpayne@0 28 shell=True)
jpayne@0 29
jpayne@0 30 def make_option(key, value):
jpayne@0 31 if value is not None:
jpayne@0 32 return "--{} {}".format(key, value)
jpayne@0 33 return "--{}".format(key)
jpayne@0 34
jpayne@0 35 #build options
jpayne@0 36 options = " ".join([make_option(k,v) for k,v in kraken_options.items()])
jpayne@0 37
jpayne@0 38 #call kraken
jpayne@0 39 check_call("kraken {options} "
jpayne@0 40 "--db {database} "
jpayne@0 41 "--gzip-compressed "
jpayne@0 42 "--fastq {seq_data.name} "
jpayne@0 43 "--output {kraken_out.name}".format(**locals()), shell=True)
jpayne@0 44
jpayne@0 45 counts = Counter()
jpayne@0 46
jpayne@0 47 with tempfile.NamedTemporaryFile('rU') as kraken_report:
jpayne@0 48 check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True)
jpayne@0 49 rows = csv.reader(kraken_report, delimiter='\t')
jpayne@0 50 for seq_id, taxon in rows:
jpayne@0 51 counts[taxon] += 1.0
jpayne@0 52 counts["total"] += 1.0
jpayne@0 53
jpayne@0 54 #clean up all the temp files by releasing contexts
jpayne@0 55
jpayne@0 56 top_results = counts.most_common(6) #total, plus five most common taxa
jpayne@0 57 total, total_count = top_results.pop(0)
jpayne@0 58 top_hit, top_hit_count = top_results.pop(0)
jpayne@0 59 next_hits = [h[0].replace(';', ' ') for h in top_results]
jpayne@0 60 next_hit_counts = [h[1] for h in top_results]
jpayne@0 61
jpayne@0 62 top_hit_frac = top_hit_count / total_count
jpayne@0 63 next_hit_frac = [h / total_count for h in next_hit_counts]
jpayne@0 64
jpayne@0 65 return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash)
jpayne@0 66
jpayne@0 67 def runKrakenSeqtest(sequenceFiles,database):
jpayne@0 68
jpayne@0 69 krakenoutoutpath = "/home/charles.strittmatter/output/kraken/"
jpayne@0 70
jpayne@0 71 w = datetime.datetime.now()
jpayne@0 72
jpayne@0 73 krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M")
jpayne@0 74 krakenreport = "" + krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M")
jpayne@0 75
jpayne@0 76 sequenceFiles[0]
jpayne@0 77 sequenceFiles[1]
jpayne@0 78
jpayne@0 79 os.path.abspath(os.path.join("Input", sequenceFileName[0]))
jpayne@0 80
jpayne@0 81 command = "kraken --preload "
jpayne@0 82 command += "--db " + database + ' '
jpayne@0 83 #if len(fastqPaths) > 1:
jpayne@0 84 command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1]))
jpayne@0 85 # command += ' '.join(fastqPaths)
jpayne@0 86 command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout
jpayne@0 87
jpayne@0 88 print command
jpayne@0 89
jpayne@0 90 check_call(command, shell=True)
jpayne@0 91 print "%s -- Kraken called..." % datetime.datetime.now()
jpayne@0 92
jpayne@0 93
jpayne@0 94 kraken_report_cmd = "kraken-report --db "
jpayne@0 95 kraken_report_cmd += database
jpayne@0 96 kraken_report_cmd += " " + krakenoutoutpath + krakenout
jpayne@0 97 kraken_report_cmd += " > " + krakenreport
jpayne@0 98
jpayne@0 99 print kraken_report_cmd
jpayne@0 100
jpayne@0 101 check_call(kraken_report_cmd, shell=True)
jpayne@0 102
jpayne@0 103
jpayne@0 104
jpayne@0 105
jpayne@0 106 parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5"
jpayne@0 107
jpayne@0 108 kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True)
jpayne@0 109
jpayne@0 110 split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines)
jpayne@0 111
jpayne@0 112 print "=====================" + krakenreport + "=========================="
jpayne@0 113 print split_kraken_rpt_line
jpayne@0 114 split_kraken_rpt_length = len(split_kraken_rpt_line)
jpayne@0 115 print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4]
jpayne@0 116 taxLen = 6;
jpayne@0 117 taxIdIdx = 4;
jpayne@0 118 outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M") + '_metadata_kraken.tsv'
jpayne@0 119
jpayne@0 120 metadata = "Platform Run asm_stats_contig_n50 asm_stats_length_bp asm_stats_n_contig attribute_package bioproject_acc bioproject_center bioproject_title biosample_acc collected_by sample_name scientific_name serovar sra_center strain sub_species tax-id"
jpayne@0 121
jpayne@0 122
jpayne@0 123
jpayne@0 124
jpayne@0 125 splitMetadata = metadata.split("\t")
jpayne@0 126 for i in range(0, len(split_kraken_rpt_line)):
jpayne@0 127 if split_kraken_rpt_line[i] == '':
jpayne@0 128 print "don't add to the metadata...split_kraken_rpt_line.pop(i)"
jpayne@0 129 elif i%(taxLen) == taxLen-1:
jpayne@0 130 print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1"
jpayne@0 131 splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip())
jpayne@0 132 elif i%(taxLen) == taxIdIdx:
jpayne@0 133 print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx"
jpayne@0 134 splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
jpayne@0 135 elif i%(taxLen) == 0:
jpayne@0 136 print "i is "+str(i)+"under this condition: i%(taxLen) == 0"
jpayne@0 137 splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
jpayne@0 138
jpayne@0 139 splitMetadataLength = len(splitMetadata)
jpayne@0 140 first_kraken_percentage = splitMetadataLength-15
jpayne@0 141 with open(outputFile, 'wb') as f:
jpayne@0 142 #writer = csv.writer(f, delimiter="\t")
jpayne@0 143
jpayne@0 144 # print splitMetadata
jpayne@0 145 for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
jpayne@0 146 print "Split metadata of i "+splitMetadata[i]
jpayne@0 147 for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
jpayne@0 148 if float(splitMetadata[j]) > float(splitMetadata[i]):
jpayne@0 149 print "i: "+str(i)+" and j: "+str(j)
jpayne@0 150 print "swapping "+splitMetadata[i]+" with "+splitMetadata[j]
jpayne@0 151 old_hit_org = splitMetadata[i-1]
jpayne@0 152 old_hit_percentage = splitMetadata[i]
jpayne@0 153 old_hit_tx_id = splitMetadata[i+1]
jpayne@0 154
jpayne@0 155 splitMetadata[i-1] = splitMetadata[j-1]
jpayne@0 156 splitMetadata[i] = splitMetadata[j]
jpayne@0 157 splitMetadata[i+1] = splitMetadata[j+1]
jpayne@0 158
jpayne@0 159 splitMetadata[j-1] = old_hit_org
jpayne@0 160 splitMetadata[j] = old_hit_percentage
jpayne@0 161 splitMetadata[j+1] = old_hit_tx_id
jpayne@0 162
jpayne@0 163 df = pandas.DataFrame(splitMetadata)
jpayne@0 164
jpayne@0 165 print df
jpayne@0 166
jpayne@0 167
jpayne@0 168 # Here aree dataframe collumns for !
jpayne@0 169
jpayne@0 170 print df.iat[0,0]
jpayne@0 171 print df.iat[1,0]
jpayne@0 172 print df.iat[3,0]
jpayne@0 173 print df.iat[4,0]
jpayne@0 174 print df.iat[6,0]
jpayne@0 175 print df.iat[7,0]
jpayne@0 176 print df.iat[9,0]
jpayne@0 177 print df.iat[10,0]
jpayne@0 178
jpayne@0 179 OrganismOne = df.iat[0,0]
jpayne@0 180 OrganismTwo = df.iat[3,0]
jpayne@0 181 OrganismThree = df.iat[6,0]
jpayne@0 182 OrganismFour = df.iat[9,0]
jpayne@0 183
jpayne@0 184
jpayne@0 185 OrganismOnePer = df.iat[1,0]
jpayne@0 186 OrganismTwoPer = df.iat[4,0]
jpayne@0 187 OrganismThreePer = df.iat[7,0]
jpayne@0 188 OrganismFourPer = df.iat[10,0]
jpayne@0 189
jpayne@0 190
jpayne@0 191 df = {'OrganismOne':OrganismOne,
jpayne@0 192 'OrganismTwo':OrganismTwo,
jpayne@0 193 'OrganismThree':OrganismThree,
jpayne@0 194 'OrganismFour':OrganismFour,
jpayne@0 195 'OrganismOne%':OrganismOnePer,
jpayne@0 196 'OrganismTwo%':OrganismTwoPer,
jpayne@0 197 'OrganismThree%':OrganismThreePer,
jpayne@0 198 'OrganismFour%':OrganismFourPer }
jpayne@0 199
jpayne@0 200
jpayne@0 201 #Clean up files
jpayne@0 202
jpayne@0 203 command = ""
jpayne@0 204 command += "rm " + krakenoutoutpath + krakenout
jpayne@0 205
jpayne@0 206 print command
jpayne@0 207
jpayne@0 208 check_call(command, shell=True)
jpayne@0 209
jpayne@0 210
jpayne@0 211
jpayne@0 212 command = ""
jpayne@0 213 command += "rm " + outputFile
jpayne@0 214
jpayne@0 215 print command
jpayne@0 216
jpayne@0 217 return pandas.DataFrame(df, index=[0])