Mercurial > repos > jpayne > cfsan_qaqc
comparison qa_core/gims_qa_kraken.py @ 0:dafec28bd37e
planemo upload
author | jpayne |
---|---|
date | Tue, 08 May 2018 18:54:20 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dafec28bd37e |
---|---|
1 import csv | |
2 from collections import Counter, OrderedDict | |
3 from hashlib import md5 | |
4 import os.path | |
5 import tempfile | |
6 from subprocess import check_call | |
7 from gims_qa_tests import TaxTuple | |
8 #from qa_core import present | |
9 | |
10 def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)): | |
11 "kraken options should be supplied as a dict to kraken_options; give a None value for valueless options" | |
12 | |
13 #hash the kraken db files | |
14 md5hash = md5() | |
15 #os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None) | |
16 md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read()) | |
17 | |
18 db_hash = md5hash.hexdigest() | |
19 | |
20 | |
21 with tempfile.NamedTemporaryFile('rU') as kraken_out: | |
22 with tempfile.NamedTemporaryFile() as seq_data: | |
23 #cat all the files together | |
24 check_call("cat {files} > {seq_data.name}".format( | |
25 files = " ".join([present(f) for f in sequenceFiles]), | |
26 seq_data = seq_data | |
27 ), | |
28 shell=True) | |
29 | |
30 def make_option(key, value): | |
31 if value is not None: | |
32 return "--{} {}".format(key, value) | |
33 return "--{}".format(key) | |
34 | |
35 #build options | |
36 options = " ".join([make_option(k,v) for k,v in kraken_options.items()]) | |
37 | |
38 #call kraken | |
39 check_call("kraken {options} " | |
40 "--db {database} " | |
41 "--gzip-compressed " | |
42 "--fastq {seq_data.name} " | |
43 "--output {kraken_out.name}".format(**locals()), shell=True) | |
44 | |
45 counts = Counter() | |
46 | |
47 with tempfile.NamedTemporaryFile('rU') as kraken_report: | |
48 check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True) | |
49 rows = csv.reader(kraken_report, delimiter='\t') | |
50 for seq_id, taxon in rows: | |
51 counts[taxon] += 1.0 | |
52 counts["total"] += 1.0 | |
53 | |
54 #clean up all the temp files by releasing contexts | |
55 | |
56 top_results = counts.most_common(6) #total, plus five most common taxa | |
57 total, total_count = top_results.pop(0) | |
58 top_hit, top_hit_count = top_results.pop(0) | |
59 next_hits = [h[0].replace(';', ' ') for h in top_results] | |
60 next_hit_counts = [h[1] for h in top_results] | |
61 | |
62 top_hit_frac = top_hit_count / total_count | |
63 next_hit_frac = [h / total_count for h in next_hit_counts] | |
64 | |
65 return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash) | |
66 | |
67 def runKrakenSeqtest(sequenceFiles,database): | |
68 | |
69 krakenoutoutpath = "/home/charles.strittmatter/output/kraken/" | |
70 | |
71 w = datetime.datetime.now() | |
72 | |
73 krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M") | |
74 krakenreport = "" + krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M") | |
75 | |
76 sequenceFiles[0] | |
77 sequenceFiles[1] | |
78 | |
79 os.path.abspath(os.path.join("Input", sequenceFileName[0])) | |
80 | |
81 command = "kraken --preload " | |
82 command += "--db " + database + ' ' | |
83 #if len(fastqPaths) > 1: | |
84 command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1])) | |
85 # command += ' '.join(fastqPaths) | |
86 command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout | |
87 | |
88 print command | |
89 | |
90 check_call(command, shell=True) | |
91 print "%s -- Kraken called..." % datetime.datetime.now() | |
92 | |
93 | |
94 kraken_report_cmd = "kraken-report --db " | |
95 kraken_report_cmd += database | |
96 kraken_report_cmd += " " + krakenoutoutpath + krakenout | |
97 kraken_report_cmd += " > " + krakenreport | |
98 | |
99 print kraken_report_cmd | |
100 | |
101 check_call(kraken_report_cmd, shell=True) | |
102 | |
103 | |
104 | |
105 | |
106 parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5" | |
107 | |
108 kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True) | |
109 | |
110 split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines) | |
111 | |
112 print "=====================" + krakenreport + "==========================" | |
113 print split_kraken_rpt_line | |
114 split_kraken_rpt_length = len(split_kraken_rpt_line) | |
115 print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4] | |
116 taxLen = 6; | |
117 taxIdIdx = 4; | |
118 outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M") + '_metadata_kraken.tsv' | |
119 | |
120 metadata = "Platform Run asm_stats_contig_n50 asm_stats_length_bp asm_stats_n_contig attribute_package bioproject_acc bioproject_center bioproject_title biosample_acc collected_by sample_name scientific_name serovar sra_center strain sub_species tax-id" | |
121 | |
122 | |
123 | |
124 | |
125 splitMetadata = metadata.split("\t") | |
126 for i in range(0, len(split_kraken_rpt_line)): | |
127 if split_kraken_rpt_line[i] == '': | |
128 print "don't add to the metadata...split_kraken_rpt_line.pop(i)" | |
129 elif i%(taxLen) == taxLen-1: | |
130 print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1" | |
131 splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip()) | |
132 elif i%(taxLen) == taxIdIdx: | |
133 print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx" | |
134 splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) | |
135 elif i%(taxLen) == 0: | |
136 print "i is "+str(i)+"under this condition: i%(taxLen) == 0" | |
137 splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip()) | |
138 | |
139 splitMetadataLength = len(splitMetadata) | |
140 first_kraken_percentage = splitMetadataLength-15 | |
141 with open(outputFile, 'wb') as f: | |
142 #writer = csv.writer(f, delimiter="\t") | |
143 | |
144 # print splitMetadata | |
145 for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): | |
146 print "Split metadata of i "+splitMetadata[i] | |
147 for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3): | |
148 if float(splitMetadata[j]) > float(splitMetadata[i]): | |
149 print "i: "+str(i)+" and j: "+str(j) | |
150 print "swapping "+splitMetadata[i]+" with "+splitMetadata[j] | |
151 old_hit_org = splitMetadata[i-1] | |
152 old_hit_percentage = splitMetadata[i] | |
153 old_hit_tx_id = splitMetadata[i+1] | |
154 | |
155 splitMetadata[i-1] = splitMetadata[j-1] | |
156 splitMetadata[i] = splitMetadata[j] | |
157 splitMetadata[i+1] = splitMetadata[j+1] | |
158 | |
159 splitMetadata[j-1] = old_hit_org | |
160 splitMetadata[j] = old_hit_percentage | |
161 splitMetadata[j+1] = old_hit_tx_id | |
162 | |
163 df = pandas.DataFrame(splitMetadata) | |
164 | |
165 print df | |
166 | |
167 | |
168 # Here aree dataframe collumns for ! | |
169 | |
170 print df.iat[0,0] | |
171 print df.iat[1,0] | |
172 print df.iat[3,0] | |
173 print df.iat[4,0] | |
174 print df.iat[6,0] | |
175 print df.iat[7,0] | |
176 print df.iat[9,0] | |
177 print df.iat[10,0] | |
178 | |
179 OrganismOne = df.iat[0,0] | |
180 OrganismTwo = df.iat[3,0] | |
181 OrganismThree = df.iat[6,0] | |
182 OrganismFour = df.iat[9,0] | |
183 | |
184 | |
185 OrganismOnePer = df.iat[1,0] | |
186 OrganismTwoPer = df.iat[4,0] | |
187 OrganismThreePer = df.iat[7,0] | |
188 OrganismFourPer = df.iat[10,0] | |
189 | |
190 | |
191 df = {'OrganismOne':OrganismOne, | |
192 'OrganismTwo':OrganismTwo, | |
193 'OrganismThree':OrganismThree, | |
194 'OrganismFour':OrganismFour, | |
195 'OrganismOne%':OrganismOnePer, | |
196 'OrganismTwo%':OrganismTwoPer, | |
197 'OrganismThree%':OrganismThreePer, | |
198 'OrganismFour%':OrganismFourPer } | |
199 | |
200 | |
201 #Clean up files | |
202 | |
203 command = "" | |
204 command += "rm " + krakenoutoutpath + krakenout | |
205 | |
206 print command | |
207 | |
208 check_call(command, shell=True) | |
209 | |
210 | |
211 | |
212 command = "" | |
213 command += "rm " + outputFile | |
214 | |
215 print command | |
216 | |
217 return pandas.DataFrame(df, index=[0]) |