jpayne@0
|
1 import csv
|
jpayne@0
|
2 from collections import Counter, OrderedDict
|
jpayne@0
|
3 from hashlib import md5
|
jpayne@0
|
4 import os.path
|
jpayne@0
|
5 import tempfile
|
jpayne@0
|
6 from subprocess import check_call
|
jpayne@0
|
7 from gims_qa_tests import TaxTuple
|
jpayne@0
|
8 #from qa_core import present
|
jpayne@0
|
9
|
jpayne@0
|
10 def run_kraken(sequenceFiles, database, kraken_options=dict(preload=None, threads=1)):
|
jpayne@0
|
11 "kraken options should be supplied as a dict to kraken_options; give a None value for valueless options"
|
jpayne@0
|
12
|
jpayne@0
|
13 #hash the kraken db files
|
jpayne@0
|
14 md5hash = md5()
|
jpayne@0
|
15 #os.path.walk(database, lambda arg, dirname, files: [md5hash.update(open(os.path.join(database, dirname, f), 'rb').read()) for f in files], None)
|
jpayne@0
|
16 md5hash.update(open(os.path.join(database, 'database.kdb'), 'rb').read())
|
jpayne@0
|
17
|
jpayne@0
|
18 db_hash = md5hash.hexdigest()
|
jpayne@0
|
19
|
jpayne@0
|
20
|
jpayne@0
|
21 with tempfile.NamedTemporaryFile('rU') as kraken_out:
|
jpayne@0
|
22 with tempfile.NamedTemporaryFile() as seq_data:
|
jpayne@0
|
23 #cat all the files together
|
jpayne@0
|
24 check_call("cat {files} > {seq_data.name}".format(
|
jpayne@0
|
25 files = " ".join([present(f) for f in sequenceFiles]),
|
jpayne@0
|
26 seq_data = seq_data
|
jpayne@0
|
27 ),
|
jpayne@0
|
28 shell=True)
|
jpayne@0
|
29
|
jpayne@0
|
30 def make_option(key, value):
|
jpayne@0
|
31 if value is not None:
|
jpayne@0
|
32 return "--{} {}".format(key, value)
|
jpayne@0
|
33 return "--{}".format(key)
|
jpayne@0
|
34
|
jpayne@0
|
35 #build options
|
jpayne@0
|
36 options = " ".join([make_option(k,v) for k,v in kraken_options.items()])
|
jpayne@0
|
37
|
jpayne@0
|
38 #call kraken
|
jpayne@0
|
39 check_call("kraken {options} "
|
jpayne@0
|
40 "--db {database} "
|
jpayne@0
|
41 "--gzip-compressed "
|
jpayne@0
|
42 "--fastq {seq_data.name} "
|
jpayne@0
|
43 "--output {kraken_out.name}".format(**locals()), shell=True)
|
jpayne@0
|
44
|
jpayne@0
|
45 counts = Counter()
|
jpayne@0
|
46
|
jpayne@0
|
47 with tempfile.NamedTemporaryFile('rU') as kraken_report:
|
jpayne@0
|
48 check_call("kraken-translate --db {database} {kraken_out.name} > {kraken_report.name}".format(**locals()), shell=True)
|
jpayne@0
|
49 rows = csv.reader(kraken_report, delimiter='\t')
|
jpayne@0
|
50 for seq_id, taxon in rows:
|
jpayne@0
|
51 counts[taxon] += 1.0
|
jpayne@0
|
52 counts["total"] += 1.0
|
jpayne@0
|
53
|
jpayne@0
|
54 #clean up all the temp files by releasing contexts
|
jpayne@0
|
55
|
jpayne@0
|
56 top_results = counts.most_common(6) #total, plus five most common taxa
|
jpayne@0
|
57 total, total_count = top_results.pop(0)
|
jpayne@0
|
58 top_hit, top_hit_count = top_results.pop(0)
|
jpayne@0
|
59 next_hits = [h[0].replace(';', ' ') for h in top_results]
|
jpayne@0
|
60 next_hit_counts = [h[1] for h in top_results]
|
jpayne@0
|
61
|
jpayne@0
|
62 top_hit_frac = top_hit_count / total_count
|
jpayne@0
|
63 next_hit_frac = [h / total_count for h in next_hit_counts]
|
jpayne@0
|
64
|
jpayne@0
|
65 return TaxTuple(top_hit.replace(';', ' '), top_hit_frac, next_hits, next_hit_frac, db_hash)
|
jpayne@0
|
66
|
jpayne@0
|
67 def runKrakenSeqtest(sequenceFiles,database):
|
jpayne@0
|
68
|
jpayne@0
|
69 krakenoutoutpath = "/home/charles.strittmatter/output/kraken/"
|
jpayne@0
|
70
|
jpayne@0
|
71 w = datetime.datetime.now()
|
jpayne@0
|
72
|
jpayne@0
|
73 krakenout = "krakenOut" + w.strftime("%Y-%m-%d-%H-%M")
|
jpayne@0
|
74 krakenreport = "" + krakenoutoutpath + "kraken_report_" + w.strftime("%Y-%m-%d-%H-%M")
|
jpayne@0
|
75
|
jpayne@0
|
76 sequenceFiles[0]
|
jpayne@0
|
77 sequenceFiles[1]
|
jpayne@0
|
78
|
jpayne@0
|
79 os.path.abspath(os.path.join("Input", sequenceFileName[0]))
|
jpayne@0
|
80
|
jpayne@0
|
81 command = "kraken --preload "
|
jpayne@0
|
82 command += "--db " + database + ' '
|
jpayne@0
|
83 #if len(fastqPaths) > 1:
|
jpayne@0
|
84 command += "--paired " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[0])) + " " + os.path.abspath(os.path.join("sample/Input", sequenceFiles[1]))
|
jpayne@0
|
85 # command += ' '.join(fastqPaths)
|
jpayne@0
|
86 command += " --output " + "/home/charles.strittmatter/output/kraken/" + krakenout
|
jpayne@0
|
87
|
jpayne@0
|
88 print command
|
jpayne@0
|
89
|
jpayne@0
|
90 check_call(command, shell=True)
|
jpayne@0
|
91 print "%s -- Kraken called..." % datetime.datetime.now()
|
jpayne@0
|
92
|
jpayne@0
|
93
|
jpayne@0
|
94 kraken_report_cmd = "kraken-report --db "
|
jpayne@0
|
95 kraken_report_cmd += database
|
jpayne@0
|
96 kraken_report_cmd += " " + krakenoutoutpath + krakenout
|
jpayne@0
|
97 kraken_report_cmd += " > " + krakenreport
|
jpayne@0
|
98
|
jpayne@0
|
99 print kraken_report_cmd
|
jpayne@0
|
100
|
jpayne@0
|
101 check_call(kraken_report_cmd, shell=True)
|
jpayne@0
|
102
|
jpayne@0
|
103
|
jpayne@0
|
104
|
jpayne@0
|
105
|
jpayne@0
|
106 parsed_kraken_rpt = "cat "+ krakenreport + " | grep \"$(printf '\tS\t')\" | grep -v '-' | head -n 5"
|
jpayne@0
|
107
|
jpayne@0
|
108 kraken_rpt_lines = check_output(parsed_kraken_rpt, shell=True)
|
jpayne@0
|
109
|
jpayne@0
|
110 split_kraken_rpt_line = re.split("\t|\n", kraken_rpt_lines)
|
jpayne@0
|
111
|
jpayne@0
|
112 print "=====================" + krakenreport + "=========================="
|
jpayne@0
|
113 print split_kraken_rpt_line
|
jpayne@0
|
114 split_kraken_rpt_length = len(split_kraken_rpt_line)
|
jpayne@0
|
115 print split_kraken_rpt_line[5]+" "+split_kraken_rpt_line[0]+" "+split_kraken_rpt_line[4]
|
jpayne@0
|
116 taxLen = 6;
|
jpayne@0
|
117 taxIdIdx = 4;
|
jpayne@0
|
118 outputFile = "/home/charles.strittmatter/output/kraken/" + w.strftime("%Y-%m-%d-%H-%M") + '_metadata_kraken.tsv'
|
jpayne@0
|
119
|
jpayne@0
|
120 metadata = "Platform Run asm_stats_contig_n50 asm_stats_length_bp asm_stats_n_contig attribute_package bioproject_acc bioproject_center bioproject_title biosample_acc collected_by sample_name scientific_name serovar sra_center strain sub_species tax-id"
|
jpayne@0
|
121
|
jpayne@0
|
122
|
jpayne@0
|
123
|
jpayne@0
|
124
|
jpayne@0
|
125 splitMetadata = metadata.split("\t")
|
jpayne@0
|
126 for i in range(0, len(split_kraken_rpt_line)):
|
jpayne@0
|
127 if split_kraken_rpt_line[i] == '':
|
jpayne@0
|
128 print "don't add to the metadata...split_kraken_rpt_line.pop(i)"
|
jpayne@0
|
129 elif i%(taxLen) == taxLen-1:
|
jpayne@0
|
130 print "i is "+str(i)+"under this condition: i%(taxLen) == taxLen-1"
|
jpayne@0
|
131 splitMetadata.insert(len(splitMetadata)-3, split_kraken_rpt_line[i].strip())
|
jpayne@0
|
132 elif i%(taxLen) == taxIdIdx:
|
jpayne@0
|
133 print "i is "+str(i)+"under this condition: i%(taxLen) == taxIdIdx"
|
jpayne@0
|
134 splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
|
jpayne@0
|
135 elif i%(taxLen) == 0:
|
jpayne@0
|
136 print "i is "+str(i)+"under this condition: i%(taxLen) == 0"
|
jpayne@0
|
137 splitMetadata.insert(len(splitMetadata)-1, split_kraken_rpt_line[i].strip())
|
jpayne@0
|
138
|
jpayne@0
|
139 splitMetadataLength = len(splitMetadata)
|
jpayne@0
|
140 first_kraken_percentage = splitMetadataLength-15
|
jpayne@0
|
141 with open(outputFile, 'wb') as f:
|
jpayne@0
|
142 #writer = csv.writer(f, delimiter="\t")
|
jpayne@0
|
143
|
jpayne@0
|
144 # print splitMetadata
|
jpayne@0
|
145 for i in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
|
jpayne@0
|
146 print "Split metadata of i "+splitMetadata[i]
|
jpayne@0
|
147 for j in xrange(splitMetadataLength-3, first_kraken_percentage-1, -3):
|
jpayne@0
|
148 if float(splitMetadata[j]) > float(splitMetadata[i]):
|
jpayne@0
|
149 print "i: "+str(i)+" and j: "+str(j)
|
jpayne@0
|
150 print "swapping "+splitMetadata[i]+" with "+splitMetadata[j]
|
jpayne@0
|
151 old_hit_org = splitMetadata[i-1]
|
jpayne@0
|
152 old_hit_percentage = splitMetadata[i]
|
jpayne@0
|
153 old_hit_tx_id = splitMetadata[i+1]
|
jpayne@0
|
154
|
jpayne@0
|
155 splitMetadata[i-1] = splitMetadata[j-1]
|
jpayne@0
|
156 splitMetadata[i] = splitMetadata[j]
|
jpayne@0
|
157 splitMetadata[i+1] = splitMetadata[j+1]
|
jpayne@0
|
158
|
jpayne@0
|
159 splitMetadata[j-1] = old_hit_org
|
jpayne@0
|
160 splitMetadata[j] = old_hit_percentage
|
jpayne@0
|
161 splitMetadata[j+1] = old_hit_tx_id
|
jpayne@0
|
162
|
jpayne@0
|
163 df = pandas.DataFrame(splitMetadata)
|
jpayne@0
|
164
|
jpayne@0
|
165 print df
|
jpayne@0
|
166
|
jpayne@0
|
167
|
jpayne@0
|
168 # Here aree dataframe collumns for !
|
jpayne@0
|
169
|
jpayne@0
|
170 print df.iat[0,0]
|
jpayne@0
|
171 print df.iat[1,0]
|
jpayne@0
|
172 print df.iat[3,0]
|
jpayne@0
|
173 print df.iat[4,0]
|
jpayne@0
|
174 print df.iat[6,0]
|
jpayne@0
|
175 print df.iat[7,0]
|
jpayne@0
|
176 print df.iat[9,0]
|
jpayne@0
|
177 print df.iat[10,0]
|
jpayne@0
|
178
|
jpayne@0
|
179 OrganismOne = df.iat[0,0]
|
jpayne@0
|
180 OrganismTwo = df.iat[3,0]
|
jpayne@0
|
181 OrganismThree = df.iat[6,0]
|
jpayne@0
|
182 OrganismFour = df.iat[9,0]
|
jpayne@0
|
183
|
jpayne@0
|
184
|
jpayne@0
|
185 OrganismOnePer = df.iat[1,0]
|
jpayne@0
|
186 OrganismTwoPer = df.iat[4,0]
|
jpayne@0
|
187 OrganismThreePer = df.iat[7,0]
|
jpayne@0
|
188 OrganismFourPer = df.iat[10,0]
|
jpayne@0
|
189
|
jpayne@0
|
190
|
jpayne@0
|
191 df = {'OrganismOne':OrganismOne,
|
jpayne@0
|
192 'OrganismTwo':OrganismTwo,
|
jpayne@0
|
193 'OrganismThree':OrganismThree,
|
jpayne@0
|
194 'OrganismFour':OrganismFour,
|
jpayne@0
|
195 'OrganismOne%':OrganismOnePer,
|
jpayne@0
|
196 'OrganismTwo%':OrganismTwoPer,
|
jpayne@0
|
197 'OrganismThree%':OrganismThreePer,
|
jpayne@0
|
198 'OrganismFour%':OrganismFourPer }
|
jpayne@0
|
199
|
jpayne@0
|
200
|
jpayne@0
|
201 #Clean up files
|
jpayne@0
|
202
|
jpayne@0
|
203 command = ""
|
jpayne@0
|
204 command += "rm " + krakenoutoutpath + krakenout
|
jpayne@0
|
205
|
jpayne@0
|
206 print command
|
jpayne@0
|
207
|
jpayne@0
|
208 check_call(command, shell=True)
|
jpayne@0
|
209
|
jpayne@0
|
210
|
jpayne@0
|
211
|
jpayne@0
|
212 command = ""
|
jpayne@0
|
213 command += "rm " + outputFile
|
jpayne@0
|
214
|
jpayne@0
|
215 print command
|
jpayne@0
|
216
|
jpayne@0
|
217 return pandas.DataFrame(df, index=[0]) |