comparison SeqSero2/SeqSero2_package.py @ 1:fae43708974d

planemo upload
author jpayne
date Fri, 09 Nov 2018 11:30:45 -0500
parents
children b82e5f3c9187
comparison
equal deleted inserted replaced
0:4ff2aee11e5b 1:fae43708974d
1 #!/usr/bin/env python3
2
3 import sys
4 import time
5 import random
6 import os
7 import subprocess
8 import gzip
9 import io
10 import pickle
11 import argparse
12 import itertools
13 from distutils.version import LooseVersion
14
15 from collections import OrderedDict
16 from csv import DictWriter
17
18 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
19
20 ### SeqSero Kmer
21 def parse_args():
22 "Parse the input arguments, use '-h' for help."
23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future
24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data")
25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)")
26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode for mapping, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem")
27 parser.add_argument("-p",default="1",help="<int>: threads used for mapping mode, if p>4, only 4 threads will be used for assembly, default=1")
28 parser.add_argument("-m",choices=['k','a'],default="k",help="<string>: 'k'(kmer mode), 'a'(allele mode), default=k")
29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted")
31 return parser.parse_args()
32
33 def reverse_complement(sequence):
34 complement = {
35 'A': 'T',
36 'C': 'G',
37 'G': 'C',
38 'T': 'A',
39 'N': 'N',
40 'M': 'K',
41 'R': 'Y',
42 'W': 'W',
43 'S': 'S',
44 'Y': 'R',
45 'K': 'M',
46 'V': 'B',
47 'H': 'D',
48 'D': 'H',
49 'B': 'V'
50 }
51 return "".join(complement[base] for base in reversed(sequence))
52
53
54 def createKmerDict_reads(list_of_strings, kmer):
55 kmer_table = {}
56 for string in list_of_strings:
57 sequence = string.strip('\n')
58 for i in range(len(sequence) - kmer + 1):
59 new_mer = sequence[i:i + kmer].upper()
60 new_mer_rc = reverse_complement(new_mer)
61 if new_mer in kmer_table:
62 kmer_table[new_mer.upper()] += 1
63 else:
64 kmer_table[new_mer.upper()] = 1
65 if new_mer_rc in kmer_table:
66 kmer_table[new_mer_rc.upper()] += 1
67 else:
68 kmer_table[new_mer_rc.upper()] = 1
69 return kmer_table
70
71
72 def multifasta_dict(multifasta):
73 multifasta_list = [
74 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
75 ]
76 headers = [i for i in multifasta_list if i[0] == '>']
77 multifasta_dict = {}
78 for h in headers:
79 start = multifasta_list.index(h)
80 for element in multifasta_list[start + 1:]:
81 if element[0] == '>':
82 break
83 else:
84 if h[1:] in multifasta_dict:
85 multifasta_dict[h[1:]] += element
86 else:
87 multifasta_dict[h[1:]] = element
88 return multifasta_dict
89
90
91 def multifasta_single_string(multifasta):
92 multifasta_list = [
93 line.strip() for line in open(multifasta, 'r')
94 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
95 ]
96 return ''.join(multifasta_list)
97
98
99 def chunk_a_long_sequence(long_sequence, chunk_size=60):
100 chunk_list = []
101 steps = len(long_sequence) // 60 #how many chunks
102 for i in range(steps):
103 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
104 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
105 return chunk_list
106
107
108 def target_multifasta_kmerizer(multifasta, k, kmerDict):
109 forward_length = 300 #if find the target, put forward 300 bases
110 reverse_length = 2200 #if find the target, put backward 2200 bases
111 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
112 target_mers = []
113 long_single_string = multifasta_single_string(multifasta)
114 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
115 unit_length = len(multifasta_list[0])
116 forward_lines = int(forward_length / unit_length) + 1
117 reverse_lines = int(forward_length / unit_length) + 1
118 start_num = 0
119 end_num = 0
120 for i in range(len(multifasta_list)):
121 if i not in range(start_num, end_num): #avoid computational repetition
122 line = multifasta_list[i]
123 start = int((len(line) - k) // 2)
124 s1 = line[start:k + start]
125 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
126 if i - forward_lines >= 0:
127 start_num = i - forward_lines
128 else:
129 start_num = 0
130 if i + reverse_lines <= len(multifasta_list) - 1:
131 end_num = i + reverse_lines
132 else:
133 end_num = len(multifasta_list) - 1
134 target_list = [
135 x.strip() for x in multifasta_list[start_num:end_num]
136 ]
137 target_line = "".join(target_list)
138 target_mers += [
139 k1 for k1 in createKmerDict_reads([str(target_line)], k)
140 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
141 else:
142 pass
143 return set(target_mers)
144
145
146 def target_read_kmerizer(file, k, kmerDict):
147 i = 1
148 n_reads = 0
149 total_coverage = 0
150 target_mers = []
151 if file.endswith(".gz"):
152 file_content = io.BufferedReader(gzip.open(file))
153 else:
154 file_content = open(file, "r").readlines()
155 for line in file_content:
156 start = int((len(line) - k) // 2)
157 if i % 4 == 2:
158 if file.endswith(".gz"):
159 s1 = line[start:k + start].decode()
160 line = line.decode()
161 else:
162 s1 = line[start:k + start]
163 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
164 n_reads += 1
165 total_coverage += len(line)
166 target_mers += [
167 k1 for k1 in createKmerDict_reads([str(line)], k)
168 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
169 i += 1
170 if total_coverage >= 4000000:
171 break
172 return set(target_mers)
173
174
175 def minion_fasta_kmerizer(file, k, kmerDict):
176 i = 1
177 n_reads = 0
178 total_coverage = 0
179 target_mers = {}
180 for line in open(file):
181 if i % 2 == 0:
182 for kmer, rc_kmer in kmers(line.strip().upper(), k):
183 if (kmer in kmerDict) or (rc_kmer in kmerDict):
184 if kmer in target_mers:
185 target_mers[kmer] += 1
186 else:
187 target_mers[kmer] = 1
188 if rc_kmer in target_mers:
189 target_mers[rc_kmer] += 1
190 else:
191 target_mers[rc_kmer] = 1
192 i += 1
193 return set([h for h in target_mers])
194
195
196 def minion_fastq_kmerizer(file, k, kmerDict):
197 i = 1
198 n_reads = 0
199 total_coverage = 0
200 target_mers = {}
201 for line in open(file):
202 if i % 4 == 2:
203 for kmer, rc_kmer in kmers(line.strip().upper(), k):
204 if (kmer in kmerDict) or (rc_kmer in kmerDict):
205 if kmer in target_mers:
206 target_mers[kmer] += 1
207 else:
208 target_mers[kmer] = 1
209 if rc_kmer in target_mers:
210 target_mers[rc_kmer] += 1
211 else:
212 target_mers[rc_kmer] = 1
213 i += 1
214 return set([h for h in target_mers])
215
216
217 def multifasta_single_string2(multifasta):
218 single_string = ''
219 with open(multifasta, 'r') as f:
220 for line in f:
221 if line.strip()[0] == '>':
222 pass
223 else:
224 single_string += line.strip()
225 return single_string
226
227
228 def kmers(seq, k):
229 rev_comp = reverse_complement(seq)
230 for start in range(1, len(seq) - k + 1):
231 yield seq[start:start + k], rev_comp[-(start + k):-start]
232
233
234 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
235 multi_seq_dict = multifasta_dict(multifasta)
236 lib_dict = {}
237 for h in multi_seq_dict:
238 lib_dict[h] = set(
239 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
240 return lib_dict
241
242
243 def Combine(b, c):
244 fliC_combinations = []
245 fliC_combinations.append(",".join(c))
246 temp_combinations = []
247 for i in range(len(b)):
248 for x in itertools.combinations(b, i + 1):
249 temp_combinations.append(",".join(x))
250 for x in temp_combinations:
251 temp = []
252 for y in c:
253 temp.append(y)
254 temp.append(x)
255 temp = ",".join(temp)
256 temp = temp.split(",")
257 temp.sort()
258 temp = ",".join(temp)
259 fliC_combinations.append(temp)
260 return fliC_combinations
261
262
263 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
264 #like test_output_06012017.txt
265 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
266 from Initial_Conditions import phase1
267 from Initial_Conditions import phase2
268 from Initial_Conditions import phaseO
269 from Initial_Conditions import sero
270 from Initial_Conditions import subs
271 seronames = []
272 seronames_none_subspecies=[]
273 for i in range(len(phase1)):
274 fliC_combine = []
275 fljB_combine = []
276 if phaseO[i] == Otype: # no VII in KW, but it's there
277 ### for fliC, detect every possible combinations to avoid the effect of "["
278 if phase1[i].count("[") == 0:
279 fliC_combine.append(phase1[i])
280 elif phase1[i].count("[") >= 1:
281 c = []
282 b = []
283 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
284 "[") == 1:
285 content = phase1[i].replace("[", "").replace("]", "")
286 fliC_combine.append(content)
287 fliC_combine.append("-")
288 else:
289 for x in phase1[i].split(","):
290 if "[" in x:
291 b.append(x.replace("[", "").replace("]", ""))
292 else:
293 c.append(x)
294 fliC_combine = Combine(
295 b, c
296 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
297 ### end of fliC "[" detect
298 ### for fljB, detect every possible combinations to avoid the effect of "["
299 if phase2[i].count("[") == 0:
300 fljB_combine.append(phase2[i])
301 elif phase2[i].count("[") >= 1:
302 d = []
303 e = []
304 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
305 "[") == 1:
306 content = phase2[i].replace("[", "").replace("]", "")
307 fljB_combine.append(content)
308 fljB_combine.append("-")
309 else:
310 for x in phase2[i].split(","):
311 if "[" in x:
312 d.append(x.replace("[", "").replace("]", ""))
313 else:
314 e.append(x)
315 fljB_combine = Combine(d, e)
316 ### end of fljB "[" detect
317 new_fliC = fliC.split(
318 ","
319 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
320 new_fliC.sort()
321 new_fliC = ",".join(new_fliC)
322 new_fljB = fljB.split(",")
323 new_fljB.sort()
324 new_fljB = ",".join(new_fljB)
325 if (new_fliC in fliC_combine
326 or fliC in fliC_combine) and (new_fljB in fljB_combine
327 or fljB in fljB_combine):
328 if subs[i] == subspecies:
329 seronames.append(sero[i])
330 seronames_none_subspecies.append(sero[i])
331 #analyze seronames
332 subspecies_pointer=""
333 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
334 seronames=seronames_none_subspecies
335 subspecies_pointer="1"
336 if len(seronames) == 0:
337 seronames = [
338 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
339 ]
340 star = ""
341 star_line = ""
342 if len(seronames) > 1: #there are two possible predictions for serotypes
343 star = "*"
344 star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
345 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
346 star="*"
347 star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
348 if Otype=="":
349 Otype="-"
350 predict_form = Otype + ":" + fliC + ":" + fljB
351 predict_sero = (" or ").join(seronames)
352 ###special test for Enteritidis
353 if predict_form == "9:g,m:-":
354 sdf = "-"
355 for x in special_gene_list:
356 if x.startswith("sdf"):
357 sdf = "+"
358 predict_form = predict_form + " Sdf prediction:" + sdf
359 if sdf == "-":
360 star = "*"
361 star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
362 predict_sero = "Gallinarum/Enteritidis sdf -"
363 ###end of special test for Enteritidis
364 elif predict_form == "4:i:-":
365 predict_sero = "potential monophasic variant of Typhimurium"
366 elif predict_form == "4:r:-":
367 predict_sero = "potential monophasic variant of Heidelberg"
368 elif predict_form == "4:b:-":
369 predict_sero = "potential monophasic variant of Paratyphi B"
370 elif predict_form == "8:e,h:1,2":
371 predict_sero = "Newport"
372 star = "*"
373 star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
374 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
375 if "N/A" in predict_sero:
376 claim = ""
377 #special test for Typhimurium
378 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
379 normal = 0
380 mutation = 0
381 for x in special_gene_list:
382 if "oafA-O-4_full" in x:
383 normal = float(special_gene_list[x])
384 elif "oafA-O-4_5-" in x:
385 mutation = float(special_gene_list[x])
386 if normal > mutation:
387 pass
388 elif normal < mutation:
389 predict_sero = predict_sero.strip() + "(O5-)"
390 star = "*"
391 star_line = "Detected the deletion of O5-."
392 else:
393 pass
394 #special test for Paratyphi B
395 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
396 normal = 0
397 mutation = 0
398 for x in special_gene_list:
399 if "gntR-family-regulatory-protein_dt-positive" in x:
400 normal = float(special_gene_list[x])
401 elif "gntR-family-regulatory-protein_dt-negative" in x:
402 mutation = float(special_gene_list[x])
403 #print(normal,mutation)
404 if normal > mutation:
405 predict_sero = predict_sero.strip() + "(dt+)"
406 star = "*"
407 star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
408 elif normal < mutation:
409 predict_sero = predict_sero.strip() + "(dt-)"
410 star = "*"
411 star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
412 else:
413 star = "*"
414 star_line = "Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
415 #special test for O13,22 and O13,23
416 if Otype=="13":
417 #ex_dir = os.path.dirname(os.path.realpath(__file__))
418 f = open(dirpath + '/special.pickle', 'rb')
419 special = pickle.load(f)
420 O22_O23=special['O22_O23']
421 if predict_sero.split(" or ")[0] in O22_O23[-1]:
422 O22_score=0
423 O23_score=0
424 for x in special_gene_list:
425 if "O:22" in x:
426 O22_score = O22_score+float(special_gene_list[x])
427 elif "O:23" in x:
428 O23_score = O23_score+float(special_gene_list[x])
429 #print(O22_score,O23_score)
430 for z in O22_O23[0]:
431 if predict_sero.split(" or ")[0] in z:
432 if O22_score > O23_score:
433 star = "*"
434 star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'."
435 predict_sero = z[0]
436 elif O22_score < O23_score:
437 star = "*"
438 star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'."
439 predict_sero = z[1]
440 else:
441 star = "*"
442 star_line = "Fail to detect O22 and O23 differences."
443 #special test for O6,8
444 merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"]
445 for x in merge_O68_list:
446 if x in predict_sero:
447 predict_sero=x
448 star=""
449 star_line=""
450 #special test for Montevideo; most of them are monophasic
451 if "Montevideo" in predict_sero and "1,2,7" in predict_form:
452 star="*"
453 star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
454 return predict_form, predict_sero, star, star_line, claim
455 ### End of SeqSero Kmer part
456
457 ### Begin of SeqSero2 allele prediction and output
458 def xml_parse_score_comparision_seqsero(xmlfile):
459 #used to do seqsero xml analysis
460 from Bio.Blast import NCBIXML
461 handle=open(xmlfile)
462 handle=NCBIXML.parse(handle)
463 handle=list(handle)
464 List=[]
465 List_score=[]
466 List_ids=[]
467 List_query_region=[]
468 for i in range(len(handle)):
469 if len(handle[i].alignments)>0:
470 for j in range(len(handle[i].alignments)):
471 score=0
472 ids=0
473 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
474 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
475 for z in range(len(handle[i].alignments[j].hsps)):
476 hsp=handle[i].alignments[j].hsps[z]
477 temp=set(range(hsp.query_start,hsp.query_end))
478 if len(cover_region)==0:
479 cover_region=cover_region|temp
480 fraction=1
481 else:
482 fraction=1-len(cover_region&temp)/float(len(temp))
483 cover_region=cover_region|temp
484 if "last" in handle[i].query or "first" in handle[i].query:
485 score+=hsp.bits*fraction
486 ids+=float(hsp.identities)/handle[i].query_length*fraction
487 else:
488 score+=hsp.bits*fraction
489 ids+=float(hsp.identities)/handle[i].query_length*fraction
490 List_score.append(score)
491 List_ids.append(ids)
492 List_query_region.append(cover_region)
493 temp=zip(List,List_score,List_ids,List_query_region)
494 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
495 return Final_list
496
497
498 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
499 Old=L
500 L.sort()
501 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
502 count=[]
503 for j in range(len(L)):
504 y=0
505 for x in Old:
506 if L[j]==x:
507 y+=1
508 count.append(y)
509 if sort_on_fre!="none":
510 d=zip(*sorted(zip(count, L)))
511 L=d[1]
512 count=d[0]
513 return (L,count)
514
515 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
516 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
517 #this is mainly used for
518 a=nodes_vs_score_list
519 fliC_score=0
520 fljB_score=0
521 for z in a:
522 if "fliC" in z[0]:
523 fliC_score+=z[1]
524 elif "fljB" in z[0]:
525 fljB_score+=z[1]
526 if fliC_score>=fljB_score:
527 role="fliC"
528 else:
529 role="fljB"
530 return (role,abs(fliC_score-fljB_score))
531
532 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
533 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
534 #also used when no head or tail got blasted score for the contig
535 role=""
536 for z in Final_list_passed:
537 if node_name in z[0]:
538 role=z[0].split("_")[0]
539 break
540 return role
541
542 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
543 #nodes_list is the c created by c,d=Uniq(nodes) in below function
544 first_target=""
545 role_list=[]
546 for x in nodes_list:
547 a=[]
548 role=""
549 for y in tail_head_list:
550 if x in y[0]:
551 a.append(y)
552 if len(a)==4:
553 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
554 if diff<20:
555 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
556 elif len(a)==3:
557 ###however, if the one with highest score is the fewer one, compare their accumulation score
558 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
559 if diff<20:
560 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
561 ###end of above score comparison
562 elif len(a)==2:
563 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
564 temp=[]
565 for z in a:
566 temp.append(z[0].split("_")[0])
567 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
568 if len(m)==1:
569 pass
570 else:
571 pass
572 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
573 if diff<20:
574 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
575 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
576 elif len(a)==1:
577 #that one
578 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
579 if diff<20:
580 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
581 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
582 else:#a==0
583 #use Final_list_passed best match
584 for z in Final_list_passed:
585 if x in z[0]:
586 role=z[0].split("_")[0]
587 break
588 #print x,role,len(a)
589 role_list.append((role,x))
590 if len(role_list)==2:
591 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
592 #just use score to do a final test
593 role_list=[]
594 for x in nodes_list:
595 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
596 role_list.append((role,x))
597 return role_list
598
599 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
600 #used to decide which contig is FliC and which one is fljB
601 contigs=[]
602 nodes=[]
603 for x in Final_list_passed:
604 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
605 nodes.append(x[0].split("___")[1].strip())
606 c,d=Uniq(nodes)#c is node_list
607 #print c
608 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
609 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
610 return roles
611
612 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
613 #decide O based on Final_list
614 O_choice="?"
615 O_list=[]
616 special_genes={}
617 nodes=[]
618 for x in Final_list_passed:
619 if x[0].startswith("O-"):
620 nodes.append(x[0].split("___")[1].strip())
621 elif not x[0].startswith("fl"):
622 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
623 #print "special_genes:",special_genes
624 c,d=Uniq(nodes)
625 #print "potential O antigen contig",c
626 final_O=[]
627 O_nodes_list=[]
628 for x in c:#c is the list for contigs
629 temp=0
630 for y in Final_list_passed:
631 if x in y[0] and y[0].startswith("O-"):
632 final_O.append(y)
633 break
634 ### O contig has the problem of two genes on same contig, so do additional test
635 potenial_new_gene=""
636 for x in final_O:
637 pointer=0 #for genes merged or not
638 #not consider O-1,3,19_not_in_3,10, too short compared with others
639 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
640 pointer=x[0].split("___")[1].strip()#store the contig name
641 print(pointer)
642 if pointer!=0:#it has potential merge event
643 for y in Final_list:
644 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
645 potenial_new_gene=y
646 #print(potenial_new_gene)
647 break
648 if potenial_new_gene!="":
649 print("two differnt genes in same contig, fix it for O antigen")
650 print(potenial_new_gene[:3])
651 final_O.append(potenial_new_gene)
652 ### end of the two genes on same contig test
653 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
654 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
655 #print "$$$No Otype, due to no hit"#may need to be changed
656 O_choice="-"
657 else:
658 highest_O_coverage=max([float(x[0].split("_cov_")[-1]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
659 O_list=[]
660 O_list_less_contamination=[]
661 for x in final_O:
662 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
663 O_list.append(x[0].split("__")[0])
664 O_nodes_list.append(x[0].split("___")[1])
665 if float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:
666 O_list_less_contamination.append(x[0].split("__")[0])
667 ### special test for O9,46 and O3,10 family
668 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
669 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
670 O_choice="O-9,46"
671 #print "$$$Most possilble Otype: O-9,46"
672 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
673 O_choice="O-9,46,27"
674 #print "$$$Most possilble Otype: O-9,46,27"
675 else:
676 O_choice="O-9"#next, detect O9 vs O2?
677 O2=0
678 O9=0
679 for z in special_genes:
680 if "tyr-O-9" in z:
681 O9=special_genes[z]
682 elif "tyr-O-2" in z:
683 O2=special_genes[z]
684 if O2>O9:
685 O_choice="O-2"
686 elif O2<O9:
687 pass
688 else:
689 pass
690 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
691 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
692 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
693 O_choice="O-3,10"
694 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
695 else:
696 O_choice="O-1,3,19"
697 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
698 ### end of special test for O9,46 and O3,10 family
699 else:
700 try:
701 max_score=0
702 for x in final_O:
703 if x[2]>=max_score and float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
704 max_score=x[2]#change from x[-1] to x[2],08172018
705 O_choice=x[0].split("_")[0]
706 if O_choice=="O-1,3,19":
707 O_choice=final_O[1][0].split("_")[0]
708 #print "$$$Most possilble Otype: ",O_choice
709 except:
710 pass
711 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
712 #print "O:",O_choice,O_nodes_list
713 Otypes=[]
714 for x in O_list:
715 if x!="O-1,3,19_not_in_3,10":
716 if "O-9,46_" not in x:
717 Otypes.append(x.split("_")[0])
718 else:
719 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
720 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
721 Otypes_uniq,Otypes_fre=Uniq(Otypes)
722 contamination_O=""
723 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
724 if len(Otypes_uniq)>2:
725 contamination_O="potential contamination from O antigen signals"
726 else:
727 if len(Otypes_uniq)>1:
728 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
729 contamination_O=""
730 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
731 contamination_O=""
732 else:
733 contamination_O="potential contamination from O antigen signals"
734 return O_choice,O_nodes_list,special_genes,final_O,contamination_O
735 ### End of SeqSero2 allele prediction and output
736
737 def get_input_files(make_dir,input_file,data_type,dirpath):
738 #tell input files from datatype
739 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
740 for_fq=""
741 rev_fq=""
742 old_dir = os.getcwd()
743 os.chdir(make_dir)
744 if data_type=="1":
745 input_file=input_file[0].split("/")[-1]
746 if input_file.endswith(".sra"):
747 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
748 for_fq=input_file.replace(".sra","_1.fastq")
749 rev_fq=input_file.replace(".sra","_2.fastq")
750 else:
751 core_id=input_file.split(".fastq")[0].split(".fq")[0]
752 for_fq=core_id+"_1.fastq"
753 rev_fq=core_id+"_2.fastq"
754 if input_file.endswith(".gz"):
755 subprocess.check_output("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
756 else:
757 subprocess.check_output("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
758 elif data_type=="2":
759 for_fq=input_file[0].split("/")[-1]
760 rev_fq=input_file[1].split("/")[-1]
761 elif data_type=="3":
762 input_file=input_file[0].split("/")[-1]
763 if input_file.endswith(".sra"):
764 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
765 for_fq=input_file.replace(".sra","_1.fastq")
766 else:
767 for_fq=input_file
768 elif data_type in ["4","5","6"]:
769 for_fq=input_file[0].split("/")[-1]
770 os.chdir(old_dir)
771 return for_fq,rev_fq
772
773 def predict_O_and_H_types(Final_list,Final_list_passed):
774 #get O and H types from Final_list from blast parsing; allele mode
775 fliC_choice="-"
776 fljB_choice="-"
777 fliC_contig="NA"
778 fljB_contig="NA"
779 fliC_region=set([0])
780 fljB_region=set([0,])
781 fliC_length=0 #can be changed to coverage in future
782 fljB_length=0 #can be changed to coverage in future
783 O_choice="-"#no need to decide O contig for now, should be only one
784 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
785 O_choice=O_choice.split("-")[-1].strip()
786 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
787 O_choice="-"
788 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
789 log_file=open("SeqSero_log.txt","a")
790 print("O_contigs:")
791 log_file.write("O_contigs:\n")
792 for x in O_nodes_roles:
793 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
794 print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
795 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+" "+"blast score: "+str(x[1])+"identity%:"+str(round(x[2]*100,2))+"% "+str(min(x[-1]))+" to "+str(max(x[-1]))+"\n")
796 if len(H_contig_roles)!=0:
797 highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
798 else:
799 highest_H_coverage=0
800 for x in H_contig_roles:
801 #if multiple choices, temporately select the one with longest length for now, will revise in further change
802 if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
803 fliC_contig=x[1]
804 fliC_length=int(x[1].split("_")[3])
805 elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:
806 fljB_contig=x[1]
807 fljB_length=int(x[1].split("_")[3])
808 for x in Final_list_passed:
809 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
810 fliC_choice=x[0].split("_")[1]
811 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
812 fljB_choice=x[0].split("_")[1]
813 elif fliC_choice!="-" and fljB_choice!="-":
814 break
815 #now remove contigs not in middle core part
816 first_allele="NA"
817 first_allele_percentage=0
818 for x in Final_list:
819 if x[0].startswith("fliC") or x[0].startswith("fljB"):
820 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
821 first_allele_percentage=x[2]
822 break
823 additional_contigs=[]
824 for x in Final_list:
825 if first_allele in x[0]:
826 if (fliC_contig == x[0].split("___")[-1]):
827 fliC_region=x[3]
828 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
829 fljB_region=x[3]
830 else:
831 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
832 additional_contigs.append(x)
833 #else:
834 #print x[:3]
835 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
836 if first_allele_percentage>0.9:
837 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
838 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
839 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
840 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
841 else:
842 target_region=set()#doesn't do anything
843 else:
844 target_region=set()#doesn't do anything
845 #print(target_region)
846 #print(additional_contigs)
847 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
848 target_region=target_region2|target_region
849 for x in additional_contigs:
850 removal=0
851 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
852 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
853 removal=1
854 else:
855 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
856 removal=1
857 else:
858 pass
859 if removal==1:
860 for y in H_contig_roles:
861 if y[1] in x[0]:
862 H_contig_roles.remove(y)
863 else:
864 pass
865 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
866 #end of removing none-middle contigs
867 print("H_contigs:")
868 log_file.write("H_contigs:\n")
869 H_contig_stat=[]
870 H1_cont_stat={}
871 H2_cont_stat={}
872 for i in range(len(H_contig_roles)):
873 x=H_contig_roles[i]
874 a=0
875 for y in Final_list_passed:
876 if x[1] in y[0] and y[0].startswith(x[0]):
877 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
878 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
879 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
880 print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
881 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n")
882 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
883 break
884 else:
885 print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
886 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n")
887 if x[0]=="fliC":
888 if y[0].split("_")[1] not in H1_cont_stat:
889 H1_cont_stat[y[0].split("_")[1]]=y[2]
890 else:
891 H1_cont_stat[y[0].split("_")[1]]+=y[2]
892 if x[0]=="fljB":
893 if y[0].split("_")[1] not in H2_cont_stat:
894 H2_cont_stat[y[0].split("_")[1]]=y[2]
895 else:
896 H2_cont_stat[y[0].split("_")[1]]+=y[2]
897 break
898 #detect contaminations
899 #print(H1_cont_stat)
900 #print(H2_cont_stat)
901 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
902 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
903 contamination_H=""
904 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
905 contamination_H="potential contamination from H antigen signals"
906 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
907 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
908 print(contamination_O)
909 print(contamination_H)
910 log_file.write(contamination_O+"\n")
911 log_file.write(contamination_H+"\n")
912 log_file.close()
913 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H
914
915 def get_input_K(input_file,lib_dict,data_type,k_size):
916 #kmer mode; get input_Ks from dict and data_type
917 kmers = []
918 for h in lib_dict:
919 kmers += lib_dict[h]
920 if data_type == '4':
921 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
922 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
923 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
924 elif data_type == '5':#minion_2d_fasta
925 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
926 if data_type == '6':#minion_2d_fastq
927 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
928 return input_Ks
929
930 def get_kmer_dict(lib_dict,input_Ks):
931 #kmer mode; get predicted types
932 O_dict = {}
933 H_dict = {}
934 Special_dict = {}
935 for h in lib_dict:
936 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
937 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
938 if h.startswith('O-') and score > 25:
939 O_dict[h] = score
940 if h.startswith('fl') and score > 40:
941 H_dict[h] = score
942 if (h[:2] != 'fl') and (h[:2] != 'O-'):
943 Special_dict[h] = score
944 return O_dict,H_dict,Special_dict
945
946 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
947 log_file=open("SeqSero_log.txt","a")
948 log_file.write("O_scores:\n")
949 #call O:
950 highest_O = '-'
951 if len(O_dict) == 0:
952 pass
953 else:
954 for x in O_dict:
955 log_file.write(x+"\t"+str(O_dict[x])+"\n")
956 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
957 if 'O-9,46_wzy__1191' in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
958 highest_O = "O-9,46"
959 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
960 highest_O = "O-9,46,27"
961 else:
962 highest_O = "O-9" # next, detect O9 vs O2?
963 O2 = 0
964 O9 = 0
965 for z in Special_dict:
966 if "tyr-O-9" in z:
967 O9 = float(Special_dict[z])
968 if "tyr-O-2" in z:
969 O2 = float(Special_dict[z])
970 if O2 > O9:
971 highest_O = "O-2"
972 elif ("O-3,10_wzx__1539" in O_dict) and (
973 "O-9,46_wzy__1191" in O_dict
974 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
975 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
976 highest_O = "O-3,10"
977 else:
978 highest_O = "O-1,3,19"
979 ### end of special test for O9,46 and O3,10 family
980 else:
981 try:
982 max_score = 0
983 for x in O_dict:
984 if float(O_dict[x]) >= max_score:
985 max_score = float(O_dict[x])
986 highest_O = x.split("_")[0]
987 if highest_O == "O-1,3,19":
988 highest_O = '-'
989 max_score = 0
990 for x in O_dict:
991 if x == 'O-1,3,19_not_in_3,10__130':
992 pass
993 else:
994 if float(O_dict[x]) >= max_score:
995 max_score = float(O_dict[x])
996 highest_O = x.split("_")[0]
997 except:
998 pass
999 #call_fliC:
1000 if len(H_dict)!=0:
1001 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
1002 else:
1003 highest_H_score_both_BC=0
1004 highest_fliC = '-'
1005 highest_fliC_raw = '-'
1006 highest_Score = 0
1007 log_file.write("\nH_scores:\n")
1008 for s in H_dict:
1009 log_file.write(s+"\t"+str(H_dict[s])+"\n")
1010 if s.startswith('fliC'):
1011 if float(H_dict[s]) > highest_Score:
1012 highest_fliC = s.split('_')[1]
1013 highest_fliC_raw = s
1014 highest_Score = float(H_dict[s])
1015 #call_fljB
1016 highest_fljB = '-'
1017 highest_fljB_raw = '-'
1018 highest_Score = 0
1019 for s in H_dict:
1020 if s.startswith('fljB'):
1021 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
1022 highest_fljB = s.split('_')[1]
1023 highest_fljB_raw = s
1024 highest_Score = float(H_dict[s])
1025 log_file.write("\nSpecial_scores:\n")
1026 for s in Special_dict:
1027 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
1028 log_file.close()
1029 return highest_O,highest_fliC,highest_fljB
1030
1031 def get_temp_file_names(for_fq,rev_fq):
1032 #seqsero2 -a; get temp file names
1033 sam=for_fq+".sam"
1034 bam=for_fq+".bam"
1035 sorted_bam=for_fq+"_sorted.bam"
1036 mapped_fq1=for_fq+"_mapped.fq"
1037 mapped_fq2=rev_fq+"_mapped.fq"
1038 combined_fq=for_fq+"_combined.fq"
1039 for_sai=for_fq+".sai"
1040 rev_sai=rev_fq+".sai"
1041 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
1042
1043 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
1044 #seqsero2 -a; do mapping and sort
1045 print("building database...")
1046 subprocess.check_output("bwa index "+database,shell=True, stderr=subprocess.STDOUT)
1047 print("mapping...")
1048 if mapping_mode=="mem":
1049 subprocess.check_output("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
1050 elif mapping_mode=="sam":
1051 if fnameB!="":
1052 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
1053 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai,shell=True, stderr=subprocess.STDOUT)
1054 subprocess.check_output("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
1055 else:
1056 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
1057 subprocess.check_output("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam, stderr=subprocess.STDOUT)
1058 subprocess.check_output("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True, stderr=subprocess.STDOUT)
1059 ### check the version of samtools then use differnt commands
1060 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
1061 out, err = samtools_version.communicate()
1062 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
1063 print("check samtools version:",version)
1064 ### end of samtools version check and its analysis
1065 if LooseVersion(version)<=LooseVersion("1.2"):
1066 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True, stderr=subprocess.STDOUT)
1067 else:
1068 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True, stderr=subprocess.STDOUT)
1069
1070 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
1071 #seqsero2 -a; extract, assembly and blast
1072 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True, stderr=subprocess.STDOUT)
1073 if fnameB!="":
1074 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 ,shell=True, stderr=subprocess.STDOUT)#2> /dev/null if want no output
1075 else:
1076 pass
1077 outdir=current_time+"_temp"
1078 print("assembling...")
1079 if int(threads)>4:
1080 t="4"
1081 else:
1082 t=threads
1083 if os.path.getsize(combined_fq)>100 and os.path.getsize(mapped_fq1)>100:#if not, then it's "-:-:-"
1084 if fnameB!="":
1085 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
1086 else:
1087 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
1088 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
1089 subprocess.check_output("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
1090 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
1091 subprocess.check_output("rm -rf "+outdir+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
1092 print("blasting...","\n")
1093 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
1094 subprocess.check_output('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl',shell=True, stderr=subprocess.STDOUT) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
1095 subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10"
1096 else:
1097 xmlfile="NA"
1098 return xmlfile
1099
1100 def judge_subspecies(fnameA):
1101 #seqsero2 -a; judge subspecies on just forward raw reads fastq
1102 salmID_output=subprocess.check_output("../SalmID/SalmID.py -i "+fnameA, shell=True, stderr=subprocess.STDOUT)
1103 #out, err = salmID_output.communicate()
1104 #out=out.decode("utf-8")
1105 out = salmID_output.decode("utf-8")
1106 file=open("data_log.txt","a")
1107 file.write(out)
1108 file.close()
1109 salm_species_scores=out.split("\n")[1].split("\t")[6:]
1110 salm_species_results=out.split("\n")[0].split("\t")[6:]
1111 max_score=0
1112 max_score_index=1 #default is 1, means "I"
1113 for i in range(len(salm_species_scores)):
1114 if max_score<float(salm_species_scores[i]):
1115 max_score=float(salm_species_scores[i])
1116 max_score_index=i
1117 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
1118 if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
1119 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
1120 if max_score<10:
1121 prediction="-"
1122 return prediction
1123
1124 def judge_subspecies_Kmer(Special_dict):
1125 #seqsero2 -k;
1126 max_score=0
1127 prediction="-" #default should be I
1128 for x in Special_dict:
1129 if "mer" in x:
1130 if max_score<float(Special_dict[x]):
1131 max_score=float(Special_dict[x])
1132 prediction=x.split("_")[-1].strip()
1133 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
1134 prediction="bongori"
1135 break
1136 return prediction
1137
1138 def main():
1139 #combine SeqSeroK and SeqSero2, also with SalmID
1140 args = parse_args()
1141 input_file = args.i
1142 data_type = args.t
1143 analysis_mode = args.m
1144 mapping_mode=args.b
1145 threads=args.p
1146 make_dir=args.d
1147 clean_mode=args.c
1148 k_size=27 #will change for bug fixing
1149 database="H_and_O_and_specific_genes.fasta"
1150
1151 if len(sys.argv)==1:
1152 subprocess.check_output(dirpath+"/SeqSero2_package.py -h",shell=True, stderr=subprocess.STDOUT)#change name of python file
1153 else:
1154 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
1155 request_id += str(random.randint(1, 10000000))
1156 if make_dir is None:
1157 make_dir="SeqSero_result_"+request_id
1158 if os.path.isdir(make_dir):
1159 pass
1160 else:
1161 subprocess.check_output(["mkdir",make_dir])
1162 #subprocess.check_output("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
1163 subprocess.check_output("ln -srf "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
1164 ############################begin the real analysis
1165 if analysis_mode=="a":
1166 if data_type in ["1","2","3"]:#use allele mode
1167 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
1168 os.chdir(make_dir)
1169 ###add a function to tell input files
1170 fnameA=for_fq.split("/")[-1]
1171 fnameB=rev_fq.split("/")[-1]
1172 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
1173 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
1174 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
1175 xmlfile=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
1176 if xmlfile=="NA":
1177 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
1178 else:
1179 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
1180 file=open("data_log.txt","a")
1181 for x in Final_list:
1182 file.write("\t".join(str(y) for y in x)+"\n")
1183 file.close()
1184 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
1185 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=predict_O_and_H_types(Final_list,Final_list_passed) #predict O, fliC and fljB
1186 subspecies=judge_subspecies(fnameA) #predict subspecies
1187 ###output
1188 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
1189 contamination_report=""
1190 if contamination_O!="" and contamination_H=="":
1191 contamination_report="#detected potential contamination of mixed serotypes from O antigen signals.\n"
1192 elif contamination_O=="" and contamination_H!="":
1193 contamination_report="#detected potential contamination of mixed serotypes or potential thrid H phase from H antigen signals.\n"
1194 elif contamination_O!="" and contamination_H!="":
1195 contamination_report="#detected potential contamination of mixed serotypes from both O and H antigen signals.\n"
1196 if clean_mode:
1197 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
1198 make_dir="none-output-directory due to '-c' flag"
1199 else:
1200 #new_file=open("Seqsero_result.txt","w")
1201 if O_choice=="":
1202 O_choice="-"
1203 result = OrderedDict(
1204 sample_name = input_file[0],
1205 O_antigen_prediction = O_choice,
1206 H1_antigen_prediction = fliC_choice,
1207 H2_antigen_prediction = fljB_choice,
1208 predicted_antigenic_profile = predict_form,
1209 predicted_subspecies = subspecies,
1210 predicted_serotype = predict_sero,
1211 note=claim.replace('\n','')
1212 )
1213 with open("Seqsero_result.tsv","w") as new_file:
1214 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
1215 #new_file.close()
1216 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
1217 wrtr.writeheader()
1218 wrtr.writerow(result)
1219
1220
1221 # new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
1222 # new_file.close()
1223 #subprocess.check_output("cat Seqsero_result.txt",shell=True, stderr=subprocess.STDOUT)
1224 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
1225 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
1226 print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
1227 else:
1228 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
1229 elif analysis_mode=="k":
1230 ex_dir = os.path.dirname(os.path.realpath(__file__))
1231 #output_mode = args.mode
1232 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
1233 input_file = for_fq #-k will just use forward because not all reads were used
1234 os.chdir(make_dir)
1235 f = open(ex_dir + '/antigens.pickle', 'rb')
1236 lib_dict = pickle.load(f)
1237 f.close
1238 input_Ks = get_input_K(input_file,lib_dict,data_type,k_size)
1239 O_dict,H_dict,Special_dict = get_kmer_dict(lib_dict,input_Ks)
1240 highest_O,highest_fliC,highest_fljB = call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
1241 subspecies = judge_subspecies_Kmer(Special_dict)
1242 if subspecies=="IIb" or subspecies=="IIa":
1243 subspecies="II"
1244 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
1245 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
1246 if clean_mode:
1247 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
1248 make_dir="none-output-directory due to '-c' flag"
1249 else:
1250 if highest_O.split('-')[-1]=="":
1251 O_choice="-"
1252 else:
1253 O_choice=highest_O.split('-')[-1]
1254 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
1255 result = OrderedDict(
1256 sample_name = input_file,
1257 O_antigen_prediction = O_choice,
1258 H1_antigen_prediction = highest_fliC,
1259 H2_antigen_prediction = highest_fljB,
1260 predicted_antigenic_profile = predict_form,
1261 predicted_subspecies = subspecies,
1262 predicted_serotype = predict_sero,
1263 note=claim.replace('\n','')
1264 )
1265 with open("Seqsero_result.tsv","w") as new_file:
1266 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
1267 #new_file.close()
1268 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
1269 wrtr.writeheader()
1270 wrtr.writerow(result)
1271 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
1272 print("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
1273 return 0
1274
1275 if __name__ == '__main__':
1276 try:
1277 quit(main())
1278 except subprocess.CalledProcessError as e:
1279 print(str(e.output))
1280 print(e)
1281 quit(e.returncode)