comparison SeqSero2/bin/SeqSero2_package.py @ 17:03f7b358d57f

planemo upload
author jpayne
date Tue, 25 Mar 2025 23:22:38 -0400
parents
children
comparison
equal deleted inserted replaced
16:3b6d5b60968f 17:03f7b358d57f
1 #!/usr/bin/env python3
2
3 import sys
4 import time
5 import random
6 import os
7 import subprocess
8 import gzip
9 import io
10 import pickle
11 import argparse
12 import itertools
13 from distutils.version import LooseVersion
14 from distutils.spawn import find_executable
15 sys.path.insert(1,sys.path[0]+'/..')
16
17 try:
18 from .version import SeqSero2_version
19 except Exception: #ImportError
20 from version import SeqSero2_version
21
22 ### SeqSero Kmer
23 def parse_args():
24 "Parse the input arguments, use '-h' for help."
25 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.3.1')#add "-m <data_type>" in future
26 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### add 'type=os.path.abspath' to generate absolute path of input data.
27 parser.add_argument("-t",choices=['1','2','3','4','5'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore reads (fasta/fastq)")
28 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode")
29 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1")
30 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a")
31 parser.add_argument("-n",help="<string>: optional, to specify a sample name in the report output")
32 parser.add_argument("-d",help="<string>: optional, to specify an output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
33 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files")
34 parser.add_argument("-s",action="store_true",help="<flag>: if '-s' was flagged, SeqSero2 will not output header in SeqSero_result.tsv")
35 parser.add_argument("--phred_offset",choices=['33','64','auto'],default='auto',help="<33|64|auto>: offset for FASTQ file quality scores, default=auto")
36 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies")
37 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version)
38 return parser.parse_args()
39
40 ### check paths of dependencies
41 check_dependencies = parse_args().check
42 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py']
43 if check_dependencies:
44 for item in dependencies:
45 ext_path = find_executable(item)
46 if ext_path is not None:
47 print ("Using "+item+" - "+ext_path)
48 else:
49 print ("ERROR: can not find "+item+" in PATH")
50 sys.exit()
51 ### end of --check
52
53 def reverse_complement(sequence):
54 complement = {
55 'A': 'T',
56 'C': 'G',
57 'G': 'C',
58 'T': 'A',
59 'N': 'N',
60 'M': 'K',
61 'R': 'Y',
62 'W': 'W',
63 'S': 'S',
64 'Y': 'R',
65 'K': 'M',
66 'V': 'B',
67 'H': 'D',
68 'D': 'H',
69 'B': 'V'
70 }
71 return "".join(complement[base] for base in reversed(sequence))
72
73
74 def createKmerDict_reads(list_of_strings, kmer):
75 kmer_table = {}
76 for string in list_of_strings:
77 sequence = string.strip('\n')
78 for i in range(len(sequence) - kmer + 1):
79 new_mer = sequence[i:i + kmer].upper()
80 new_mer_rc = reverse_complement(new_mer)
81 if new_mer in kmer_table:
82 kmer_table[new_mer.upper()] += 1
83 else:
84 kmer_table[new_mer.upper()] = 1
85 if new_mer_rc in kmer_table:
86 kmer_table[new_mer_rc.upper()] += 1
87 else:
88 kmer_table[new_mer_rc.upper()] = 1
89 return kmer_table
90
91
92 def multifasta_dict(multifasta):
93 multifasta_list = [
94 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
95 ]
96 headers = [i for i in multifasta_list if i[0] == '>']
97 multifasta_dict = {}
98 for h in headers:
99 start = multifasta_list.index(h)
100 for element in multifasta_list[start + 1:]:
101 if element[0] == '>':
102 break
103 else:
104 if h[1:] in multifasta_dict:
105 multifasta_dict[h[1:]] += element
106 else:
107 multifasta_dict[h[1:]] = element
108 return multifasta_dict
109
110
111 def multifasta_single_string(multifasta):
112 multifasta_list = [
113 line.strip() for line in open(multifasta, 'r')
114 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
115 ]
116 return ''.join(multifasta_list)
117
118
119 def chunk_a_long_sequence(long_sequence, chunk_size=60):
120 chunk_list = []
121 steps = len(long_sequence) // 60 #how many chunks
122 for i in range(steps):
123 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
124 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
125 return chunk_list
126
127
128 def target_multifasta_kmerizer(multifasta, k, kmerDict):
129 forward_length = 300 #if find the target, put forward 300 bases
130 reverse_length = 2200 #if find the target, put backward 2200 bases
131 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
132 target_mers = []
133 long_single_string = multifasta_single_string(multifasta)
134 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
135 unit_length = len(multifasta_list[0])
136 forward_lines = int(forward_length / unit_length) + 1
137 reverse_lines = int(forward_length / unit_length) + 1
138 start_num = 0
139 end_num = 0
140 for i in range(len(multifasta_list)):
141 if i not in range(start_num, end_num): #avoid computational repetition
142 line = multifasta_list[i]
143 start = int((len(line) - k) // 2)
144 s1 = line[start:k + start]
145 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
146 if i - forward_lines >= 0:
147 start_num = i - forward_lines
148 else:
149 start_num = 0
150 if i + reverse_lines <= len(multifasta_list) - 1:
151 end_num = i + reverse_lines
152 else:
153 end_num = len(multifasta_list) - 1
154 target_list = [
155 x.strip() for x in multifasta_list[start_num:end_num]
156 ]
157 target_line = "".join(target_list)
158 target_mers += [
159 k1 for k1 in createKmerDict_reads([str(target_line)], k)
160 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
161 else:
162 pass
163 return set(target_mers)
164
165
166 def target_read_kmerizer(file, k, kmerDict):
167 i = 1
168 n_reads = 0
169 total_coverage = 0
170 target_mers = []
171 if file.endswith(".gz"):
172 file_content = io.BufferedReader(gzip.open(file))
173 else:
174 file_content = open(file, "r").readlines()
175 for line in file_content:
176 start = int((len(line) - k) // 2)
177 if i % 4 == 2:
178 if file.endswith(".gz"):
179 s1 = line[start:k + start].decode()
180 line = line.decode()
181 else:
182 s1 = line[start:k + start]
183 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
184 n_reads += 1
185 total_coverage += len(line)
186 target_mers += [
187 k1 for k1 in createKmerDict_reads([str(line)], k)
188 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
189 i += 1
190 if total_coverage >= 4000000:
191 break
192 return set(target_mers)
193
194
195 def minion_fasta_kmerizer(file, k, kmerDict):
196 i = 1
197 n_reads = 0
198 total_coverage = 0
199 target_mers = {}
200 for line in open(file):
201 if i % 2 == 0:
202 for kmer, rc_kmer in kmers(line.strip().upper(), k):
203 if (kmer in kmerDict) or (rc_kmer in kmerDict):
204 if kmer in target_mers:
205 target_mers[kmer] += 1
206 else:
207 target_mers[kmer] = 1
208 if rc_kmer in target_mers:
209 target_mers[rc_kmer] += 1
210 else:
211 target_mers[rc_kmer] = 1
212 i += 1
213 return set([h for h in target_mers])
214
215
216 def minion_fastq_kmerizer(file, k, kmerDict):
217 i = 1
218 n_reads = 0
219 total_coverage = 0
220 target_mers = {}
221 for line in open(file):
222 if i % 4 == 2:
223 for kmer, rc_kmer in kmers(line.strip().upper(), k):
224 if (kmer in kmerDict) or (rc_kmer in kmerDict):
225 if kmer in target_mers:
226 target_mers[kmer] += 1
227 else:
228 target_mers[kmer] = 1
229 if rc_kmer in target_mers:
230 target_mers[rc_kmer] += 1
231 else:
232 target_mers[rc_kmer] = 1
233 i += 1
234 return set([h for h in target_mers])
235
236
237 def multifasta_single_string2(multifasta):
238 single_string = ''
239 with open(multifasta, 'r') as f:
240 for line in f:
241 if line.strip()[0] == '>':
242 pass
243 else:
244 single_string += line.strip()
245 return single_string
246
247
248 def kmers(seq, k):
249 rev_comp = reverse_complement(seq)
250 for start in range(1, len(seq) - k + 1):
251 yield seq[start:start + k], rev_comp[-(start + k):-start]
252
253
254 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
255 multi_seq_dict = multifasta_dict(multifasta)
256 lib_dict = {}
257 for h in multi_seq_dict:
258 lib_dict[h] = set(
259 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
260 return lib_dict
261
262
263 def Combine(b, c):
264 fliC_combinations = []
265 fliC_combinations.append(",".join(c))
266 temp_combinations = []
267 for i in range(len(b)):
268 for x in itertools.combinations(b, i + 1):
269 temp_combinations.append(",".join(x))
270 for x in temp_combinations:
271 temp = []
272 for y in c:
273 temp.append(y)
274 temp.append(x)
275 temp = ",".join(temp)
276 temp = temp.split(",")
277 temp.sort()
278 temp = ",".join(temp)
279 fliC_combinations.append(temp)
280 return fliC_combinations
281
282
283 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
284 #like test_output_06012017.txt
285 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
286 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
287 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
288 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
289 seronames = []
290 seronames_none_subspecies=[]
291 for i in range(len(phase1)):
292 fliC_combine = []
293 fljB_combine = []
294 if phaseO[i] == Otype: # no VII in KW, but it's there
295 ### for fliC, detect every possible combinations to avoid the effect of "["
296 if phase1[i].count("[") == 0:
297 fliC_combine.append(phase1[i])
298 elif phase1[i].count("[") >= 1:
299 c = []
300 b = []
301 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
302 "[") == 1:
303 content = phase1[i].replace("[", "").replace("]", "")
304 fliC_combine.append(content)
305 fliC_combine.append("-")
306 else:
307 for x in phase1[i].split(","):
308 if "[" in x:
309 b.append(x.replace("[", "").replace("]", ""))
310 else:
311 c.append(x)
312 fliC_combine = Combine(
313 b, c
314 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
315 ### end of fliC "[" detect
316 ### for fljB, detect every possible combinations to avoid the effect of "["
317 if phase2[i].count("[") == 0:
318 fljB_combine.append(phase2[i])
319 elif phase2[i].count("[") >= 1:
320 d = []
321 e = []
322 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
323 "[") == 1:
324 content = phase2[i].replace("[", "").replace("]", "")
325 fljB_combine.append(content)
326 fljB_combine.append("-")
327 else:
328 for x in phase2[i].split(","):
329 if "[" in x:
330 d.append(x.replace("[", "").replace("]", ""))
331 else:
332 e.append(x)
333 fljB_combine = Combine(d, e)
334 ### end of fljB "[" detect
335 new_fliC = fliC.split(
336 ","
337 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
338 new_fliC.sort()
339 new_fliC = ",".join(new_fliC)
340 new_fljB = fljB.split(",")
341 new_fljB.sort()
342 new_fljB = ",".join(new_fljB)
343 if (new_fliC in fliC_combine
344 or fliC in fliC_combine) and (new_fljB in fljB_combine
345 or fljB in fljB_combine):
346 ######start, remove_list,rename_dict, added on 11/11/2018
347 if sero[i] not in remove_list:
348 temp_sero=sero[i]
349 if temp_sero in rename_dict:
350 temp_sero=rename_dict[temp_sero] #rename if in the rename list
351 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
352 if subs[i] == subspecies:
353 seronames.append(temp_sero)
354 seronames_none_subspecies.append(temp_sero)
355 else:
356 pass
357 else:
358 pass
359 ######end, added on 11/11/2018
360 #analyze seronames
361 subspecies_pointer=""
362 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
363 ## ed_SL_06062020: for the subspecies mismatch between KW and SalmID
364 seronames=seronames_none_subspecies
365 #seronames=["N/A"]
366 subspecies_pointer="1"
367 #subspecies_pointer="0"
368 ##
369 if len(seronames) == 0:
370 seronames = [
371 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
372 ]
373 star = ""
374 star_line = ""
375 if len(seronames) > 1: #there are two possible predictions for serotypes
376 star = "*"
377 #changed 04072019
378 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
379 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
380 star="*"
381 star_line = "This antigenic profile has been associated with serotype '"+(" or ").join(seronames)+"' in the Kauffman-White scheme. The existence of the same antigenic formula in multiple species or subspecies is well documented in the Kauffman-White Scheme. " + star_line ## ed_SL_03202021: changed for new output format
382 #star_line="The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme. " + star_line
383 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
384 seronames=["N/A"] ## ed_SL_06062020
385 if Otype=="":
386 Otype="-"
387 predict_form = Otype + ":" + fliC + ":" + fljB
388 predict_sero = (" or ").join(seronames)
389 ###special test for Enteritidis
390 if predict_form == "9:g,m:-":
391 sdf = "-"
392 for x in special_gene_list:
393 if x.startswith("sdf"):
394 sdf = "+"
395 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
396 #star_line="sdf gene detected. "
397 star_line = "Detected Sdf I that is characteristic of commonly circulating strains of serotype Enteritidis. "
398 #predict_form = predict_form + " Sdf prediction:" + sdf
399 predict_form = predict_form #changed 04072019
400 if sdf == "-":
401 star = "*"
402 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
403 #star_line="sdf gene not detected. "
404 star_line = "Sdf I that is characteristic of commonly circulating strains of serotype Enteritidis was not detected. "
405 #changed in 04072019, for new output
406 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
407 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
408 predict_sero = "Gallinarum or Enteritidis"
409 ###end of special test for Enteritidis
410 elif predict_form == "4:i:-":
411 predict_sero = "I 4,[5],12:i:-" # change serotype name
412 elif predict_form == "4:r:-":
413 predict_sero = "N/A (4:r:-)"
414 elif predict_form == "4:b:-":
415 predict_sero = "N/A (4:b:-)"
416 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
417 #predict_sero = "Newport"
418 #star = "*"
419 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
420 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
421 if "N/A" in predict_sero:
422 claim = ""
423 #special test for Typhimurium
424 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
425 normal = 0
426 mutation = 0
427 for x in special_gene_list:
428 if "oafA-O-4_full" in x:
429 normal = float(special_gene_list[x])
430 elif "oafA-O-4_5-" in x:
431 mutation = float(special_gene_list[x])
432 if normal > mutation:
433 pass
434 elif normal < mutation:
435 #predict_sero = predict_sero.strip() + "(O5-)"
436 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
437 star = "*"
438 #star_line = "Detected the deletion of O5-."
439 star_line = "Detected a deletion in gene oafA that causes O5- variant of Typhimurium. "
440 else:
441 pass
442 #special test for Paratyphi B
443 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
444 normal = 0
445 mutation = 0
446 for x in special_gene_list:
447 if "gntR-family-regulatory-protein_dt-positive" in x:
448 normal = float(special_gene_list[x])
449 elif "gntR-family-regulatory-protein_dt-negative" in x:
450 mutation = float(special_gene_list[x])
451 #print(normal,mutation)
452 if normal > mutation:
453 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
454 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip()
455 star = "*"
456 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
457 star_line = "The SNP in gene STM3356 that is associated with the d-Tartrate nonfermenting phenotype characteristic of the typhoidal pathotype was not detected. "
458 elif normal < mutation:
459 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
460 predict_sero = predict_sero.strip()
461 star = "*"
462 #star_line = "Detected the SNP for d-Tartrate nonfermenting phenotype of Paratyphi B. "
463 star_line = "Detected the SNP in gene STM3356 that is associated with the d-Tartrate nonfermenting phenotype characteristic of the typhoidal pathotype. "
464 else:
465 star = "*"
466 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
467 star_line = " " ## ed_SL_05152019: do not report this situation.
468 #special test for O13,22 and O13,23
469 if Otype=="13":
470 #ex_dir = os.path.dirname(os.path.realpath(__file__))
471 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019
472 f = open(ex_dir + '/special.pickle', 'rb')
473 special = pickle.load(f)
474 O22_O23=special['O22_O23']
475 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
476 O22_score=0
477 O23_score=0
478 for x in special_gene_list:
479 if "O:22" in x:
480 O22_score = O22_score+float(special_gene_list[x])
481 elif "O:23" in x:
482 O23_score = O23_score+float(special_gene_list[x])
483 #print(O22_score,O23_score)
484 for z in O22_O23[0]:
485 if predict_sero.split(" or ")[0] in z:
486 if O22_score > O23_score:
487 star = "*"
488 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
489 predict_sero = z[0]
490 elif O22_score < O23_score:
491 star = "*"
492 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
493 predict_sero = z[1]
494 else:
495 star = "*"
496 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
497 if " or " in predict_sero:
498 star_line = star_line + "The predicted serotypes share the same general formula: " + Otype + ":" + fliC + ":" + fljB + " and can be differentiated by additional analysis. "
499 #special test for O6,8
500 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
501 #for x in merge_O68_list:
502 # if x in predict_sero:
503 # predict_sero=x
504 # star=""
505 # star_line=""
506 #special test for Montevideo; most of them are monophasic
507 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
508 #star="*"
509 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
510 return predict_form, predict_sero, star, star_line, claim
511 ### End of SeqSero Kmer part
512
513 ### Begin of SeqSero2 allele prediction and output
514 def xml_parse_score_comparision_seqsero(xmlfile):
515 #used to do seqsero xml analysis
516 from Bio.Blast import NCBIXML
517 handle=open(xmlfile)
518 handle=NCBIXML.parse(handle)
519 handle=list(handle)
520 List=[]
521 List_score=[]
522 List_ids=[]
523 List_query_region=[]
524 for i in range(len(handle)):
525 if len(handle[i].alignments)>0:
526 for j in range(len(handle[i].alignments)):
527 score=0
528 ids=0
529 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
530 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
531 for z in range(len(handle[i].alignments[j].hsps)):
532 hsp=handle[i].alignments[j].hsps[z]
533 temp=set(range(hsp.query_start,hsp.query_end))
534 if len(cover_region)==0:
535 cover_region=cover_region|temp
536 fraction=1
537 else:
538 fraction=1-len(cover_region&temp)/float(len(temp))
539 cover_region=cover_region|temp
540 if "last" in handle[i].query or "first" in handle[i].query:
541 score+=hsp.bits*fraction
542 ids+=float(hsp.identities)/handle[i].query_length*fraction
543 else:
544 score+=hsp.bits*fraction
545 ids+=float(hsp.identities)/handle[i].query_length*fraction
546 List_score.append(score)
547 List_ids.append(ids)
548 List_query_region.append(cover_region)
549 temp=zip(List,List_score,List_ids,List_query_region)
550 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
551 return Final_list
552
553
554 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
555 Old=L
556 L.sort()
557 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
558 count=[]
559 for j in range(len(L)):
560 y=0
561 for x in Old:
562 if L[j]==x:
563 y+=1
564 count.append(y)
565 if sort_on_fre!="none":
566 d=zip(*sorted(zip(count, L)))
567 L=d[1]
568 count=d[0]
569 return (L,count)
570
571 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
572 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
573 #this is mainly used for
574 a=nodes_vs_score_list
575 fliC_score=0
576 fljB_score=0
577 for z in a:
578 if "fliC" in z[0]:
579 fliC_score+=z[1]
580 elif "fljB" in z[0]:
581 fljB_score+=z[1]
582 if fliC_score>=fljB_score:
583 role="fliC"
584 else:
585 role="fljB"
586 return (role,abs(fliC_score-fljB_score))
587
588 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
589 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
590 #also used when no head or tail got blasted score for the contig
591 role=""
592 for z in Final_list_passed:
593 if node_name in z[0]:
594 role=z[0].split("_")[0]
595 break
596 return role
597
598 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
599 #nodes_list is the c created by c,d=Uniq(nodes) in below function
600 first_target=""
601 role_list=[]
602 for x in nodes_list:
603 a=[]
604 role=""
605 for y in tail_head_list:
606 if x in y[0]:
607 a.append(y)
608 if len(a)==4:
609 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
610 if diff<20:
611 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
612 elif len(a)==3:
613 ###however, if the one with highest score is the fewer one, compare their accumulation score
614 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
615 if diff<20:
616 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
617 ###end of above score comparison
618 elif len(a)==2:
619 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
620 temp=[]
621 for z in a:
622 temp.append(z[0].split("_")[0])
623 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
624 if len(m)==1:
625 pass
626 else:
627 pass
628 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
629 if diff<20:
630 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
631 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
632 elif len(a)==1:
633 #that one
634 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
635 if diff<20:
636 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
637 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
638 else:#a==0
639 #use Final_list_passed best match
640 for z in Final_list_passed:
641 if x in z[0]:
642 role=z[0].split("_")[0]
643 break
644 #print x,role,len(a)
645 role_list.append((role,x))
646 if len(role_list)==2:
647 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
648 #just use score to do a final test
649 role_list=[]
650 for x in nodes_list:
651 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
652 role_list.append((role,x))
653 return role_list
654
655 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
656 #used to decide which contig is FliC and which one is fljB
657 contigs=[]
658 nodes=[]
659 for x in Final_list_passed:
660 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
661 nodes.append(x[0].split("___")[1].strip())
662 c,d=Uniq(nodes)#c is node_list
663 #print c
664 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
665 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
666 return roles
667
668 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
669 #decide O based on Final_list
670 O_choice="?"
671 O_list=[]
672 special_genes={}
673 nodes=[]
674 for x in Final_list_passed:
675 if x[0].startswith("O-"):
676 nodes.append(x[0].split("___")[1].strip())
677 elif not x[0].startswith("fl"):
678 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
679 #print "special_genes:",special_genes
680 c,d=Uniq(nodes)
681 #print "potential O antigen contig",c
682 final_O=[]
683 O_nodes_list=[]
684 for x in c:#c is the list for contigs
685 temp=0
686 for y in Final_list_passed:
687 if x in y[0] and y[0].startswith("O-"):
688 final_O.append(y)
689 break
690 ### O contig has the problem of two genes on same contig, so do additional test
691 potenial_new_gene=""
692 for x in final_O:
693 pointer=0 #for genes merged or not
694 #not consider O-1,3,19_not_in_3,10, too short compared with others
695 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
696 pointer=x[0].split("___")[1].strip()#store the contig name
697 print(pointer)
698 if pointer!=0:#it has potential merge event
699 for y in Final_list:
700 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
701 potenial_new_gene=y
702 #print(potenial_new_gene)
703 break
704 if potenial_new_gene!="":
705 print("two differnt genes in same contig, fix it for O antigen")
706 print(potenial_new_gene[:3])
707 pointer=0
708 for y in final_O:
709 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
710 pointer=1
711 if pointer!=0: #changed to consider two genes in same contig
712 final_O.append(potenial_new_gene)
713 ### end of the two genes on same contig test
714 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
715 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
716 #print "$$$No Otype, due to no hit"#may need to be changed
717 O_choice="-"
718 else:
719 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
720 O_list=[]
721 O_list_less_contamination=[]
722 for x in final_O:
723 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
724 O_list.append(x[0].split("__")[0])
725 O_nodes_list.append(x[0].split("___")[1])
726 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
727 O_list_less_contamination.append(x[0].split("__")[0])
728 ### special test for O9,46 and O3,10 family
729 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
730 if "O-9,46_wzy" in O_list or "O-9,46_wzy_partial" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
731 O_choice="O-9,46"
732 #print "$$$Most possilble Otype: O-9,46"
733 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
734 O_choice="O-9,46,27"
735 #print "$$$Most possilble Otype: O-9,46,27"
736 else:
737 O_choice="O-9"#next, detect O9 vs O2?
738 O2=0
739 O9=0
740 for z in special_genes:
741 if "tyr-O-9" in z and special_genes[z] > O9: ##20240322, add "special_genes[z] > O9" to avoid misidentification of O9 to O2 that caused by multiple tyr-O-9 contigs
742 O9=special_genes[z]
743 elif "tyr-O-2" in z and special_genes[z] > O2: ##20240322, add "special_genes[z] > O2"
744 O2=special_genes[z]
745 if O2>O9:
746 O_choice="O-2"
747 elif O2<O9:
748 pass
749 else:
750 pass
751 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
752 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
753 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
754 O_choice="O-3,10"
755 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
756 else:
757 O_choice="O-1,3,19"
758 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
759 ### end of special test for O9,46 and O3,10 family
760 else:
761 try:
762 max_score=0
763 for x in final_O:
764 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
765 max_score=x[2]#change from x[-1] to x[2],08172018
766 O_choice=x[0].split("_")[0]
767 if O_choice=="O-1,3,19":
768 O_choice=final_O[1][0].split("_")[0]
769 #print "$$$Most possilble Otype: ",O_choice
770 except:
771 pass
772 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
773 if O_choice=="O-9,46,27" and len(O_list)==2 and "O-4_wzx" in O_list: #special for very low chance sitatuion between O4 and O9,27,46, this is for serotypes like Bredeney and Schwarzengrund (normallly O-4 will have higher score, but sometimes sequencing quality may affect the prediction)
774 O_choice="O-4"
775 #print "O:",O_choice,O_nodes_list
776 Otypes=[]
777 for x in O_list:
778 if x!="O-1,3,19_not_in_3,10":
779 if "O-9,46_" not in x:
780 Otypes.append(x.split("_")[0])
781 else:
782 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
783 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
784 Otypes_uniq,Otypes_fre=Uniq(Otypes)
785 contamination_O=""
786 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
787 if len(Otypes_uniq)>2:
788 contamination_O="potential contamination from O antigen signals"
789 else:
790 if len(Otypes_uniq)>1:
791 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
792 contamination_O=""
793 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
794 contamination_O=""
795 else:
796 contamination_O="potential contamination from O antigen signals"
797 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
798 ### End of SeqSero2 allele prediction and output
799
800 def get_input_files(make_dir,input_file,data_type,dirpath):
801 #tell input files from datatype
802 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
803 for_fq=""
804 rev_fq=""
805 os.chdir(make_dir)
806 if data_type=="1":
807 input_file=input_file[0].split("/")[-1]
808 if input_file.endswith(".sra"):
809 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
810 for_fq=input_file.replace(".sra","_1.fastq")
811 rev_fq=input_file.replace(".sra","_2.fastq")
812 else:
813 core_id=input_file.split(".fastq")[0].split(".fq")[0]
814 for_fq=core_id+"_1.fastq"
815 rev_fq=core_id+"_2.fastq"
816 if input_file.endswith(".gz"):
817 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
818 else:
819 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
820 elif data_type=="2":
821 for_fq=input_file[0].split("/")[-1]
822 rev_fq=input_file[1].split("/")[-1]
823 elif data_type=="3":
824 input_file=input_file[0].split("/")[-1]
825 if input_file.endswith(".sra"):
826 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
827 for_fq=input_file.replace(".sra","_1.fastq")
828 else:
829 for_fq=input_file
830 elif data_type in ["4","5","6"]:
831 for_fq=input_file[0].split("/")[-1]
832 os.chdir("..")
833 return for_fq,rev_fq
834
835 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
836 #get O and H types from Final_list from blast parsing; allele mode
837 from Bio import SeqIO
838 fliC_choice="-"
839 fljB_choice="-"
840 fliC_contig="NA"
841 fljB_contig="NA"
842 fliC_region=set([0])
843 fljB_region=set([0,])
844 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
845 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
846 O_choice="-"#no need to decide O contig for now, should be only one
847 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
848 O_choice=O_choice.split("-")[-1].strip()
849 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
850 O_choice="-"
851 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
852 #add alignment locations, used for further selection, 03312019
853 for i in range(len(H_contig_roles)):
854 x=H_contig_roles[i]
855 for y in Final_list_passed:
856 if x[1] in y[0] and y[0].startswith(x[0]):
857 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
858 break
859 log_file=open("SeqSero_log.txt","a")
860 extract_file=open("Extracted_antigen_alleles.fasta","a")
861 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
862
863 #print("O_contigs:")
864 log_file.write("O_contigs:\n")
865 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
866 extract_file.write("#O_contigs:\n")
867 for x in O_nodes_roles:
868 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
869 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
870 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
871 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
872 seqs=""
873 for z in handle_fasta:
874 if x[0].split("___")[-1]==z.description:
875 seqs=str(z.seq)
876 extract_file.write(title+seqs+"\n")
877 if len(H_contig_roles)!=0:
878 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
879 else:
880 highest_H_coverage=0
881 for x in H_contig_roles:
882 #if multiple choices, temporately select the one with longest length for now, will revise in further change
883 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
884 fliC_contig=x[1]
885 fliC_length=len(x[-1])
886 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
887 fljB_contig=x[1]
888 fljB_length=len(x[-1])
889 for x in Final_list_passed:
890 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
891 fliC_choice=x[0].split("_")[1]
892 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
893 fljB_choice=x[0].split("_")[1]
894 elif fliC_choice!="-" and fljB_choice!="-":
895 break
896 #now remove contigs not in middle core part
897 first_allele="NA"
898 first_allele_percentage=0
899 for x in Final_list:
900 if x[0].startswith("fliC") or x[0].startswith("fljB"):
901 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
902 first_allele_percentage=x[2]
903 break
904 additional_contigs=[]
905 for x in Final_list:
906 if first_allele in x[0]:
907 if (fliC_contig == x[0].split("___")[-1]):
908 fliC_region=x[3]
909 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
910 fljB_region=x[3]
911 else:
912 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
913 additional_contigs.append(x)
914 #else:
915 #print x[:3]
916 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
917 if first_allele_percentage>0.9:
918 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
919 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
920 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
921 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
922 else:
923 target_region=set()#doesn't do anything
924 else:
925 target_region=set()#doesn't do anything
926 #print(target_region)
927 #print(additional_contigs)
928 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
929 target_region=target_region2|target_region
930 for x in additional_contigs:
931 removal=0
932 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
933 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
934 removal=1
935 else:
936 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
937 removal=1
938 else:
939 pass
940 if removal==1:
941 for y in H_contig_roles:
942 if y[1] in x[0]:
943 H_contig_roles.remove(y)
944 else:
945 pass
946 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5)
947 #end of removing none-middle contigs
948 #print("H_contigs:")
949 log_file.write("H_contigs:\n")
950 extract_file.write("#H_contigs:\n")
951 H_contig_stat=[]
952 H1_cont_stat={}
953 H2_cont_stat={}
954 for i in range(len(H_contig_roles)):
955 x=H_contig_roles[i]
956 a=0
957 for y in Final_list_passed:
958 if x[1] in y[0] and y[0].startswith(x[0]):
959 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
960 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
961 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
962 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
963 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
964 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
965 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
966 seqs=""
967 for z in handle_fasta:
968 if x[1]==z.description:
969 seqs=str(z.seq)
970 extract_file.write(title+seqs+"\n")
971 break
972 else:
973 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
974 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
975 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
976 seqs=""
977 for z in handle_fasta:
978 if x[1]==z.description:
979 seqs=str(z.seq)
980 extract_file.write(title+seqs+"\n")
981 if x[0]=="fliC":
982 if y[0].split("_")[1] not in H1_cont_stat:
983 H1_cont_stat[y[0].split("_")[1]]=y[2]
984 else:
985 H1_cont_stat[y[0].split("_")[1]]+=y[2]
986 if x[0]=="fljB":
987 if y[0].split("_")[1] not in H2_cont_stat:
988 H2_cont_stat[y[0].split("_")[1]]=y[2]
989 else:
990 H2_cont_stat[y[0].split("_")[1]]+=y[2]
991 break
992 #detect contaminations
993 #print(H1_cont_stat)
994 #print(H2_cont_stat)
995 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
996 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
997 contamination_H=""
998 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
999 contamination_H="potential contamination from H antigen signals"
1000 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
1001 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
1002 #get additional antigens
1003 """
1004 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
1005 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
1006 O_choice="O-9,46"
1007 #print "$$$Most possilble Otype: O-9,46"
1008 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
1009 O_choice="O-9,46,27"
1010 #print "$$$Most possilble Otype: O-9,46,27"
1011 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
1012 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
1013 O_choice="O-3,10"
1014 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
1015 else:
1016 O_choice="O-1,3,19"
1017 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
1018 ### end of special test for O9,46 and O3,10 family
1019
1020 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
1021 if len(Otypes_uniq)>2:
1022 contamination_O="potential contamination from O antigen signals"
1023 else:
1024 if len(Otypes_uniq)>1:
1025 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
1026 contamination_O=""
1027 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
1028 contamination_O=""
1029 """
1030 additonal_antigents=[]
1031 #print(contamination_O)
1032 #print(contamination_H)
1033 log_file.write(contamination_O+"\n")
1034 log_file.write(contamination_H+"\n")
1035 log_file.close()
1036 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
1037
1038 def get_input_K(input_file,lib_dict,data_type,k_size):
1039 #kmer mode; get input_Ks from dict and data_type
1040 kmers = []
1041 for h in lib_dict:
1042 kmers += lib_dict[h]
1043 if data_type == '4':
1044 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
1045 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
1046 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
1047 elif data_type == '5':#minion_2d_fasta
1048 #input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
1049 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers)) #ed_SL_08172020: change for nanopore workflow
1050 if data_type == '6':#minion_2d_fastq
1051 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
1052 return input_Ks
1053
1054 def get_kmer_dict(lib_dict,input_Ks):
1055 #kmer mode; get predicted types
1056 O_dict = {}
1057 H_dict = {}
1058 Special_dict = {}
1059 for h in lib_dict:
1060 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
1061 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
1062 if h.startswith('O-') and score > 25:
1063 O_dict[h] = score
1064 if h.startswith('fl') and score > 40:
1065 H_dict[h] = score
1066 if (h[:2] != 'fl') and (h[:2] != 'O-'):
1067 Special_dict[h] = score
1068 return O_dict,H_dict,Special_dict
1069
1070 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
1071 log_file=open("SeqSero_log.txt","a")
1072 log_file.write("O_scores:\n")
1073 #call O:
1074 highest_O = '-'
1075 if len(O_dict) == 0:
1076 pass
1077 else:
1078 for x in O_dict:
1079 log_file.write(x+"\t"+str(O_dict[x])+"\n")
1080 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
1081 #if 'O-9,46_wzy__1191' in O_dict or "O-9,46_wzy_partial__216" in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
1082 #modified to fix miscall of O-9,46
1083 if ('O-9,46_wzy__1191' in O_dict and O_dict['O-9,46_wzy__1191']>40) or ("O-9,46_wzy_partial__216" in O_dict and O_dict["O-9,46_wzy_partial__216"]>40): # and float(O946_wzy)/float(num_1) > 0.1
1084 highest_O = "O-9,46"
1085 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
1086 highest_O = "O-9,46,27"
1087 else:
1088 highest_O = "O-9" # next, detect O9 vs O2?
1089 O2 = 0
1090 O9 = 0
1091 for z in Special_dict:
1092 if "tyr-O-9" in z:
1093 O9 = float(Special_dict[z])
1094 if "tyr-O-2" in z:
1095 O2 = float(Special_dict[z])
1096 if O2 > O9:
1097 highest_O = "O-2"
1098 elif ("O-3,10_wzx__1539" in O_dict) and (
1099 "O-9,46_wzy__1191" in O_dict
1100 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
1101 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
1102 highest_O = "O-3,10"
1103 else:
1104 highest_O = "O-1,3,19"
1105 ### end of special test for O9,46 and O3,10 family
1106 else:
1107 try:
1108 max_score = 0
1109 for x in O_dict:
1110 if float(O_dict[x]) >= max_score:
1111 max_score = float(O_dict[x])
1112 #highest_O = x.split("_")[0]
1113 # ed_SL_12182019: modified to fix the O-9,46 error example1
1114 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict):
1115 highest_O = "O-9"
1116 else:
1117 highest_O = x.split("_")[0]
1118 if highest_O == "O-1,3,19":
1119 highest_O = '-'
1120 max_score = 0
1121 for x in O_dict:
1122 if x == 'O-1,3,19_not_in_3,10__130':
1123 pass
1124 else:
1125 if float(O_dict[x]) >= max_score:
1126 max_score = float(O_dict[x])
1127 #highest_O = x.split("_")[0]
1128 # ed_SL_12182019: modified to fix the O-9,46 error example1
1129 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict):
1130 highest_O = "O-9"
1131 else:
1132 highest_O = x.split("_")[0]
1133 except:
1134 pass
1135 #call_fliC:
1136 if len(H_dict)!=0:
1137 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
1138 else:
1139 highest_H_score_both_BC=0
1140 highest_fliC = '-'
1141 highest_fliC_raw = '-'
1142 highest_Score = 0
1143 log_file.write("\nH_scores:\n")
1144 for s in H_dict:
1145 log_file.write(s+"\t"+str(H_dict[s])+"\n")
1146 if s.startswith('fliC'):
1147 if float(H_dict[s]) > highest_Score:
1148 highest_fliC = s.split('_')[1]
1149 highest_fliC_raw = s
1150 highest_Score = float(H_dict[s])
1151 #call_fljB
1152 highest_fljB = '-'
1153 highest_fljB_raw = '-'
1154 highest_Score = 0
1155 for s in H_dict:
1156 if s.startswith('fljB'):
1157 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
1158 #highest_fljB = s.split('_')[1]
1159 #highest_fljB_raw = s
1160 #highest_Score = float(H_dict[s])
1161 if s.split('_')[1]!=highest_fliC:
1162 highest_fljB = s.split('_')[1]
1163 highest_fljB_raw = s
1164 highest_Score = float(H_dict[s])
1165 log_file.write("\nSpecial_scores:\n")
1166 for s in Special_dict:
1167 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
1168 log_file.close()
1169 return highest_O,highest_fliC,highest_fljB
1170
1171 def get_temp_file_names(for_fq,rev_fq):
1172 #seqsero2 -a; get temp file names
1173 sam=for_fq+".sam"
1174 bam=for_fq+".bam"
1175 sorted_bam=for_fq+"_sorted.bam"
1176 mapped_fq1=for_fq+"_mapped.fq"
1177 mapped_fq2=rev_fq+"_mapped.fq"
1178 combined_fq=for_fq+"_combined.fq"
1179 for_sai=for_fq+".sai"
1180 rev_sai=rev_fq+".sai"
1181 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
1182
1183 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
1184 #seqsero2 -a; do mapping and sort
1185 print("building database...")
1186 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True)
1187 print("mapping...")
1188 if mapping_mode=="mem":
1189 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
1190 elif mapping_mode=="sam":
1191 if fnameB!="":
1192 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
1193 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True)
1194 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
1195 else:
1196 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
1197 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam)
1198 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True)
1199 ### check the version of samtools then use differnt commands
1200 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
1201 out, err = samtools_version.communicate()
1202 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
1203 print("check samtools version:",version)
1204 ### end of samtools version check and its analysis
1205 if LooseVersion(version)<=LooseVersion("1.2"):
1206 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True)
1207 else:
1208 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True)
1209
1210 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode,phred_offset):
1211 #seqsero2 -a; extract, assembly and blast
1212 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True)
1213 #print("fnameA:",fnameA)
1214 #print("fnameB:",fnameB)
1215 if fnameB!="":
1216 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output
1217 else:
1218 pass
1219 outdir=current_time+"_temp"
1220 print("assembling...")
1221 if int(threads)>4:
1222 t="4"
1223 else:
1224 t=threads
1225 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
1226 if phred_offset == 'auto':
1227 phred_offset = ''
1228 else:
1229 phred_offset = '--phred-offset ' + phred_offset
1230
1231 if fnameB!="":
1232 #print("spades.py --careful "+phred_offset+" --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1")
1233 subprocess.check_call("spades.py --careful "+phred_offset+" --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
1234 else:
1235 subprocess.check_call("spades.py --careful "+phred_offset+" --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
1236 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
1237 #new_fasta=fnameA+"_"+database.split('/')[-1]+"_"+mapping_mode+".fasta" # change path to databse for packaging
1238 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True)
1239 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
1240 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True)
1241 print("blasting...","\n")
1242 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
1243 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
1244 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10"
1245 else:
1246 xmlfile="NA"
1247 return xmlfile,new_fasta
1248
1249 def judge_subspecies(fnameA):
1250 #seqsero2 -a; judge subspecies on just forward raw reads fastq
1251 salmID_output=subprocess.Popen("SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
1252 out, err = salmID_output.communicate()
1253 out=out.decode("utf-8")
1254 file=open("data_log.txt","a")
1255 file.write(out)
1256 file.close()
1257 salm_species_scores=out.split("\n")[1].split("\t")[6:]
1258 salm_species_results=out.split("\n")[0].split("\t")[6:]
1259 max_score=0
1260 max_score_index=1 #default is 1, means "I"
1261 for i in range(len(salm_species_scores)):
1262 if max_score<float(salm_species_scores[i]):
1263 max_score=float(salm_species_scores[i])
1264 max_score_index=i
1265 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
1266 #if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
1267 if float(out.split("\n")[1].split("\t")[4]) > 10 and float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): ## ed_SL_0318: change SalmID_ssp_threshold
1268 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
1269 #if max_score<10: ## ed_SL_0318: change SalmID_ssp_threshold
1270 if max_score<60:
1271 prediction="-"
1272 ## ed_SL_0818: add for enterica
1273 if float(out.split("\n")[1].split("\t")[5]) > 10 and float(out.split("\n")[1].split("\t")[5]) > float(out.split("\n")[1].split("\t")[4]):
1274 prediction="enterica"
1275 ##
1276 return prediction
1277
1278 def judge_subspecies_Kmer(Special_dict):
1279 #seqsero2 -k;
1280 max_score=0
1281 prediction="-" #default should be I
1282 for x in Special_dict:
1283 #if "mer" in x: ## ed_SL_0318: change ssp_threshold
1284 if "mer" in x and float(Special_dict[x]) > 60:
1285 if max_score<float(Special_dict[x]):
1286 max_score=float(Special_dict[x])
1287 prediction=x.split("_")[-1].strip()
1288 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
1289 prediction="bongori"
1290 break
1291 return prediction
1292
1293 ## ed_SL_11232019: add notes for missing antigen
1294 def check_antigens(ssp,O_antigen,H1_antigen,H2_antigen,NA_note):
1295 antigen_note = ''
1296 if ssp != '-':
1297 if O_antigen != '-' and H1_antigen == '-' and H2_antigen == '-': # O:-:-
1298 antigen_note = 'H antigens were not detected. This is an atypical result that should be further investigated. Most Salmonella strains have at least fliC, encoding the Phase 1 H antigen, even if it is not expressed. '
1299 NA_note = ''
1300 elif O_antigen != '-' and H1_antigen == '-' and H2_antigen != '-': # O:-:H2
1301 antigen_note = 'fliC was not detected. This is an atypical result that should be further investigated. Most Salmonella strains have fliC, encoding the Phase 1 H antigen, even if it is not expressed. '
1302 NA_note = ''
1303 elif O_antigen == '-' and H1_antigen != '-': # -:H1:X
1304 antigen_note = 'O antigen was not detected. This result may be due to a rough strain that has deleted the rfb region. For raw reads input, the k-mer workflow is sometimes more sensitive than the microassembly workflow in detecting O antigen. Caution should be used with this approach because the k-mer result may be due to low levels of contamination. '
1305 NA_note = ''
1306 elif O_antigen == '-' and H1_antigen == '-' and H2_antigen == '-': # -:-:-
1307 antigen_note = 'No serotype antigens were detected. This is an atypical result that should be further investigated. '
1308 NA_note = ''
1309 else:
1310 antigen_note = 'The input genome cannot be identified as Salmonella. Check the input for taxonomic ID, contamination, or sequencing quality. '
1311 NA_note = ''
1312 if ssp == 'enterica':
1313 antigen_note += 'Subspecies identification of the input genome cannot be definitively determined. '
1314 NA_note = ''
1315 # if [O_antigen, H1_antigen, H2_antigen].count('-') >= 2:
1316 # antigen_note = 'No subspecies marker was detected and less than 2 serotype antigens were detected; further, this genome was not identified as Salmonella. This is an atypical result that should be further investigated. '
1317 # else:
1318 # antigen_note = 'No subspecies marker was detected. This genome may not be Salmonella. This is an atypical result that should be further investigated. '
1319 return (antigen_note,NA_note)
1320
1321 ## ed_SL_06062020: rename subspecies ID
1322 subspecies_ID_dir = {'I': 'Salmonella enterica subspecies enterica (subspecies I)',
1323 'II': 'Salmonella enterica subspecies salamae (subspecies II)',
1324 'IIIa': 'Salmonella enterica subspecies arizonae (subspecies IIIa)',
1325 'IIIb': 'Salmonella enterica subspecies diarizonae (subspecies IIIb)',
1326 'IV': 'Salmonella enterica subspecies houtenae (subspecies IV)',
1327 'VI': 'Salmonella enterica subspecies indica (subspecies VI)',
1328 'VII': 'Salmonella enterica subspecies VII (subspecies VII)',
1329 'bongori': 'Salmonella bongori',
1330 'enterica': 'Salmonella enterica',
1331 '-': '-'}
1332 ##
1333
1334 ## ed_SL_08172020: format check for fasta or fastq in nanopore workflow, convert raw reads fastq to fasta
1335 def format_check(input_file):
1336 line=open(input_file,'r').readline()
1337 if line.startswith('>'):
1338 output_file = input_file
1339 elif line.startswith('@'):
1340 input_file_fa = input_file + '.fasta'
1341 subprocess.check_call("seqtk seq -A "+input_file+" > "+input_file_fa,shell=True)
1342 output_file = input_file_fa
1343 else:
1344 print ('please check the format of input files')
1345 return (output_file)
1346 ##
1347
1348 def main():
1349 #combine SeqSeroK and SeqSero2, also with SalmID
1350 args = parse_args()
1351 input_file = args.i
1352 data_type = args.t
1353 analysis_mode = args.m
1354 mapping_mode=args.b
1355 threads=args.p
1356 make_dir=args.d
1357 clean_mode=args.c
1358 sample_name=args.n
1359 ingore_header=args.s
1360 phred_offset=args.phred_offset
1361 k_size=27 #will change for bug fixing
1362 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
1363 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: add ex_dir for packaging
1364 seqsero2_db=ex_dir+"/H_and_O_and_specific_genes.fasta" # ed_SL_11092019: change path to database for packaging
1365 database="H_and_O_and_specific_genes.fasta"
1366 note="Note: "
1367 NA_note="This predicted serotype is not in the Kauffman-White scheme. " # ed_SL_09272019: add for new output format
1368 if len(sys.argv)==1:
1369 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file
1370 else:
1371 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
1372 request_id += str(random.randint(1, 10000000))
1373 if make_dir is None:
1374 make_dir="SeqSero_result_"+request_id
1375 make_dir=os.path.abspath(make_dir)
1376 if os.path.isdir(make_dir):
1377 pass
1378 else:
1379 subprocess.check_call("mkdir -p "+make_dir,shell=True)
1380 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
1381 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
1382 subprocess.check_call("ln -f -s "+seqsero2_db+" "+" ".join(input_file)+" "+make_dir,shell=True) # ed_SL_11092019: change path to database for packaging
1383 #subprocess.check_call("ln -f -s "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument).
1384 ############################begin the real analysis
1385 if analysis_mode=="a":
1386 if data_type in ["1","2","3"]:#use allele mode
1387 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
1388 os.chdir(make_dir)
1389 ###add a function to tell input files
1390 fnameA=for_fq.split("/")[-1]
1391 fnameB=rev_fq.split("/")[-1]
1392 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
1393 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
1394 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
1395
1396 ### avoid error out when micro assembly fails. ed_SL_03172020
1397 try:
1398 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode,phred_offset) #extract the mapped reads and do micro assembly and blast
1399 except (UnboundLocalError, subprocess.CalledProcessError):
1400 xmlfile="NA"
1401 H1_cont_stat_list=[]
1402 H2_cont_stat_list=[]
1403 ###
1404 if xmlfile=="NA":
1405 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
1406 else:
1407 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
1408 file=open("data_log.txt","a")
1409 for x in Final_list:
1410 file.write("\t".join(str(y) for y in x)+"\n")
1411 file.close()
1412 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
1413 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
1414 subspecies=judge_subspecies(fnameA) #predict subspecies
1415 ### ed_SL_06062020: correction VIII -> II
1416 if subspecies == 'VIII':
1417 subspecies = 'II'
1418 ### ed_SL_08132020: correction VII -> IV, according to CDC's suggestion
1419 if subspecies == 'VII':
1420 subspecies = 'IV'
1421 note+='SalmID reports this as ssp VII, which has not been formally recognized. '
1422 ###
1423 ### ed_SL_08182020: change serotype ouput for genome without definitive subspecies ID
1424 ssp_pointer = subspecies
1425 if subspecies == 'enterica':
1426 subspecies = '-'
1427 ###
1428 ###output
1429 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
1430 claim="" #04132019, disable claim for new report requirement
1431 contamination_report=""
1432 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
1433 if contamination_O!="" and contamination_H=="":
1434 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
1435 elif contamination_O=="" and contamination_H!="":
1436 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
1437 elif contamination_O!="" and contamination_H!="":
1438 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
1439 if contamination_report!="":
1440 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
1441 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles. "
1442 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
1443 #if contamination_report+star_line+claim=="": #0413, new output style
1444 # note=""
1445 #else:
1446 # note="Note:"
1447
1448 ### ed_SL_11232019: add notes for missing antigen
1449 if O_choice=="":
1450 O_choice="-"
1451 antigen_note,NA_note=check_antigens(ssp_pointer,O_choice,fliC_choice,fljB_choice,NA_note)
1452 if sample_name:
1453 print ("Sample name:\t"+sample_name)
1454 ###
1455 if clean_mode:
1456 subprocess.check_call("rm -rf "+make_dir,shell=True)
1457 make_dir="none-output-directory due to '-c' flag"
1458 else:
1459 new_file=open("SeqSero_result.txt","w")
1460 ### ed_SL_01152020: add new output
1461 conta_note="yes" if "inter-serotype contamination" in contamination_report else "no"
1462 tsv_file=open("SeqSero_result.tsv","w")
1463 if ingore_header:
1464 pass
1465 else:
1466 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted identification\tPredicted antigenic profile\tPredicted serotype\tPotential inter-serotype contamination\tNote\n")
1467 if sample_name:
1468 new_file.write("Sample name:\t"+sample_name+"\n")
1469 tsv_file.write(sample_name+'\t')
1470 else:
1471 tsv_file.write(input_file[0].split('/')[-1]+'\t')
1472 ###
1473 if "N/A" not in predict_sero:
1474 new_file.write("Output directory:\t"+make_dir+"\n"+
1475 "Input files:\t"+"\t".join(input_file)+"\n"+
1476 "O antigen prediction:\t"+O_choice+"\n"+
1477 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1478 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1479 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+
1480 "Predicted antigenic profile:\t"+predict_form+"\n"+
1481 "Predicted serotype:\t"+predict_sero+"\n"+
1482 note+contamination_report+star_line+claim+antigen_note+"\n")#+##
1483 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+predict_sero+"\t"+conta_note+"\t"+contamination_report+star_line+claim+antigen_note+"\n")
1484 else:
1485 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
1486 #star_line="" #04132019, for new output requirement, diable star_line if "NA" in output
1487 new_file.write("Output directory:\t"+make_dir+"\n"+
1488 "Input files:\t"+"\t".join(input_file)+"\n"+
1489 "O antigen prediction:\t"+O_choice+"\n"+
1490 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1491 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1492 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+
1493 "Predicted antigenic profile:\t"+predict_form+"\n"+
1494 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, add subspecies
1495 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")#+##
1496 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+conta_note+"\t"+NA_note+contamination_report+star_line+claim+antigen_note+"\n")
1497 new_file.close()
1498 tsv_file.close()
1499 #subprocess.check_call("cat Seqsero_result.txt",shell=True)
1500 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True)
1501 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True)
1502 if "N/A" not in predict_sero:
1503 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+##
1504 print("Output directory:\t"+make_dir+"\n"+
1505 "Input files:\t"+"\t".join(input_file)+"\n"+
1506 "O antigen prediction:\t"+O_choice+"\n"+
1507 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1508 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1509 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+
1510 "Predicted antigenic profile:\t"+predict_form+"\n"+
1511 "Predicted serotype:\t"+predict_sero+"\n"+
1512 note+contamination_report+star_line+claim+antigen_note+"\n")#+##
1513 else:
1514 print("Output directory:\t"+make_dir+"\n"+
1515 "Input files:\t"+"\t".join(input_file)+"\n"+
1516 "O antigen prediction:\t"+O_choice+"\n"+
1517 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
1518 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
1519 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+
1520 "Predicted antigenic profile:\t"+predict_form+"\n"+
1521 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
1522 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")
1523 else:
1524 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
1525 elif analysis_mode=="k":
1526 #ex_dir = os.path.dirname(os.path.realpath(__file__))
1527 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: change ex_dir for packaging
1528 #output_mode = args.mode
1529 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
1530 input_file = for_fq #-k will just use forward because not all reads were used
1531 os.chdir(make_dir)
1532 ### ed_SL_08182020: use assembly workflow for nanopore fastq, convert fastq to fasta
1533 if data_type == "5":
1534 input_file = format_check(for_fq)
1535 ###
1536 f = open(ex_dir + '/antigens.pickle', 'rb')
1537 lib_dict = pickle.load(f)
1538 f.close
1539 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size)
1540 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks)
1541 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
1542 subspecies=judge_subspecies_Kmer(Special_dict)
1543 if subspecies=="IIb" or subspecies=="IIa":
1544 subspecies="II"
1545 ### ed_SL_06062020: correction VIII -> II
1546 if subspecies == 'VIII':
1547 subspecies = 'II'
1548 ### ed_SL_08132020: correction VII -> IV, according to CDC's suggestion
1549 if subspecies == 'VII':
1550 subspecies = 'IV'
1551 note+='SalmID reports this as ssp VII, which has not been formally recognized. '
1552 ###
1553 ### ed_SL_08182020: change serotype ouput for genome without definitive subspecies ID
1554 ssp_pointer = subspecies
1555 if subspecies == 'enterica':
1556 subspecies = '-'
1557 ###
1558 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
1559 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
1560 claim="" #no claim any more based on new output requirement
1561 #if star_line+claim=="": #0413, new output style
1562 # note=""
1563 #else:
1564 # note="Note:"
1565
1566 ### ed_SL_11232019: add notes for missing antigen
1567 if highest_O.split('-')[-1]=="":
1568 O_choice="-"
1569 else:
1570 O_choice=highest_O.split('-')[-1]
1571 antigen_note,NA_note=check_antigens(ssp_pointer,O_choice,highest_fliC,highest_fljB,NA_note)
1572 if sample_name:
1573 print ("Sample name:\t"+sample_name)
1574 ###
1575
1576 if clean_mode:
1577 subprocess.check_call("rm -rf "+make_dir,shell=True)
1578 make_dir="none-output-directory due to '-c' flag"
1579 else:
1580 new_file=open("SeqSero_result.txt","w")
1581 tsv_file=open("SeqSero_result.tsv","w")
1582 # ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c"
1583 # if highest_O.split('-')[-1]=="":
1584 # O_choice="-"
1585 # else:
1586 # O_choice=highest_O.split('-')[-1]
1587 # ###
1588 # else:
1589 # if highest_O.split('-')[-1]=="":
1590 # O_choice="-"
1591 # else:
1592 # O_choice=highest_O.split('-')[-1]
1593 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
1594 # new_file=open("SeqSero_result.txt","w")
1595 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
1596 ### ed_SL_01152020: add new output
1597 # tsv_file=open("SeqSero_result.tsv","w")
1598 if ingore_header:
1599 pass
1600 else:
1601 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted identification\tPredicted antigenic profile\tPredicted serotype\tNote\n")
1602 if sample_name:
1603 new_file.write("Sample name:\t"+sample_name+"\n")
1604 tsv_file.write(sample_name+'\t')
1605 else:
1606 tsv_file.write(input_file.split('/')[-1]+'\t')
1607 ###
1608 if "N/A" not in predict_sero:
1609 new_file.write("Output directory:\t"+make_dir+"\n"+
1610 "Input files:\t"+input_file+"\n"+
1611 "O antigen prediction:\t"+O_choice+"\n"+
1612 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1613 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1614 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+
1615 "Predicted antigenic profile:\t"+predict_form+"\n"+
1616 "Predicted serotype:\t"+predict_sero+"\n"+
1617 note+star_line+claim+antigen_note+"\n")#+##
1618 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+predict_sero+"\t"+star_line+claim+antigen_note+"\n")
1619 else:
1620 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
1621 #star_line = "" #changed for new output requirement, 04132019
1622 new_file.write("Output directory:\t"+make_dir+"\n"+
1623 "Input files:\t"+input_file+"\n"+
1624 "O antigen prediction:\t"+O_choice+"\n"+
1625 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1626 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1627 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+
1628 "Predicted antigenic profile:\t"+predict_form+"\n"+
1629 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
1630 note+NA_note+star_line+claim+antigen_note+"\n")#+##
1631 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+NA_note+star_line+claim+antigen_note+"\n")
1632 new_file.close()
1633 tsv_file.close()
1634 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True)
1635 if "N/A" not in predict_sero:
1636 print("Output directory:\t"+make_dir+"\n"+
1637 "Input files:\t"+input_file+"\n"+
1638 "O antigen prediction:\t"+O_choice+"\n"+
1639 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1640 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1641 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+
1642 "Predicted antigenic profile:\t"+predict_form+"\n"+
1643 "Predicted serotype:\t"+predict_sero+"\n"+
1644 note+star_line+claim+antigen_note+"\n")#+##
1645 else:
1646 print("Output directory:\t"+make_dir+"\n"+
1647 "Input files:\t"+input_file+"\n"+
1648 "O antigen prediction:\t"+O_choice+"\n"+
1649 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
1650 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
1651 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+
1652 "Predicted antigenic profile:\t"+predict_form+"\n"+
1653 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies
1654 note+NA_note+star_line+claim+antigen_note+"\n")#+##
1655
1656 if __name__ == '__main__':
1657 main()