Mercurial > repos > jpayne > seqsero_v2
comparison SeqSero2/bin/SeqSero2_package.py @ 17:03f7b358d57f
planemo upload
author | jpayne |
---|---|
date | Tue, 25 Mar 2025 23:22:38 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
16:3b6d5b60968f | 17:03f7b358d57f |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import sys | |
4 import time | |
5 import random | |
6 import os | |
7 import subprocess | |
8 import gzip | |
9 import io | |
10 import pickle | |
11 import argparse | |
12 import itertools | |
13 from distutils.version import LooseVersion | |
14 from distutils.spawn import find_executable | |
15 sys.path.insert(1,sys.path[0]+'/..') | |
16 | |
17 try: | |
18 from .version import SeqSero2_version | |
19 except Exception: #ImportError | |
20 from version import SeqSero2_version | |
21 | |
22 ### SeqSero Kmer | |
23 def parse_args(): | |
24 "Parse the input arguments, use '-h' for help." | |
25 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.3.1')#add "-m <data_type>" in future | |
26 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### add 'type=os.path.abspath' to generate absolute path of input data. | |
27 parser.add_argument("-t",choices=['1','2','3','4','5'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore reads (fasta/fastq)") | |
28 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode") | |
29 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1") | |
30 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a") | |
31 parser.add_argument("-n",help="<string>: optional, to specify a sample name in the report output") | |
32 parser.add_argument("-d",help="<string>: optional, to specify an output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number") | |
33 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files") | |
34 parser.add_argument("-s",action="store_true",help="<flag>: if '-s' was flagged, SeqSero2 will not output header in SeqSero_result.tsv") | |
35 parser.add_argument("--phred_offset",choices=['33','64','auto'],default='auto',help="<33|64|auto>: offset for FASTQ file quality scores, default=auto") | |
36 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies") | |
37 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version) | |
38 return parser.parse_args() | |
39 | |
40 ### check paths of dependencies | |
41 check_dependencies = parse_args().check | |
42 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py'] | |
43 if check_dependencies: | |
44 for item in dependencies: | |
45 ext_path = find_executable(item) | |
46 if ext_path is not None: | |
47 print ("Using "+item+" - "+ext_path) | |
48 else: | |
49 print ("ERROR: can not find "+item+" in PATH") | |
50 sys.exit() | |
51 ### end of --check | |
52 | |
53 def reverse_complement(sequence): | |
54 complement = { | |
55 'A': 'T', | |
56 'C': 'G', | |
57 'G': 'C', | |
58 'T': 'A', | |
59 'N': 'N', | |
60 'M': 'K', | |
61 'R': 'Y', | |
62 'W': 'W', | |
63 'S': 'S', | |
64 'Y': 'R', | |
65 'K': 'M', | |
66 'V': 'B', | |
67 'H': 'D', | |
68 'D': 'H', | |
69 'B': 'V' | |
70 } | |
71 return "".join(complement[base] for base in reversed(sequence)) | |
72 | |
73 | |
74 def createKmerDict_reads(list_of_strings, kmer): | |
75 kmer_table = {} | |
76 for string in list_of_strings: | |
77 sequence = string.strip('\n') | |
78 for i in range(len(sequence) - kmer + 1): | |
79 new_mer = sequence[i:i + kmer].upper() | |
80 new_mer_rc = reverse_complement(new_mer) | |
81 if new_mer in kmer_table: | |
82 kmer_table[new_mer.upper()] += 1 | |
83 else: | |
84 kmer_table[new_mer.upper()] = 1 | |
85 if new_mer_rc in kmer_table: | |
86 kmer_table[new_mer_rc.upper()] += 1 | |
87 else: | |
88 kmer_table[new_mer_rc.upper()] = 1 | |
89 return kmer_table | |
90 | |
91 | |
92 def multifasta_dict(multifasta): | |
93 multifasta_list = [ | |
94 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0 | |
95 ] | |
96 headers = [i for i in multifasta_list if i[0] == '>'] | |
97 multifasta_dict = {} | |
98 for h in headers: | |
99 start = multifasta_list.index(h) | |
100 for element in multifasta_list[start + 1:]: | |
101 if element[0] == '>': | |
102 break | |
103 else: | |
104 if h[1:] in multifasta_dict: | |
105 multifasta_dict[h[1:]] += element | |
106 else: | |
107 multifasta_dict[h[1:]] = element | |
108 return multifasta_dict | |
109 | |
110 | |
111 def multifasta_single_string(multifasta): | |
112 multifasta_list = [ | |
113 line.strip() for line in open(multifasta, 'r') | |
114 if (len(line.strip()) > 0) and (line.strip()[0] != '>') | |
115 ] | |
116 return ''.join(multifasta_list) | |
117 | |
118 | |
119 def chunk_a_long_sequence(long_sequence, chunk_size=60): | |
120 chunk_list = [] | |
121 steps = len(long_sequence) // 60 #how many chunks | |
122 for i in range(steps): | |
123 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size]) | |
124 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)]) | |
125 return chunk_list | |
126 | |
127 | |
128 def target_multifasta_kmerizer(multifasta, k, kmerDict): | |
129 forward_length = 300 #if find the target, put forward 300 bases | |
130 reverse_length = 2200 #if find the target, put backward 2200 bases | |
131 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences | |
132 target_mers = [] | |
133 long_single_string = multifasta_single_string(multifasta) | |
134 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size) | |
135 unit_length = len(multifasta_list[0]) | |
136 forward_lines = int(forward_length / unit_length) + 1 | |
137 reverse_lines = int(forward_length / unit_length) + 1 | |
138 start_num = 0 | |
139 end_num = 0 | |
140 for i in range(len(multifasta_list)): | |
141 if i not in range(start_num, end_num): #avoid computational repetition | |
142 line = multifasta_list[i] | |
143 start = int((len(line) - k) // 2) | |
144 s1 = line[start:k + start] | |
145 if s1 in kmerDict: #detect it is a potential read or not (use the middle part) | |
146 if i - forward_lines >= 0: | |
147 start_num = i - forward_lines | |
148 else: | |
149 start_num = 0 | |
150 if i + reverse_lines <= len(multifasta_list) - 1: | |
151 end_num = i + reverse_lines | |
152 else: | |
153 end_num = len(multifasta_list) - 1 | |
154 target_list = [ | |
155 x.strip() for x in multifasta_list[start_num:end_num] | |
156 ] | |
157 target_line = "".join(target_list) | |
158 target_mers += [ | |
159 k1 for k1 in createKmerDict_reads([str(target_line)], k) | |
160 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length) | |
161 else: | |
162 pass | |
163 return set(target_mers) | |
164 | |
165 | |
166 def target_read_kmerizer(file, k, kmerDict): | |
167 i = 1 | |
168 n_reads = 0 | |
169 total_coverage = 0 | |
170 target_mers = [] | |
171 if file.endswith(".gz"): | |
172 file_content = io.BufferedReader(gzip.open(file)) | |
173 else: | |
174 file_content = open(file, "r").readlines() | |
175 for line in file_content: | |
176 start = int((len(line) - k) // 2) | |
177 if i % 4 == 2: | |
178 if file.endswith(".gz"): | |
179 s1 = line[start:k + start].decode() | |
180 line = line.decode() | |
181 else: | |
182 s1 = line[start:k + start] | |
183 if s1 in kmerDict: #detect it is a potential read or not (use the middle part) | |
184 n_reads += 1 | |
185 total_coverage += len(line) | |
186 target_mers += [ | |
187 k1 for k1 in createKmerDict_reads([str(line)], k) | |
188 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length) | |
189 i += 1 | |
190 if total_coverage >= 4000000: | |
191 break | |
192 return set(target_mers) | |
193 | |
194 | |
195 def minion_fasta_kmerizer(file, k, kmerDict): | |
196 i = 1 | |
197 n_reads = 0 | |
198 total_coverage = 0 | |
199 target_mers = {} | |
200 for line in open(file): | |
201 if i % 2 == 0: | |
202 for kmer, rc_kmer in kmers(line.strip().upper(), k): | |
203 if (kmer in kmerDict) or (rc_kmer in kmerDict): | |
204 if kmer in target_mers: | |
205 target_mers[kmer] += 1 | |
206 else: | |
207 target_mers[kmer] = 1 | |
208 if rc_kmer in target_mers: | |
209 target_mers[rc_kmer] += 1 | |
210 else: | |
211 target_mers[rc_kmer] = 1 | |
212 i += 1 | |
213 return set([h for h in target_mers]) | |
214 | |
215 | |
216 def minion_fastq_kmerizer(file, k, kmerDict): | |
217 i = 1 | |
218 n_reads = 0 | |
219 total_coverage = 0 | |
220 target_mers = {} | |
221 for line in open(file): | |
222 if i % 4 == 2: | |
223 for kmer, rc_kmer in kmers(line.strip().upper(), k): | |
224 if (kmer in kmerDict) or (rc_kmer in kmerDict): | |
225 if kmer in target_mers: | |
226 target_mers[kmer] += 1 | |
227 else: | |
228 target_mers[kmer] = 1 | |
229 if rc_kmer in target_mers: | |
230 target_mers[rc_kmer] += 1 | |
231 else: | |
232 target_mers[rc_kmer] = 1 | |
233 i += 1 | |
234 return set([h for h in target_mers]) | |
235 | |
236 | |
237 def multifasta_single_string2(multifasta): | |
238 single_string = '' | |
239 with open(multifasta, 'r') as f: | |
240 for line in f: | |
241 if line.strip()[0] == '>': | |
242 pass | |
243 else: | |
244 single_string += line.strip() | |
245 return single_string | |
246 | |
247 | |
248 def kmers(seq, k): | |
249 rev_comp = reverse_complement(seq) | |
250 for start in range(1, len(seq) - k + 1): | |
251 yield seq[start:start + k], rev_comp[-(start + k):-start] | |
252 | |
253 | |
254 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set | |
255 multi_seq_dict = multifasta_dict(multifasta) | |
256 lib_dict = {} | |
257 for h in multi_seq_dict: | |
258 lib_dict[h] = set( | |
259 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)]) | |
260 return lib_dict | |
261 | |
262 | |
263 def Combine(b, c): | |
264 fliC_combinations = [] | |
265 fliC_combinations.append(",".join(c)) | |
266 temp_combinations = [] | |
267 for i in range(len(b)): | |
268 for x in itertools.combinations(b, i + 1): | |
269 temp_combinations.append(",".join(x)) | |
270 for x in temp_combinations: | |
271 temp = [] | |
272 for y in c: | |
273 temp.append(y) | |
274 temp.append(x) | |
275 temp = ",".join(temp) | |
276 temp = temp.split(",") | |
277 temp.sort() | |
278 temp = ",".join(temp) | |
279 fliC_combinations.append(temp) | |
280 return fliC_combinations | |
281 | |
282 | |
283 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies): | |
284 #like test_output_06012017.txt | |
285 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list) | |
286 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict | |
287 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict] | |
288 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to | |
289 seronames = [] | |
290 seronames_none_subspecies=[] | |
291 for i in range(len(phase1)): | |
292 fliC_combine = [] | |
293 fljB_combine = [] | |
294 if phaseO[i] == Otype: # no VII in KW, but it's there | |
295 ### for fliC, detect every possible combinations to avoid the effect of "[" | |
296 if phase1[i].count("[") == 0: | |
297 fliC_combine.append(phase1[i]) | |
298 elif phase1[i].count("[") >= 1: | |
299 c = [] | |
300 b = [] | |
301 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count( | |
302 "[") == 1: | |
303 content = phase1[i].replace("[", "").replace("]", "") | |
304 fliC_combine.append(content) | |
305 fliC_combine.append("-") | |
306 else: | |
307 for x in phase1[i].split(","): | |
308 if "[" in x: | |
309 b.append(x.replace("[", "").replace("]", "")) | |
310 else: | |
311 c.append(x) | |
312 fliC_combine = Combine( | |
313 b, c | |
314 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t | |
315 ### end of fliC "[" detect | |
316 ### for fljB, detect every possible combinations to avoid the effect of "[" | |
317 if phase2[i].count("[") == 0: | |
318 fljB_combine.append(phase2[i]) | |
319 elif phase2[i].count("[") >= 1: | |
320 d = [] | |
321 e = [] | |
322 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count( | |
323 "[") == 1: | |
324 content = phase2[i].replace("[", "").replace("]", "") | |
325 fljB_combine.append(content) | |
326 fljB_combine.append("-") | |
327 else: | |
328 for x in phase2[i].split(","): | |
329 if "[" in x: | |
330 d.append(x.replace("[", "").replace("]", "")) | |
331 else: | |
332 e.append(x) | |
333 fljB_combine = Combine(d, e) | |
334 ### end of fljB "[" detect | |
335 new_fliC = fliC.split( | |
336 "," | |
337 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings | |
338 new_fliC.sort() | |
339 new_fliC = ",".join(new_fliC) | |
340 new_fljB = fljB.split(",") | |
341 new_fljB.sort() | |
342 new_fljB = ",".join(new_fljB) | |
343 if (new_fliC in fliC_combine | |
344 or fliC in fliC_combine) and (new_fljB in fljB_combine | |
345 or fljB in fljB_combine): | |
346 ######start, remove_list,rename_dict, added on 11/11/2018 | |
347 if sero[i] not in remove_list: | |
348 temp_sero=sero[i] | |
349 if temp_sero in rename_dict: | |
350 temp_sero=rename_dict[temp_sero] #rename if in the rename list | |
351 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider | |
352 if subs[i] == subspecies: | |
353 seronames.append(temp_sero) | |
354 seronames_none_subspecies.append(temp_sero) | |
355 else: | |
356 pass | |
357 else: | |
358 pass | |
359 ######end, added on 11/11/2018 | |
360 #analyze seronames | |
361 subspecies_pointer="" | |
362 if len(seronames) == 0 and len(seronames_none_subspecies)!=0: | |
363 ## ed_SL_06062020: for the subspecies mismatch between KW and SalmID | |
364 seronames=seronames_none_subspecies | |
365 #seronames=["N/A"] | |
366 subspecies_pointer="1" | |
367 #subspecies_pointer="0" | |
368 ## | |
369 if len(seronames) == 0: | |
370 seronames = [ | |
371 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)" | |
372 ] | |
373 star = "" | |
374 star_line = "" | |
375 if len(seronames) > 1: #there are two possible predictions for serotypes | |
376 star = "*" | |
377 #changed 04072019 | |
378 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" | |
379 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0: | |
380 star="*" | |
381 star_line = "This antigenic profile has been associated with serotype '"+(" or ").join(seronames)+"' in the Kauffman-White scheme. The existence of the same antigenic formula in multiple species or subspecies is well documented in the Kauffman-White Scheme. " + star_line ## ed_SL_03202021: changed for new output format | |
382 #star_line="The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme. " + star_line | |
383 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line | |
384 seronames=["N/A"] ## ed_SL_06062020 | |
385 if Otype=="": | |
386 Otype="-" | |
387 predict_form = Otype + ":" + fliC + ":" + fljB | |
388 predict_sero = (" or ").join(seronames) | |
389 ###special test for Enteritidis | |
390 if predict_form == "9:g,m:-": | |
391 sdf = "-" | |
392 for x in special_gene_list: | |
393 if x.startswith("sdf"): | |
394 sdf = "+" | |
395 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" | |
396 #star_line="sdf gene detected. " | |
397 star_line = "Detected Sdf I that is characteristic of commonly circulating strains of serotype Enteritidis. " | |
398 #predict_form = predict_form + " Sdf prediction:" + sdf | |
399 predict_form = predict_form #changed 04072019 | |
400 if sdf == "-": | |
401 star = "*" | |
402 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" | |
403 #star_line="sdf gene not detected. " | |
404 star_line = "Sdf I that is characteristic of commonly circulating strains of serotype Enteritidis was not detected. " | |
405 #changed in 04072019, for new output | |
406 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n" | |
407 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement | |
408 predict_sero = "Gallinarum or Enteritidis" | |
409 ###end of special test for Enteritidis | |
410 elif predict_form == "4:i:-": | |
411 predict_sero = "I 4,[5],12:i:-" # change serotype name | |
412 elif predict_form == "4:r:-": | |
413 predict_sero = "N/A (4:r:-)" | |
414 elif predict_form == "4:b:-": | |
415 predict_sero = "N/A (4:b:-)" | |
416 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo | |
417 #predict_sero = "Newport" | |
418 #star = "*" | |
419 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." | |
420 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" | |
421 if "N/A" in predict_sero: | |
422 claim = "" | |
423 #special test for Typhimurium | |
424 if "Typhimurium" in predict_sero or predict_form == "4:i:-": | |
425 normal = 0 | |
426 mutation = 0 | |
427 for x in special_gene_list: | |
428 if "oafA-O-4_full" in x: | |
429 normal = float(special_gene_list[x]) | |
430 elif "oafA-O-4_5-" in x: | |
431 mutation = float(special_gene_list[x]) | |
432 if normal > mutation: | |
433 pass | |
434 elif normal < mutation: | |
435 #predict_sero = predict_sero.strip() + "(O5-)" | |
436 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019 | |
437 star = "*" | |
438 #star_line = "Detected the deletion of O5-." | |
439 star_line = "Detected a deletion in gene oafA that causes O5- variant of Typhimurium. " | |
440 else: | |
441 pass | |
442 #special test for Paratyphi B | |
443 if "Paratyphi B" in predict_sero or predict_form == "4:b:-": | |
444 normal = 0 | |
445 mutation = 0 | |
446 for x in special_gene_list: | |
447 if "gntR-family-regulatory-protein_dt-positive" in x: | |
448 normal = float(special_gene_list[x]) | |
449 elif "gntR-family-regulatory-protein_dt-negative" in x: | |
450 mutation = float(special_gene_list[x]) | |
451 #print(normal,mutation) | |
452 if normal > mutation: | |
453 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019 | |
454 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() | |
455 star = "*" | |
456 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)." | |
457 star_line = "The SNP in gene STM3356 that is associated with the d-Tartrate nonfermenting phenotype characteristic of the typhoidal pathotype was not detected. " | |
458 elif normal < mutation: | |
459 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019 | |
460 predict_sero = predict_sero.strip() | |
461 star = "*" | |
462 #star_line = "Detected the SNP for d-Tartrate nonfermenting phenotype of Paratyphi B. " | |
463 star_line = "Detected the SNP in gene STM3356 that is associated with the d-Tartrate nonfermenting phenotype characteristic of the typhoidal pathotype. " | |
464 else: | |
465 star = "*" | |
466 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not." | |
467 star_line = " " ## ed_SL_05152019: do not report this situation. | |
468 #special test for O13,22 and O13,23 | |
469 if Otype=="13": | |
470 #ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
471 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019 | |
472 f = open(ex_dir + '/special.pickle', 'rb') | |
473 special = pickle.load(f) | |
474 O22_O23=special['O22_O23'] | |
475 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze | |
476 O22_score=0 | |
477 O23_score=0 | |
478 for x in special_gene_list: | |
479 if "O:22" in x: | |
480 O22_score = O22_score+float(special_gene_list[x]) | |
481 elif "O:23" in x: | |
482 O23_score = O23_score+float(special_gene_list[x]) | |
483 #print(O22_score,O23_score) | |
484 for z in O22_O23[0]: | |
485 if predict_sero.split(" or ")[0] in z: | |
486 if O22_score > O23_score: | |
487 star = "*" | |
488 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 | |
489 predict_sero = z[0] | |
490 elif O22_score < O23_score: | |
491 star = "*" | |
492 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 | |
493 predict_sero = z[1] | |
494 else: | |
495 star = "*" | |
496 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019 | |
497 if " or " in predict_sero: | |
498 star_line = star_line + "The predicted serotypes share the same general formula: " + Otype + ":" + fliC + ":" + fljB + " and can be differentiated by additional analysis. " | |
499 #special test for O6,8 | |
500 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list | |
501 #for x in merge_O68_list: | |
502 # if x in predict_sero: | |
503 # predict_sero=x | |
504 # star="" | |
505 # star_line="" | |
506 #special test for Montevideo; most of them are monophasic | |
507 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list | |
508 #star="*" | |
509 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination." | |
510 return predict_form, predict_sero, star, star_line, claim | |
511 ### End of SeqSero Kmer part | |
512 | |
513 ### Begin of SeqSero2 allele prediction and output | |
514 def xml_parse_score_comparision_seqsero(xmlfile): | |
515 #used to do seqsero xml analysis | |
516 from Bio.Blast import NCBIXML | |
517 handle=open(xmlfile) | |
518 handle=NCBIXML.parse(handle) | |
519 handle=list(handle) | |
520 List=[] | |
521 List_score=[] | |
522 List_ids=[] | |
523 List_query_region=[] | |
524 for i in range(len(handle)): | |
525 if len(handle[i].alignments)>0: | |
526 for j in range(len(handle[i].alignments)): | |
527 score=0 | |
528 ids=0 | |
529 cover_region=set() #fixed problem that repeated calculation leading percentage > 1 | |
530 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def) | |
531 for z in range(len(handle[i].alignments[j].hsps)): | |
532 hsp=handle[i].alignments[j].hsps[z] | |
533 temp=set(range(hsp.query_start,hsp.query_end)) | |
534 if len(cover_region)==0: | |
535 cover_region=cover_region|temp | |
536 fraction=1 | |
537 else: | |
538 fraction=1-len(cover_region&temp)/float(len(temp)) | |
539 cover_region=cover_region|temp | |
540 if "last" in handle[i].query or "first" in handle[i].query: | |
541 score+=hsp.bits*fraction | |
542 ids+=float(hsp.identities)/handle[i].query_length*fraction | |
543 else: | |
544 score+=hsp.bits*fraction | |
545 ids+=float(hsp.identities)/handle[i].query_length*fraction | |
546 List_score.append(score) | |
547 List_ids.append(ids) | |
548 List_query_region.append(cover_region) | |
549 temp=zip(List,List_score,List_ids,List_query_region) | |
550 Final_list=sorted(temp, key=lambda d:d[1], reverse = True) | |
551 return Final_list | |
552 | |
553 | |
554 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number | |
555 Old=L | |
556 L.sort() | |
557 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]] | |
558 count=[] | |
559 for j in range(len(L)): | |
560 y=0 | |
561 for x in Old: | |
562 if L[j]==x: | |
563 y+=1 | |
564 count.append(y) | |
565 if sort_on_fre!="none": | |
566 d=zip(*sorted(zip(count, L))) | |
567 L=d[1] | |
568 count=d[0] | |
569 return (L,count) | |
570 | |
571 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list): | |
572 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test | |
573 #this is mainly used for | |
574 a=nodes_vs_score_list | |
575 fliC_score=0 | |
576 fljB_score=0 | |
577 for z in a: | |
578 if "fliC" in z[0]: | |
579 fliC_score+=z[1] | |
580 elif "fljB" in z[0]: | |
581 fljB_score+=z[1] | |
582 if fliC_score>=fljB_score: | |
583 role="fliC" | |
584 else: | |
585 role="fljB" | |
586 return (role,abs(fliC_score-fljB_score)) | |
587 | |
588 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed): | |
589 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small) | |
590 #also used when no head or tail got blasted score for the contig | |
591 role="" | |
592 for z in Final_list_passed: | |
593 if node_name in z[0]: | |
594 role=z[0].split("_")[0] | |
595 break | |
596 return role | |
597 | |
598 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed): | |
599 #nodes_list is the c created by c,d=Uniq(nodes) in below function | |
600 first_target="" | |
601 role_list=[] | |
602 for x in nodes_list: | |
603 a=[] | |
604 role="" | |
605 for y in tail_head_list: | |
606 if x in y[0]: | |
607 a.append(y) | |
608 if len(a)==4: | |
609 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
610 if diff<20: | |
611 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
612 elif len(a)==3: | |
613 ###however, if the one with highest score is the fewer one, compare their accumulation score | |
614 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
615 if diff<20: | |
616 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
617 ###end of above score comparison | |
618 elif len(a)==2: | |
619 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37) | |
620 temp=[] | |
621 for z in a: | |
622 temp.append(z[0].split("_")[0]) | |
623 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too | |
624 if len(m)==1: | |
625 pass | |
626 else: | |
627 pass | |
628 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
629 if diff<20: | |
630 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
631 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation | |
632 elif len(a)==1: | |
633 #that one | |
634 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
635 if diff<20: | |
636 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
637 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0" | |
638 else:#a==0 | |
639 #use Final_list_passed best match | |
640 for z in Final_list_passed: | |
641 if x in z[0]: | |
642 role=z[0].split("_")[0] | |
643 break | |
644 #print x,role,len(a) | |
645 role_list.append((role,x)) | |
646 if len(role_list)==2: | |
647 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase | |
648 #just use score to do a final test | |
649 role_list=[] | |
650 for x in nodes_list: | |
651 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
652 role_list.append((role,x)) | |
653 return role_list | |
654 | |
655 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed): | |
656 #used to decide which contig is FliC and which one is fljB | |
657 contigs=[] | |
658 nodes=[] | |
659 for x in Final_list_passed: | |
660 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]: | |
661 nodes.append(x[0].split("___")[1].strip()) | |
662 c,d=Uniq(nodes)#c is node_list | |
663 #print c | |
664 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])] | |
665 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed) | |
666 return roles | |
667 | |
668 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed): | |
669 #decide O based on Final_list | |
670 O_choice="?" | |
671 O_list=[] | |
672 special_genes={} | |
673 nodes=[] | |
674 for x in Final_list_passed: | |
675 if x[0].startswith("O-"): | |
676 nodes.append(x[0].split("___")[1].strip()) | |
677 elif not x[0].startswith("fl"): | |
678 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1] | |
679 #print "special_genes:",special_genes | |
680 c,d=Uniq(nodes) | |
681 #print "potential O antigen contig",c | |
682 final_O=[] | |
683 O_nodes_list=[] | |
684 for x in c:#c is the list for contigs | |
685 temp=0 | |
686 for y in Final_list_passed: | |
687 if x in y[0] and y[0].startswith("O-"): | |
688 final_O.append(y) | |
689 break | |
690 ### O contig has the problem of two genes on same contig, so do additional test | |
691 potenial_new_gene="" | |
692 for x in final_O: | |
693 pointer=0 #for genes merged or not | |
694 #not consider O-1,3,19_not_in_3,10, too short compared with others | |
695 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region | |
696 pointer=x[0].split("___")[1].strip()#store the contig name | |
697 print(pointer) | |
698 if pointer!=0:#it has potential merge event | |
699 for y in Final_list: | |
700 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O | |
701 potenial_new_gene=y | |
702 #print(potenial_new_gene) | |
703 break | |
704 if potenial_new_gene!="": | |
705 print("two differnt genes in same contig, fix it for O antigen") | |
706 print(potenial_new_gene[:3]) | |
707 pointer=0 | |
708 for y in final_O: | |
709 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]: | |
710 pointer=1 | |
711 if pointer!=0: #changed to consider two genes in same contig | |
712 final_O.append(potenial_new_gene) | |
713 ### end of the two genes on same contig test | |
714 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted | |
715 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]): | |
716 #print "$$$No Otype, due to no hit"#may need to be changed | |
717 O_choice="-" | |
718 else: | |
719 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]]) | |
720 O_list=[] | |
721 O_list_less_contamination=[] | |
722 for x in final_O: | |
723 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off | |
724 O_list.append(x[0].split("__")[0]) | |
725 O_nodes_list.append(x[0].split("___")[1]) | |
726 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15: | |
727 O_list_less_contamination.append(x[0].split("__")[0]) | |
728 ### special test for O9,46 and O3,10 family | |
729 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
730 if "O-9,46_wzy" in O_list or "O-9,46_wzy_partial" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | |
731 O_choice="O-9,46" | |
732 #print "$$$Most possilble Otype: O-9,46" | |
733 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 | |
734 O_choice="O-9,46,27" | |
735 #print "$$$Most possilble Otype: O-9,46,27" | |
736 else: | |
737 O_choice="O-9"#next, detect O9 vs O2? | |
738 O2=0 | |
739 O9=0 | |
740 for z in special_genes: | |
741 if "tyr-O-9" in z and special_genes[z] > O9: ##20240322, add "special_genes[z] > O9" to avoid misidentification of O9 to O2 that caused by multiple tyr-O-9 contigs | |
742 O9=special_genes[z] | |
743 elif "tyr-O-2" in z and special_genes[z] > O2: ##20240322, add "special_genes[z] > O2" | |
744 O2=special_genes[z] | |
745 if O2>O9: | |
746 O_choice="O-2" | |
747 elif O2<O9: | |
748 pass | |
749 else: | |
750 pass | |
751 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility." | |
752 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
753 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 | |
754 O_choice="O-3,10" | |
755 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" | |
756 else: | |
757 O_choice="O-1,3,19" | |
758 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" | |
759 ### end of special test for O9,46 and O3,10 family | |
760 else: | |
761 try: | |
762 max_score=0 | |
763 for x in final_O: | |
764 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold | |
765 max_score=x[2]#change from x[-1] to x[2],08172018 | |
766 O_choice=x[0].split("_")[0] | |
767 if O_choice=="O-1,3,19": | |
768 O_choice=final_O[1][0].split("_")[0] | |
769 #print "$$$Most possilble Otype: ",O_choice | |
770 except: | |
771 pass | |
772 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)" | |
773 if O_choice=="O-9,46,27" and len(O_list)==2 and "O-4_wzx" in O_list: #special for very low chance sitatuion between O4 and O9,27,46, this is for serotypes like Bredeney and Schwarzengrund (normallly O-4 will have higher score, but sometimes sequencing quality may affect the prediction) | |
774 O_choice="O-4" | |
775 #print "O:",O_choice,O_nodes_list | |
776 Otypes=[] | |
777 for x in O_list: | |
778 if x!="O-1,3,19_not_in_3,10": | |
779 if "O-9,46_" not in x: | |
780 Otypes.append(x.split("_")[0]) | |
781 else: | |
782 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254 | |
783 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"] | |
784 Otypes_uniq,Otypes_fre=Uniq(Otypes) | |
785 contamination_O="" | |
786 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19": | |
787 if len(Otypes_uniq)>2: | |
788 contamination_O="potential contamination from O antigen signals" | |
789 else: | |
790 if len(Otypes_uniq)>1: | |
791 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
792 contamination_O="" | |
793 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
794 contamination_O="" | |
795 else: | |
796 contamination_O="potential contamination from O antigen signals" | |
797 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq | |
798 ### End of SeqSero2 allele prediction and output | |
799 | |
800 def get_input_files(make_dir,input_file,data_type,dirpath): | |
801 #tell input files from datatype | |
802 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)" | |
803 for_fq="" | |
804 rev_fq="" | |
805 os.chdir(make_dir) | |
806 if data_type=="1": | |
807 input_file=input_file[0].split("/")[-1] | |
808 if input_file.endswith(".sra"): | |
809 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True) | |
810 for_fq=input_file.replace(".sra","_1.fastq") | |
811 rev_fq=input_file.replace(".sra","_2.fastq") | |
812 else: | |
813 core_id=input_file.split(".fastq")[0].split(".fq")[0] | |
814 for_fq=core_id+"_1.fastq" | |
815 rev_fq=core_id+"_2.fastq" | |
816 if input_file.endswith(".gz"): | |
817 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True) | |
818 else: | |
819 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True) | |
820 elif data_type=="2": | |
821 for_fq=input_file[0].split("/")[-1] | |
822 rev_fq=input_file[1].split("/")[-1] | |
823 elif data_type=="3": | |
824 input_file=input_file[0].split("/")[-1] | |
825 if input_file.endswith(".sra"): | |
826 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True) | |
827 for_fq=input_file.replace(".sra","_1.fastq") | |
828 else: | |
829 for_fq=input_file | |
830 elif data_type in ["4","5","6"]: | |
831 for_fq=input_file[0].split("/")[-1] | |
832 os.chdir("..") | |
833 return for_fq,rev_fq | |
834 | |
835 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta): | |
836 #get O and H types from Final_list from blast parsing; allele mode | |
837 from Bio import SeqIO | |
838 fliC_choice="-" | |
839 fljB_choice="-" | |
840 fliC_contig="NA" | |
841 fljB_contig="NA" | |
842 fliC_region=set([0]) | |
843 fljB_region=set([0,]) | |
844 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length | |
845 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length | |
846 O_choice="-"#no need to decide O contig for now, should be only one | |
847 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification | |
848 O_choice=O_choice.split("-")[-1].strip() | |
849 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": | |
850 O_choice="-" | |
851 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB | |
852 #add alignment locations, used for further selection, 03312019 | |
853 for i in range(len(H_contig_roles)): | |
854 x=H_contig_roles[i] | |
855 for y in Final_list_passed: | |
856 if x[1] in y[0] and y[0].startswith(x[0]): | |
857 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],) | |
858 break | |
859 log_file=open("SeqSero_log.txt","a") | |
860 extract_file=open("Extracted_antigen_alleles.fasta","a") | |
861 handle_fasta=list(SeqIO.parse(new_fasta,"fasta")) | |
862 | |
863 #print("O_contigs:") | |
864 log_file.write("O_contigs:\n") | |
865 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n") | |
866 extract_file.write("#O_contigs:\n") | |
867 for x in O_nodes_roles: | |
868 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker | |
869 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1]))) | |
870 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n") | |
871 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n" | |
872 seqs="" | |
873 for z in handle_fasta: | |
874 if x[0].split("___")[-1]==z.description: | |
875 seqs=str(z.seq) | |
876 extract_file.write(title+seqs+"\n") | |
877 if len(H_contig_roles)!=0: | |
878 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output | |
879 else: | |
880 highest_H_coverage=0 | |
881 for x in H_contig_roles: | |
882 #if multiple choices, temporately select the one with longest length for now, will revise in further change | |
883 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list | |
884 fliC_contig=x[1] | |
885 fliC_length=len(x[-1]) | |
886 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13: | |
887 fljB_contig=x[1] | |
888 fljB_length=len(x[-1]) | |
889 for x in Final_list_passed: | |
890 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]: | |
891 fliC_choice=x[0].split("_")[1] | |
892 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]: | |
893 fljB_choice=x[0].split("_")[1] | |
894 elif fliC_choice!="-" and fljB_choice!="-": | |
895 break | |
896 #now remove contigs not in middle core part | |
897 first_allele="NA" | |
898 first_allele_percentage=0 | |
899 for x in Final_list: | |
900 if x[0].startswith("fliC") or x[0].startswith("fljB"): | |
901 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs | |
902 first_allele_percentage=x[2] | |
903 break | |
904 additional_contigs=[] | |
905 for x in Final_list: | |
906 if first_allele in x[0]: | |
907 if (fliC_contig == x[0].split("___")[-1]): | |
908 fliC_region=x[3] | |
909 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]): | |
910 fljB_region=x[3] | |
911 else: | |
912 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1 | |
913 additional_contigs.append(x) | |
914 #else: | |
915 #print x[:3] | |
916 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off | |
917 if first_allele_percentage>0.9: | |
918 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000: | |
919 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) | |
920 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000: | |
921 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) | |
922 else: | |
923 target_region=set()#doesn't do anything | |
924 else: | |
925 target_region=set()#doesn't do anything | |
926 #print(target_region) | |
927 #print(additional_contigs) | |
928 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best | |
929 target_region=target_region2|target_region | |
930 for x in additional_contigs: | |
931 removal=0 | |
932 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0]) | |
933 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off | |
934 removal=1 | |
935 else: | |
936 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part) | |
937 removal=1 | |
938 else: | |
939 pass | |
940 if removal==1: | |
941 for y in H_contig_roles: | |
942 if y[1] in x[0]: | |
943 H_contig_roles.remove(y) | |
944 else: | |
945 pass | |
946 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5) | |
947 #end of removing none-middle contigs | |
948 #print("H_contigs:") | |
949 log_file.write("H_contigs:\n") | |
950 extract_file.write("#H_contigs:\n") | |
951 H_contig_stat=[] | |
952 H1_cont_stat={} | |
953 H2_cont_stat={} | |
954 for i in range(len(H_contig_roles)): | |
955 x=H_contig_roles[i] | |
956 a=0 | |
957 for y in Final_list_passed: | |
958 if x[1] in y[0] and y[0].startswith(x[0]): | |
959 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide | |
960 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it | |
961 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB | |
962 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) | |
963 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") | |
964 H_contig_roles[i]="can't decide fliC or fljB, may be third phase" | |
965 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n" | |
966 seqs="" | |
967 for z in handle_fasta: | |
968 if x[1]==z.description: | |
969 seqs=str(z.seq) | |
970 extract_file.write(title+seqs+"\n") | |
971 break | |
972 else: | |
973 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) | |
974 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") | |
975 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n" | |
976 seqs="" | |
977 for z in handle_fasta: | |
978 if x[1]==z.description: | |
979 seqs=str(z.seq) | |
980 extract_file.write(title+seqs+"\n") | |
981 if x[0]=="fliC": | |
982 if y[0].split("_")[1] not in H1_cont_stat: | |
983 H1_cont_stat[y[0].split("_")[1]]=y[2] | |
984 else: | |
985 H1_cont_stat[y[0].split("_")[1]]+=y[2] | |
986 if x[0]=="fljB": | |
987 if y[0].split("_")[1] not in H2_cont_stat: | |
988 H2_cont_stat[y[0].split("_")[1]]=y[2] | |
989 else: | |
990 H2_cont_stat[y[0].split("_")[1]]+=y[2] | |
991 break | |
992 #detect contaminations | |
993 #print(H1_cont_stat) | |
994 #print(H2_cont_stat) | |
995 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2] | |
996 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2] | |
997 contamination_H="" | |
998 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1: | |
999 contamination_H="potential contamination from H antigen signals" | |
1000 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA": | |
1001 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected" | |
1002 #get additional antigens | |
1003 """ | |
1004 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
1005 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | |
1006 O_choice="O-9,46" | |
1007 #print "$$$Most possilble Otype: O-9,46" | |
1008 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 | |
1009 O_choice="O-9,46,27" | |
1010 #print "$$$Most possilble Otype: O-9,46,27" | |
1011 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
1012 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 | |
1013 O_choice="O-3,10" | |
1014 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" | |
1015 else: | |
1016 O_choice="O-1,3,19" | |
1017 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" | |
1018 ### end of special test for O9,46 and O3,10 family | |
1019 | |
1020 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19": | |
1021 if len(Otypes_uniq)>2: | |
1022 contamination_O="potential contamination from O antigen signals" | |
1023 else: | |
1024 if len(Otypes_uniq)>1: | |
1025 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
1026 contamination_O="" | |
1027 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
1028 contamination_O="" | |
1029 """ | |
1030 additonal_antigents=[] | |
1031 #print(contamination_O) | |
1032 #print(contamination_H) | |
1033 log_file.write(contamination_O+"\n") | |
1034 log_file.write(contamination_H+"\n") | |
1035 log_file.close() | |
1036 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list | |
1037 | |
1038 def get_input_K(input_file,lib_dict,data_type,k_size): | |
1039 #kmer mode; get input_Ks from dict and data_type | |
1040 kmers = [] | |
1041 for h in lib_dict: | |
1042 kmers += lib_dict[h] | |
1043 if data_type == '4': | |
1044 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers)) | |
1045 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later | |
1046 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers)) | |
1047 elif data_type == '5':#minion_2d_fasta | |
1048 #input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers)) | |
1049 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers)) #ed_SL_08172020: change for nanopore workflow | |
1050 if data_type == '6':#minion_2d_fastq | |
1051 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers)) | |
1052 return input_Ks | |
1053 | |
1054 def get_kmer_dict(lib_dict,input_Ks): | |
1055 #kmer mode; get predicted types | |
1056 O_dict = {} | |
1057 H_dict = {} | |
1058 Special_dict = {} | |
1059 for h in lib_dict: | |
1060 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100 | |
1061 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases | |
1062 if h.startswith('O-') and score > 25: | |
1063 O_dict[h] = score | |
1064 if h.startswith('fl') and score > 40: | |
1065 H_dict[h] = score | |
1066 if (h[:2] != 'fl') and (h[:2] != 'O-'): | |
1067 Special_dict[h] = score | |
1068 return O_dict,H_dict,Special_dict | |
1069 | |
1070 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir): | |
1071 log_file=open("SeqSero_log.txt","a") | |
1072 log_file.write("O_scores:\n") | |
1073 #call O: | |
1074 highest_O = '-' | |
1075 if len(O_dict) == 0: | |
1076 pass | |
1077 else: | |
1078 for x in O_dict: | |
1079 log_file.write(x+"\t"+str(O_dict[x])+"\n") | |
1080 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
1081 #if 'O-9,46_wzy__1191' in O_dict or "O-9,46_wzy_partial__216" in O_dict: # and float(O946_wzy)/float(num_1) > 0.1 | |
1082 #modified to fix miscall of O-9,46 | |
1083 if ('O-9,46_wzy__1191' in O_dict and O_dict['O-9,46_wzy__1191']>40) or ("O-9,46_wzy_partial__216" in O_dict and O_dict["O-9,46_wzy_partial__216"]>40): # and float(O946_wzy)/float(num_1) > 0.1 | |
1084 highest_O = "O-9,46" | |
1085 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1 | |
1086 highest_O = "O-9,46,27" | |
1087 else: | |
1088 highest_O = "O-9" # next, detect O9 vs O2? | |
1089 O2 = 0 | |
1090 O9 = 0 | |
1091 for z in Special_dict: | |
1092 if "tyr-O-9" in z: | |
1093 O9 = float(Special_dict[z]) | |
1094 if "tyr-O-2" in z: | |
1095 O2 = float(Special_dict[z]) | |
1096 if O2 > O9: | |
1097 highest_O = "O-2" | |
1098 elif ("O-3,10_wzx__1539" in O_dict) and ( | |
1099 "O-9,46_wzy__1191" in O_dict | |
1100 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
1101 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1 | |
1102 highest_O = "O-3,10" | |
1103 else: | |
1104 highest_O = "O-1,3,19" | |
1105 ### end of special test for O9,46 and O3,10 family | |
1106 else: | |
1107 try: | |
1108 max_score = 0 | |
1109 for x in O_dict: | |
1110 if float(O_dict[x]) >= max_score: | |
1111 max_score = float(O_dict[x]) | |
1112 #highest_O = x.split("_")[0] | |
1113 # ed_SL_12182019: modified to fix the O-9,46 error example1 | |
1114 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict): | |
1115 highest_O = "O-9" | |
1116 else: | |
1117 highest_O = x.split("_")[0] | |
1118 if highest_O == "O-1,3,19": | |
1119 highest_O = '-' | |
1120 max_score = 0 | |
1121 for x in O_dict: | |
1122 if x == 'O-1,3,19_not_in_3,10__130': | |
1123 pass | |
1124 else: | |
1125 if float(O_dict[x]) >= max_score: | |
1126 max_score = float(O_dict[x]) | |
1127 #highest_O = x.split("_")[0] | |
1128 # ed_SL_12182019: modified to fix the O-9,46 error example1 | |
1129 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict): | |
1130 highest_O = "O-9" | |
1131 else: | |
1132 highest_O = x.split("_")[0] | |
1133 except: | |
1134 pass | |
1135 #call_fliC: | |
1136 if len(H_dict)!=0: | |
1137 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not | |
1138 else: | |
1139 highest_H_score_both_BC=0 | |
1140 highest_fliC = '-' | |
1141 highest_fliC_raw = '-' | |
1142 highest_Score = 0 | |
1143 log_file.write("\nH_scores:\n") | |
1144 for s in H_dict: | |
1145 log_file.write(s+"\t"+str(H_dict[s])+"\n") | |
1146 if s.startswith('fliC'): | |
1147 if float(H_dict[s]) > highest_Score: | |
1148 highest_fliC = s.split('_')[1] | |
1149 highest_fliC_raw = s | |
1150 highest_Score = float(H_dict[s]) | |
1151 #call_fljB | |
1152 highest_fljB = '-' | |
1153 highest_fljB_raw = '-' | |
1154 highest_Score = 0 | |
1155 for s in H_dict: | |
1156 if s.startswith('fljB'): | |
1157 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other | |
1158 #highest_fljB = s.split('_')[1] | |
1159 #highest_fljB_raw = s | |
1160 #highest_Score = float(H_dict[s]) | |
1161 if s.split('_')[1]!=highest_fliC: | |
1162 highest_fljB = s.split('_')[1] | |
1163 highest_fljB_raw = s | |
1164 highest_Score = float(H_dict[s]) | |
1165 log_file.write("\nSpecial_scores:\n") | |
1166 for s in Special_dict: | |
1167 log_file.write(s+"\t"+str(Special_dict[s])+"\n") | |
1168 log_file.close() | |
1169 return highest_O,highest_fliC,highest_fljB | |
1170 | |
1171 def get_temp_file_names(for_fq,rev_fq): | |
1172 #seqsero2 -a; get temp file names | |
1173 sam=for_fq+".sam" | |
1174 bam=for_fq+".bam" | |
1175 sorted_bam=for_fq+"_sorted.bam" | |
1176 mapped_fq1=for_fq+"_mapped.fq" | |
1177 mapped_fq2=rev_fq+"_mapped.fq" | |
1178 combined_fq=for_fq+"_combined.fq" | |
1179 for_sai=for_fq+".sai" | |
1180 rev_sai=rev_fq+".sai" | |
1181 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai | |
1182 | |
1183 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode): | |
1184 #seqsero2 -a; do mapping and sort | |
1185 print("building database...") | |
1186 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True) | |
1187 print("mapping...") | |
1188 if mapping_mode=="mem": | |
1189 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True) | |
1190 elif mapping_mode=="sam": | |
1191 if fnameB!="": | |
1192 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True) | |
1193 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True) | |
1194 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True) | |
1195 else: | |
1196 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True) | |
1197 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam) | |
1198 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True) | |
1199 ### check the version of samtools then use differnt commands | |
1200 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
1201 out, err = samtools_version.communicate() | |
1202 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip() | |
1203 print("check samtools version:",version) | |
1204 ### end of samtools version check and its analysis | |
1205 if LooseVersion(version)<=LooseVersion("1.2"): | |
1206 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True) | |
1207 else: | |
1208 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True) | |
1209 | |
1210 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode,phred_offset): | |
1211 #seqsero2 -a; extract, assembly and blast | |
1212 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True) | |
1213 #print("fnameA:",fnameA) | |
1214 #print("fnameB:",fnameB) | |
1215 if fnameB!="": | |
1216 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output | |
1217 else: | |
1218 pass | |
1219 outdir=current_time+"_temp" | |
1220 print("assembling...") | |
1221 if int(threads)>4: | |
1222 t="4" | |
1223 else: | |
1224 t=threads | |
1225 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-" | |
1226 if phred_offset == 'auto': | |
1227 phred_offset = '' | |
1228 else: | |
1229 phred_offset = '--phred-offset ' + phred_offset | |
1230 | |
1231 if fnameB!="": | |
1232 #print("spades.py --careful "+phred_offset+" --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1") | |
1233 subprocess.check_call("spades.py --careful "+phred_offset+" --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True) | |
1234 else: | |
1235 subprocess.check_call("spades.py --careful "+phred_offset+" --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True) | |
1236 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta" | |
1237 #new_fasta=fnameA+"_"+database.split('/')[-1]+"_"+mapping_mode+".fasta" # change path to databse for packaging | |
1238 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True) | |
1239 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta | |
1240 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True) | |
1241 print("blasting...","\n") | |
1242 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml" | |
1243 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015 | |
1244 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10" | |
1245 else: | |
1246 xmlfile="NA" | |
1247 return xmlfile,new_fasta | |
1248 | |
1249 def judge_subspecies(fnameA): | |
1250 #seqsero2 -a; judge subspecies on just forward raw reads fastq | |
1251 salmID_output=subprocess.Popen("SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
1252 out, err = salmID_output.communicate() | |
1253 out=out.decode("utf-8") | |
1254 file=open("data_log.txt","a") | |
1255 file.write(out) | |
1256 file.close() | |
1257 salm_species_scores=out.split("\n")[1].split("\t")[6:] | |
1258 salm_species_results=out.split("\n")[0].split("\t")[6:] | |
1259 max_score=0 | |
1260 max_score_index=1 #default is 1, means "I" | |
1261 for i in range(len(salm_species_scores)): | |
1262 if max_score<float(salm_species_scores[i]): | |
1263 max_score=float(salm_species_scores[i]) | |
1264 max_score_index=i | |
1265 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0] | |
1266 #if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare | |
1267 if float(out.split("\n")[1].split("\t")[4]) > 10 and float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): ## ed_SL_0318: change SalmID_ssp_threshold | |
1268 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part | |
1269 #if max_score<10: ## ed_SL_0318: change SalmID_ssp_threshold | |
1270 if max_score<60: | |
1271 prediction="-" | |
1272 ## ed_SL_0818: add for enterica | |
1273 if float(out.split("\n")[1].split("\t")[5]) > 10 and float(out.split("\n")[1].split("\t")[5]) > float(out.split("\n")[1].split("\t")[4]): | |
1274 prediction="enterica" | |
1275 ## | |
1276 return prediction | |
1277 | |
1278 def judge_subspecies_Kmer(Special_dict): | |
1279 #seqsero2 -k; | |
1280 max_score=0 | |
1281 prediction="-" #default should be I | |
1282 for x in Special_dict: | |
1283 #if "mer" in x: ## ed_SL_0318: change ssp_threshold | |
1284 if "mer" in x and float(Special_dict[x]) > 60: | |
1285 if max_score<float(Special_dict[x]): | |
1286 max_score=float(Special_dict[x]) | |
1287 prediction=x.split("_")[-1].strip() | |
1288 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica | |
1289 prediction="bongori" | |
1290 break | |
1291 return prediction | |
1292 | |
1293 ## ed_SL_11232019: add notes for missing antigen | |
1294 def check_antigens(ssp,O_antigen,H1_antigen,H2_antigen,NA_note): | |
1295 antigen_note = '' | |
1296 if ssp != '-': | |
1297 if O_antigen != '-' and H1_antigen == '-' and H2_antigen == '-': # O:-:- | |
1298 antigen_note = 'H antigens were not detected. This is an atypical result that should be further investigated. Most Salmonella strains have at least fliC, encoding the Phase 1 H antigen, even if it is not expressed. ' | |
1299 NA_note = '' | |
1300 elif O_antigen != '-' and H1_antigen == '-' and H2_antigen != '-': # O:-:H2 | |
1301 antigen_note = 'fliC was not detected. This is an atypical result that should be further investigated. Most Salmonella strains have fliC, encoding the Phase 1 H antigen, even if it is not expressed. ' | |
1302 NA_note = '' | |
1303 elif O_antigen == '-' and H1_antigen != '-': # -:H1:X | |
1304 antigen_note = 'O antigen was not detected. This result may be due to a rough strain that has deleted the rfb region. For raw reads input, the k-mer workflow is sometimes more sensitive than the microassembly workflow in detecting O antigen. Caution should be used with this approach because the k-mer result may be due to low levels of contamination. ' | |
1305 NA_note = '' | |
1306 elif O_antigen == '-' and H1_antigen == '-' and H2_antigen == '-': # -:-:- | |
1307 antigen_note = 'No serotype antigens were detected. This is an atypical result that should be further investigated. ' | |
1308 NA_note = '' | |
1309 else: | |
1310 antigen_note = 'The input genome cannot be identified as Salmonella. Check the input for taxonomic ID, contamination, or sequencing quality. ' | |
1311 NA_note = '' | |
1312 if ssp == 'enterica': | |
1313 antigen_note += 'Subspecies identification of the input genome cannot be definitively determined. ' | |
1314 NA_note = '' | |
1315 # if [O_antigen, H1_antigen, H2_antigen].count('-') >= 2: | |
1316 # antigen_note = 'No subspecies marker was detected and less than 2 serotype antigens were detected; further, this genome was not identified as Salmonella. This is an atypical result that should be further investigated. ' | |
1317 # else: | |
1318 # antigen_note = 'No subspecies marker was detected. This genome may not be Salmonella. This is an atypical result that should be further investigated. ' | |
1319 return (antigen_note,NA_note) | |
1320 | |
1321 ## ed_SL_06062020: rename subspecies ID | |
1322 subspecies_ID_dir = {'I': 'Salmonella enterica subspecies enterica (subspecies I)', | |
1323 'II': 'Salmonella enterica subspecies salamae (subspecies II)', | |
1324 'IIIa': 'Salmonella enterica subspecies arizonae (subspecies IIIa)', | |
1325 'IIIb': 'Salmonella enterica subspecies diarizonae (subspecies IIIb)', | |
1326 'IV': 'Salmonella enterica subspecies houtenae (subspecies IV)', | |
1327 'VI': 'Salmonella enterica subspecies indica (subspecies VI)', | |
1328 'VII': 'Salmonella enterica subspecies VII (subspecies VII)', | |
1329 'bongori': 'Salmonella bongori', | |
1330 'enterica': 'Salmonella enterica', | |
1331 '-': '-'} | |
1332 ## | |
1333 | |
1334 ## ed_SL_08172020: format check for fasta or fastq in nanopore workflow, convert raw reads fastq to fasta | |
1335 def format_check(input_file): | |
1336 line=open(input_file,'r').readline() | |
1337 if line.startswith('>'): | |
1338 output_file = input_file | |
1339 elif line.startswith('@'): | |
1340 input_file_fa = input_file + '.fasta' | |
1341 subprocess.check_call("seqtk seq -A "+input_file+" > "+input_file_fa,shell=True) | |
1342 output_file = input_file_fa | |
1343 else: | |
1344 print ('please check the format of input files') | |
1345 return (output_file) | |
1346 ## | |
1347 | |
1348 def main(): | |
1349 #combine SeqSeroK and SeqSero2, also with SalmID | |
1350 args = parse_args() | |
1351 input_file = args.i | |
1352 data_type = args.t | |
1353 analysis_mode = args.m | |
1354 mapping_mode=args.b | |
1355 threads=args.p | |
1356 make_dir=args.d | |
1357 clean_mode=args.c | |
1358 sample_name=args.n | |
1359 ingore_header=args.s | |
1360 phred_offset=args.phred_offset | |
1361 k_size=27 #will change for bug fixing | |
1362 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) | |
1363 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: add ex_dir for packaging | |
1364 seqsero2_db=ex_dir+"/H_and_O_and_specific_genes.fasta" # ed_SL_11092019: change path to database for packaging | |
1365 database="H_and_O_and_specific_genes.fasta" | |
1366 note="Note: " | |
1367 NA_note="This predicted serotype is not in the Kauffman-White scheme. " # ed_SL_09272019: add for new output format | |
1368 if len(sys.argv)==1: | |
1369 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file | |
1370 else: | |
1371 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime()) | |
1372 request_id += str(random.randint(1, 10000000)) | |
1373 if make_dir is None: | |
1374 make_dir="SeqSero_result_"+request_id | |
1375 make_dir=os.path.abspath(make_dir) | |
1376 if os.path.isdir(make_dir): | |
1377 pass | |
1378 else: | |
1379 subprocess.check_call("mkdir -p "+make_dir,shell=True) | |
1380 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) | |
1381 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) | |
1382 subprocess.check_call("ln -f -s "+seqsero2_db+" "+" ".join(input_file)+" "+make_dir,shell=True) # ed_SL_11092019: change path to database for packaging | |
1383 #subprocess.check_call("ln -f -s "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument). | |
1384 ############################begin the real analysis | |
1385 if analysis_mode=="a": | |
1386 if data_type in ["1","2","3"]:#use allele mode | |
1387 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath) | |
1388 os.chdir(make_dir) | |
1389 ###add a function to tell input files | |
1390 fnameA=for_fq.split("/")[-1] | |
1391 fnameB=rev_fq.split("/")[-1] | |
1392 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) | |
1393 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id | |
1394 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort | |
1395 | |
1396 ### avoid error out when micro assembly fails. ed_SL_03172020 | |
1397 try: | |
1398 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode,phred_offset) #extract the mapped reads and do micro assembly and blast | |
1399 except (UnboundLocalError, subprocess.CalledProcessError): | |
1400 xmlfile="NA" | |
1401 H1_cont_stat_list=[] | |
1402 H2_cont_stat_list=[] | |
1403 ### | |
1404 if xmlfile=="NA": | |
1405 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","") | |
1406 else: | |
1407 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results | |
1408 file=open("data_log.txt","a") | |
1409 for x in Final_list: | |
1410 file.write("\t".join(str(y) for y in x)+"\n") | |
1411 file.close() | |
1412 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] | |
1413 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB | |
1414 subspecies=judge_subspecies(fnameA) #predict subspecies | |
1415 ### ed_SL_06062020: correction VIII -> II | |
1416 if subspecies == 'VIII': | |
1417 subspecies = 'II' | |
1418 ### ed_SL_08132020: correction VII -> IV, according to CDC's suggestion | |
1419 if subspecies == 'VII': | |
1420 subspecies = 'IV' | |
1421 note+='SalmID reports this as ssp VII, which has not been formally recognized. ' | |
1422 ### | |
1423 ### ed_SL_08182020: change serotype ouput for genome without definitive subspecies ID | |
1424 ssp_pointer = subspecies | |
1425 if subspecies == 'enterica': | |
1426 subspecies = '-' | |
1427 ### | |
1428 ###output | |
1429 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies) | |
1430 claim="" #04132019, disable claim for new report requirement | |
1431 contamination_report="" | |
1432 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0] | |
1433 if contamination_O!="" and contamination_H=="": | |
1434 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"." | |
1435 elif contamination_O=="" and contamination_H!="": | |
1436 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"." | |
1437 elif contamination_O!="" and contamination_H!="": | |
1438 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"." | |
1439 if contamination_report!="": | |
1440 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019 | |
1441 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles. " | |
1442 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version | |
1443 #if contamination_report+star_line+claim=="": #0413, new output style | |
1444 # note="" | |
1445 #else: | |
1446 # note="Note:" | |
1447 | |
1448 ### ed_SL_11232019: add notes for missing antigen | |
1449 if O_choice=="": | |
1450 O_choice="-" | |
1451 antigen_note,NA_note=check_antigens(ssp_pointer,O_choice,fliC_choice,fljB_choice,NA_note) | |
1452 if sample_name: | |
1453 print ("Sample name:\t"+sample_name) | |
1454 ### | |
1455 if clean_mode: | |
1456 subprocess.check_call("rm -rf "+make_dir,shell=True) | |
1457 make_dir="none-output-directory due to '-c' flag" | |
1458 else: | |
1459 new_file=open("SeqSero_result.txt","w") | |
1460 ### ed_SL_01152020: add new output | |
1461 conta_note="yes" if "inter-serotype contamination" in contamination_report else "no" | |
1462 tsv_file=open("SeqSero_result.tsv","w") | |
1463 if ingore_header: | |
1464 pass | |
1465 else: | |
1466 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted identification\tPredicted antigenic profile\tPredicted serotype\tPotential inter-serotype contamination\tNote\n") | |
1467 if sample_name: | |
1468 new_file.write("Sample name:\t"+sample_name+"\n") | |
1469 tsv_file.write(sample_name+'\t') | |
1470 else: | |
1471 tsv_file.write(input_file[0].split('/')[-1]+'\t') | |
1472 ### | |
1473 if "N/A" not in predict_sero: | |
1474 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
1475 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1476 "O antigen prediction:\t"+O_choice+"\n"+ | |
1477 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1478 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1479 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
1480 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1481 "Predicted serotype:\t"+predict_sero+"\n"+ | |
1482 note+contamination_report+star_line+claim+antigen_note+"\n")#+## | |
1483 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+predict_sero+"\t"+conta_note+"\t"+contamination_report+star_line+claim+antigen_note+"\n") | |
1484 else: | |
1485 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n" | |
1486 #star_line="" #04132019, for new output requirement, diable star_line if "NA" in output | |
1487 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
1488 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1489 "O antigen prediction:\t"+O_choice+"\n"+ | |
1490 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1491 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1492 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
1493 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1494 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, add subspecies | |
1495 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")#+## | |
1496 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+conta_note+"\t"+NA_note+contamination_report+star_line+claim+antigen_note+"\n") | |
1497 new_file.close() | |
1498 tsv_file.close() | |
1499 #subprocess.check_call("cat Seqsero_result.txt",shell=True) | |
1500 #subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt *.xml "+fnameA+"*_db* 2> /dev/null",shell=True) | |
1501 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True) | |
1502 if "N/A" not in predict_sero: | |
1503 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+## | |
1504 print("Output directory:\t"+make_dir+"\n"+ | |
1505 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1506 "O antigen prediction:\t"+O_choice+"\n"+ | |
1507 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1508 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1509 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
1510 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1511 "Predicted serotype:\t"+predict_sero+"\n"+ | |
1512 note+contamination_report+star_line+claim+antigen_note+"\n")#+## | |
1513 else: | |
1514 print("Output directory:\t"+make_dir+"\n"+ | |
1515 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
1516 "O antigen prediction:\t"+O_choice+"\n"+ | |
1517 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
1518 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
1519 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
1520 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1521 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies | |
1522 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n") | |
1523 else: | |
1524 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'") | |
1525 elif analysis_mode=="k": | |
1526 #ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
1527 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2_db')) # ed_SL_09152019: change ex_dir for packaging | |
1528 #output_mode = args.mode | |
1529 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath) | |
1530 input_file = for_fq #-k will just use forward because not all reads were used | |
1531 os.chdir(make_dir) | |
1532 ### ed_SL_08182020: use assembly workflow for nanopore fastq, convert fastq to fasta | |
1533 if data_type == "5": | |
1534 input_file = format_check(for_fq) | |
1535 ### | |
1536 f = open(ex_dir + '/antigens.pickle', 'rb') | |
1537 lib_dict = pickle.load(f) | |
1538 f.close | |
1539 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size) | |
1540 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks) | |
1541 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir) | |
1542 subspecies=judge_subspecies_Kmer(Special_dict) | |
1543 if subspecies=="IIb" or subspecies=="IIa": | |
1544 subspecies="II" | |
1545 ### ed_SL_06062020: correction VIII -> II | |
1546 if subspecies == 'VIII': | |
1547 subspecies = 'II' | |
1548 ### ed_SL_08132020: correction VII -> IV, according to CDC's suggestion | |
1549 if subspecies == 'VII': | |
1550 subspecies = 'IV' | |
1551 note+='SalmID reports this as ssp VII, which has not been formally recognized. ' | |
1552 ### | |
1553 ### ed_SL_08182020: change serotype ouput for genome without definitive subspecies ID | |
1554 ssp_pointer = subspecies | |
1555 if subspecies == 'enterica': | |
1556 subspecies = '-' | |
1557 ### | |
1558 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( | |
1559 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies) | |
1560 claim="" #no claim any more based on new output requirement | |
1561 #if star_line+claim=="": #0413, new output style | |
1562 # note="" | |
1563 #else: | |
1564 # note="Note:" | |
1565 | |
1566 ### ed_SL_11232019: add notes for missing antigen | |
1567 if highest_O.split('-')[-1]=="": | |
1568 O_choice="-" | |
1569 else: | |
1570 O_choice=highest_O.split('-')[-1] | |
1571 antigen_note,NA_note=check_antigens(ssp_pointer,O_choice,highest_fliC,highest_fljB,NA_note) | |
1572 if sample_name: | |
1573 print ("Sample name:\t"+sample_name) | |
1574 ### | |
1575 | |
1576 if clean_mode: | |
1577 subprocess.check_call("rm -rf "+make_dir,shell=True) | |
1578 make_dir="none-output-directory due to '-c' flag" | |
1579 else: | |
1580 new_file=open("SeqSero_result.txt","w") | |
1581 tsv_file=open("SeqSero_result.tsv","w") | |
1582 # ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c" | |
1583 # if highest_O.split('-')[-1]=="": | |
1584 # O_choice="-" | |
1585 # else: | |
1586 # O_choice=highest_O.split('-')[-1] | |
1587 # ### | |
1588 # else: | |
1589 # if highest_O.split('-')[-1]=="": | |
1590 # O_choice="-" | |
1591 # else: | |
1592 # O_choice=highest_O.split('-')[-1] | |
1593 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero) | |
1594 # new_file=open("SeqSero_result.txt","w") | |
1595 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+## | |
1596 ### ed_SL_01152020: add new output | |
1597 # tsv_file=open("SeqSero_result.tsv","w") | |
1598 if ingore_header: | |
1599 pass | |
1600 else: | |
1601 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted identification\tPredicted antigenic profile\tPredicted serotype\tNote\n") | |
1602 if sample_name: | |
1603 new_file.write("Sample name:\t"+sample_name+"\n") | |
1604 tsv_file.write(sample_name+'\t') | |
1605 else: | |
1606 tsv_file.write(input_file.split('/')[-1]+'\t') | |
1607 ### | |
1608 if "N/A" not in predict_sero: | |
1609 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
1610 "Input files:\t"+input_file+"\n"+ | |
1611 "O antigen prediction:\t"+O_choice+"\n"+ | |
1612 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1613 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1614 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
1615 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1616 "Predicted serotype:\t"+predict_sero+"\n"+ | |
1617 note+star_line+claim+antigen_note+"\n")#+## | |
1618 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+predict_sero+"\t"+star_line+claim+antigen_note+"\n") | |
1619 else: | |
1620 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n" | |
1621 #star_line = "" #changed for new output requirement, 04132019 | |
1622 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
1623 "Input files:\t"+input_file+"\n"+ | |
1624 "O antigen prediction:\t"+O_choice+"\n"+ | |
1625 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1626 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1627 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
1628 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1629 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies | |
1630 note+NA_note+star_line+claim+antigen_note+"\n")#+## | |
1631 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+NA_note+star_line+claim+antigen_note+"\n") | |
1632 new_file.close() | |
1633 tsv_file.close() | |
1634 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True) | |
1635 if "N/A" not in predict_sero: | |
1636 print("Output directory:\t"+make_dir+"\n"+ | |
1637 "Input files:\t"+input_file+"\n"+ | |
1638 "O antigen prediction:\t"+O_choice+"\n"+ | |
1639 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1640 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1641 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
1642 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1643 "Predicted serotype:\t"+predict_sero+"\n"+ | |
1644 note+star_line+claim+antigen_note+"\n")#+## | |
1645 else: | |
1646 print("Output directory:\t"+make_dir+"\n"+ | |
1647 "Input files:\t"+input_file+"\n"+ | |
1648 "O antigen prediction:\t"+O_choice+"\n"+ | |
1649 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
1650 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
1651 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
1652 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
1653 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies | |
1654 note+NA_note+star_line+claim+antigen_note+"\n")#+## | |
1655 | |
1656 if __name__ == '__main__': | |
1657 main() |