Mercurial > repos > jpayne > seqsero2s
comparison SeqSero2S/bin/SeqSero2S.py @ 19:cfc91e1d2c9b draft
planemo upload commit 936a627c4fc706080f07ec678f89e8256a7e7895
| author | jpayne |
|---|---|
| date | Fri, 15 May 2026 17:50:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 18:6ae6c7a51b22 | 19:cfc91e1d2c9b |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import sys | |
| 4 import time | |
| 5 import random | |
| 6 import os | |
| 7 import subprocess | |
| 8 import gzip | |
| 9 import io | |
| 10 import pickle | |
| 11 import argparse | |
| 12 import itertools | |
| 13 import json | |
| 14 from distutils.version import LooseVersion | |
| 15 from distutils.spawn import find_executable | |
| 16 sys.path.insert(1,sys.path[0]+'/..') | |
| 17 | |
| 18 __version__ = "1.1.4" | |
| 19 | |
| 20 ### SeqSero Kmer | |
| 21 def parse_args(): | |
| 22 "Parse the input arguments, use '-h' for help." | |
| 23 parser = argparse.ArgumentParser(usage='SeqSero2S.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\n')#add "-m <data_type>" in future | |
| 24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### add 'type=os.path.abspath' to generate absolute path of input data. | |
| 25 parser.add_argument("-t",choices=['1','2','3','4','5'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore reads (fasta/fastq)") | |
| 26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode") | |
| 27 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1") | |
| 28 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a") | |
| 29 parser.add_argument("-n",help="<string>: optional, to specify a sample name in the report output") | |
| 30 parser.add_argument("-d",help="<string>: optional, to specify an output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number") | |
| 31 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2S will only output serotype prediction without the directory containing log files") | |
| 32 parser.add_argument("-s",action="store_true",help="<flag>: if '-s' was flagged, SeqSero2S will not output header in SeqSero_result.tsv") | |
| 33 parser.add_argument("--phred_offset",choices=['33','64','auto'],default='auto',help="<33|64|auto>: offset for FASTQ file quality scores, default=auto") | |
| 34 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies") | |
| 35 parser.add_argument('-v', '--version', action='version', version=f"%(prog)s {__version__}") | |
| 36 return parser.parse_args() | |
| 37 | |
| 38 ### check paths of dependencies | |
| 39 check_dependencies = parse_args().check | |
| 40 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py','mlst','stringMLST.py'] | |
| 41 if check_dependencies: | |
| 42 for item in dependencies: | |
| 43 ext_path = find_executable(item) | |
| 44 if ext_path is not None: | |
| 45 print ("Using "+item+" - "+ext_path) | |
| 46 else: | |
| 47 print ("ERROR: can not find "+item+" in PATH") | |
| 48 sys.exit() | |
| 49 ### end of --check | |
| 50 | |
| 51 def reverse_complement(sequence): | |
| 52 complement = { | |
| 53 'A': 'T', | |
| 54 'C': 'G', | |
| 55 'G': 'C', | |
| 56 'T': 'A', | |
| 57 'N': 'N', | |
| 58 'M': 'K', | |
| 59 'R': 'Y', | |
| 60 'W': 'W', | |
| 61 'S': 'S', | |
| 62 'Y': 'R', | |
| 63 'K': 'M', | |
| 64 'V': 'B', | |
| 65 'H': 'D', | |
| 66 'D': 'H', | |
| 67 'B': 'V' | |
| 68 } | |
| 69 return "".join(complement[base] for base in reversed(sequence)) | |
| 70 | |
| 71 def mlst(assembly): | |
| 72 subprocess.check_call("mlst -q --json mlst.json --scheme senterica_achtman_2 "+assembly+" >> data_log.txt 2>&1",shell=True) | |
| 73 f = open("mlst.json",'r') | |
| 74 mlst_result = json.load(f) | |
| 75 f.close() | |
| 76 st = mlst_result[0]['sequence_type'] | |
| 77 alleles = mlst_result[0]['alleles'] | |
| 78 return(st,alleles) | |
| 79 | |
| 80 def stringmlst(f1,f2,t,d): | |
| 81 if t in ['1','2']: | |
| 82 subprocess.check_call("stringMLST.py --predict -P "+d+"/kmer/salmonella -1 "+f1+" -2 "+f2+" -a /dev/stdout -o stringMLST.txt >> data_log.txt 2>&1",shell=True) | |
| 83 elif t=='3': | |
| 84 subprocess.check_call("stringMLST.py --predict -P "+d+"/kmer/salmonella -1 "+f1+" -s -a /dev/stdout -o stringMLST.txt >> data_log.txt 2>&1",shell=True) | |
| 85 f = "stringMLST.txt" | |
| 86 mlst_result = open(f).readlines()[1].strip().split('\t') | |
| 87 st = mlst_result[-1] | |
| 88 if st == '0': | |
| 89 st = '-' | |
| 90 k = ['aroC','dnaN','hemD','hisD','purE','sucA','thrA'] | |
| 91 v = [mlst_result[1],mlst_result[2],mlst_result[3],mlst_result[4],mlst_result[5],mlst_result[6],mlst_result[7]] | |
| 92 alleles = dict(zip(k,v)) | |
| 93 return(st,alleles) | |
| 94 | |
| 95 def createKmerDict_reads(list_of_strings, kmer): | |
| 96 kmer_table = {} | |
| 97 for string in list_of_strings: | |
| 98 sequence = string.strip('\n') | |
| 99 for i in range(len(sequence) - kmer + 1): | |
| 100 new_mer = sequence[i:i + kmer].upper() | |
| 101 new_mer_rc = reverse_complement(new_mer) | |
| 102 if new_mer in kmer_table: | |
| 103 kmer_table[new_mer.upper()] += 1 | |
| 104 else: | |
| 105 kmer_table[new_mer.upper()] = 1 | |
| 106 if new_mer_rc in kmer_table: | |
| 107 kmer_table[new_mer_rc.upper()] += 1 | |
| 108 else: | |
| 109 kmer_table[new_mer_rc.upper()] = 1 | |
| 110 return kmer_table | |
| 111 | |
| 112 | |
| 113 def multifasta_dict(multifasta): | |
| 114 multifasta_list = [ | |
| 115 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0 | |
| 116 ] | |
| 117 headers = [i for i in multifasta_list if i[0] == '>'] | |
| 118 multifasta_dict = {} | |
| 119 for h in headers: | |
| 120 start = multifasta_list.index(h) | |
| 121 for element in multifasta_list[start + 1:]: | |
| 122 if element[0] == '>': | |
| 123 break | |
| 124 else: | |
| 125 if h[1:] in multifasta_dict: | |
| 126 multifasta_dict[h[1:]] += element | |
| 127 else: | |
| 128 multifasta_dict[h[1:]] = element | |
| 129 return multifasta_dict | |
| 130 | |
| 131 | |
| 132 def multifasta_single_string(multifasta): | |
| 133 multifasta_list = [ | |
| 134 line.strip() for line in open(multifasta, 'r') | |
| 135 if (len(line.strip()) > 0) and (line.strip()[0] != '>') | |
| 136 ] | |
| 137 return ''.join(multifasta_list) | |
| 138 | |
| 139 | |
| 140 def chunk_a_long_sequence(long_sequence, chunk_size=60): | |
| 141 chunk_list = [] | |
| 142 steps = len(long_sequence) // 60 #how many chunks | |
| 143 for i in range(steps): | |
| 144 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size]) | |
| 145 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)]) | |
| 146 return chunk_list | |
| 147 | |
| 148 | |
| 149 def target_multifasta_kmerizer(multifasta, k, kmerDict): | |
| 150 forward_length = 300 #if find the target, put forward 300 bases | |
| 151 reverse_length = 2200 #if find the target, put backward 2200 bases | |
| 152 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences | |
| 153 target_mers = [] | |
| 154 long_single_string = multifasta_single_string(multifasta) | |
| 155 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size) | |
| 156 unit_length = len(multifasta_list[0]) | |
| 157 forward_lines = int(forward_length / unit_length) + 1 | |
| 158 reverse_lines = int(forward_length / unit_length) + 1 | |
| 159 start_num = 0 | |
| 160 end_num = 0 | |
| 161 for i in range(len(multifasta_list)): | |
| 162 if i not in range(start_num, end_num): #avoid computational repetition | |
| 163 line = multifasta_list[i] | |
| 164 start = int((len(line) - k) // 2) | |
| 165 s1 = line[start:k + start] | |
| 166 if s1 in kmerDict: #detect it is a potential read or not (use the middle part) | |
| 167 if i - forward_lines >= 0: | |
| 168 start_num = i - forward_lines | |
| 169 else: | |
| 170 start_num = 0 | |
| 171 if i + reverse_lines <= len(multifasta_list) - 1: | |
| 172 end_num = i + reverse_lines | |
| 173 else: | |
| 174 end_num = len(multifasta_list) - 1 | |
| 175 target_list = [ | |
| 176 x.strip() for x in multifasta_list[start_num:end_num] | |
| 177 ] | |
| 178 target_line = "".join(target_list) | |
| 179 target_mers += [ | |
| 180 k1 for k1 in createKmerDict_reads([str(target_line)], k) | |
| 181 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length) | |
| 182 else: | |
| 183 pass | |
| 184 return set(target_mers) | |
| 185 | |
| 186 | |
| 187 def target_read_kmerizer(file, k, kmerDict): | |
| 188 i = 1 | |
| 189 n_reads = 0 | |
| 190 total_coverage = 0 | |
| 191 target_mers = [] | |
| 192 if file.endswith(".gz"): | |
| 193 file_content = io.BufferedReader(gzip.open(file)) | |
| 194 else: | |
| 195 file_content = open(file, "r").readlines() | |
| 196 for line in file_content: | |
| 197 start = int((len(line) - k) // 2) | |
| 198 if i % 4 == 2: | |
| 199 if file.endswith(".gz"): | |
| 200 s1 = line[start:k + start].decode() | |
| 201 line = line.decode() | |
| 202 else: | |
| 203 s1 = line[start:k + start] | |
| 204 if s1 in kmerDict: #detect it is a potential read or not (use the middle part) | |
| 205 n_reads += 1 | |
| 206 total_coverage += len(line) | |
| 207 target_mers += [ | |
| 208 k1 for k1 in createKmerDict_reads([str(line)], k) | |
| 209 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length) | |
| 210 i += 1 | |
| 211 if total_coverage >= 4000000: | |
| 212 break | |
| 213 return set(target_mers) | |
| 214 | |
| 215 | |
| 216 def minion_fasta_kmerizer(file, k, kmerDict): | |
| 217 i = 1 | |
| 218 n_reads = 0 | |
| 219 total_coverage = 0 | |
| 220 target_mers = {} | |
| 221 for line in open(file): | |
| 222 if i % 2 == 0: | |
| 223 for kmer, rc_kmer in kmers(line.strip().upper(), k): | |
| 224 if (kmer in kmerDict) or (rc_kmer in kmerDict): | |
| 225 if kmer in target_mers: | |
| 226 target_mers[kmer] += 1 | |
| 227 else: | |
| 228 target_mers[kmer] = 1 | |
| 229 if rc_kmer in target_mers: | |
| 230 target_mers[rc_kmer] += 1 | |
| 231 else: | |
| 232 target_mers[rc_kmer] = 1 | |
| 233 i += 1 | |
| 234 return set([h for h in target_mers]) | |
| 235 | |
| 236 | |
| 237 def minion_fastq_kmerizer(file, k, kmerDict): | |
| 238 i = 1 | |
| 239 n_reads = 0 | |
| 240 total_coverage = 0 | |
| 241 target_mers = {} | |
| 242 for line in open(file): | |
| 243 if i % 4 == 2: | |
| 244 for kmer, rc_kmer in kmers(line.strip().upper(), k): | |
| 245 if (kmer in kmerDict) or (rc_kmer in kmerDict): | |
| 246 if kmer in target_mers: | |
| 247 target_mers[kmer] += 1 | |
| 248 else: | |
| 249 target_mers[kmer] = 1 | |
| 250 if rc_kmer in target_mers: | |
| 251 target_mers[rc_kmer] += 1 | |
| 252 else: | |
| 253 target_mers[rc_kmer] = 1 | |
| 254 i += 1 | |
| 255 return set([h for h in target_mers]) | |
| 256 | |
| 257 | |
| 258 def multifasta_single_string2(multifasta): | |
| 259 single_string = '' | |
| 260 with open(multifasta, 'r') as f: | |
| 261 for line in f: | |
| 262 if line.strip()[0] == '>': | |
| 263 pass | |
| 264 else: | |
| 265 single_string += line.strip() | |
| 266 return single_string | |
| 267 | |
| 268 | |
| 269 def kmers(seq, k): | |
| 270 rev_comp = reverse_complement(seq) | |
| 271 for start in range(1, len(seq) - k + 1): | |
| 272 yield seq[start:start + k], rev_comp[-(start + k):-start] | |
| 273 | |
| 274 | |
| 275 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set | |
| 276 multi_seq_dict = multifasta_dict(multifasta) | |
| 277 lib_dict = {} | |
| 278 for h in multi_seq_dict: | |
| 279 lib_dict[h] = set( | |
| 280 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)]) | |
| 281 return lib_dict | |
| 282 | |
| 283 | |
| 284 def Combine(b, c): | |
| 285 fliC_combinations = [] | |
| 286 fliC_combinations.append(",".join(c)) | |
| 287 temp_combinations = [] | |
| 288 for i in range(len(b)): | |
| 289 for x in itertools.combinations(b, i + 1): | |
| 290 temp_combinations.append(",".join(x)) | |
| 291 for x in temp_combinations: | |
| 292 temp = [] | |
| 293 for y in c: | |
| 294 temp.append(y) | |
| 295 temp.append(x) | |
| 296 temp = ",".join(temp) | |
| 297 temp = temp.split(",") | |
| 298 temp.sort() | |
| 299 temp = ",".join(temp) | |
| 300 fliC_combinations.append(temp) | |
| 301 return fliC_combinations | |
| 302 | |
| 303 | |
| 304 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies,ss): | |
| 305 #like test_output_06012017.txt | |
| 306 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list) | |
| 307 #from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict | |
| 308 if ss == 'ss2': | |
| 309 from Initial_Conditions_SS2 import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict | |
| 310 elif ss == 'ss2s': | |
| 311 from Initial_Conditions_SS2S import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict | |
| 312 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict] | |
| 313 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to | |
| 314 seronames = [] | |
| 315 seronames_none_subspecies=[] | |
| 316 for i in range(len(phase1)): | |
| 317 fliC_combine = [] | |
| 318 fljB_combine = [] | |
| 319 if phaseO[i] == Otype: # no VII in KW, but it's there | |
| 320 ### for fliC, detect every possible combinations to avoid the effect of "[" | |
| 321 if phase1[i].count("[") == 0: | |
| 322 fliC_combine.append(phase1[i]) | |
| 323 elif phase1[i].count("[") >= 1: | |
| 324 c = [] | |
| 325 b = [] | |
| 326 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count( | |
| 327 "[") == 1: | |
| 328 content = phase1[i].replace("[", "").replace("]", "") | |
| 329 fliC_combine.append(content) | |
| 330 fliC_combine.append("-") | |
| 331 else: | |
| 332 for x in phase1[i].split(","): | |
| 333 if "[" in x: | |
| 334 b.append(x.replace("[", "").replace("]", "")) | |
| 335 else: | |
| 336 c.append(x) | |
| 337 fliC_combine = Combine( | |
| 338 b, c | |
| 339 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t | |
| 340 ### end of fliC "[" detect | |
| 341 ### for fljB, detect every possible combinations to avoid the effect of "[" | |
| 342 if phase2[i].count("[") == 0: | |
| 343 fljB_combine.append(phase2[i]) | |
| 344 elif phase2[i].count("[") >= 1: | |
| 345 d = [] | |
| 346 e = [] | |
| 347 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count( | |
| 348 "[") == 1: | |
| 349 content = phase2[i].replace("[", "").replace("]", "") | |
| 350 fljB_combine.append(content) | |
| 351 fljB_combine.append("-") | |
| 352 else: | |
| 353 for x in phase2[i].split(","): | |
| 354 if "[" in x: | |
| 355 d.append(x.replace("[", "").replace("]", "")) | |
| 356 else: | |
| 357 e.append(x) | |
| 358 fljB_combine = Combine(d, e) | |
| 359 ### end of fljB "[" detect | |
| 360 new_fliC = fliC.split( | |
| 361 "," | |
| 362 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings | |
| 363 new_fliC.sort() | |
| 364 new_fliC = ",".join(new_fliC) | |
| 365 new_fljB = fljB.split(",") | |
| 366 new_fljB.sort() | |
| 367 new_fljB = ",".join(new_fljB) | |
| 368 if (new_fliC in fliC_combine | |
| 369 or fliC in fliC_combine) and (new_fljB in fljB_combine | |
| 370 or fljB in fljB_combine): | |
| 371 ######start, remove_list,rename_dict, added on 11/11/2018 | |
| 372 if sero[i] not in remove_list: | |
| 373 temp_sero=sero[i] | |
| 374 if temp_sero in rename_dict: | |
| 375 temp_sero=rename_dict[temp_sero] #rename if in the rename list | |
| 376 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider | |
| 377 if subs[i] == subspecies: | |
| 378 seronames.append(temp_sero) | |
| 379 seronames_none_subspecies.append(temp_sero) | |
| 380 else: | |
| 381 pass | |
| 382 else: | |
| 383 pass | |
| 384 ######end, added on 11/11/2018 | |
| 385 #analyze seronames | |
| 386 subspecies_pointer="" | |
| 387 if len(seronames) == 0 and len(seronames_none_subspecies)!=0: | |
| 388 ## ed_SL_06062020: for the subspecies mismatch between KW and SalmID | |
| 389 seronames=seronames_none_subspecies | |
| 390 #seronames=["N/A"] | |
| 391 subspecies_pointer="1" | |
| 392 #subspecies_pointer="0" | |
| 393 ## | |
| 394 if len(seronames) == 0: | |
| 395 seronames = [ | |
| 396 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)" | |
| 397 ] | |
| 398 star = "" | |
| 399 star_line = "" | |
| 400 if len(seronames) > 1: #there are two possible predictions for serotypes | |
| 401 star = "*" | |
| 402 #changed 04072019 | |
| 403 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n" | |
| 404 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0: | |
| 405 star="*" | |
| 406 star_line = "This antigenic profile has been associated with serotype '"+(" or ").join(seronames)+"' in the Kauffman-White scheme. The existence of the same antigenic formula in multiple species or subspecies is well documented in the Kauffman-White Scheme. " + star_line ## ed_SL_03202021: changed for new output format | |
| 407 #star_line="The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme. " + star_line | |
| 408 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line | |
| 409 seronames=["N/A"] ## ed_SL_06062020 | |
| 410 if Otype=="": | |
| 411 Otype="-" | |
| 412 predict_form = Otype + ":" + fliC + ":" + fljB | |
| 413 predict_sero = (" or ").join(seronames) | |
| 414 ###special test for Enteritidis | |
| 415 if predict_form == "9:g,m:-": | |
| 416 sdf = "-" | |
| 417 for x in special_gene_list: | |
| 418 if x.startswith("sdf"): | |
| 419 sdf = "+" | |
| 420 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" | |
| 421 #star_line="sdf gene detected. " | |
| 422 star_line = "Detected Sdf I that is characteristic of commonly circulating strains of serotype Enteritidis. " | |
| 423 #predict_form = predict_form + " Sdf prediction:" + sdf | |
| 424 predict_form = predict_form #changed 04072019 | |
| 425 if sdf == "-": | |
| 426 star = "*" | |
| 427 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis" | |
| 428 #star_line="sdf gene not detected. " | |
| 429 star_line = "Sdf I that is characteristic of commonly circulating strains of serotype Enteritidis was not detected. " | |
| 430 #changed in 04072019, for new output | |
| 431 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n" | |
| 432 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement | |
| 433 predict_sero = "Gallinarum or Enteritidis" | |
| 434 ###end of special test for Enteritidis | |
| 435 elif predict_form == "4:i:-": | |
| 436 predict_sero = "I 4,[5],12:i:-" # change serotype name | |
| 437 elif predict_form == "4:r:-": | |
| 438 predict_sero = "N/A (4:r:-)" | |
| 439 elif predict_form == "4:b:-": | |
| 440 predict_sero = "N/A (4:b:-)" | |
| 441 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo | |
| 442 #predict_sero = "Newport" | |
| 443 #star = "*" | |
| 444 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare." | |
| 445 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n" | |
| 446 if "N/A" in predict_sero: | |
| 447 claim = "" | |
| 448 #special test for Typhimurium | |
| 449 if "Typhimurium" in predict_sero or predict_form == "4:i:-": | |
| 450 normal = 0 | |
| 451 mutation = 0 | |
| 452 for x in special_gene_list: | |
| 453 if "oafA-O-4_full" in x: | |
| 454 normal = float(special_gene_list[x]) | |
| 455 elif "oafA-O-4_5-" in x: | |
| 456 mutation = float(special_gene_list[x]) | |
| 457 if normal > mutation: | |
| 458 pass | |
| 459 elif normal < mutation: | |
| 460 #predict_sero = predict_sero.strip() + "(O5-)" | |
| 461 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019 | |
| 462 star = "*" | |
| 463 #star_line = "Detected the deletion of O5-." | |
| 464 star_line = "Detected a deletion in gene oafA that causes O5- variant of Typhimurium. " | |
| 465 else: | |
| 466 pass | |
| 467 #special test for Paratyphi B | |
| 468 if "Paratyphi B" in predict_sero or predict_form == "4:b:-": | |
| 469 normal = 0 | |
| 470 mutation = 0 | |
| 471 for x in special_gene_list: | |
| 472 if "gntR-family-regulatory-protein_dt-positive" in x: | |
| 473 normal = float(special_gene_list[x]) | |
| 474 elif "gntR-family-regulatory-protein_dt-negative" in x: | |
| 475 mutation = float(special_gene_list[x]) | |
| 476 #print(normal,mutation) | |
| 477 if normal > mutation: | |
| 478 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019 | |
| 479 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() | |
| 480 star = "*" | |
| 481 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)." | |
| 482 star_line = "The SNP in gene STM3356 that is associated with the d-Tartrate nonfermenting phenotype characteristic of the typhoidal pathotype was not detected. " | |
| 483 elif normal < mutation: | |
| 484 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019 | |
| 485 predict_sero = predict_sero.strip() | |
| 486 star = "*" | |
| 487 #star_line = "Detected the SNP for d-Tartrate nonfermenting phenotype of Paratyphi B. " | |
| 488 star_line = "Detected the SNP in gene STM3356 that is associated with the d-Tartrate nonfermenting phenotype characteristic of the typhoidal pathotype. " | |
| 489 else: | |
| 490 star = "*" | |
| 491 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not." | |
| 492 star_line = " " ## ed_SL_05152019: do not report this situation. | |
| 493 #special test for O13,22 and O13,23 | |
| 494 ### add comment for any O2Â call. 06052024 | |
| 495 if Otype=='2' and predict_sero not in ['Nitra','Kiel','Koessen']: | |
| 496 star_line = 'O2 is typically a O9 rfb serotype with a mutation that results in a different sugar being placed in O antigen. SS2S detects only one group O2 serotype, Paratyphi A. This genome may be a variant of a group O9 serotype.' | |
| 497 ### | |
| 498 if Otype=="13": | |
| 499 #ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
| 500 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2s_db')) # ed_SL_09152019 | |
| 501 f = open(ex_dir + '/special.pickle', 'rb') | |
| 502 special = pickle.load(f) | |
| 503 O22_O23=special['O22_O23'] | |
| 504 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze | |
| 505 #if predict_sero.split(" or ")[0] in O22_O23[-1]: # Report O22 vs O23 result for O13 serotypes. 12232024 | |
| 506 O22_score=0 | |
| 507 O23_score=0 | |
| 508 for x in special_gene_list: | |
| 509 if "O:22" in x: | |
| 510 O22_score = O22_score+float(special_gene_list[x]) | |
| 511 elif "O:23" in x: | |
| 512 O23_score = O23_score+float(special_gene_list[x]) | |
| 513 #print(O22_score,O23_score) | |
| 514 for z in O22_O23[0]: | |
| 515 if predict_sero.split(" or ")[0] in z: | |
| 516 if O22_score > O23_score: | |
| 517 star = "*" | |
| 518 star_line = "Detected a genetic marker (galE allele) for ancillary O22." | |
| 519 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 | |
| 520 predict_sero = z[0] | |
| 521 elif O22_score < O23_score: | |
| 522 star = "*" | |
| 523 star_line = "Detected a genetic marker (galE allele) for ancillary O23." | |
| 524 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019 | |
| 525 predict_sero = z[1] | |
| 526 else: | |
| 527 star = "*" | |
| 528 star_line = "Fail to detect genetic markers (galE alleles) for ancillary O22 and O23." | |
| 529 #star_line = "Fail to detect O22/O23 specific genes." #diabled for new output requirement, 04132019 | |
| 530 if " or " in predict_sero: | |
| 531 star_line = star_line + "The predicted serotypes share the same general formula: " + Otype + ":" + fliC + ":" + fljB + " and can be differentiated by additional analysis. " | |
| 532 #special test for O6,8 | |
| 533 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list | |
| 534 #for x in merge_O68_list: | |
| 535 # if x in predict_sero: | |
| 536 # predict_sero=x | |
| 537 # star="" | |
| 538 # star_line="" | |
| 539 #special test for Montevideo; most of them are monophasic | |
| 540 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list | |
| 541 #star="*" | |
| 542 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination." | |
| 543 return predict_form, predict_sero, star, star_line, claim | |
| 544 ### End of SeqSero Kmer part | |
| 545 | |
| 546 ### Begin of SeqSero2 allele prediction and output | |
| 547 def xml_parse_score_comparision_seqsero(xmlfile): | |
| 548 #used to do seqsero xml analysis | |
| 549 from Bio.Blast import NCBIXML | |
| 550 handle=open(xmlfile) | |
| 551 handle=NCBIXML.parse(handle) | |
| 552 handle=list(handle) | |
| 553 List=[] | |
| 554 List_score=[] | |
| 555 List_ids=[] | |
| 556 List_query_region=[] | |
| 557 for i in range(len(handle)): | |
| 558 if len(handle[i].alignments)>0: | |
| 559 for j in range(len(handle[i].alignments)): | |
| 560 score=0 | |
| 561 ids=0 | |
| 562 cover_region=set() #fixed problem that repeated calculation leading percentage > 1 | |
| 563 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def) | |
| 564 for z in range(len(handle[i].alignments[j].hsps)): | |
| 565 hsp=handle[i].alignments[j].hsps[z] | |
| 566 temp=set(range(hsp.query_start,hsp.query_end)) | |
| 567 if len(cover_region)==0: | |
| 568 cover_region=cover_region|temp | |
| 569 fraction=1 | |
| 570 else: | |
| 571 fraction=1-len(cover_region&temp)/float(len(temp)) | |
| 572 cover_region=cover_region|temp | |
| 573 if "last" in handle[i].query or "first" in handle[i].query: | |
| 574 score+=hsp.bits*fraction | |
| 575 ids+=float(hsp.identities)/handle[i].query_length*fraction | |
| 576 else: | |
| 577 score+=hsp.bits*fraction | |
| 578 ids+=float(hsp.identities)/handle[i].query_length*fraction | |
| 579 List_score.append(score) | |
| 580 List_ids.append(ids) | |
| 581 List_query_region.append(cover_region) | |
| 582 temp=zip(List,List_score,List_ids,List_query_region) | |
| 583 Final_list=sorted(temp, key=lambda d:d[1], reverse = True) | |
| 584 return Final_list | |
| 585 | |
| 586 | |
| 587 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number | |
| 588 Old=L | |
| 589 L.sort() | |
| 590 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]] | |
| 591 count=[] | |
| 592 for j in range(len(L)): | |
| 593 y=0 | |
| 594 for x in Old: | |
| 595 if L[j]==x: | |
| 596 y+=1 | |
| 597 count.append(y) | |
| 598 if sort_on_fre!="none": | |
| 599 d=zip(*sorted(zip(count, L))) | |
| 600 L=d[1] | |
| 601 count=d[0] | |
| 602 return (L,count) | |
| 603 | |
| 604 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list): | |
| 605 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test | |
| 606 #this is mainly used for | |
| 607 a=nodes_vs_score_list | |
| 608 fliC_score=0 | |
| 609 fljB_score=0 | |
| 610 for z in a: | |
| 611 if "fliC" in z[0]: | |
| 612 fliC_score+=z[1] | |
| 613 elif "fljB" in z[0]: | |
| 614 fljB_score+=z[1] | |
| 615 if fliC_score>=fljB_score: | |
| 616 role="fliC" | |
| 617 else: | |
| 618 role="fljB" | |
| 619 return (role,abs(fliC_score-fljB_score)) | |
| 620 | |
| 621 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed): | |
| 622 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small) | |
| 623 #also used when no head or tail got blasted score for the contig | |
| 624 role="" | |
| 625 for z in Final_list_passed: | |
| 626 if node_name in z[0]: | |
| 627 role=z[0].split("_")[0] | |
| 628 break | |
| 629 return role | |
| 630 | |
| 631 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed): | |
| 632 #nodes_list is the c created by c,d=Uniq(nodes) in below function | |
| 633 first_target="" | |
| 634 role_list=[] | |
| 635 for x in nodes_list: | |
| 636 a=[] | |
| 637 role="" | |
| 638 for y in tail_head_list: | |
| 639 if x in y[0]: | |
| 640 a.append(y) | |
| 641 if len(a)==4: | |
| 642 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
| 643 if diff<20: | |
| 644 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
| 645 elif len(a)==3: | |
| 646 ###however, if the one with highest score is the fewer one, compare their accumulation score | |
| 647 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
| 648 if diff<20: | |
| 649 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
| 650 ###end of above score comparison | |
| 651 elif len(a)==2: | |
| 652 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37) | |
| 653 temp=[] | |
| 654 for z in a: | |
| 655 temp.append(z[0].split("_")[0]) | |
| 656 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too | |
| 657 if len(m)==1: | |
| 658 pass | |
| 659 else: | |
| 660 pass | |
| 661 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
| 662 if diff<20: | |
| 663 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
| 664 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation | |
| 665 elif len(a)==1: | |
| 666 #that one | |
| 667 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a) | |
| 668 if diff<20: | |
| 669 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
| 670 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0" | |
| 671 else:#a==0 | |
| 672 #use Final_list_passed best match | |
| 673 for z in Final_list_passed: | |
| 674 if x in z[0]: | |
| 675 role=z[0].split("_")[0] | |
| 676 break | |
| 677 #print x,role,len(a) | |
| 678 role_list.append((role,x)) | |
| 679 if len(role_list)==2: | |
| 680 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase | |
| 681 #just use score to do a final test | |
| 682 role_list=[] | |
| 683 for x in nodes_list: | |
| 684 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed) | |
| 685 role_list.append((role,x)) | |
| 686 return role_list | |
| 687 | |
| 688 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed): | |
| 689 #used to decide which contig is FliC and which one is fljB | |
| 690 contigs=[] | |
| 691 nodes=[] | |
| 692 for x in Final_list_passed: | |
| 693 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]: | |
| 694 nodes.append(x[0].split("___")[1].strip()) | |
| 695 c,d=Uniq(nodes)#c is node_list | |
| 696 #print c | |
| 697 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])] | |
| 698 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed) | |
| 699 return roles | |
| 700 | |
| 701 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed): | |
| 702 #decide O based on Final_list | |
| 703 O_choice="?" | |
| 704 O_list=[] | |
| 705 special_genes={} | |
| 706 nodes=[] | |
| 707 for x in Final_list_passed: | |
| 708 if x[0].startswith("O-"): | |
| 709 nodes.append(x[0].split("___")[1].strip()) | |
| 710 elif not x[0].startswith("fl"): | |
| 711 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1] | |
| 712 ##print("special_genes:",special_genes) | |
| 713 c,d=Uniq(nodes) | |
| 714 #print "potential O antigen contig",c | |
| 715 final_O=[] | |
| 716 O_nodes_list=[] | |
| 717 for x in c:#c is the list for contigs | |
| 718 temp=0 | |
| 719 for y in Final_list_passed: | |
| 720 if x in y[0] and y[0].startswith("O-"): | |
| 721 final_O.append(y) | |
| 722 break | |
| 723 ### O contig has the problem of two genes on same contig, so do additional test | |
| 724 potenial_new_gene="" | |
| 725 for x in final_O: | |
| 726 pointer=0 #for genes merged or not | |
| 727 #not consider O-1,3,19_not_in_3,10, too short compared with others | |
| 728 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region | |
| 729 pointer=x[0].split("___")[1].strip()#store the contig name | |
| 730 print(pointer) | |
| 731 if pointer!=0:#it has potential merge event | |
| 732 for y in Final_list: | |
| 733 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O | |
| 734 potenial_new_gene=y | |
| 735 #print(potenial_new_gene) | |
| 736 break | |
| 737 if potenial_new_gene!="": | |
| 738 print("two differnt genes in same contig, fix it for O antigen") | |
| 739 print(potenial_new_gene[:3]) | |
| 740 pointer=0 | |
| 741 for y in final_O: | |
| 742 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]: | |
| 743 pointer=1 | |
| 744 if pointer!=0: #changed to consider two genes in same contig | |
| 745 final_O.append(potenial_new_gene) | |
| 746 ### end of the two genes on same contig test | |
| 747 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted | |
| 748 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]): | |
| 749 #print "$$$No Otype, due to no hit"#may need to be changed | |
| 750 O_choice="-" | |
| 751 else: | |
| 752 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]]) | |
| 753 O_list=[] | |
| 754 O_list_less_contamination=[] | |
| 755 for x in final_O: | |
| 756 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off | |
| 757 O_list.append(x[0].split("__")[0]) | |
| 758 O_nodes_list.append(x[0].split("___")[1]) | |
| 759 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15: | |
| 760 O_list_less_contamination.append(x[0].split("__")[0]) | |
| 761 ### special test for O9,46 and O3,10 family | |
| 762 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
| 763 if "O-9,46_wzy" in O_list or "O-9,46_wzy_partial" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | |
| 764 O_choice="O-9,46" | |
| 765 #print "$$$Most possilble Otype: O-9,46" | |
| 766 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 | |
| 767 O_choice="O-9,46,27" | |
| 768 #print "$$$Most possilble Otype: O-9,46,27" | |
| 769 else: | |
| 770 O_choice="O-9"#next, detect O9 vs O2? | |
| 771 O2=0 | |
| 772 O9=0 | |
| 773 for z in special_genes: | |
| 774 if "tyr-O-9" in z and special_genes[z] > O9: ##20240322, add "special_genes[z] > O9" to avoid misidentification of O9 to O2 that caused by multiple tyr-O-9 contigs | |
| 775 O9=special_genes[z] | |
| 776 elif "tyr-O-2" in z and special_genes[z] > O2: ##20240322, add "special_genes[z] > O2" | |
| 777 O2=special_genes[z] | |
| 778 if O2>O9: | |
| 779 O_choice="O-2" | |
| 780 elif O2<O9: | |
| 781 pass | |
| 782 else: | |
| 783 pass | |
| 784 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility." | |
| 785 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
| 786 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 | |
| 787 O_choice="O-3,10" | |
| 788 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" | |
| 789 else: | |
| 790 O_choice="O-1,3,19" | |
| 791 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" | |
| 792 ### end of special test for O9,46 and O3,10 family | |
| 793 else: | |
| 794 try: | |
| 795 max_score=0 | |
| 796 for x in final_O: | |
| 797 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold | |
| 798 max_score=x[2]#change from x[-1] to x[2],08172018 | |
| 799 O_choice=x[0].split("_")[0] | |
| 800 if O_choice=="O-1,3,19": | |
| 801 O_choice=final_O[1][0].split("_")[0] | |
| 802 #print "$$$Most possilble Otype: ",O_choice | |
| 803 except: | |
| 804 pass | |
| 805 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)" | |
| 806 if O_choice=="O-9,46,27" and len(O_list)==2 and "O-4_wzx" in O_list: #special for very low chance sitatuion between O4 and O9,27,46, this is for serotypes like Bredeney and Schwarzengrund (normallly O-4 will have higher score, but sometimes sequencing quality may affect the prediction) | |
| 807 O_choice="O-4" | |
| 808 #print "O:",O_choice,O_nodes_list | |
| 809 Otypes=[] | |
| 810 for x in O_list: | |
| 811 if x!="O-1,3,19_not_in_3,10": | |
| 812 if "O-9,46_" not in x: | |
| 813 Otypes.append(x.split("_")[0]) | |
| 814 else: | |
| 815 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254 | |
| 816 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"] | |
| 817 Otypes_uniq,Otypes_fre=Uniq(Otypes) | |
| 818 contamination_O="" | |
| 819 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19": | |
| 820 if len(Otypes_uniq)>2: | |
| 821 contamination_O="potential contamination from O antigen signals" | |
| 822 else: | |
| 823 if len(Otypes_uniq)>1: | |
| 824 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
| 825 contamination_O="" | |
| 826 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
| 827 contamination_O="" | |
| 828 else: | |
| 829 contamination_O="potential contamination from O antigen signals" | |
| 830 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq | |
| 831 ### End of SeqSero2 allele prediction and output | |
| 832 | |
| 833 def get_input_files(make_dir,input_file,data_type,dirpath): | |
| 834 #tell input files from datatype | |
| 835 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)" | |
| 836 for_fq="" | |
| 837 rev_fq="" | |
| 838 os.chdir(make_dir) | |
| 839 if data_type=="1": | |
| 840 input_file=input_file[0].split("/")[-1] | |
| 841 if input_file.endswith(".sra"): | |
| 842 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True) | |
| 843 for_fq=input_file.replace(".sra","_1.fastq") | |
| 844 rev_fq=input_file.replace(".sra","_2.fastq") | |
| 845 else: | |
| 846 core_id=input_file.split(".fastq")[0].split(".fq")[0] | |
| 847 for_fq=core_id+"_1.fastq" | |
| 848 rev_fq=core_id+"_2.fastq" | |
| 849 if input_file.endswith(".gz"): | |
| 850 subprocess.check_call("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True) | |
| 851 else: | |
| 852 subprocess.check_call("cat "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True) | |
| 853 elif data_type=="2": | |
| 854 for_fq=input_file[0].split("/")[-1] | |
| 855 rev_fq=input_file[1].split("/")[-1] | |
| 856 elif data_type=="3": | |
| 857 input_file=input_file[0].split("/")[-1] | |
| 858 if input_file.endswith(".sra"): | |
| 859 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True) | |
| 860 for_fq=input_file.replace(".sra","_1.fastq") | |
| 861 else: | |
| 862 for_fq=input_file | |
| 863 elif data_type in ["4","5","6"]: | |
| 864 for_fq=input_file[0].split("/")[-1] | |
| 865 os.chdir("..") | |
| 866 return for_fq,rev_fq | |
| 867 | |
| 868 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta): | |
| 869 #get O and H types from Final_list from blast parsing; allele mode | |
| 870 from Bio import SeqIO | |
| 871 fliC_choice="-" | |
| 872 fljB_choice="-" | |
| 873 fliC_contig="NA" | |
| 874 fljB_contig="NA" | |
| 875 fliC_region=set([0]) | |
| 876 fljB_region=set([0,]) | |
| 877 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length | |
| 878 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length | |
| 879 O_choice="-"#no need to decide O contig for now, should be only one | |
| 880 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification | |
| 881 O_choice=O_choice.split("-")[-1].strip() | |
| 882 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="": | |
| 883 O_choice="-" | |
| 884 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB | |
| 885 #add alignment locations, used for further selection, 03312019 | |
| 886 for i in range(len(H_contig_roles)): | |
| 887 x=H_contig_roles[i] | |
| 888 for y in Final_list_passed: | |
| 889 if x[1] in y[0] and y[0].startswith(x[0]): | |
| 890 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],) | |
| 891 break | |
| 892 log_file=open("SeqSero_log.txt","a") | |
| 893 extract_file=open("Extracted_antigen_alleles.fasta","a") | |
| 894 handle_fasta=list(SeqIO.parse(new_fasta,"fasta")) | |
| 895 | |
| 896 #print("O_contigs:") | |
| 897 log_file.write("O_contigs:\n") | |
| 898 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n") | |
| 899 extract_file.write("#O_contigs:\n") | |
| 900 for x in O_nodes_roles: | |
| 901 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker | |
| 902 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1]))) | |
| 903 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n") | |
| 904 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n" | |
| 905 seqs="" | |
| 906 for z in handle_fasta: | |
| 907 if x[0].split("___")[-1]==z.description: | |
| 908 seqs=str(z.seq) | |
| 909 extract_file.write(title+seqs+"\n") | |
| 910 if len(H_contig_roles)!=0: | |
| 911 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output | |
| 912 else: | |
| 913 highest_H_coverage=0 | |
| 914 for x in H_contig_roles: | |
| 915 #if multiple choices, temporately select the one with longest length for now, will revise in further change | |
| 916 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list | |
| 917 fliC_contig=x[1] | |
| 918 fliC_length=len(x[-1]) | |
| 919 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13: | |
| 920 fljB_contig=x[1] | |
| 921 fljB_length=len(x[-1]) | |
| 922 for x in Final_list_passed: | |
| 923 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]: | |
| 924 fliC_choice=x[0].split("_")[1] | |
| 925 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]: | |
| 926 fljB_choice=x[0].split("_")[1] | |
| 927 elif fliC_choice!="-" and fljB_choice!="-": | |
| 928 break | |
| 929 #now remove contigs not in middle core part | |
| 930 first_allele="NA" | |
| 931 first_allele_percentage=0 | |
| 932 for x in Final_list: | |
| 933 if x[0].startswith("fliC") or x[0].startswith("fljB"): | |
| 934 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs | |
| 935 first_allele_percentage=x[2] | |
| 936 break | |
| 937 additional_contigs=[] | |
| 938 for x in Final_list: | |
| 939 if first_allele in x[0]: | |
| 940 if (fliC_contig == x[0].split("___")[-1]): | |
| 941 fliC_region=x[3] | |
| 942 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]): | |
| 943 fljB_region=x[3] | |
| 944 else: | |
| 945 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1 | |
| 946 additional_contigs.append(x) | |
| 947 #else: | |
| 948 #print x[:3] | |
| 949 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off | |
| 950 if first_allele_percentage>0.9: | |
| 951 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000: | |
| 952 target_region=fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) | |
| 953 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000: | |
| 954 target_region=fliC_region|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) | |
| 955 else: | |
| 956 target_region=set()#doesn't do anything | |
| 957 else: | |
| 958 target_region=set()#doesn't do anything | |
| 959 #print(target_region) | |
| 960 #print(additional_contigs) | |
| 961 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best | |
| 962 target_region=target_region2|target_region | |
| 963 for x in additional_contigs: | |
| 964 removal=0 | |
| 965 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0]) | |
| 966 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length*0.5<len(x[3])<contig_length*1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off | |
| 967 removal=1 | |
| 968 else: | |
| 969 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part) | |
| 970 removal=1 | |
| 971 else: | |
| 972 pass | |
| 973 if removal==1: | |
| 974 for y in H_contig_roles: | |
| 975 if y[1] in x[0]: | |
| 976 H_contig_roles.remove(y) | |
| 977 else: | |
| 978 pass | |
| 979 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length*0.5,len(x[3]),contig_length*1.5) | |
| 980 #end of removing none-middle contigs | |
| 981 #print("H_contigs:") | |
| 982 log_file.write("H_contigs:\n") | |
| 983 extract_file.write("#H_contigs:\n") | |
| 984 H_contig_stat=[] | |
| 985 H1_cont_stat={} | |
| 986 H2_cont_stat={} | |
| 987 for i in range(len(H_contig_roles)): | |
| 988 x=H_contig_roles[i] | |
| 989 a=0 | |
| 990 for y in Final_list_passed: | |
| 991 if x[1] in y[0] and y[0].startswith(x[0]): | |
| 992 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide | |
| 993 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it | |
| 994 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB | |
| 995 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) | |
| 996 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") | |
| 997 H_contig_roles[i]="can't decide fliC or fljB, may be third phase" | |
| 998 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n" | |
| 999 seqs="" | |
| 1000 for z in handle_fasta: | |
| 1001 if x[1]==z.description: | |
| 1002 seqs=str(z.seq) | |
| 1003 extract_file.write(title+seqs+"\n") | |
| 1004 break | |
| 1005 else: | |
| 1006 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1]))) | |
| 1007 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n") | |
| 1008 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n" | |
| 1009 seqs="" | |
| 1010 for z in handle_fasta: | |
| 1011 if x[1]==z.description: | |
| 1012 seqs=str(z.seq) | |
| 1013 extract_file.write(title+seqs+"\n") | |
| 1014 if x[0]=="fliC": | |
| 1015 if y[0].split("_")[1] not in H1_cont_stat: | |
| 1016 H1_cont_stat[y[0].split("_")[1]]=y[2] | |
| 1017 else: | |
| 1018 H1_cont_stat[y[0].split("_")[1]]+=y[2] | |
| 1019 if x[0]=="fljB": | |
| 1020 if y[0].split("_")[1] not in H2_cont_stat: | |
| 1021 H2_cont_stat[y[0].split("_")[1]]=y[2] | |
| 1022 else: | |
| 1023 H2_cont_stat[y[0].split("_")[1]]+=y[2] | |
| 1024 break | |
| 1025 #detect contaminations | |
| 1026 #print(H1_cont_stat) | |
| 1027 #print(H2_cont_stat) | |
| 1028 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2] | |
| 1029 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2] | |
| 1030 contamination_H="" | |
| 1031 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1: | |
| 1032 contamination_H="potential contamination from H antigen signals" | |
| 1033 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA": | |
| 1034 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected" | |
| 1035 #get additional antigens | |
| 1036 """ | |
| 1037 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
| 1038 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1 | |
| 1039 O_choice="O-9,46" | |
| 1040 #print "$$$Most possilble Otype: O-9,46" | |
| 1041 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1 | |
| 1042 O_choice="O-9,46,27" | |
| 1043 #print "$$$Most possilble Otype: O-9,46,27" | |
| 1044 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
| 1045 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1 | |
| 1046 O_choice="O-3,10" | |
| 1047 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)" | |
| 1048 else: | |
| 1049 O_choice="O-1,3,19" | |
| 1050 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)" | |
| 1051 ### end of special test for O9,46 and O3,10 family | |
| 1052 | |
| 1053 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19": | |
| 1054 if len(Otypes_uniq)>2: | |
| 1055 contamination_O="potential contamination from O antigen signals" | |
| 1056 else: | |
| 1057 if len(Otypes_uniq)>1: | |
| 1058 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
| 1059 contamination_O="" | |
| 1060 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund | |
| 1061 contamination_O="" | |
| 1062 """ | |
| 1063 additonal_antigents=[] | |
| 1064 #print(contamination_O) | |
| 1065 #print(contamination_H) | |
| 1066 log_file.write(contamination_O+"\n") | |
| 1067 log_file.write(contamination_H+"\n") | |
| 1068 log_file.close() | |
| 1069 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list | |
| 1070 | |
| 1071 def get_input_K(input_file,lib_dict,data_type,k_size): | |
| 1072 #kmer mode; get input_Ks from dict and data_type | |
| 1073 kmers = [] | |
| 1074 for h in lib_dict: | |
| 1075 kmers += lib_dict[h] | |
| 1076 if data_type == '4': | |
| 1077 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers)) | |
| 1078 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later | |
| 1079 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers)) | |
| 1080 elif data_type == '5':#minion_2d_fasta | |
| 1081 #input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers)) | |
| 1082 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers)) #ed_SL_08172020: change for nanopore workflow | |
| 1083 if data_type == '6':#minion_2d_fastq | |
| 1084 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers)) | |
| 1085 return input_Ks | |
| 1086 | |
| 1087 def get_kmer_dict(lib_dict,input_Ks): | |
| 1088 #kmer mode; get predicted types | |
| 1089 O_dict = {} | |
| 1090 H_dict = {} | |
| 1091 Special_dict = {} | |
| 1092 for h in lib_dict: | |
| 1093 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100 | |
| 1094 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases | |
| 1095 if h.startswith('O-') and score > 25: | |
| 1096 O_dict[h] = score | |
| 1097 if h.startswith('fl') and score > 40: | |
| 1098 H_dict[h] = score | |
| 1099 if (h[:2] != 'fl') and (h[:2] != 'O-'): | |
| 1100 Special_dict[h] = score | |
| 1101 return O_dict,H_dict,Special_dict | |
| 1102 | |
| 1103 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir): | |
| 1104 log_file=open("SeqSero_log.txt","a") | |
| 1105 log_file.write("O_scores:\n") | |
| 1106 #call O: | |
| 1107 highest_O = '-' | |
| 1108 if len(O_dict) == 0: | |
| 1109 pass | |
| 1110 else: | |
| 1111 for x in O_dict: | |
| 1112 log_file.write(x+"\t"+str(O_dict[x])+"\n") | |
| 1113 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1 | |
| 1114 #if 'O-9,46_wzy__1191' in O_dict or "O-9,46_wzy_partial__216" in O_dict: # and float(O946_wzy)/float(num_1) > 0.1 | |
| 1115 #modified to fix miscall of O-9,46 | |
| 1116 if ('O-9,46_wzy__1191' in O_dict and O_dict['O-9,46_wzy__1191']>40) or ("O-9,46_wzy_partial__216" in O_dict and O_dict["O-9,46_wzy_partial__216"]>40): # and float(O946_wzy)/float(num_1) > 0.1 | |
| 1117 highest_O = "O-9,46" | |
| 1118 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1 | |
| 1119 highest_O = "O-9,46,27" | |
| 1120 else: | |
| 1121 highest_O = "O-9" # next, detect O9 vs O2? | |
| 1122 O2 = 0 | |
| 1123 O9 = 0 | |
| 1124 for z in Special_dict: | |
| 1125 if "tyr-O-9" in z: | |
| 1126 O9 = float(Special_dict[z]) | |
| 1127 if "tyr-O-2" in z: | |
| 1128 O2 = float(Special_dict[z]) | |
| 1129 if O2 > O9: | |
| 1130 highest_O = "O-2" | |
| 1131 elif ("O-3,10_wzx__1539" in O_dict) and ( | |
| 1132 "O-9,46_wzy__1191" in O_dict | |
| 1133 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1 | |
| 1134 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1 | |
| 1135 highest_O = "O-3,10" | |
| 1136 else: | |
| 1137 highest_O = "O-1,3,19" | |
| 1138 ### end of special test for O9,46 and O3,10 family | |
| 1139 else: | |
| 1140 try: | |
| 1141 max_score = 0 | |
| 1142 for x in O_dict: | |
| 1143 if float(O_dict[x]) >= max_score: | |
| 1144 max_score = float(O_dict[x]) | |
| 1145 #highest_O = x.split("_")[0] | |
| 1146 # ed_SL_12182019: modified to fix the O-9,46 error example1 | |
| 1147 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict): | |
| 1148 highest_O = "O-9" | |
| 1149 else: | |
| 1150 highest_O = x.split("_")[0] | |
| 1151 if highest_O == "O-1,3,19": | |
| 1152 highest_O = '-' | |
| 1153 max_score = 0 | |
| 1154 for x in O_dict: | |
| 1155 if x == 'O-1,3,19_not_in_3,10__130': | |
| 1156 pass | |
| 1157 else: | |
| 1158 if float(O_dict[x]) >= max_score: | |
| 1159 max_score = float(O_dict[x]) | |
| 1160 #highest_O = x.split("_")[0] | |
| 1161 # ed_SL_12182019: modified to fix the O-9,46 error example1 | |
| 1162 if (x == 'O-9,46_wbaV__1002' or x == 'O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002') and ('O-9,46_wzy__1191' not in O_dict and 'O-9,46_wzy_partial__216' not in O_dict): | |
| 1163 highest_O = "O-9" | |
| 1164 else: | |
| 1165 highest_O = x.split("_")[0] | |
| 1166 except: | |
| 1167 pass | |
| 1168 #call_fliC: | |
| 1169 if len(H_dict)!=0: | |
| 1170 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not | |
| 1171 else: | |
| 1172 highest_H_score_both_BC=0 | |
| 1173 highest_fliC = '-' | |
| 1174 highest_fliC_raw = '-' | |
| 1175 highest_Score = 0 | |
| 1176 log_file.write("\nH_scores:\n") | |
| 1177 for s in H_dict: | |
| 1178 log_file.write(s+"\t"+str(H_dict[s])+"\n") | |
| 1179 if s.startswith('fliC'): | |
| 1180 if float(H_dict[s]) > highest_Score: | |
| 1181 highest_fliC = s.split('_')[1] | |
| 1182 highest_fliC_raw = s | |
| 1183 highest_Score = float(H_dict[s]) | |
| 1184 #call_fljB | |
| 1185 highest_fljB = '-' | |
| 1186 highest_fljB_raw = '-' | |
| 1187 highest_Score = 0 | |
| 1188 for s in H_dict: | |
| 1189 if s.startswith('fljB'): | |
| 1190 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other | |
| 1191 #highest_fljB = s.split('_')[1] | |
| 1192 #highest_fljB_raw = s | |
| 1193 #highest_Score = float(H_dict[s]) | |
| 1194 if s.split('_')[1]!=highest_fliC: | |
| 1195 highest_fljB = s.split('_')[1] | |
| 1196 highest_fljB_raw = s | |
| 1197 highest_Score = float(H_dict[s]) | |
| 1198 log_file.write("\nSpecial_scores:\n") | |
| 1199 for s in Special_dict: | |
| 1200 log_file.write(s+"\t"+str(Special_dict[s])+"\n") | |
| 1201 log_file.close() | |
| 1202 return highest_O,highest_fliC,highest_fljB | |
| 1203 | |
| 1204 def get_temp_file_names(for_fq,rev_fq): | |
| 1205 #seqsero2 -a; get temp file names | |
| 1206 sam=for_fq+".sam" | |
| 1207 bam=for_fq+".bam" | |
| 1208 sorted_bam=for_fq+"_sorted.bam" | |
| 1209 mapped_fq1=for_fq+"_mapped.fq" | |
| 1210 mapped_fq2=rev_fq+"_mapped.fq" | |
| 1211 combined_fq=for_fq+"_combined.fq" | |
| 1212 for_sai=for_fq+".sai" | |
| 1213 rev_sai=rev_fq+".sai" | |
| 1214 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai | |
| 1215 | |
| 1216 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode): | |
| 1217 #seqsero2 -a; do mapping and sort | |
| 1218 print("building database...") | |
| 1219 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True) | |
| 1220 print("mapping...") | |
| 1221 if mapping_mode=="mem": | |
| 1222 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True) | |
| 1223 elif mapping_mode=="sam": | |
| 1224 if fnameB!="": | |
| 1225 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True) | |
| 1226 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True) | |
| 1227 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True) | |
| 1228 else: | |
| 1229 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True) | |
| 1230 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam) | |
| 1231 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True) | |
| 1232 ### check the version of samtools then use differnt commands | |
| 1233 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
| 1234 out, err = samtools_version.communicate() | |
| 1235 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip() | |
| 1236 print("check samtools version:",version) | |
| 1237 ### end of samtools version check and its analysis | |
| 1238 if LooseVersion(version)<=LooseVersion("1.2"): | |
| 1239 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True) | |
| 1240 else: | |
| 1241 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True) | |
| 1242 | |
| 1243 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode,phred_offset): | |
| 1244 #seqsero2 -a; extract, assembly and blast | |
| 1245 #subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True) | |
| 1246 subprocess.check_call("samtools bam2fq "+sorted_bam+" > "+combined_fq+" 2>> data_log.txt",shell=True) ## change to samtools bam2fq. 202509 | |
| 1247 #print("fnameA:",fnameA) | |
| 1248 #print("fnameB:",fnameB) | |
| 1249 if fnameB!="": | |
| 1250 #subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output | |
| 1251 subprocess.check_call("samtools bam2fq -1 "+mapped_fq1+" -2 "+mapped_fq2+" -0 /dev/null -s /dev/null -n "+sorted_bam+" 2>> data_log.txt",shell=True) ## change to samtools bam2fq. 202509 | |
| 1252 else: | |
| 1253 pass | |
| 1254 outdir=current_time+"_temp" | |
| 1255 print("assembling...") | |
| 1256 if int(threads)>4: | |
| 1257 t="4" | |
| 1258 else: | |
| 1259 t=threads | |
| 1260 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-" | |
| 1261 if phred_offset == 'auto': | |
| 1262 phred_offset = '' | |
| 1263 else: | |
| 1264 phred_offset = '--phred-offset ' + phred_offset | |
| 1265 | |
| 1266 if fnameB!="": | |
| 1267 #print("spades.py --careful "+phred_offset+" --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1") | |
| 1268 subprocess.check_call("spades.py --careful "+phred_offset+" --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True) | |
| 1269 else: | |
| 1270 subprocess.check_call("spades.py --careful "+phred_offset+" --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True) | |
| 1271 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta" | |
| 1272 #new_fasta=fnameA+"_"+database.split('/')[-1]+"_"+mapping_mode+".fasta" # change path to databse for packaging | |
| 1273 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True) | |
| 1274 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta | |
| 1275 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True) | |
| 1276 print("blasting...","\n") | |
| 1277 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml" | |
| 1278 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015 | |
| 1279 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10" | |
| 1280 else: | |
| 1281 xmlfile="NA" | |
| 1282 return xmlfile,new_fasta | |
| 1283 | |
| 1284 def judge_subspecies(fnameA): | |
| 1285 #seqsero2 -a; judge subspecies on just forward raw reads fastq | |
| 1286 salmID_output=subprocess.Popen("SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE) | |
| 1287 out, err = salmID_output.communicate() | |
| 1288 out=out.decode("utf-8") | |
| 1289 file=open("data_log.txt","a") | |
| 1290 file.write(out) | |
| 1291 file.close() | |
| 1292 salm_species_scores=out.split("\n")[1].split("\t")[6:] | |
| 1293 salm_species_results=out.split("\n")[0].split("\t")[6:] | |
| 1294 max_score=0 | |
| 1295 max_score_index=1 #default is 1, means "I" | |
| 1296 for i in range(len(salm_species_scores)): | |
| 1297 if max_score<float(salm_species_scores[i]): | |
| 1298 max_score=float(salm_species_scores[i]) | |
| 1299 max_score_index=i | |
| 1300 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0] | |
| 1301 #if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare | |
| 1302 if float(out.split("\n")[1].split("\t")[4]) > 10 and float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): ## ed_SL_0318: change SalmID_ssp_threshold | |
| 1303 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part | |
| 1304 #if max_score<10: ## ed_SL_0318: change SalmID_ssp_threshold | |
| 1305 if max_score<60: | |
| 1306 prediction="-" | |
| 1307 ## ed_SL_0818: add for enterica | |
| 1308 if float(out.split("\n")[1].split("\t")[5]) > 10 and float(out.split("\n")[1].split("\t")[5]) > float(out.split("\n")[1].split("\t")[4]): | |
| 1309 prediction="enterica" | |
| 1310 ## | |
| 1311 return prediction | |
| 1312 | |
| 1313 def judge_subspecies_Kmer(Special_dict): | |
| 1314 #seqsero2 -k; | |
| 1315 max_score=0 | |
| 1316 prediction="-" #default should be I | |
| 1317 for x in Special_dict: | |
| 1318 #if "mer" in x: ## ed_SL_0318: change ssp_threshold | |
| 1319 if "mer" in x and float(Special_dict[x]) > 60: | |
| 1320 if max_score<float(Special_dict[x]): | |
| 1321 max_score=float(Special_dict[x]) | |
| 1322 prediction=x.split("_")[-1].strip() | |
| 1323 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica | |
| 1324 prediction="bongori" | |
| 1325 break | |
| 1326 return prediction | |
| 1327 | |
| 1328 ## ed_SL_11232019: add notes for missing antigen | |
| 1329 def check_antigens(ssp,O_antigen,H1_antigen,H2_antigen,NA_note): | |
| 1330 antigen_note = '' | |
| 1331 if ssp != '-': | |
| 1332 if O_antigen != '-' and H1_antigen == '-' and H2_antigen == '-': # O:-:- | |
| 1333 antigen_note = 'H antigens were not detected. This is an atypical result that should be further investigated. Most Salmonella strains have at least fliC, encoding the Phase 1 H antigen, even if it is not expressed. ' | |
| 1334 NA_note = '' | |
| 1335 elif O_antigen != '-' and H1_antigen == '-' and H2_antigen != '-': # O:-:H2 | |
| 1336 antigen_note = 'fliC was not detected. This is an atypical result that should be further investigated. Most Salmonella strains have fliC, encoding the Phase 1 H antigen, even if it is not expressed. ' | |
| 1337 NA_note = '' | |
| 1338 elif O_antigen == '-' and H1_antigen != '-': # -:H1:X | |
| 1339 antigen_note = 'O antigen was not detected. This result may be due to a rough strain that has deleted the rfb region. For raw reads input, the k-mer workflow is sometimes more sensitive than the microassembly workflow in detecting O antigen. Caution should be used with this approach because the k-mer result may be due to low levels of contamination. ' | |
| 1340 NA_note = '' | |
| 1341 elif O_antigen == '-' and H1_antigen == '-' and H2_antigen == '-': # -:-:- | |
| 1342 antigen_note = 'No serotype antigens were detected. This is an atypical result that should be further investigated. ' | |
| 1343 NA_note = '' | |
| 1344 else: | |
| 1345 antigen_note = 'The input genome cannot be identified as Salmonella. Check the input for taxonomic ID, contamination, or sequencing quality. ' | |
| 1346 NA_note = '' | |
| 1347 if ssp == 'enterica': | |
| 1348 antigen_note += 'Subspecies identification of the input genome cannot be definitively determined. ' | |
| 1349 NA_note = '' | |
| 1350 # if [O_antigen, H1_antigen, H2_antigen].count('-') >= 2: | |
| 1351 # antigen_note = 'No subspecies marker was detected and less than 2 serotype antigens were detected; further, this genome was not identified as Salmonella. This is an atypical result that should be further investigated. ' | |
| 1352 # else: | |
| 1353 # antigen_note = 'No subspecies marker was detected. This genome may not be Salmonella. This is an atypical result that should be further investigated. ' | |
| 1354 return (antigen_note,NA_note) | |
| 1355 | |
| 1356 ## ed_SL_06062020: rename subspecies ID | |
| 1357 subspecies_ID_dir = {'I': 'Salmonella enterica subspecies enterica (subspecies I)', | |
| 1358 'II': 'Salmonella enterica subspecies salamae (subspecies II)', | |
| 1359 'IIIa': 'Salmonella enterica subspecies arizonae (subspecies IIIa)', | |
| 1360 'IIIb': 'Salmonella enterica subspecies diarizonae (subspecies IIIb)', | |
| 1361 'IV': 'Salmonella enterica subspecies houtenae (subspecies IV)', | |
| 1362 'VI': 'Salmonella enterica subspecies indica (subspecies VI)', | |
| 1363 'VII': 'Salmonella enterica subspecies VII (subspecies VII)', | |
| 1364 'bongori': 'Salmonella bongori', | |
| 1365 'enterica': 'Salmonella enterica', | |
| 1366 '-': '-'} | |
| 1367 ## | |
| 1368 | |
| 1369 ## ed_SL_08172020: format check for fasta or fastq in nanopore workflow, convert raw reads fastq to fasta | |
| 1370 def format_check(input_file): | |
| 1371 line=open(input_file,'r').readline() | |
| 1372 if line.startswith('>'): | |
| 1373 output_file = input_file | |
| 1374 elif line.startswith('@'): | |
| 1375 input_file_fa = input_file + '.fasta' | |
| 1376 subprocess.check_call("seqtk seq -A "+input_file+" > "+input_file_fa,shell=True) | |
| 1377 output_file = input_file_fa | |
| 1378 else: | |
| 1379 print ('please check the format of input files') | |
| 1380 return (output_file) | |
| 1381 ## | |
| 1382 | |
| 1383 def main(): | |
| 1384 #combine SeqSeroK and SeqSero2, also with SalmID | |
| 1385 args = parse_args() | |
| 1386 input_file = args.i | |
| 1387 data_type = args.t | |
| 1388 analysis_mode = args.m | |
| 1389 mapping_mode=args.b | |
| 1390 threads=args.p | |
| 1391 make_dir=args.d | |
| 1392 clean_mode=args.c | |
| 1393 sample_name=args.n | |
| 1394 ingore_header=args.s | |
| 1395 phred_offset=args.phred_offset | |
| 1396 k_size=27 #will change for bug fixing | |
| 1397 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) | |
| 1398 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2s_db')) # ed_SL_09152019: add ex_dir for packaging | |
| 1399 seqsero2_db=ex_dir+"/H_and_O_and_specific_genes.fasta" # ed_SL_11092019: change path to database for packaging | |
| 1400 database="H_and_O_and_specific_genes.fasta" | |
| 1401 mlst_count=pickle.load(open(ex_dir+"/mlst.pickle", "rb")) | |
| 1402 note="Note: " | |
| 1403 NA_note="This predicted serotype is not in the Kauffman-White scheme. " # ed_SL_09272019: add for new output format | |
| 1404 if len(sys.argv)==1: | |
| 1405 subprocess.check_call(dirpath+"/SeqSero2S.py -h",shell=True)#change name of python file | |
| 1406 else: | |
| 1407 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime()) | |
| 1408 request_id += str(random.randint(1, 10000000)) | |
| 1409 if make_dir is None: | |
| 1410 make_dir="SeqSero_result_"+request_id | |
| 1411 make_dir=os.path.abspath(make_dir) | |
| 1412 if os.path.isdir(make_dir): | |
| 1413 pass | |
| 1414 else: | |
| 1415 subprocess.check_call("mkdir -p "+make_dir,shell=True) | |
| 1416 subprocess.check_call("ln -f -s "+seqsero2_db+" "+" ".join(input_file)+" "+make_dir,shell=True) # ed_SL_11092019: change path to database for packaging | |
| 1417 #subprocess.check_call("ln -f -s "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument). | |
| 1418 ############################begin the real analysis | |
| 1419 if analysis_mode=="a": | |
| 1420 if data_type in ["1","2","3"]:#use allele mode | |
| 1421 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath) | |
| 1422 os.chdir(make_dir) | |
| 1423 ###add a function to tell input files | |
| 1424 fnameA=for_fq.split("/")[-1] | |
| 1425 fnameB=rev_fq.split("/")[-1] | |
| 1426 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) | |
| 1427 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id | |
| 1428 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort | |
| 1429 | |
| 1430 ### avoid error out when micro assembly fails. ed_SL_03172020 | |
| 1431 try: | |
| 1432 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode,phred_offset) #extract the mapped reads and do micro assembly and blast | |
| 1433 except (UnboundLocalError, subprocess.CalledProcessError): | |
| 1434 xmlfile="NA" | |
| 1435 H1_cont_stat_list=[] | |
| 1436 H2_cont_stat_list=[] | |
| 1437 ### | |
| 1438 if xmlfile=="NA": | |
| 1439 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","") | |
| 1440 else: | |
| 1441 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results | |
| 1442 file=open("data_log.txt","a") | |
| 1443 for x in Final_list: | |
| 1444 file.write("\t".join(str(y) for y in x)+"\n") | |
| 1445 file.close() | |
| 1446 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)] | |
| 1447 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB | |
| 1448 subspecies=judge_subspecies(fnameA) #predict subspecies | |
| 1449 ### ed_SL_06062020: correction VIII -> II | |
| 1450 if subspecies == 'VIII': | |
| 1451 subspecies = 'II' | |
| 1452 ### ed_SL_08132020: correction VII -> IV, according to CDC's suggestion | |
| 1453 if subspecies == 'VII': | |
| 1454 subspecies = 'IV' | |
| 1455 note+='SalmID reports this as ssp VII, which has not been formally recognized. ' | |
| 1456 ### | |
| 1457 ### ed_SL_08182020: change serotype ouput for genome without definitive subspecies ID | |
| 1458 ssp_pointer = subspecies | |
| 1459 if subspecies == 'enterica': | |
| 1460 subspecies = '-' | |
| 1461 ### | |
| 1462 ###MLST | |
| 1463 #print("MLST using https://github.com/jordanlab/stringMLST") | |
| 1464 #print("7-gene MLST...") | |
| 1465 mlst_result = stringmlst(for_fq,rev_fq,data_type,ex_dir) | |
| 1466 st = mlst_result[0] | |
| 1467 alleles = mlst_result[1] | |
| 1468 sorted_alleles = sorted(alleles.items()) | |
| 1469 try: | |
| 1470 st_count = str(mlst_count[st]) | |
| 1471 except: | |
| 1472 st_count = '0' | |
| 1473 subprocess.call("rm H_and_O_and_specific_genes.fasta* *.sra *.bam *.sam *.fastq *.gz *.fq temp.txt "+fnameA+"*_db* 2> /dev/null",shell=True) | |
| 1474 ### | |
| 1475 ###output | |
| 1476 predict_form_ss2,predict_sero_ss2,star_ss2,star_line_ss2,claim_ss2=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies,'ss2') | |
| 1477 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies,'ss2s') | |
| 1478 claim="" #04132019, disable claim for new report requirement | |
| 1479 contamination_report="" | |
| 1480 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0] | |
| 1481 if contamination_O!="" and contamination_H=="": | |
| 1482 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"." | |
| 1483 elif contamination_O=="" and contamination_H!="": | |
| 1484 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"." | |
| 1485 elif contamination_O!="" and contamination_H!="": | |
| 1486 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"." | |
| 1487 if contamination_report!="": | |
| 1488 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019 | |
| 1489 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles. " | |
| 1490 | |
| 1491 ### ed_SL_11232019: add notes for missing antigen | |
| 1492 if O_choice=="": | |
| 1493 O_choice="-" | |
| 1494 antigen_note,NA_note=check_antigens(ssp_pointer,O_choice,fliC_choice,fljB_choice,NA_note) | |
| 1495 if sample_name: | |
| 1496 print ("Sample name:\t"+sample_name) | |
| 1497 ### | |
| 1498 if clean_mode: | |
| 1499 subprocess.check_call("rm -rf "+make_dir,shell=True) | |
| 1500 make_dir="none-output-directory due to '-c' flag" | |
| 1501 else: | |
| 1502 new_file=open("SeqSero_result.txt","w") | |
| 1503 ### ed_SL_01152020: add new output | |
| 1504 conta_note="yes" if "inter-serotype contamination" in contamination_report else "no" | |
| 1505 tsv_file=open("SeqSero_result.tsv","w") | |
| 1506 if ingore_header: | |
| 1507 pass | |
| 1508 else: | |
| 1509 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted identification\tPredicted antigenic profile\tPredicted serotype\tPredicted serotype (SeqSero2 v1.3.2)\tPotential inter-serotype contamination\tNote\tST\n") | |
| 1510 if sample_name: | |
| 1511 new_file.write("Sample name:\t"+sample_name+"\n") | |
| 1512 tsv_file.write(sample_name+'\t') | |
| 1513 else: | |
| 1514 tsv_file.write(input_file[0].split('/')[-1]+'\t') | |
| 1515 ### | |
| 1516 if "N/A" not in predict_sero: | |
| 1517 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
| 1518 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
| 1519 "O antigen prediction:\t"+O_choice+"\n"+ | |
| 1520 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
| 1521 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
| 1522 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
| 1523 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
| 1524 "Predicted serotype:\t"+predict_sero+"\n"+ | |
| 1525 "Predicted serotype (SeqSero2 v1.3.2):\t"+predict_sero_ss2+"\n"+ | |
| 1526 note+contamination_report+star_line+claim+antigen_note+"\n")#+## | |
| 1527 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+predict_sero+"\t"+predict_sero_ss2+"\t"+conta_note+"\t"+contamination_report+star_line+claim+antigen_note+"\t"+st+"\n") | |
| 1528 else: | |
| 1529 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
| 1530 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
| 1531 "O antigen prediction:\t"+O_choice+"\n"+ | |
| 1532 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
| 1533 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
| 1534 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
| 1535 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
| 1536 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, add subspecies | |
| 1537 "Predicted serotype (SeqSero2 v1.3.2):\t"+subspecies+' '+predict_form_ss2+"\n"+ | |
| 1538 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n")#+## | |
| 1539 tsv_file.write(make_dir+"\t"+" ".join(input_file)+"\t"+O_choice+"\t"+fliC_choice+"\t"+fljB_choice+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+subspecies+' '+predict_form_ss2+"\t"+conta_note+"\t"+NA_note+contamination_report+star_line+claim+antigen_note+"\t"+st+"\n") | |
| 1540 ##MLST | |
| 1541 new_file.write("Sequence type:\t"+st+"\n"+ | |
| 1542 "Number of ST"+st+" strains in EnteroBase:\t"+st_count+"\n"+ | |
| 1543 "\n".join([k+":\t"+v for k,v in sorted_alleles])) | |
| 1544 ### | |
| 1545 new_file.close() | |
| 1546 tsv_file.close() | |
| 1547 if "N/A" not in predict_sero: | |
| 1548 print("Output directory:\t"+make_dir+"\n"+ | |
| 1549 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
| 1550 "O antigen prediction:\t"+O_choice+"\n"+ | |
| 1551 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
| 1552 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
| 1553 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
| 1554 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
| 1555 "Predicted serotype:\t"+predict_sero+"\n"+ | |
| 1556 "Predicted serotype (SeqSero2 v1.3.2):\t"+predict_sero_ss2+"\n"+ | |
| 1557 note+contamination_report+star_line+claim+antigen_note+"\n")#+## | |
| 1558 else: | |
| 1559 print("Output directory:\t"+make_dir+"\n"+ | |
| 1560 "Input files:\t"+"\t".join(input_file)+"\n"+ | |
| 1561 "O antigen prediction:\t"+O_choice+"\n"+ | |
| 1562 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+ | |
| 1563 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+ | |
| 1564 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
| 1565 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
| 1566 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies | |
| 1567 "Predicted serotype (SeqSero2 v1.3.2):\t"+subspecies+' '+predict_form_ss2+"\n"+ | |
| 1568 note+NA_note+contamination_report+star_line+claim+antigen_note+"\n") | |
| 1569 ###MLST | |
| 1570 print("Sequence type: "+st) | |
| 1571 print("Number of ST"+st+" strains in EnteroBase: "+st_count) | |
| 1572 #print("Allele profile...") | |
| 1573 for k,v in sorted_alleles: | |
| 1574 print(k+': '+v) | |
| 1575 print('\n') | |
| 1576 ### | |
| 1577 else: | |
| 1578 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'") | |
| 1579 elif analysis_mode=="k": | |
| 1580 #ex_dir = os.path.dirname(os.path.realpath(__file__)) | |
| 1581 ex_dir = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)),'seqsero2s_db')) # ed_SL_09152019: change ex_dir for packaging | |
| 1582 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath) | |
| 1583 input_file = for_fq #-k will just use forward because not all reads were used | |
| 1584 os.chdir(make_dir) | |
| 1585 ### ed_SL_08182020: use assembly workflow for nanopore fastq, convert fastq to fasta | |
| 1586 if data_type == "5": | |
| 1587 input_file = format_check(for_fq) | |
| 1588 ### | |
| 1589 f = open(ex_dir + '/antigens.pickle', 'rb') | |
| 1590 lib_dict = pickle.load(f) | |
| 1591 f.close | |
| 1592 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size) | |
| 1593 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks) | |
| 1594 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir) | |
| 1595 subspecies=judge_subspecies_Kmer(Special_dict) | |
| 1596 if subspecies=="IIb" or subspecies=="IIa": | |
| 1597 subspecies="II" | |
| 1598 ### ed_SL_06062020: correction VIII -> II | |
| 1599 if subspecies == 'VIII': | |
| 1600 subspecies = 'II' | |
| 1601 ### ed_SL_08132020: correction VII -> IV, according to CDC's suggestion | |
| 1602 if subspecies == 'VII': | |
| 1603 subspecies = 'IV' | |
| 1604 note+='SalmID reports this as ssp VII, which has not been formally recognized. ' | |
| 1605 ### | |
| 1606 ### ed_SL_08182020: change serotype ouput for genome without definitive subspecies ID | |
| 1607 ssp_pointer = subspecies | |
| 1608 if subspecies == 'enterica': | |
| 1609 subspecies = '-' | |
| 1610 ### | |
| 1611 predict_form_ss2,predict_sero_ss2,star_ss2,star_line_ss2,claim_ss2 = seqsero_from_formula_to_serotypes( | |
| 1612 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies, 'ss2') | |
| 1613 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes( | |
| 1614 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies, 'ss2s') | |
| 1615 claim="" #no claim any more based on new output requirement | |
| 1616 | |
| 1617 ### ed_SL_11232019: add notes for missing antigen | |
| 1618 if highest_O.split('-')[-1]=="": | |
| 1619 O_choice="-" | |
| 1620 else: | |
| 1621 O_choice=highest_O.split('-')[-1] | |
| 1622 antigen_note,NA_note=check_antigens(ssp_pointer,O_choice,highest_fliC,highest_fljB,NA_note) | |
| 1623 if sample_name: | |
| 1624 print ("Sample name:\t"+sample_name) | |
| 1625 ### | |
| 1626 ###MLST | |
| 1627 if data_type in ["4","5"]: | |
| 1628 #print("MLST using https://github.com/tseemann/mlst") | |
| 1629 #print("7-gene MLST...") | |
| 1630 mlst_result = mlst(args.i[0]) | |
| 1631 if data_type in ["1","2","3"]: | |
| 1632 #print("MLST using https://github.com/jordanlab/stringMLST") | |
| 1633 #print("7-gene MLST...") | |
| 1634 mlst_result = stringmlst(for_fq,rev_fq,data_type,ex_dir) | |
| 1635 st = mlst_result[0] | |
| 1636 alleles = mlst_result[1] | |
| 1637 sorted_alleles = sorted(alleles.items()) | |
| 1638 try: | |
| 1639 st_count = str(mlst_count[st]) | |
| 1640 except: | |
| 1641 st_count = '0' | |
| 1642 subprocess.call("rm *.fasta* *.fastq *.gz *.fq temp.txt *.sra 2> /dev/null",shell=True) | |
| 1643 ### | |
| 1644 ###output | |
| 1645 if clean_mode: | |
| 1646 subprocess.check_call("rm -rf "+make_dir,shell=True) | |
| 1647 make_dir="none-output-directory due to '-c' flag" | |
| 1648 else: | |
| 1649 new_file=open("SeqSero_result.txt","w") | |
| 1650 tsv_file=open("SeqSero_result.tsv","w") | |
| 1651 if ingore_header: | |
| 1652 pass | |
| 1653 else: | |
| 1654 tsv_file.write("Sample name\tOutput directory\tInput files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted identification\tPredicted antigenic profile\tPredicted serotype\tPredicted serotype (SeqSero2 v1.3.2)\tNote\tST\n") | |
| 1655 if sample_name: | |
| 1656 new_file.write("Sample name:\t"+sample_name+"\n") | |
| 1657 tsv_file.write(sample_name+'\t') | |
| 1658 else: | |
| 1659 tsv_file.write(input_file.split('/')[-1]+'\t') | |
| 1660 ### | |
| 1661 if "N/A" not in predict_sero: | |
| 1662 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
| 1663 "Input files:\t"+input_file+"\n"+ | |
| 1664 "O antigen prediction:\t"+O_choice+"\n"+ | |
| 1665 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
| 1666 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
| 1667 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
| 1668 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
| 1669 "Predicted serotype:\t"+predict_sero+"\n"+ | |
| 1670 "Predicted serotype (SeqSero2 v1.3.2):\t"+predict_sero_ss2+"\n"+ | |
| 1671 note+star_line+claim+antigen_note+"\n")#+## | |
| 1672 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+predict_sero+"\t"+predict_sero_ss2+"\t"+star_line+claim+antigen_note+"\t"+st+"\n") | |
| 1673 else: | |
| 1674 new_file.write("Output directory:\t"+make_dir+"\n"+ | |
| 1675 "Input files:\t"+input_file+"\n"+ | |
| 1676 "O antigen prediction:\t"+O_choice+"\n"+ | |
| 1677 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
| 1678 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
| 1679 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
| 1680 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
| 1681 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies | |
| 1682 "Predicted serotype (SeqSero2 v1.3.2):\t"+subspecies+' '+predict_form_ss2+"\n"+ | |
| 1683 note+NA_note+star_line+claim+antigen_note+"\n")#+## | |
| 1684 tsv_file.write(make_dir+"\t"+input_file+"\t"+O_choice+"\t"+highest_fliC+"\t"+highest_fljB+"\t"+subspecies_ID_dir[ssp_pointer]+"\t"+predict_form+"\t"+subspecies+' '+predict_form+"\t"+subspecies+' '+predict_form_ss2+"\t"+NA_note+star_line+claim+antigen_note+"\t"+st+"\n") | |
| 1685 ###MLST | |
| 1686 new_file.write("Sequence type:\t"+st+"\n"+ | |
| 1687 "Number of ST"+st+" strains in EnteroBase:\t"+st_count+"\n"+ | |
| 1688 "\n".join([k+":\t"+v for k,v in sorted_alleles])) | |
| 1689 ### | |
| 1690 new_file.close() | |
| 1691 tsv_file.close() | |
| 1692 if "N/A" not in predict_sero: | |
| 1693 print("Output directory:\t"+make_dir+"\n"+ | |
| 1694 "Input files:\t"+input_file+"\n"+ | |
| 1695 "O antigen prediction:\t"+O_choice+"\n"+ | |
| 1696 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
| 1697 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
| 1698 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
| 1699 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
| 1700 "Predicted serotype:\t"+predict_sero+"\n"+ | |
| 1701 "Predicted serotype (SeqSero2 v1.3.2):\t"+predict_sero_ss2+"\n"+ | |
| 1702 note+star_line+claim+antigen_note+"\n")#+## | |
| 1703 else: | |
| 1704 print("Output directory:\t"+make_dir+"\n"+ | |
| 1705 "Input files:\t"+input_file+"\n"+ | |
| 1706 "O antigen prediction:\t"+O_choice+"\n"+ | |
| 1707 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+ | |
| 1708 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+ | |
| 1709 "Predicted identification:\t"+subspecies_ID_dir[ssp_pointer]+"\n"+ | |
| 1710 "Predicted antigenic profile:\t"+predict_form+"\n"+ | |
| 1711 "Predicted serotype:\t"+subspecies+' '+predict_form+"\n"+ # add serotype output for "N/A" prediction, subspecies | |
| 1712 "Predicted serotype (SeqSero2 v1.3.2):\t"+subspecies+' '+predict_form_ss2+"\n"+ | |
| 1713 note+NA_note+star_line+claim+antigen_note+"\n")#+## | |
| 1714 ###MLST | |
| 1715 print("Sequence type: "+st) | |
| 1716 print("Number of ST"+st+" strains in EnteroBase: "+st_count) | |
| 1717 #print("Allele profile...") | |
| 1718 for k,v in sorted_alleles: | |
| 1719 print(k+': '+v) | |
| 1720 print('\n') | |
| 1721 ### | |
| 1722 if __name__ == '__main__': | |
| 1723 main() |
