seqsero2_v1_0_1: SeqSero2_package.py annotate

annotate SeqSero2_package.py @ 7:a3accb97716a

Uploaded

author	estrain
date	Fri, 06 Sep 2019 22:31:16 -0400
parents
children	66277925a124

rev	line source
estrain@7	1 #!/usr/bin/env python3
estrain@7	2
estrain@7	3 import sys
estrain@7	4 import time
estrain@7	5 import random
estrain@7	6 import os
estrain@7	7 import subprocess
estrain@7	8 import gzip
estrain@7	9 import io
estrain@7	10 import pickle
estrain@7	11 import argparse
estrain@7	12 import itertools
estrain@7	13 from distutils.version import LooseVersion
estrain@7	14 from distutils.spawn import find_executable
estrain@7	15
estrain@7	16 try:
estrain@7	17 from .version import SeqSero2_version
estrain@7	18 except Exception: #ImportError
estrain@7	19 from version import SeqSero2_version
estrain@7	20
estrain@7	21 ### SeqSero Kmer
estrain@7	22 def parse_args():
estrain@7	23 "Parse the input arguments, use '-h' for help."
estrain@7	24 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-d <output_directory>] [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com\n\nVersion: v1.0.1')#add "-m <data_type>" in future
estrain@7	25 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data",type=os.path.abspath) ### ed_SL_05282019: add 'type=os.path.abspath' to generate absolute path of input data.
estrain@7	26 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1' for interleaved paired-end reads, '2' for separated paired-end reads, '3' for single reads, '4' for genome assembly, '5' for nanopore fasta, '6' for nanopore fastq")
estrain@7	27 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: algorithms for bwa mapping for allele mode; 'mem' for mem, 'sam' for samse/sampe; default=mem; optional; for now we only optimized for default 'mem' mode")
estrain@7	28 parser.add_argument("-p",default="1",help="<int>: number of threads for allele mode, if p >4, only 4 threads will be used for assembly since the amount of extracted reads is small, default=1")
estrain@7	29 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: which workflow to apply, 'a'(raw reads allele micro-assembly), 'k'(raw reads and genome assembly k-mer), default=a")
estrain@7	30 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
estrain@7	31 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will only output serotype prediction without the directory containing log files")
estrain@7	32 parser.add_argument("--check",action="store_true",help="<flag>: use '--check' flag to check the required dependencies")
estrain@7	33 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SeqSero2_version)
estrain@7	34 return parser.parse_args()
estrain@7	35
estrain@7	36 ### ed_SL_05282019: check paths of dependencies
estrain@7	37 check_dependencies = parse_args().check
estrain@7	38 dependencies = ['bwa','samtools','blastn','fastq-dump','spades.py','bedtools','SalmID.py']
estrain@7	39 if check_dependencies:
estrain@7	40 for item in dependencies:
estrain@7	41 ext_path = find_executable(item)
estrain@7	42 if ext_path is not None:
estrain@7	43 print ("Using "+item+" - "+ext_path)
estrain@7	44 else:
estrain@7	45 print ("ERROR: can not find "+item+" in PATH")
estrain@7	46 sys.exit()
estrain@7	47 ### end of --check
estrain@7	48
estrain@7	49 def reverse_complement(sequence):
estrain@7	50 complement = {
estrain@7	51 'A': 'T',
estrain@7	52 'C': 'G',
estrain@7	53 'G': 'C',
estrain@7	54 'T': 'A',
estrain@7	55 'N': 'N',
estrain@7	56 'M': 'K',
estrain@7	57 'R': 'Y',
estrain@7	58 'W': 'W',
estrain@7	59 'S': 'S',
estrain@7	60 'Y': 'R',
estrain@7	61 'K': 'M',
estrain@7	62 'V': 'B',
estrain@7	63 'H': 'D',
estrain@7	64 'D': 'H',
estrain@7	65 'B': 'V'
estrain@7	66 }
estrain@7	67 return "".join(complement[base] for base in reversed(sequence))
estrain@7	68
estrain@7	69
estrain@7	70 def createKmerDict_reads(list_of_strings, kmer):
estrain@7	71 kmer_table = {}
estrain@7	72 for string in list_of_strings:
estrain@7	73 sequence = string.strip('\n')
estrain@7	74 for i in range(len(sequence) - kmer + 1):
estrain@7	75 new_mer = sequence[i:i + kmer].upper()
estrain@7	76 new_mer_rc = reverse_complement(new_mer)
estrain@7	77 if new_mer in kmer_table:
estrain@7	78 kmer_table[new_mer.upper()] += 1
estrain@7	79 else:
estrain@7	80 kmer_table[new_mer.upper()] = 1
estrain@7	81 if new_mer_rc in kmer_table:
estrain@7	82 kmer_table[new_mer_rc.upper()] += 1
estrain@7	83 else:
estrain@7	84 kmer_table[new_mer_rc.upper()] = 1
estrain@7	85 return kmer_table
estrain@7	86
estrain@7	87
estrain@7	88 def multifasta_dict(multifasta):
estrain@7	89 multifasta_list = [
estrain@7	90 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
estrain@7	91 ]
estrain@7	92 headers = [i for i in multifasta_list if i[0] == '>']
estrain@7	93 multifasta_dict = {}
estrain@7	94 for h in headers:
estrain@7	95 start = multifasta_list.index(h)
estrain@7	96 for element in multifasta_list[start + 1:]:
estrain@7	97 if element[0] == '>':
estrain@7	98 break
estrain@7	99 else:
estrain@7	100 if h[1:] in multifasta_dict:
estrain@7	101 multifasta_dict[h[1:]] += element
estrain@7	102 else:
estrain@7	103 multifasta_dict[h[1:]] = element
estrain@7	104 return multifasta_dict
estrain@7	105
estrain@7	106
estrain@7	107 def multifasta_single_string(multifasta):
estrain@7	108 multifasta_list = [
estrain@7	109 line.strip() for line in open(multifasta, 'r')
estrain@7	110 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
estrain@7	111 ]
estrain@7	112 return ''.join(multifasta_list)
estrain@7	113
estrain@7	114
estrain@7	115 def chunk_a_long_sequence(long_sequence, chunk_size=60):
estrain@7	116 chunk_list = []
estrain@7	117 steps = len(long_sequence) // 60 #how many chunks
estrain@7	118 for i in range(steps):
estrain@7	119 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
estrain@7	120 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
estrain@7	121 return chunk_list
estrain@7	122
estrain@7	123
estrain@7	124 def target_multifasta_kmerizer(multifasta, k, kmerDict):
estrain@7	125 forward_length = 300 #if find the target, put forward 300 bases
estrain@7	126 reverse_length = 2200 #if find the target, put backward 2200 bases
estrain@7	127 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
estrain@7	128 target_mers = []
estrain@7	129 long_single_string = multifasta_single_string(multifasta)
estrain@7	130 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
estrain@7	131 unit_length = len(multifasta_list[0])
estrain@7	132 forward_lines = int(forward_length / unit_length) + 1
estrain@7	133 reverse_lines = int(forward_length / unit_length) + 1
estrain@7	134 start_num = 0
estrain@7	135 end_num = 0
estrain@7	136 for i in range(len(multifasta_list)):
estrain@7	137 if i not in range(start_num, end_num): #avoid computational repetition
estrain@7	138 line = multifasta_list[i]
estrain@7	139 start = int((len(line) - k) // 2)
estrain@7	140 s1 = line[start:k + start]
estrain@7	141 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
estrain@7	142 if i - forward_lines >= 0:
estrain@7	143 start_num = i - forward_lines
estrain@7	144 else:
estrain@7	145 start_num = 0
estrain@7	146 if i + reverse_lines <= len(multifasta_list) - 1:
estrain@7	147 end_num = i + reverse_lines
estrain@7	148 else:
estrain@7	149 end_num = len(multifasta_list) - 1
estrain@7	150 target_list = [
estrain@7	151 x.strip() for x in multifasta_list[start_num:end_num]
estrain@7	152 ]
estrain@7	153 target_line = "".join(target_list)
estrain@7	154 target_mers += [
estrain@7	155 k1 for k1 in createKmerDict_reads([str(target_line)], k)
estrain@7	156 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
estrain@7	157 else:
estrain@7	158 pass
estrain@7	159 return set(target_mers)
estrain@7	160
estrain@7	161
estrain@7	162 def target_read_kmerizer(file, k, kmerDict):
estrain@7	163 i = 1
estrain@7	164 n_reads = 0
estrain@7	165 total_coverage = 0
estrain@7	166 target_mers = []
estrain@7	167 if file.endswith(".gz"):
estrain@7	168 file_content = io.BufferedReader(gzip.open(file))
estrain@7	169 else:
estrain@7	170 file_content = open(file, "r").readlines()
estrain@7	171 for line in file_content:
estrain@7	172 start = int((len(line) - k) // 2)
estrain@7	173 if i % 4 == 2:
estrain@7	174 if file.endswith(".gz"):
estrain@7	175 s1 = line[start:k + start].decode()
estrain@7	176 line = line.decode()
estrain@7	177 else:
estrain@7	178 s1 = line[start:k + start]
estrain@7	179 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
estrain@7	180 n_reads += 1
estrain@7	181 total_coverage += len(line)
estrain@7	182 target_mers += [
estrain@7	183 k1 for k1 in createKmerDict_reads([str(line)], k)
estrain@7	184 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
estrain@7	185 i += 1
estrain@7	186 if total_coverage >= 4000000:
estrain@7	187 break
estrain@7	188 return set(target_mers)
estrain@7	189
estrain@7	190
estrain@7	191 def minion_fasta_kmerizer(file, k, kmerDict):
estrain@7	192 i = 1
estrain@7	193 n_reads = 0
estrain@7	194 total_coverage = 0
estrain@7	195 target_mers = {}
estrain@7	196 for line in open(file):
estrain@7	197 if i % 2 == 0:
estrain@7	198 for kmer, rc_kmer in kmers(line.strip().upper(), k):
estrain@7	199 if (kmer in kmerDict) or (rc_kmer in kmerDict):
estrain@7	200 if kmer in target_mers:
estrain@7	201 target_mers[kmer] += 1
estrain@7	202 else:
estrain@7	203 target_mers[kmer] = 1
estrain@7	204 if rc_kmer in target_mers:
estrain@7	205 target_mers[rc_kmer] += 1
estrain@7	206 else:
estrain@7	207 target_mers[rc_kmer] = 1
estrain@7	208 i += 1
estrain@7	209 return set([h for h in target_mers])
estrain@7	210
estrain@7	211
estrain@7	212 def minion_fastq_kmerizer(file, k, kmerDict):
estrain@7	213 i = 1
estrain@7	214 n_reads = 0
estrain@7	215 total_coverage = 0
estrain@7	216 target_mers = {}
estrain@7	217 for line in open(file):
estrain@7	218 if i % 4 == 2:
estrain@7	219 for kmer, rc_kmer in kmers(line.strip().upper(), k):
estrain@7	220 if (kmer in kmerDict) or (rc_kmer in kmerDict):
estrain@7	221 if kmer in target_mers:
estrain@7	222 target_mers[kmer] += 1
estrain@7	223 else:
estrain@7	224 target_mers[kmer] = 1
estrain@7	225 if rc_kmer in target_mers:
estrain@7	226 target_mers[rc_kmer] += 1
estrain@7	227 else:
estrain@7	228 target_mers[rc_kmer] = 1
estrain@7	229 i += 1
estrain@7	230 return set([h for h in target_mers])
estrain@7	231
estrain@7	232
estrain@7	233 def multifasta_single_string2(multifasta):
estrain@7	234 single_string = ''
estrain@7	235 with open(multifasta, 'r') as f:
estrain@7	236 for line in f:
estrain@7	237 if line.strip()[0] == '>':
estrain@7	238 pass
estrain@7	239 else:
estrain@7	240 single_string += line.strip()
estrain@7	241 return single_string
estrain@7	242
estrain@7	243
estrain@7	244 def kmers(seq, k):
estrain@7	245 rev_comp = reverse_complement(seq)
estrain@7	246 for start in range(1, len(seq) - k + 1):
estrain@7	247 yield seq[start:start + k], rev_comp[-(start + k):-start]
estrain@7	248
estrain@7	249
estrain@7	250 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
estrain@7	251 multi_seq_dict = multifasta_dict(multifasta)
estrain@7	252 lib_dict = {}
estrain@7	253 for h in multi_seq_dict:
estrain@7	254 lib_dict[h] = set(
estrain@7	255 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
estrain@7	256 return lib_dict
estrain@7	257
estrain@7	258
estrain@7	259 def Combine(b, c):
estrain@7	260 fliC_combinations = []
estrain@7	261 fliC_combinations.append(",".join(c))
estrain@7	262 temp_combinations = []
estrain@7	263 for i in range(len(b)):
estrain@7	264 for x in itertools.combinations(b, i + 1):
estrain@7	265 temp_combinations.append(",".join(x))
estrain@7	266 for x in temp_combinations:
estrain@7	267 temp = []
estrain@7	268 for y in c:
estrain@7	269 temp.append(y)
estrain@7	270 temp.append(x)
estrain@7	271 temp = ",".join(temp)
estrain@7	272 temp = temp.split(",")
estrain@7	273 temp.sort()
estrain@7	274 temp = ",".join(temp)
estrain@7	275 fliC_combinations.append(temp)
estrain@7	276 return fliC_combinations
estrain@7	277
estrain@7	278
estrain@7	279 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
estrain@7	280 #like test_output_06012017.txt
estrain@7	281 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
estrain@7	282 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
estrain@7	283 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
estrain@7	284 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
estrain@7	285 seronames = []
estrain@7	286 seronames_none_subspecies=[]
estrain@7	287 for i in range(len(phase1)):
estrain@7	288 fliC_combine = []
estrain@7	289 fljB_combine = []
estrain@7	290 if phaseO[i] == Otype: # no VII in KW, but it's there
estrain@7	291 ### for fliC, detect every possible combinations to avoid the effect of "["
estrain@7	292 if phase1[i].count("[") == 0:
estrain@7	293 fliC_combine.append(phase1[i])
estrain@7	294 elif phase1[i].count("[") >= 1:
estrain@7	295 c = []
estrain@7	296 b = []
estrain@7	297 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
estrain@7	298 "[") == 1:
estrain@7	299 content = phase1[i].replace("[", "").replace("]", "")
estrain@7	300 fliC_combine.append(content)
estrain@7	301 fliC_combine.append("-")
estrain@7	302 else:
estrain@7	303 for x in phase1[i].split(","):
estrain@7	304 if "[" in x:
estrain@7	305 b.append(x.replace("[", "").replace("]", ""))
estrain@7	306 else:
estrain@7	307 c.append(x)
estrain@7	308 fliC_combine = Combine(
estrain@7	309 b, c
estrain@7	310 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
estrain@7	311 ### end of fliC "[" detect
estrain@7	312 ### for fljB, detect every possible combinations to avoid the effect of "["
estrain@7	313 if phase2[i].count("[") == 0:
estrain@7	314 fljB_combine.append(phase2[i])
estrain@7	315 elif phase2[i].count("[") >= 1:
estrain@7	316 d = []
estrain@7	317 e = []
estrain@7	318 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
estrain@7	319 "[") == 1:
estrain@7	320 content = phase2[i].replace("[", "").replace("]", "")
estrain@7	321 fljB_combine.append(content)
estrain@7	322 fljB_combine.append("-")
estrain@7	323 else:
estrain@7	324 for x in phase2[i].split(","):
estrain@7	325 if "[" in x:
estrain@7	326 d.append(x.replace("[", "").replace("]", ""))
estrain@7	327 else:
estrain@7	328 e.append(x)
estrain@7	329 fljB_combine = Combine(d, e)
estrain@7	330 ### end of fljB "[" detect
estrain@7	331 new_fliC = fliC.split(
estrain@7	332 ","
estrain@7	333 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
estrain@7	334 new_fliC.sort()
estrain@7	335 new_fliC = ",".join(new_fliC)
estrain@7	336 new_fljB = fljB.split(",")
estrain@7	337 new_fljB.sort()
estrain@7	338 new_fljB = ",".join(new_fljB)
estrain@7	339 if (new_fliC in fliC_combine
estrain@7	340 or fliC in fliC_combine) and (new_fljB in fljB_combine
estrain@7	341 or fljB in fljB_combine):
estrain@7	342 ######start, remove_list,rename_dict, added on 11/11/2018
estrain@7	343 if sero[i] not in remove_list:
estrain@7	344 temp_sero=sero[i]
estrain@7	345 if temp_sero in rename_dict:
estrain@7	346 temp_sero=rename_dict[temp_sero] #rename if in the rename list
estrain@7	347 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
estrain@7	348 if subs[i] == subspecies:
estrain@7	349 seronames.append(temp_sero)
estrain@7	350 seronames_none_subspecies.append(temp_sero)
estrain@7	351 else:
estrain@7	352 pass
estrain@7	353 else:
estrain@7	354 pass
estrain@7	355 ######end, added on 11/11/2018
estrain@7	356 #analyze seronames
estrain@7	357 subspecies_pointer=""
estrain@7	358 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
estrain@7	359 seronames=seronames_none_subspecies
estrain@7	360 subspecies_pointer="1"
estrain@7	361 if len(seronames) == 0:
estrain@7	362 seronames = [
estrain@7	363 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
estrain@7	364 ]
estrain@7	365 star = ""
estrain@7	366 star_line = ""
estrain@7	367 if len(seronames) > 1: #there are two possible predictions for serotypes
estrain@7	368 star = "*"
estrain@7	369 #changed 04072019
estrain@7	370 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
estrain@7	371 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
estrain@7	372 star="*"
estrain@7	373 star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line
estrain@7	374 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
estrain@7	375 if Otype=="":
estrain@7	376 Otype="-"
estrain@7	377 predict_form = Otype + ":" + fliC + ":" + fljB
estrain@7	378 predict_sero = (" or ").join(seronames)
estrain@7	379 ###special test for Enteritidis
estrain@7	380 if predict_form == "9:g,m:-":
estrain@7	381 sdf = "-"
estrain@7	382 for x in special_gene_list:
estrain@7	383 if x.startswith("sdf"):
estrain@7	384 sdf = "+"
estrain@7	385 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
estrain@7	386 star_line=" sdf gene detected." # ed_SL_04152019: new output format
estrain@7	387 #predict_form = predict_form + " Sdf prediction:" + sdf
estrain@7	388 predict_form = predict_form #changed 04072019
estrain@7	389 if sdf == "-":
estrain@7	390 star = "*"
estrain@7	391 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
estrain@7	392 star_line=" sdf gene not detected." # ed_SL_04152019: new output format
estrain@7	393 #changed in 04072019, for new output
estrain@7	394 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
estrain@7	395 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
estrain@7	396 predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format
estrain@7	397 ###end of special test for Enteritidis
estrain@7	398 elif predict_form == "4:i:-":
estrain@7	399 predict_sero = "4:i:-"
estrain@7	400 elif predict_form == "4:r:-":
estrain@7	401 predict_sero = "4:r:-"
estrain@7	402 elif predict_form == "4:b:-":
estrain@7	403 predict_sero = "4:b:-"
estrain@7	404 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
estrain@7	405 #predict_sero = "Newport"
estrain@7	406 #star = "*"
estrain@7	407 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
estrain@7	408 claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
estrain@7	409 if "N/A" in predict_sero:
estrain@7	410 claim = ""
estrain@7	411 #special test for Typhimurium
estrain@7	412 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
estrain@7	413 normal = 0
estrain@7	414 mutation = 0
estrain@7	415 for x in special_gene_list:
estrain@7	416 if "oafA-O-4_full" in x:
estrain@7	417 normal = float(special_gene_list[x])
estrain@7	418 elif "oafA-O-4_5-" in x:
estrain@7	419 mutation = float(special_gene_list[x])
estrain@7	420 if normal > mutation:
estrain@7	421 pass
estrain@7	422 elif normal < mutation:
estrain@7	423 #predict_sero = predict_sero.strip() + "(O5-)"
estrain@7	424 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
estrain@7	425 star = "*"
estrain@7	426 #star_line = "Detected the deletion of O5-."
estrain@7	427 star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format
estrain@7	428 else:
estrain@7	429 pass
estrain@7	430 #special test for Paratyphi B
estrain@7	431 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
estrain@7	432 normal = 0
estrain@7	433 mutation = 0
estrain@7	434 for x in special_gene_list:
estrain@7	435 if "gntR-family-regulatory-protein_dt-positive" in x:
estrain@7	436 normal = float(special_gene_list[x])
estrain@7	437 elif "gntR-family-regulatory-protein_dt-negative" in x:
estrain@7	438 mutation = float(special_gene_list[x])
estrain@7	439 #print(normal,mutation)
estrain@7	440 if normal > mutation:
estrain@7	441 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
estrain@7	442 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format
estrain@7	443 star = "*"
estrain@7	444 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
estrain@7	445 star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format
estrain@7	446 elif normal < mutation:
estrain@7	447 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
estrain@7	448 predict_sero = predict_sero.strip()
estrain@7	449 star = "*"
estrain@7	450 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
estrain@7	451 star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format
estrain@7	452 else:
estrain@7	453 star = "*"
estrain@7	454 #star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
estrain@7	455 star_line = " " ## ed_SL_05152019: do not report this situation.
estrain@7	456 #special test for O13,22 and O13,23
estrain@7	457 if Otype=="13":
estrain@7	458 ex_dir = os.path.dirname(os.path.realpath(__file__))
estrain@7	459 f = open(ex_dir + '/special.pickle', 'rb')
estrain@7	460 special = pickle.load(f)
estrain@7	461 O22_O23=special['O22_O23']
estrain@7	462 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
estrain@7	463 O22_score=0
estrain@7	464 O23_score=0
estrain@7	465 for x in special_gene_list:
estrain@7	466 if "O:22" in x:
estrain@7	467 O22_score = O22_score+float(special_gene_list[x])
estrain@7	468 elif "O:23" in x:
estrain@7	469 O23_score = O23_score+float(special_gene_list[x])
estrain@7	470 #print(O22_score,O23_score)
estrain@7	471 for z in O22_O23[0]:
estrain@7	472 if predict_sero.split(" or ")[0] in z:
estrain@7	473 if O22_score > O23_score:
estrain@7	474 star = "*"
estrain@7	475 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
estrain@7	476 predict_sero = z[0]
estrain@7	477 elif O22_score < O23_score:
estrain@7	478 star = "*"
estrain@7	479 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
estrain@7	480 predict_sero = z[1]
estrain@7	481 else:
estrain@7	482 star = "*"
estrain@7	483 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
estrain@7	484 if " or " in predict_sero:
estrain@7	485 star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
estrain@7	486 #special test for O6,8
estrain@7	487 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
estrain@7	488 #for x in merge_O68_list:
estrain@7	489 # if x in predict_sero:
estrain@7	490 # predict_sero=x
estrain@7	491 # star=""
estrain@7	492 # star_line=""
estrain@7	493 #special test for Montevideo; most of them are monophasic
estrain@7	494 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
estrain@7	495 #star="*"
estrain@7	496 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
estrain@7	497 return predict_form, predict_sero, star, star_line, claim
estrain@7	498 ### End of SeqSero Kmer part
estrain@7	499
estrain@7	500 ### Begin of SeqSero2 allele prediction and output
estrain@7	501 def xml_parse_score_comparision_seqsero(xmlfile):
estrain@7	502 #used to do seqsero xml analysis
estrain@7	503 from Bio.Blast import NCBIXML
estrain@7	504 handle=open(xmlfile)
estrain@7	505 handle=NCBIXML.parse(handle)
estrain@7	506 handle=list(handle)
estrain@7	507 List=[]
estrain@7	508 List_score=[]
estrain@7	509 List_ids=[]
estrain@7	510 List_query_region=[]
estrain@7	511 for i in range(len(handle)):
estrain@7	512 if len(handle[i].alignments)>0:
estrain@7	513 for j in range(len(handle[i].alignments)):
estrain@7	514 score=0
estrain@7	515 ids=0
estrain@7	516 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
estrain@7	517 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
estrain@7	518 for z in range(len(handle[i].alignments[j].hsps)):
estrain@7	519 hsp=handle[i].alignments[j].hsps[z]
estrain@7	520 temp=set(range(hsp.query_start,hsp.query_end))
estrain@7	521 if len(cover_region)==0:
estrain@7	522 cover_region=cover_region\|temp
estrain@7	523 fraction=1
estrain@7	524 else:
estrain@7	525 fraction=1-len(cover_region&temp)/float(len(temp))
estrain@7	526 cover_region=cover_region\|temp
estrain@7	527 if "last" in handle[i].query or "first" in handle[i].query:
estrain@7	528 score+=hsp.bits*fraction
estrain@7	529 ids+=float(hsp.identities)/handle[i].query_length*fraction
estrain@7	530 else:
estrain@7	531 score+=hsp.bits*fraction
estrain@7	532 ids+=float(hsp.identities)/handle[i].query_length*fraction
estrain@7	533 List_score.append(score)
estrain@7	534 List_ids.append(ids)
estrain@7	535 List_query_region.append(cover_region)
estrain@7	536 temp=zip(List,List_score,List_ids,List_query_region)
estrain@7	537 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
estrain@7	538 return Final_list
estrain@7	539
estrain@7	540
estrain@7	541 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
estrain@7	542 Old=L
estrain@7	543 L.sort()
estrain@7	544 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
estrain@7	545 count=[]
estrain@7	546 for j in range(len(L)):
estrain@7	547 y=0
estrain@7	548 for x in Old:
estrain@7	549 if L[j]==x:
estrain@7	550 y+=1
estrain@7	551 count.append(y)
estrain@7	552 if sort_on_fre!="none":
estrain@7	553 d=zip(*sorted(zip(count, L)))
estrain@7	554 L=d[1]
estrain@7	555 count=d[0]
estrain@7	556 return (L,count)
estrain@7	557
estrain@7	558 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
estrain@7	559 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
estrain@7	560 #this is mainly used for
estrain@7	561 a=nodes_vs_score_list
estrain@7	562 fliC_score=0
estrain@7	563 fljB_score=0
estrain@7	564 for z in a:
estrain@7	565 if "fliC" in z[0]:
estrain@7	566 fliC_score+=z[1]
estrain@7	567 elif "fljB" in z[0]:
estrain@7	568 fljB_score+=z[1]
estrain@7	569 if fliC_score>=fljB_score:
estrain@7	570 role="fliC"
estrain@7	571 else:
estrain@7	572 role="fljB"
estrain@7	573 return (role,abs(fliC_score-fljB_score))
estrain@7	574
estrain@7	575 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
estrain@7	576 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
estrain@7	577 #also used when no head or tail got blasted score for the contig
estrain@7	578 role=""
estrain@7	579 for z in Final_list_passed:
estrain@7	580 if node_name in z[0]:
estrain@7	581 role=z[0].split("_")[0]
estrain@7	582 break
estrain@7	583 return role
estrain@7	584
estrain@7	585 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
estrain@7	586 #nodes_list is the c created by c,d=Uniq(nodes) in below function
estrain@7	587 first_target=""
estrain@7	588 role_list=[]
estrain@7	589 for x in nodes_list:
estrain@7	590 a=[]
estrain@7	591 role=""
estrain@7	592 for y in tail_head_list:
estrain@7	593 if x in y[0]:
estrain@7	594 a.append(y)
estrain@7	595 if len(a)==4:
estrain@7	596 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
estrain@7	597 if diff<20:
estrain@7	598 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
estrain@7	599 elif len(a)==3:
estrain@7	600 ###however, if the one with highest score is the fewer one, compare their accumulation score
estrain@7	601 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
estrain@7	602 if diff<20:
estrain@7	603 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
estrain@7	604 ###end of above score comparison
estrain@7	605 elif len(a)==2:
estrain@7	606 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
estrain@7	607 temp=[]
estrain@7	608 for z in a:
estrain@7	609 temp.append(z[0].split("_")[0])
estrain@7	610 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
estrain@7	611 if len(m)==1:
estrain@7	612 pass
estrain@7	613 else:
estrain@7	614 pass
estrain@7	615 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
estrain@7	616 if diff<20:
estrain@7	617 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
estrain@7	618 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
estrain@7	619 elif len(a)==1:
estrain@7	620 #that one
estrain@7	621 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
estrain@7	622 if diff<20:
estrain@7	623 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
estrain@7	624 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
estrain@7	625 else:#a==0
estrain@7	626 #use Final_list_passed best match
estrain@7	627 for z in Final_list_passed:
estrain@7	628 if x in z[0]:
estrain@7	629 role=z[0].split("_")[0]
estrain@7	630 break
estrain@7	631 #print x,role,len(a)
estrain@7	632 role_list.append((role,x))
estrain@7	633 if len(role_list)==2:
estrain@7	634 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
estrain@7	635 #just use score to do a final test
estrain@7	636 role_list=[]
estrain@7	637 for x in nodes_list:
estrain@7	638 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
estrain@7	639 role_list.append((role,x))
estrain@7	640 return role_list
estrain@7	641
estrain@7	642 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
estrain@7	643 #used to decide which contig is FliC and which one is fljB
estrain@7	644 contigs=[]
estrain@7	645 nodes=[]
estrain@7	646 for x in Final_list_passed:
estrain@7	647 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
estrain@7	648 nodes.append(x[0].split("___")[1].strip())
estrain@7	649 c,d=Uniq(nodes)#c is node_list
estrain@7	650 #print c
estrain@7	651 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
estrain@7	652 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
estrain@7	653 return roles
estrain@7	654
estrain@7	655 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
estrain@7	656 #decide O based on Final_list
estrain@7	657 O_choice="?"
estrain@7	658 O_list=[]
estrain@7	659 special_genes={}
estrain@7	660 nodes=[]
estrain@7	661 for x in Final_list_passed:
estrain@7	662 if x[0].startswith("O-"):
estrain@7	663 nodes.append(x[0].split("___")[1].strip())
estrain@7	664 elif not x[0].startswith("fl"):
estrain@7	665 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
estrain@7	666 #print "special_genes:",special_genes
estrain@7	667 c,d=Uniq(nodes)
estrain@7	668 #print "potential O antigen contig",c
estrain@7	669 final_O=[]
estrain@7	670 O_nodes_list=[]
estrain@7	671 for x in c:#c is the list for contigs
estrain@7	672 temp=0
estrain@7	673 for y in Final_list_passed:
estrain@7	674 if x in y[0] and y[0].startswith("O-"):
estrain@7	675 final_O.append(y)
estrain@7	676 break
estrain@7	677 ### O contig has the problem of two genes on same contig, so do additional test
estrain@7	678 potenial_new_gene=""
estrain@7	679 for x in final_O:
estrain@7	680 pointer=0 #for genes merged or not
estrain@7	681 #not consider O-1,3,19_not_in_3,10, too short compared with others
estrain@7	682 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 3002 (for secureity can use 400*2) as flank region
estrain@7	683 pointer=x[0].split("___")[1].strip()#store the contig name
estrain@7	684 print(pointer)
estrain@7	685 if pointer!=0:#it has potential merge event
estrain@7	686 for y in Final_list:
estrain@7	687 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
estrain@7	688 potenial_new_gene=y
estrain@7	689 #print(potenial_new_gene)
estrain@7	690 break
estrain@7	691 if potenial_new_gene!="":
estrain@7	692 print("two differnt genes in same contig, fix it for O antigen")
estrain@7	693 print(potenial_new_gene[:3])
estrain@7	694 pointer=0
estrain@7	695 for y in final_O:
estrain@7	696 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
estrain@7	697 pointer=1
estrain@7	698 if pointer!=0: #changed to consider two genes in same contig
estrain@7	699 final_O.append(potenial_new_gene)
estrain@7	700 ### end of the two genes on same contig test
estrain@7	701 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
estrain@7	702 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
estrain@7	703 #print "$$$No Otype, due to no hit"#may need to be changed
estrain@7	704 O_choice="-"
estrain@7	705 else:
estrain@7	706 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
estrain@7	707 O_list=[]
estrain@7	708 O_list_less_contamination=[]
estrain@7	709 for x in final_O:
estrain@7	710 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
estrain@7	711 O_list.append(x[0].split("__")[0])
estrain@7	712 O_nodes_list.append(x[0].split("___")[1])
estrain@7	713 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
estrain@7	714 O_list_less_contamination.append(x[0].split("__")[0])
estrain@7	715 ### special test for O9,46 and O3,10 family
estrain@7	716 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
estrain@7	717 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
estrain@7	718 O_choice="O-9,46"
estrain@7	719 #print "$$$Most possilble Otype: O-9,46"
estrain@7	720 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
estrain@7	721 O_choice="O-9,46,27"
estrain@7	722 #print "$$$Most possilble Otype: O-9,46,27"
estrain@7	723 else:
estrain@7	724 O_choice="O-9"#next, detect O9 vs O2?
estrain@7	725 O2=0
estrain@7	726 O9=0
estrain@7	727 for z in special_genes:
estrain@7	728 if "tyr-O-9" in z:
estrain@7	729 O9=special_genes[z]
estrain@7	730 elif "tyr-O-2" in z:
estrain@7	731 O2=special_genes[z]
estrain@7	732 if O2>O9:
estrain@7	733 O_choice="O-2"
estrain@7	734 elif O2<O9:
estrain@7	735 pass
estrain@7	736 else:
estrain@7	737 pass
estrain@7	738 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
estrain@7	739 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
estrain@7	740 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
estrain@7	741 O_choice="O-3,10"
estrain@7	742 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
estrain@7	743 else:
estrain@7	744 O_choice="O-1,3,19"
estrain@7	745 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
estrain@7	746 ### end of special test for O9,46 and O3,10 family
estrain@7	747 else:
estrain@7	748 try:
estrain@7	749 max_score=0
estrain@7	750 for x in final_O:
estrain@7	751 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage0.15:#use x[2],08172018, the "coverage identity = cover_length identity"; also meet coverage threshold
estrain@7	752 max_score=x[2]#change from x[-1] to x[2],08172018
estrain@7	753 O_choice=x[0].split("_")[0]
estrain@7	754 if O_choice=="O-1,3,19":
estrain@7	755 O_choice=final_O[1][0].split("_")[0]
estrain@7	756 #print "$$$Most possilble Otype: ",O_choice
estrain@7	757 except:
estrain@7	758 pass
estrain@7	759 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
estrain@7	760 #print "O:",O_choice,O_nodes_list
estrain@7	761 Otypes=[]
estrain@7	762 for x in O_list:
estrain@7	763 if x!="O-1,3,19_not_in_3,10":
estrain@7	764 if "O-9,46_" not in x:
estrain@7	765 Otypes.append(x.split("_")[0])
estrain@7	766 else:
estrain@7	767 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
estrain@7	768 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
estrain@7	769 Otypes_uniq,Otypes_fre=Uniq(Otypes)
estrain@7	770 contamination_O=""
estrain@7	771 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
estrain@7	772 if len(Otypes_uniq)>2:
estrain@7	773 contamination_O="potential contamination from O antigen signals"
estrain@7	774 else:
estrain@7	775 if len(Otypes_uniq)>1:
estrain@7	776 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
estrain@7	777 contamination_O=""
estrain@7	778 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
estrain@7	779 contamination_O=""
estrain@7	780 else:
estrain@7	781 contamination_O="potential contamination from O antigen signals"
estrain@7	782 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
estrain@7	783 ### End of SeqSero2 allele prediction and output
estrain@7	784
estrain@7	785 def get_input_files(make_dir,input_file,data_type,dirpath):
estrain@7	786 #tell input files from datatype
estrain@7	787 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
estrain@7	788 for_fq=""
estrain@7	789 rev_fq=""
estrain@7	790 os.chdir(make_dir)
estrain@7	791 if data_type=="1":
estrain@7	792 input_file=input_file[0].split("/")[-1]
estrain@7	793 if input_file.endswith(".sra"):
estrain@7	794 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
estrain@7	795 for_fq=input_file.replace(".sra","_1.fastq")
estrain@7	796 rev_fq=input_file.replace(".sra","_2.fastq")
estrain@7	797 else:
estrain@7	798 core_id=input_file.split(".fastq")[0].split(".fq")[0]
estrain@7	799 for_fq=core_id+"_1.fastq"
estrain@7	800 rev_fq=core_id+"_2.fastq"
estrain@7	801 if input_file.endswith(".gz"):
estrain@7	802 subprocess.check_call("gzip -dc "+input_file+" \| "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
estrain@7	803 else:
estrain@7	804 subprocess.check_call("cat "+input_file+" \| "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True)
estrain@7	805 elif data_type=="2":
estrain@7	806 for_fq=input_file[0].split("/")[-1]
estrain@7	807 rev_fq=input_file[1].split("/")[-1]
estrain@7	808 elif data_type=="3":
estrain@7	809 input_file=input_file[0].split("/")[-1]
estrain@7	810 if input_file.endswith(".sra"):
estrain@7	811 subprocess.check_call("fastq-dump --split-files "+input_file,shell=True)
estrain@7	812 for_fq=input_file.replace(".sra","_1.fastq")
estrain@7	813 else:
estrain@7	814 for_fq=input_file
estrain@7	815 elif data_type in ["4","5","6"]:
estrain@7	816 for_fq=input_file[0].split("/")[-1]
estrain@7	817 os.chdir("..")
estrain@7	818 return for_fq,rev_fq
estrain@7	819
estrain@7	820 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
estrain@7	821 #get O and H types from Final_list from blast parsing; allele mode
estrain@7	822 from Bio import SeqIO
estrain@7	823 fliC_choice="-"
estrain@7	824 fljB_choice="-"
estrain@7	825 fliC_contig="NA"
estrain@7	826 fljB_contig="NA"
estrain@7	827 fliC_region=set([0])
estrain@7	828 fljB_region=set([0,])
estrain@7	829 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
estrain@7	830 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
estrain@7	831 O_choice="-"#no need to decide O contig for now, should be only one
estrain@7	832 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
estrain@7	833 O_choice=O_choice.split("-")[-1].strip()
estrain@7	834 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
estrain@7	835 O_choice="-"
estrain@7	836 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
estrain@7	837 #add alignment locations, used for further selection, 03312019
estrain@7	838 for i in range(len(H_contig_roles)):
estrain@7	839 x=H_contig_roles[i]
estrain@7	840 for y in Final_list_passed:
estrain@7	841 if x[1] in y[0] and y[0].startswith(x[0]):
estrain@7	842 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
estrain@7	843 break
estrain@7	844 log_file=open("SeqSero_log.txt","a")
estrain@7	845 extract_file=open("Extracted_antigen_alleles.fasta","a")
estrain@7	846 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
estrain@7	847
estrain@7	848 #print("O_contigs:")
estrain@7	849 log_file.write("O_contigs:\n")
estrain@7	850 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
estrain@7	851 extract_file.write("#O_contigs:\n")
estrain@7	852 for x in O_nodes_roles:
estrain@7	853 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
estrain@7	854 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
estrain@7	855 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
estrain@7	856 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
estrain@7	857 seqs=""
estrain@7	858 for z in handle_fasta:
estrain@7	859 if x[0].split("___")[-1]==z.description:
estrain@7	860 seqs=str(z.seq)
estrain@7	861 extract_file.write(title+seqs+"\n")
estrain@7	862 if len(H_contig_roles)!=0:
estrain@7	863 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
estrain@7	864 else:
estrain@7	865 highest_H_coverage=0
estrain@7	866 for x in H_contig_roles:
estrain@7	867 #if multiple choices, temporately select the one with longest length for now, will revise in further change
estrain@7	868 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
estrain@7	869 fliC_contig=x[1]
estrain@7	870 fliC_length=len(x[-1])
estrain@7	871 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
estrain@7	872 fljB_contig=x[1]
estrain@7	873 fljB_length=len(x[-1])
estrain@7	874 for x in Final_list_passed:
estrain@7	875 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
estrain@7	876 fliC_choice=x[0].split("_")[1]
estrain@7	877 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
estrain@7	878 fljB_choice=x[0].split("_")[1]
estrain@7	879 elif fliC_choice!="-" and fljB_choice!="-":
estrain@7	880 break
estrain@7	881 #now remove contigs not in middle core part
estrain@7	882 first_allele="NA"
estrain@7	883 first_allele_percentage=0
estrain@7	884 for x in Final_list:
estrain@7	885 if x[0].startswith("fliC") or x[0].startswith("fljB"):
estrain@7	886 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
estrain@7	887 first_allele_percentage=x[2]
estrain@7	888 break
estrain@7	889 additional_contigs=[]
estrain@7	890 for x in Final_list:
estrain@7	891 if first_allele in x[0]:
estrain@7	892 if (fliC_contig == x[0].split("___")[-1]):
estrain@7	893 fliC_region=x[3]
estrain@7	894 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
estrain@7	895 fljB_region=x[3]
estrain@7	896 else:
estrain@7	897 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
estrain@7	898 additional_contigs.append(x)
estrain@7	899 #else:
estrain@7	900 #print x[:3]
estrain@7	901 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
estrain@7	902 if first_allele_percentage>0.9:
estrain@7	903 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
estrain@7	904 target_region=fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
estrain@7	905 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
estrain@7	906 target_region=fliC_region\|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
estrain@7	907 else:
estrain@7	908 target_region=set()#doesn't do anything
estrain@7	909 else:
estrain@7	910 target_region=set()#doesn't do anything
estrain@7	911 #print(target_region)
estrain@7	912 #print(additional_contigs)
estrain@7	913 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
estrain@7	914 target_region=target_region2\|target_region
estrain@7	915 for x in additional_contigs:
estrain@7	916 removal=0
estrain@7	917 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
estrain@7	918 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length0.5<len(x[3])<contig_length1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
estrain@7	919 removal=1
estrain@7	920 else:
estrain@7	921 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
estrain@7	922 removal=1
estrain@7	923 else:
estrain@7	924 pass
estrain@7	925 if removal==1:
estrain@7	926 for y in H_contig_roles:
estrain@7	927 if y[1] in x[0]:
estrain@7	928 H_contig_roles.remove(y)
estrain@7	929 else:
estrain@7	930 pass
estrain@7	931 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length0.5,len(x[3]),contig_length1.5)
estrain@7	932 #end of removing none-middle contigs
estrain@7	933 #print("H_contigs:")
estrain@7	934 log_file.write("H_contigs:\n")
estrain@7	935 extract_file.write("#H_contigs:\n")
estrain@7	936 H_contig_stat=[]
estrain@7	937 H1_cont_stat={}
estrain@7	938 H2_cont_stat={}
estrain@7	939 for i in range(len(H_contig_roles)):
estrain@7	940 x=H_contig_roles[i]
estrain@7	941 a=0
estrain@7	942 for y in Final_list_passed:
estrain@7	943 if x[1] in y[0] and y[0].startswith(x[0]):
estrain@7	944 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
estrain@7	945 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
estrain@7	946 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
estrain@7	947 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
estrain@7	948 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
estrain@7	949 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
estrain@7	950 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
estrain@7	951 seqs=""
estrain@7	952 for z in handle_fasta:
estrain@7	953 if x[1]==z.description:
estrain@7	954 seqs=str(z.seq)
estrain@7	955 extract_file.write(title+seqs+"\n")
estrain@7	956 break
estrain@7	957 else:
estrain@7	958 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
estrain@7	959 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
estrain@7	960 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
estrain@7	961 seqs=""
estrain@7	962 for z in handle_fasta:
estrain@7	963 if x[1]==z.description:
estrain@7	964 seqs=str(z.seq)
estrain@7	965 extract_file.write(title+seqs+"\n")
estrain@7	966 if x[0]=="fliC":
estrain@7	967 if y[0].split("_")[1] not in H1_cont_stat:
estrain@7	968 H1_cont_stat[y[0].split("_")[1]]=y[2]
estrain@7	969 else:
estrain@7	970 H1_cont_stat[y[0].split("_")[1]]+=y[2]
estrain@7	971 if x[0]=="fljB":
estrain@7	972 if y[0].split("_")[1] not in H2_cont_stat:
estrain@7	973 H2_cont_stat[y[0].split("_")[1]]=y[2]
estrain@7	974 else:
estrain@7	975 H2_cont_stat[y[0].split("_")[1]]+=y[2]
estrain@7	976 break
estrain@7	977 #detect contaminations
estrain@7	978 #print(H1_cont_stat)
estrain@7	979 #print(H2_cont_stat)
estrain@7	980 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
estrain@7	981 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
estrain@7	982 contamination_H=""
estrain@7	983 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
estrain@7	984 contamination_H="potential contamination from H antigen signals"
estrain@7	985 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
estrain@7	986 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
estrain@7	987 #get additional antigens
estrain@7	988 """
estrain@7	989 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
estrain@7	990 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
estrain@7	991 O_choice="O-9,46"
estrain@7	992 #print "$$$Most possilble Otype: O-9,46"
estrain@7	993 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
estrain@7	994 O_choice="O-9,46,27"
estrain@7	995 #print "$$$Most possilble Otype: O-9,46,27"
estrain@7	996 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
estrain@7	997 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
estrain@7	998 O_choice="O-3,10"
estrain@7	999 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
estrain@7	1000 else:
estrain@7	1001 O_choice="O-1,3,19"
estrain@7	1002 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
estrain@7	1003 ### end of special test for O9,46 and O3,10 family
estrain@7	1004
estrain@7	1005 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
estrain@7	1006 if len(Otypes_uniq)>2:
estrain@7	1007 contamination_O="potential contamination from O antigen signals"
estrain@7	1008 else:
estrain@7	1009 if len(Otypes_uniq)>1:
estrain@7	1010 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
estrain@7	1011 contamination_O=""
estrain@7	1012 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
estrain@7	1013 contamination_O=""
estrain@7	1014 """
estrain@7	1015 additonal_antigents=[]
estrain@7	1016 #print(contamination_O)
estrain@7	1017 #print(contamination_H)
estrain@7	1018 log_file.write(contamination_O+"\n")
estrain@7	1019 log_file.write(contamination_H+"\n")
estrain@7	1020 log_file.close()
estrain@7	1021 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
estrain@7	1022
estrain@7	1023 def get_input_K(input_file,lib_dict,data_type,k_size):
estrain@7	1024 #kmer mode; get input_Ks from dict and data_type
estrain@7	1025 kmers = []
estrain@7	1026 for h in lib_dict:
estrain@7	1027 kmers += lib_dict[h]
estrain@7	1028 if data_type == '4':
estrain@7	1029 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
estrain@7	1030 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
estrain@7	1031 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
estrain@7	1032 elif data_type == '5':#minion_2d_fasta
estrain@7	1033 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
estrain@7	1034 if data_type == '6':#minion_2d_fastq
estrain@7	1035 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
estrain@7	1036 return input_Ks
estrain@7	1037
estrain@7	1038 def get_kmer_dict(lib_dict,input_Ks):
estrain@7	1039 #kmer mode; get predicted types
estrain@7	1040 O_dict = {}
estrain@7	1041 H_dict = {}
estrain@7	1042 Special_dict = {}
estrain@7	1043 for h in lib_dict:
estrain@7	1044 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
estrain@7	1045 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
estrain@7	1046 if h.startswith('O-') and score > 25:
estrain@7	1047 O_dict[h] = score
estrain@7	1048 if h.startswith('fl') and score > 40:
estrain@7	1049 H_dict[h] = score
estrain@7	1050 if (h[:2] != 'fl') and (h[:2] != 'O-'):
estrain@7	1051 Special_dict[h] = score
estrain@7	1052 return O_dict,H_dict,Special_dict
estrain@7	1053
estrain@7	1054 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
estrain@7	1055 log_file=open("SeqSero_log.txt","a")
estrain@7	1056 log_file.write("O_scores:\n")
estrain@7	1057 #call O:
estrain@7	1058 highest_O = '-'
estrain@7	1059 if len(O_dict) == 0:
estrain@7	1060 pass
estrain@7	1061 else:
estrain@7	1062 for x in O_dict:
estrain@7	1063 log_file.write(x+"\t"+str(O_dict[x])+"\n")
estrain@7	1064 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
estrain@7	1065 if 'O-9,46_wzy__1191' in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
estrain@7	1066 highest_O = "O-9,46"
estrain@7	1067 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
estrain@7	1068 highest_O = "O-9,46,27"
estrain@7	1069 else:
estrain@7	1070 highest_O = "O-9" # next, detect O9 vs O2?
estrain@7	1071 O2 = 0
estrain@7	1072 O9 = 0
estrain@7	1073 for z in Special_dict:
estrain@7	1074 if "tyr-O-9" in z:
estrain@7	1075 O9 = float(Special_dict[z])
estrain@7	1076 if "tyr-O-2" in z:
estrain@7	1077 O2 = float(Special_dict[z])
estrain@7	1078 if O2 > O9:
estrain@7	1079 highest_O = "O-2"
estrain@7	1080 elif ("O-3,10_wzx__1539" in O_dict) and (
estrain@7	1081 "O-9,46_wzy__1191" in O_dict
estrain@7	1082 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
estrain@7	1083 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
estrain@7	1084 highest_O = "O-3,10"
estrain@7	1085 else:
estrain@7	1086 highest_O = "O-1,3,19"
estrain@7	1087 ### end of special test for O9,46 and O3,10 family
estrain@7	1088 else:
estrain@7	1089 try:
estrain@7	1090 max_score = 0
estrain@7	1091 for x in O_dict:
estrain@7	1092 if float(O_dict[x]) >= max_score:
estrain@7	1093 max_score = float(O_dict[x])
estrain@7	1094 highest_O = x.split("_")[0]
estrain@7	1095 if highest_O == "O-1,3,19":
estrain@7	1096 highest_O = '-'
estrain@7	1097 max_score = 0
estrain@7	1098 for x in O_dict:
estrain@7	1099 if x == 'O-1,3,19_not_in_3,10__130':
estrain@7	1100 pass
estrain@7	1101 else:
estrain@7	1102 if float(O_dict[x]) >= max_score:
estrain@7	1103 max_score = float(O_dict[x])
estrain@7	1104 highest_O = x.split("_")[0]
estrain@7	1105 except:
estrain@7	1106 pass
estrain@7	1107 #call_fliC:
estrain@7	1108 if len(H_dict)!=0:
estrain@7	1109 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
estrain@7	1110 else:
estrain@7	1111 highest_H_score_both_BC=0
estrain@7	1112 highest_fliC = '-'
estrain@7	1113 highest_fliC_raw = '-'
estrain@7	1114 highest_Score = 0
estrain@7	1115 log_file.write("\nH_scores:\n")
estrain@7	1116 for s in H_dict:
estrain@7	1117 log_file.write(s+"\t"+str(H_dict[s])+"\n")
estrain@7	1118 if s.startswith('fliC'):
estrain@7	1119 if float(H_dict[s]) > highest_Score:
estrain@7	1120 highest_fliC = s.split('_')[1]
estrain@7	1121 highest_fliC_raw = s
estrain@7	1122 highest_Score = float(H_dict[s])
estrain@7	1123 #call_fljB
estrain@7	1124 highest_fljB = '-'
estrain@7	1125 highest_fljB_raw = '-'
estrain@7	1126 highest_Score = 0
estrain@7	1127 for s in H_dict:
estrain@7	1128 if s.startswith('fljB'):
estrain@7	1129 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
estrain@7	1130 highest_fljB = s.split('_')[1]
estrain@7	1131 highest_fljB_raw = s
estrain@7	1132 highest_Score = float(H_dict[s])
estrain@7	1133 log_file.write("\nSpecial_scores:\n")
estrain@7	1134 for s in Special_dict:
estrain@7	1135 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
estrain@7	1136 log_file.close()
estrain@7	1137 return highest_O,highest_fliC,highest_fljB
estrain@7	1138
estrain@7	1139 def get_temp_file_names(for_fq,rev_fq):
estrain@7	1140 #seqsero2 -a; get temp file names
estrain@7	1141 sam=for_fq+".sam"
estrain@7	1142 bam=for_fq+".bam"
estrain@7	1143 sorted_bam=for_fq+"_sorted.bam"
estrain@7	1144 mapped_fq1=for_fq+"_mapped.fq"
estrain@7	1145 mapped_fq2=rev_fq+"_mapped.fq"
estrain@7	1146 combined_fq=for_fq+"_combined.fq"
estrain@7	1147 for_sai=for_fq+".sai"
estrain@7	1148 rev_sai=rev_fq+".sai"
estrain@7	1149 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
estrain@7	1150
estrain@7	1151 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
estrain@7	1152 #seqsero2 -a; do mapping and sort
estrain@7	1153 print("building database...")
estrain@7	1154 subprocess.check_call("bwa index "+database+ " 2>> data_log.txt",shell=True)
estrain@7	1155 print("mapping...")
estrain@7	1156 if mapping_mode=="mem":
estrain@7	1157 subprocess.check_call("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
estrain@7	1158 elif mapping_mode=="sam":
estrain@7	1159 if fnameB!="":
estrain@7	1160 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
estrain@7	1161 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai+ " 2>> data_log.txt",shell=True)
estrain@7	1162 subprocess.check_call("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam+ " 2>> data_log.txt",shell=True)
estrain@7	1163 else:
estrain@7	1164 subprocess.check_call("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai+ " 2>> data_log.txt",shell=True)
estrain@7	1165 subprocess.check_call("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam)
estrain@7	1166 subprocess.check_call("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True)
estrain@7	1167 ### check the version of samtools then use differnt commands
estrain@7	1168 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
estrain@7	1169 out, err = samtools_version.communicate()
estrain@7	1170 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
estrain@7	1171 print("check samtools version:",version)
estrain@7	1172 ### end of samtools version check and its analysis
estrain@7	1173 if LooseVersion(version)<=LooseVersion("1.2"):
estrain@7	1174 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True)
estrain@7	1175 else:
estrain@7	1176 subprocess.check_call("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True)
estrain@7	1177
estrain@7	1178 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
estrain@7	1179 #seqsero2 -a; extract, assembly and blast
estrain@7	1180 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True)
estrain@7	1181 #print("fnameA:",fnameA)
estrain@7	1182 #print("fnameB:",fnameB)
estrain@7	1183 if fnameB!="":
estrain@7	1184 subprocess.check_call("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 + " 2>> data_log.txt",shell=True)#2> /dev/null if want no output
estrain@7	1185 else:
estrain@7	1186 pass
estrain@7	1187 outdir=current_time+"_temp"
estrain@7	1188 print("assembling...")
estrain@7	1189 if int(threads)>4:
estrain@7	1190 t="4"
estrain@7	1191 else:
estrain@7	1192 t=threads
estrain@7	1193 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
estrain@7	1194 if fnameB!="":
estrain@7	1195 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
estrain@7	1196 else:
estrain@7	1197 subprocess.check_call("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir+ " >> data_log.txt 2>&1",shell=True)
estrain@7	1198 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
estrain@7	1199 subprocess.check_call("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True)
estrain@7	1200 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
estrain@7	1201 subprocess.check_call("rm -rf "+outdir+ " 2> /dev/null",shell=True)
estrain@7	1202 print("blasting...","\n")
estrain@7	1203 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
estrain@7	1204 subprocess.check_call('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl >> data_log.txt 2>&1',shell=True) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
estrain@7	1205 subprocess.check_call("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5 >> data_log.txt 2>&1",shell=True)###1/27/2015; 08272018, remove "-word_size 10"
estrain@7	1206 else:
estrain@7	1207 xmlfile="NA"
estrain@7	1208 return xmlfile,new_fasta
estrain@7	1209
estrain@7	1210 def judge_subspecies(fnameA,dirpath):
estrain@7	1211 #seqsero2 -a; judge subspecies on just forward raw reads fastq
estrain@7	1212 salmID_output=subprocess.Popen(dirpath+"/SalmID.py -i "+fnameA,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
estrain@7	1213 out, err = salmID_output.communicate()
estrain@7	1214 out=out.decode("utf-8")
estrain@7	1215 file=open("data_log.txt","a")
estrain@7	1216 file.write(out)
estrain@7	1217 file.close()
estrain@7	1218 salm_species_scores=out.split("\n")[1].split("\t")[6:]
estrain@7	1219 salm_species_results=out.split("\n")[0].split("\t")[6:]
estrain@7	1220 max_score=0
estrain@7	1221 max_score_index=1 #default is 1, means "I"
estrain@7	1222 for i in range(len(salm_species_scores)):
estrain@7	1223 if max_score<float(salm_species_scores[i]):
estrain@7	1224 max_score=float(salm_species_scores[i])
estrain@7	1225 max_score_index=i
estrain@7	1226 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
estrain@7	1227 if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
estrain@7	1228 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
estrain@7	1229 if max_score<10:
estrain@7	1230 prediction="-"
estrain@7	1231 return prediction
estrain@7	1232
estrain@7	1233 def judge_subspecies_Kmer(Special_dict):
estrain@7	1234 #seqsero2 -k;
estrain@7	1235 max_score=0
estrain@7	1236 prediction="-" #default should be I
estrain@7	1237 for x in Special_dict:
estrain@7	1238 if "mer" in x:
estrain@7	1239 if max_score<float(Special_dict[x]):
estrain@7	1240 max_score=float(Special_dict[x])
estrain@7	1241 prediction=x.split("_")[-1].strip()
estrain@7	1242 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
estrain@7	1243 prediction="bongori"
estrain@7	1244 break
estrain@7	1245 return prediction
estrain@7	1246
estrain@7	1247 def main():
estrain@7	1248 #combine SeqSeroK and SeqSero2, also with SalmID
estrain@7	1249 args = parse_args()
estrain@7	1250 input_file = args.i
estrain@7	1251 data_type = args.t
estrain@7	1252 analysis_mode = args.m
estrain@7	1253 mapping_mode=args.b
estrain@7	1254 threads=args.p
estrain@7	1255 make_dir=args.d
estrain@7	1256 clean_mode=args.c
estrain@7	1257 k_size=27 #will change for bug fixing
estrain@7	1258 database="H_and_O_and_specific_genes.fasta"
estrain@7	1259 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
estrain@7	1260 if len(sys.argv)==1:
estrain@7	1261 subprocess.check_call(dirpath+"/SeqSero2_package.py -h",shell=True)#change name of python file
estrain@7	1262 else:
estrain@7	1263 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
estrain@7	1264 request_id += str(random.randint(1, 10000000))
estrain@7	1265 if make_dir is None:
estrain@7	1266 make_dir="SeqSero_result_"+request_id
estrain@7	1267 if os.path.isdir(make_dir):
estrain@7	1268 pass
estrain@7	1269 else:
estrain@7	1270 subprocess.check_call(["mkdir",make_dir])
estrain@7	1271 #subprocess.check_call("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
estrain@7	1272 #subprocess.check_call("ln -sr "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True)
estrain@7	1273 subprocess.check_call("ln -f -s "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True) ### ed_SL_05282019: use -f option to force the replacement of links, remove -r and use absolute path instead to avoid link issue (use 'type=os.path.abspath' in -i argument).
estrain@7	1274 ############################begin the real analysis
estrain@7	1275 if analysis_mode=="a":
estrain@7	1276 if data_type in ["1","2","3"]:#use allele mode
estrain@7	1277 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
estrain@7	1278 os.chdir(make_dir)
estrain@7	1279 ###add a function to tell input files
estrain@7	1280 fnameA=for_fq.split("/")[-1]
estrain@7	1281 fnameB=rev_fq.split("/")[-1]
estrain@7	1282 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
estrain@7	1283 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
estrain@7	1284 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
estrain@7	1285 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
estrain@7	1286 if xmlfile=="NA":
estrain@7	1287 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
estrain@7	1288 else:
estrain@7	1289 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
estrain@7	1290 file=open("data_log.txt","a")
estrain@7	1291 for x in Final_list:
estrain@7	1292 file.write("\t".join(str(y) for y in x)+"\n")
estrain@7	1293 file.close()
estrain@7	1294 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
estrain@7	1295 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
estrain@7	1296 subspecies=judge_subspecies(fnameA,dirpath) #predict subspecies
estrain@7	1297 ###output
estrain@7	1298 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
estrain@7	1299 claim="" #04132019, disable claim for new report requirement
estrain@7	1300 contamination_report=""
estrain@7	1301 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
estrain@7	1302 if contamination_O!="" and contamination_H=="":
estrain@7	1303 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
estrain@7	1304 elif contamination_O=="" and contamination_H!="":
estrain@7	1305 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
estrain@7	1306 elif contamination_O!="" and contamination_H!="":
estrain@7	1307 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
estrain@7	1308 if contamination_report!="":
estrain@7	1309 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
estrain@7	1310 contamination_report=" Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles."
estrain@7	1311 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
estrain@7	1312 if contamination_report+star_line+claim=="": #0413, new output style
estrain@7	1313 note=""
estrain@7	1314 else:
estrain@7	1315 note="Note:"
estrain@7	1316 if clean_mode:
estrain@7	1317 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
estrain@7	1318 make_dir="none-output-directory due to '-c' flag"
estrain@7	1319 else:
estrain@7	1320 new_file=open("SeqSero_result.txt","w")
estrain@7	1321 if O_choice=="":
estrain@7	1322 O_choice="-"
estrain@7	1323 if "N/A" not in predict_sero:
estrain@7	1324 new_file.write("Output_directory:\t"+make_dir+"\n"+
estrain@7	1325 "Input files:\t"+"\t".join(input_file)+"\n"+
estrain@7	1326 "O antigen prediction:\t"+O_choice+"\n"+
estrain@7	1327 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
estrain@7	1328 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
estrain@7	1329 "Predicted subspecies:\t"+subspecies+"\n"+
estrain@7	1330 "Predicted antigenic profile:\t"+predict_form+"\n"+
estrain@7	1331 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
estrain@7	1332 note+contamination_report+star_line+claim+"\n")#+##
estrain@7	1333 else:
estrain@7	1334 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
estrain@7	1335 star_line="" #04132019, for new output requirement, diable star_line if "NA" in output
estrain@7	1336 new_file.write("Output_directory:\t"+make_dir+"\n"+
estrain@7	1337 "Input files:\t"+"\t".join(input_file)+"\n"+
estrain@7	1338 "O antigen prediction:\t"+O_choice+"\n"+
estrain@7	1339 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
estrain@7	1340 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
estrain@7	1341 "Predicted subspecies:\t"+subspecies+"\n"+
estrain@7	1342 "Predicted antigenic profile:\t"+predict_form+"\n"+
estrain@7	1343 note+contamination_report+star_line+claim+"\n")#+##
estrain@7	1344 new_file.close()
estrain@7	1345 print("\n")
estrain@7	1346 #subprocess.check_call("cat Seqsero_result.txt",shell=True)
estrain@7	1347 #subprocess.call("rm H_and_O_and_specific_genes.fasta* .sra .bam .sam .fastq .gz .fq temp.txt .xml "+fnameA+"_db* 2> /dev/null",shell=True)
estrain@7	1348 subprocess.call("rm H_and_O_and_specific_genes.fasta* .sra .bam .sam .fastq .gz .fq temp.txt "+fnameA+"_db 2> /dev/null",shell=True)
estrain@7	1349 if "N/A" not in predict_sero:
estrain@7	1350 #print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\nNote:"+contamination_report+star+star_line+claim+"\n")#+##
estrain@7	1351 print("Output_directory:\t"+make_dir+"\n"+
estrain@7	1352 "Input files:\t"+"\t".join(input_file)+"\n"+
estrain@7	1353 "O antigen prediction:\t"+O_choice+"\n"+
estrain@7	1354 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
estrain@7	1355 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
estrain@7	1356 "Predicted subspecies:\t"+subspecies+"\n"+
estrain@7	1357 "Predicted antigenic profile:\t"+predict_form+"\n"+
estrain@7	1358 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
estrain@7	1359 note+contamination_report+star_line+claim+"\n")#+##
estrain@7	1360 else:
estrain@7	1361 print("Output_directory:\t"+make_dir+"\n"+
estrain@7	1362 "Input files:\t"+"\t".join(input_file)+"\n"+
estrain@7	1363 "O antigen prediction:\t"+O_choice+"\n"+
estrain@7	1364 "H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+
estrain@7	1365 "H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+
estrain@7	1366 "Predicted subspecies:\t"+subspecies+"\n"+
estrain@7	1367 "Predicted antigenic profile:\t"+predict_form+"\n"+
estrain@7	1368 note+contamination_report+star_line+claim+"\n")
estrain@7	1369 else:
estrain@7	1370 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
estrain@7	1371 elif analysis_mode=="k":
estrain@7	1372 ex_dir = os.path.dirname(os.path.realpath(__file__))
estrain@7	1373 #output_mode = args.mode
estrain@7	1374 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
estrain@7	1375 input_file = for_fq #-k will just use forward because not all reads were used
estrain@7	1376 os.chdir(make_dir)
estrain@7	1377 f = open(ex_dir + '/antigens.pickle', 'rb')
estrain@7	1378 lib_dict = pickle.load(f)
estrain@7	1379 f.close
estrain@7	1380 input_Ks=get_input_K(input_file,lib_dict,data_type,k_size)
estrain@7	1381 O_dict,H_dict,Special_dict=get_kmer_dict(lib_dict,input_Ks)
estrain@7	1382 highest_O,highest_fliC,highest_fljB=call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
estrain@7	1383 subspecies=judge_subspecies_Kmer(Special_dict)
estrain@7	1384 if subspecies=="IIb" or subspecies=="IIa":
estrain@7	1385 subspecies="II"
estrain@7	1386 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
estrain@7	1387 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
estrain@7	1388 claim="" #no claim any more based on new output requirement
estrain@7	1389 if star_line+claim=="": #0413, new output style
estrain@7	1390 note=""
estrain@7	1391 else:
estrain@7	1392 note="Note:"
estrain@7	1393 if clean_mode:
estrain@7	1394 subprocess.check_call("rm -rf ../"+make_dir,shell=True)
estrain@7	1395 make_dir="none-output-directory due to '-c' flag"
estrain@7	1396 ### ed_SL_05282019, fix the assignment issue of variable 'O_choice' using "-m k -c"
estrain@7	1397 if highest_O.split('-')[-1]=="":
estrain@7	1398 O_choice="-"
estrain@7	1399 else:
estrain@7	1400 O_choice=highest_O.split('-')[-1]
estrain@7	1401 ###
estrain@7	1402 else:
estrain@7	1403 if highest_O.split('-')[-1]=="":
estrain@7	1404 O_choice="-"
estrain@7	1405 else:
estrain@7	1406 O_choice=highest_O.split('-')[-1]
estrain@7	1407 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
estrain@7	1408 new_file=open("SeqSero_result.txt","w")
estrain@7	1409 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
estrain@7	1410 if "N/A" not in predict_sero:
estrain@7	1411 new_file.write("Output_directory:\t"+make_dir+"\n"+
estrain@7	1412 "Input files:\t"+input_file+"\n"+
estrain@7	1413 "O antigen prediction:\t"+O_choice+"\n"+
estrain@7	1414 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
estrain@7	1415 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
estrain@7	1416 "Predicted subspecies:\t"+subspecies+"\n"+
estrain@7	1417 "Predicted antigenic profile:\t"+predict_form+"\n"+
estrain@7	1418 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
estrain@7	1419 note+star_line+claim+"\n")#+##
estrain@7	1420 else:
estrain@7	1421 #star_line=star_line.strip()+"\tNone such antigenic formula in KW.\n"
estrain@7	1422 star_line = "" #changed for new output requirement, 04132019
estrain@7	1423 new_file.write("Output_directory:\t"+make_dir+"\n"+
estrain@7	1424 "Input files:\t"+input_file+"\n"+
estrain@7	1425 "O antigen prediction:\t"+O_choice+"\n"+
estrain@7	1426 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
estrain@7	1427 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
estrain@7	1428 "Predicted subspecies:\t"+subspecies+"\n"+
estrain@7	1429 "Predicted antigenic profile:\t"+predict_form+"\n"+
estrain@7	1430 note+star_line+claim+"\n")#+##
estrain@7	1431 new_file.close()
estrain@7	1432 subprocess.call("rm .fasta .fastq .gz .fq temp.txt .sra 2> /dev/null",shell=True)
estrain@7	1433 if "N/A" not in predict_sero:
estrain@7	1434 print("Output_directory:\t"+make_dir+"\n"+
estrain@7	1435 "Input files:\t"+input_file+"\n"+
estrain@7	1436 "O antigen prediction:\t"+O_choice+"\n"+
estrain@7	1437 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
estrain@7	1438 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
estrain@7	1439 "Predicted subspecies:\t"+subspecies+"\n"+
estrain@7	1440 "Predicted antigenic profile:\t"+predict_form+"\n"+
estrain@7	1441 "Predicted serotype:\t"+predict_sero+"\n"+ # ed_SL_04152019: change serotype(s) to serotype
estrain@7	1442 note+star_line+claim+"\n")#+##
estrain@7	1443 else:
estrain@7	1444 print("Output_directory:\t"+make_dir+"\n"+
estrain@7	1445 "Input files:\t"+input_file+"\n"+
estrain@7	1446 "O antigen prediction:\t"+O_choice+"\n"+
estrain@7	1447 "H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+
estrain@7	1448 "H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+
estrain@7	1449 "Predicted subspecies:\t"+subspecies+"\n"+
estrain@7	1450 "Predicted antigenic profile:\t"+predict_form+"\n"+
estrain@7	1451 note+star_line+claim+"\n")#+##
estrain@7	1452
estrain@7	1453 if __name__ == '__main__':
estrain@7	1454 main()

Mercurial > repos > estrain > seqsero2_v1_0_1

annotate SeqSero2_package.py @ 7:a3accb97716a