seqsero_v2: SeqSero2/SeqSero2_package.py annotate

annotate SeqSero2/SeqSero2_package.py @ 3:b18e8cdfdf81

planemo upload

author	jpayne
date	Tue, 13 Nov 2018 16:18:17 -0500
parents	fae43708974d
children	b82e5f3c9187

rev	line source
jpayne@1	1 #!/usr/bin/env python3
jpayne@1	2
jpayne@1	3 import sys
jpayne@1	4 import time
jpayne@1	5 import random
jpayne@1	6 import os
jpayne@1	7 import subprocess
jpayne@1	8 import gzip
jpayne@1	9 import io
jpayne@1	10 import pickle
jpayne@1	11 import argparse
jpayne@1	12 import itertools
jpayne@1	13 from distutils.version import LooseVersion
jpayne@1	14
jpayne@1	15 from collections import OrderedDict
jpayne@1	16 from csv import DictWriter
jpayne@1	17
jpayne@1	18 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
jpayne@1	19
jpayne@1	20 ### SeqSero Kmer
jpayne@1	21 def parse_args():
jpayne@1	22 "Parse the input arguments, use '-h' for help."
jpayne@1	23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future
jpayne@1	24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data")
jpayne@1	25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)")
jpayne@1	26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode for mapping, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem")
jpayne@1	27 parser.add_argument("-p",default="1",help="<int>: threads used for mapping mode, if p>4, only 4 threads will be used for assembly, default=1")
jpayne@1	28 parser.add_argument("-m",choices=['k','a'],default="k",help="<string>: 'k'(kmer mode), 'a'(allele mode), default=k")
jpayne@1	29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
jpayne@1	30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted")
jpayne@1	31 return parser.parse_args()
jpayne@1	32
jpayne@1	33 def reverse_complement(sequence):
jpayne@1	34 complement = {
jpayne@1	35 'A': 'T',
jpayne@1	36 'C': 'G',
jpayne@1	37 'G': 'C',
jpayne@1	38 'T': 'A',
jpayne@1	39 'N': 'N',
jpayne@1	40 'M': 'K',
jpayne@1	41 'R': 'Y',
jpayne@1	42 'W': 'W',
jpayne@1	43 'S': 'S',
jpayne@1	44 'Y': 'R',
jpayne@1	45 'K': 'M',
jpayne@1	46 'V': 'B',
jpayne@1	47 'H': 'D',
jpayne@1	48 'D': 'H',
jpayne@1	49 'B': 'V'
jpayne@1	50 }
jpayne@1	51 return "".join(complement[base] for base in reversed(sequence))
jpayne@1	52
jpayne@1	53
jpayne@1	54 def createKmerDict_reads(list_of_strings, kmer):
jpayne@1	55 kmer_table = {}
jpayne@1	56 for string in list_of_strings:
jpayne@1	57 sequence = string.strip('\n')
jpayne@1	58 for i in range(len(sequence) - kmer + 1):
jpayne@1	59 new_mer = sequence[i:i + kmer].upper()
jpayne@1	60 new_mer_rc = reverse_complement(new_mer)
jpayne@1	61 if new_mer in kmer_table:
jpayne@1	62 kmer_table[new_mer.upper()] += 1
jpayne@1	63 else:
jpayne@1	64 kmer_table[new_mer.upper()] = 1
jpayne@1	65 if new_mer_rc in kmer_table:
jpayne@1	66 kmer_table[new_mer_rc.upper()] += 1
jpayne@1	67 else:
jpayne@1	68 kmer_table[new_mer_rc.upper()] = 1
jpayne@1	69 return kmer_table
jpayne@1	70
jpayne@1	71
jpayne@1	72 def multifasta_dict(multifasta):
jpayne@1	73 multifasta_list = [
jpayne@1	74 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
jpayne@1	75 ]
jpayne@1	76 headers = [i for i in multifasta_list if i[0] == '>']
jpayne@1	77 multifasta_dict = {}
jpayne@1	78 for h in headers:
jpayne@1	79 start = multifasta_list.index(h)
jpayne@1	80 for element in multifasta_list[start + 1:]:
jpayne@1	81 if element[0] == '>':
jpayne@1	82 break
jpayne@1	83 else:
jpayne@1	84 if h[1:] in multifasta_dict:
jpayne@1	85 multifasta_dict[h[1:]] += element
jpayne@1	86 else:
jpayne@1	87 multifasta_dict[h[1:]] = element
jpayne@1	88 return multifasta_dict
jpayne@1	89
jpayne@1	90
jpayne@1	91 def multifasta_single_string(multifasta):
jpayne@1	92 multifasta_list = [
jpayne@1	93 line.strip() for line in open(multifasta, 'r')
jpayne@1	94 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
jpayne@1	95 ]
jpayne@1	96 return ''.join(multifasta_list)
jpayne@1	97
jpayne@1	98
jpayne@1	99 def chunk_a_long_sequence(long_sequence, chunk_size=60):
jpayne@1	100 chunk_list = []
jpayne@1	101 steps = len(long_sequence) // 60 #how many chunks
jpayne@1	102 for i in range(steps):
jpayne@1	103 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
jpayne@1	104 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
jpayne@1	105 return chunk_list
jpayne@1	106
jpayne@1	107
jpayne@1	108 def target_multifasta_kmerizer(multifasta, k, kmerDict):
jpayne@1	109 forward_length = 300 #if find the target, put forward 300 bases
jpayne@1	110 reverse_length = 2200 #if find the target, put backward 2200 bases
jpayne@1	111 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
jpayne@1	112 target_mers = []
jpayne@1	113 long_single_string = multifasta_single_string(multifasta)
jpayne@1	114 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
jpayne@1	115 unit_length = len(multifasta_list[0])
jpayne@1	116 forward_lines = int(forward_length / unit_length) + 1
jpayne@1	117 reverse_lines = int(forward_length / unit_length) + 1
jpayne@1	118 start_num = 0
jpayne@1	119 end_num = 0
jpayne@1	120 for i in range(len(multifasta_list)):
jpayne@1	121 if i not in range(start_num, end_num): #avoid computational repetition
jpayne@1	122 line = multifasta_list[i]
jpayne@1	123 start = int((len(line) - k) // 2)
jpayne@1	124 s1 = line[start:k + start]
jpayne@1	125 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
jpayne@1	126 if i - forward_lines >= 0:
jpayne@1	127 start_num = i - forward_lines
jpayne@1	128 else:
jpayne@1	129 start_num = 0
jpayne@1	130 if i + reverse_lines <= len(multifasta_list) - 1:
jpayne@1	131 end_num = i + reverse_lines
jpayne@1	132 else:
jpayne@1	133 end_num = len(multifasta_list) - 1
jpayne@1	134 target_list = [
jpayne@1	135 x.strip() for x in multifasta_list[start_num:end_num]
jpayne@1	136 ]
jpayne@1	137 target_line = "".join(target_list)
jpayne@1	138 target_mers += [
jpayne@1	139 k1 for k1 in createKmerDict_reads([str(target_line)], k)
jpayne@1	140 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
jpayne@1	141 else:
jpayne@1	142 pass
jpayne@1	143 return set(target_mers)
jpayne@1	144
jpayne@1	145
jpayne@1	146 def target_read_kmerizer(file, k, kmerDict):
jpayne@1	147 i = 1
jpayne@1	148 n_reads = 0
jpayne@1	149 total_coverage = 0
jpayne@1	150 target_mers = []
jpayne@1	151 if file.endswith(".gz"):
jpayne@1	152 file_content = io.BufferedReader(gzip.open(file))
jpayne@1	153 else:
jpayne@1	154 file_content = open(file, "r").readlines()
jpayne@1	155 for line in file_content:
jpayne@1	156 start = int((len(line) - k) // 2)
jpayne@1	157 if i % 4 == 2:
jpayne@1	158 if file.endswith(".gz"):
jpayne@1	159 s1 = line[start:k + start].decode()
jpayne@1	160 line = line.decode()
jpayne@1	161 else:
jpayne@1	162 s1 = line[start:k + start]
jpayne@1	163 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
jpayne@1	164 n_reads += 1
jpayne@1	165 total_coverage += len(line)
jpayne@1	166 target_mers += [
jpayne@1	167 k1 for k1 in createKmerDict_reads([str(line)], k)
jpayne@1	168 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
jpayne@1	169 i += 1
jpayne@1	170 if total_coverage >= 4000000:
jpayne@1	171 break
jpayne@1	172 return set(target_mers)
jpayne@1	173
jpayne@1	174
jpayne@1	175 def minion_fasta_kmerizer(file, k, kmerDict):
jpayne@1	176 i = 1
jpayne@1	177 n_reads = 0
jpayne@1	178 total_coverage = 0
jpayne@1	179 target_mers = {}
jpayne@1	180 for line in open(file):
jpayne@1	181 if i % 2 == 0:
jpayne@1	182 for kmer, rc_kmer in kmers(line.strip().upper(), k):
jpayne@1	183 if (kmer in kmerDict) or (rc_kmer in kmerDict):
jpayne@1	184 if kmer in target_mers:
jpayne@1	185 target_mers[kmer] += 1
jpayne@1	186 else:
jpayne@1	187 target_mers[kmer] = 1
jpayne@1	188 if rc_kmer in target_mers:
jpayne@1	189 target_mers[rc_kmer] += 1
jpayne@1	190 else:
jpayne@1	191 target_mers[rc_kmer] = 1
jpayne@1	192 i += 1
jpayne@1	193 return set([h for h in target_mers])
jpayne@1	194
jpayne@1	195
jpayne@1	196 def minion_fastq_kmerizer(file, k, kmerDict):
jpayne@1	197 i = 1
jpayne@1	198 n_reads = 0
jpayne@1	199 total_coverage = 0
jpayne@1	200 target_mers = {}
jpayne@1	201 for line in open(file):
jpayne@1	202 if i % 4 == 2:
jpayne@1	203 for kmer, rc_kmer in kmers(line.strip().upper(), k):
jpayne@1	204 if (kmer in kmerDict) or (rc_kmer in kmerDict):
jpayne@1	205 if kmer in target_mers:
jpayne@1	206 target_mers[kmer] += 1
jpayne@1	207 else:
jpayne@1	208 target_mers[kmer] = 1
jpayne@1	209 if rc_kmer in target_mers:
jpayne@1	210 target_mers[rc_kmer] += 1
jpayne@1	211 else:
jpayne@1	212 target_mers[rc_kmer] = 1
jpayne@1	213 i += 1
jpayne@1	214 return set([h for h in target_mers])
jpayne@1	215
jpayne@1	216
jpayne@1	217 def multifasta_single_string2(multifasta):
jpayne@1	218 single_string = ''
jpayne@1	219 with open(multifasta, 'r') as f:
jpayne@1	220 for line in f:
jpayne@1	221 if line.strip()[0] == '>':
jpayne@1	222 pass
jpayne@1	223 else:
jpayne@1	224 single_string += line.strip()
jpayne@1	225 return single_string
jpayne@1	226
jpayne@1	227
jpayne@1	228 def kmers(seq, k):
jpayne@1	229 rev_comp = reverse_complement(seq)
jpayne@1	230 for start in range(1, len(seq) - k + 1):
jpayne@1	231 yield seq[start:start + k], rev_comp[-(start + k):-start]
jpayne@1	232
jpayne@1	233
jpayne@1	234 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
jpayne@1	235 multi_seq_dict = multifasta_dict(multifasta)
jpayne@1	236 lib_dict = {}
jpayne@1	237 for h in multi_seq_dict:
jpayne@1	238 lib_dict[h] = set(
jpayne@1	239 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
jpayne@1	240 return lib_dict
jpayne@1	241
jpayne@1	242
jpayne@1	243 def Combine(b, c):
jpayne@1	244 fliC_combinations = []
jpayne@1	245 fliC_combinations.append(",".join(c))
jpayne@1	246 temp_combinations = []
jpayne@1	247 for i in range(len(b)):
jpayne@1	248 for x in itertools.combinations(b, i + 1):
jpayne@1	249 temp_combinations.append(",".join(x))
jpayne@1	250 for x in temp_combinations:
jpayne@1	251 temp = []
jpayne@1	252 for y in c:
jpayne@1	253 temp.append(y)
jpayne@1	254 temp.append(x)
jpayne@1	255 temp = ",".join(temp)
jpayne@1	256 temp = temp.split(",")
jpayne@1	257 temp.sort()
jpayne@1	258 temp = ",".join(temp)
jpayne@1	259 fliC_combinations.append(temp)
jpayne@1	260 return fliC_combinations
jpayne@1	261
jpayne@1	262
jpayne@1	263 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
jpayne@1	264 #like test_output_06012017.txt
jpayne@1	265 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
jpayne@1	266 from Initial_Conditions import phase1
jpayne@1	267 from Initial_Conditions import phase2
jpayne@1	268 from Initial_Conditions import phaseO
jpayne@1	269 from Initial_Conditions import sero
jpayne@1	270 from Initial_Conditions import subs
jpayne@1	271 seronames = []
jpayne@1	272 seronames_none_subspecies=[]
jpayne@1	273 for i in range(len(phase1)):
jpayne@1	274 fliC_combine = []
jpayne@1	275 fljB_combine = []
jpayne@1	276 if phaseO[i] == Otype: # no VII in KW, but it's there
jpayne@1	277 ### for fliC, detect every possible combinations to avoid the effect of "["
jpayne@1	278 if phase1[i].count("[") == 0:
jpayne@1	279 fliC_combine.append(phase1[i])
jpayne@1	280 elif phase1[i].count("[") >= 1:
jpayne@1	281 c = []
jpayne@1	282 b = []
jpayne@1	283 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
jpayne@1	284 "[") == 1:
jpayne@1	285 content = phase1[i].replace("[", "").replace("]", "")
jpayne@1	286 fliC_combine.append(content)
jpayne@1	287 fliC_combine.append("-")
jpayne@1	288 else:
jpayne@1	289 for x in phase1[i].split(","):
jpayne@1	290 if "[" in x:
jpayne@1	291 b.append(x.replace("[", "").replace("]", ""))
jpayne@1	292 else:
jpayne@1	293 c.append(x)
jpayne@1	294 fliC_combine = Combine(
jpayne@1	295 b, c
jpayne@1	296 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
jpayne@1	297 ### end of fliC "[" detect
jpayne@1	298 ### for fljB, detect every possible combinations to avoid the effect of "["
jpayne@1	299 if phase2[i].count("[") == 0:
jpayne@1	300 fljB_combine.append(phase2[i])
jpayne@1	301 elif phase2[i].count("[") >= 1:
jpayne@1	302 d = []
jpayne@1	303 e = []
jpayne@1	304 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
jpayne@1	305 "[") == 1:
jpayne@1	306 content = phase2[i].replace("[", "").replace("]", "")
jpayne@1	307 fljB_combine.append(content)
jpayne@1	308 fljB_combine.append("-")
jpayne@1	309 else:
jpayne@1	310 for x in phase2[i].split(","):
jpayne@1	311 if "[" in x:
jpayne@1	312 d.append(x.replace("[", "").replace("]", ""))
jpayne@1	313 else:
jpayne@1	314 e.append(x)
jpayne@1	315 fljB_combine = Combine(d, e)
jpayne@1	316 ### end of fljB "[" detect
jpayne@1	317 new_fliC = fliC.split(
jpayne@1	318 ","
jpayne@1	319 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
jpayne@1	320 new_fliC.sort()
jpayne@1	321 new_fliC = ",".join(new_fliC)
jpayne@1	322 new_fljB = fljB.split(",")
jpayne@1	323 new_fljB.sort()
jpayne@1	324 new_fljB = ",".join(new_fljB)
jpayne@1	325 if (new_fliC in fliC_combine
jpayne@1	326 or fliC in fliC_combine) and (new_fljB in fljB_combine
jpayne@1	327 or fljB in fljB_combine):
jpayne@1	328 if subs[i] == subspecies:
jpayne@1	329 seronames.append(sero[i])
jpayne@1	330 seronames_none_subspecies.append(sero[i])
jpayne@1	331 #analyze seronames
jpayne@1	332 subspecies_pointer=""
jpayne@1	333 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
jpayne@1	334 seronames=seronames_none_subspecies
jpayne@1	335 subspecies_pointer="1"
jpayne@1	336 if len(seronames) == 0:
jpayne@1	337 seronames = [
jpayne@1	338 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
jpayne@1	339 ]
jpayne@1	340 star = ""
jpayne@1	341 star_line = ""
jpayne@1	342 if len(seronames) > 1: #there are two possible predictions for serotypes
jpayne@1	343 star = "*"
jpayne@1	344 star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
jpayne@1	345 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
jpayne@1	346 star="*"
jpayne@1	347 star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
jpayne@1	348 if Otype=="":
jpayne@1	349 Otype="-"
jpayne@1	350 predict_form = Otype + ":" + fliC + ":" + fljB
jpayne@1	351 predict_sero = (" or ").join(seronames)
jpayne@1	352 ###special test for Enteritidis
jpayne@1	353 if predict_form == "9:g,m:-":
jpayne@1	354 sdf = "-"
jpayne@1	355 for x in special_gene_list:
jpayne@1	356 if x.startswith("sdf"):
jpayne@1	357 sdf = "+"
jpayne@1	358 predict_form = predict_form + " Sdf prediction:" + sdf
jpayne@1	359 if sdf == "-":
jpayne@1	360 star = "*"
jpayne@1	361 star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
jpayne@1	362 predict_sero = "Gallinarum/Enteritidis sdf -"
jpayne@1	363 ###end of special test for Enteritidis
jpayne@1	364 elif predict_form == "4:i:-":
jpayne@1	365 predict_sero = "potential monophasic variant of Typhimurium"
jpayne@1	366 elif predict_form == "4:r:-":
jpayne@1	367 predict_sero = "potential monophasic variant of Heidelberg"
jpayne@1	368 elif predict_form == "4:b:-":
jpayne@1	369 predict_sero = "potential monophasic variant of Paratyphi B"
jpayne@1	370 elif predict_form == "8:e,h:1,2":
jpayne@1	371 predict_sero = "Newport"
jpayne@1	372 star = "*"
jpayne@1	373 star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
jpayne@1	374 claim = "The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
jpayne@1	375 if "N/A" in predict_sero:
jpayne@1	376 claim = ""
jpayne@1	377 #special test for Typhimurium
jpayne@1	378 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
jpayne@1	379 normal = 0
jpayne@1	380 mutation = 0
jpayne@1	381 for x in special_gene_list:
jpayne@1	382 if "oafA-O-4_full" in x:
jpayne@1	383 normal = float(special_gene_list[x])
jpayne@1	384 elif "oafA-O-4_5-" in x:
jpayne@1	385 mutation = float(special_gene_list[x])
jpayne@1	386 if normal > mutation:
jpayne@1	387 pass
jpayne@1	388 elif normal < mutation:
jpayne@1	389 predict_sero = predict_sero.strip() + "(O5-)"
jpayne@1	390 star = "*"
jpayne@1	391 star_line = "Detected the deletion of O5-."
jpayne@1	392 else:
jpayne@1	393 pass
jpayne@1	394 #special test for Paratyphi B
jpayne@1	395 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
jpayne@1	396 normal = 0
jpayne@1	397 mutation = 0
jpayne@1	398 for x in special_gene_list:
jpayne@1	399 if "gntR-family-regulatory-protein_dt-positive" in x:
jpayne@1	400 normal = float(special_gene_list[x])
jpayne@1	401 elif "gntR-family-regulatory-protein_dt-negative" in x:
jpayne@1	402 mutation = float(special_gene_list[x])
jpayne@1	403 #print(normal,mutation)
jpayne@1	404 if normal > mutation:
jpayne@1	405 predict_sero = predict_sero.strip() + "(dt+)"
jpayne@1	406 star = "*"
jpayne@1	407 star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
jpayne@1	408 elif normal < mutation:
jpayne@1	409 predict_sero = predict_sero.strip() + "(dt-)"
jpayne@1	410 star = "*"
jpayne@1	411 star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
jpayne@1	412 else:
jpayne@1	413 star = "*"
jpayne@1	414 star_line = "Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
jpayne@1	415 #special test for O13,22 and O13,23
jpayne@1	416 if Otype=="13":
jpayne@1	417 #ex_dir = os.path.dirname(os.path.realpath(__file__))
jpayne@1	418 f = open(dirpath + '/special.pickle', 'rb')
jpayne@1	419 special = pickle.load(f)
jpayne@1	420 O22_O23=special['O22_O23']
jpayne@1	421 if predict_sero.split(" or ")[0] in O22_O23[-1]:
jpayne@1	422 O22_score=0
jpayne@1	423 O23_score=0
jpayne@1	424 for x in special_gene_list:
jpayne@1	425 if "O:22" in x:
jpayne@1	426 O22_score = O22_score+float(special_gene_list[x])
jpayne@1	427 elif "O:23" in x:
jpayne@1	428 O23_score = O23_score+float(special_gene_list[x])
jpayne@1	429 #print(O22_score,O23_score)
jpayne@1	430 for z in O22_O23[0]:
jpayne@1	431 if predict_sero.split(" or ")[0] in z:
jpayne@1	432 if O22_score > O23_score:
jpayne@1	433 star = "*"
jpayne@1	434 star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'."
jpayne@1	435 predict_sero = z[0]
jpayne@1	436 elif O22_score < O23_score:
jpayne@1	437 star = "*"
jpayne@1	438 star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'."
jpayne@1	439 predict_sero = z[1]
jpayne@1	440 else:
jpayne@1	441 star = "*"
jpayne@1	442 star_line = "Fail to detect O22 and O23 differences."
jpayne@1	443 #special test for O6,8
jpayne@1	444 merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"]
jpayne@1	445 for x in merge_O68_list:
jpayne@1	446 if x in predict_sero:
jpayne@1	447 predict_sero=x
jpayne@1	448 star=""
jpayne@1	449 star_line=""
jpayne@1	450 #special test for Montevideo; most of them are monophasic
jpayne@1	451 if "Montevideo" in predict_sero and "1,2,7" in predict_form:
jpayne@1	452 star="*"
jpayne@1	453 star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
jpayne@1	454 return predict_form, predict_sero, star, star_line, claim
jpayne@1	455 ### End of SeqSero Kmer part
jpayne@1	456
jpayne@1	457 ### Begin of SeqSero2 allele prediction and output
jpayne@1	458 def xml_parse_score_comparision_seqsero(xmlfile):
jpayne@1	459 #used to do seqsero xml analysis
jpayne@1	460 from Bio.Blast import NCBIXML
jpayne@1	461 handle=open(xmlfile)
jpayne@1	462 handle=NCBIXML.parse(handle)
jpayne@1	463 handle=list(handle)
jpayne@1	464 List=[]
jpayne@1	465 List_score=[]
jpayne@1	466 List_ids=[]
jpayne@1	467 List_query_region=[]
jpayne@1	468 for i in range(len(handle)):
jpayne@1	469 if len(handle[i].alignments)>0:
jpayne@1	470 for j in range(len(handle[i].alignments)):
jpayne@1	471 score=0
jpayne@1	472 ids=0
jpayne@1	473 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
jpayne@1	474 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
jpayne@1	475 for z in range(len(handle[i].alignments[j].hsps)):
jpayne@1	476 hsp=handle[i].alignments[j].hsps[z]
jpayne@1	477 temp=set(range(hsp.query_start,hsp.query_end))
jpayne@1	478 if len(cover_region)==0:
jpayne@1	479 cover_region=cover_region\|temp
jpayne@1	480 fraction=1
jpayne@1	481 else:
jpayne@1	482 fraction=1-len(cover_region&temp)/float(len(temp))
jpayne@1	483 cover_region=cover_region\|temp
jpayne@1	484 if "last" in handle[i].query or "first" in handle[i].query:
jpayne@1	485 score+=hsp.bits*fraction
jpayne@1	486 ids+=float(hsp.identities)/handle[i].query_length*fraction
jpayne@1	487 else:
jpayne@1	488 score+=hsp.bits*fraction
jpayne@1	489 ids+=float(hsp.identities)/handle[i].query_length*fraction
jpayne@1	490 List_score.append(score)
jpayne@1	491 List_ids.append(ids)
jpayne@1	492 List_query_region.append(cover_region)
jpayne@1	493 temp=zip(List,List_score,List_ids,List_query_region)
jpayne@1	494 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
jpayne@1	495 return Final_list
jpayne@1	496
jpayne@1	497
jpayne@1	498 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
jpayne@1	499 Old=L
jpayne@1	500 L.sort()
jpayne@1	501 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
jpayne@1	502 count=[]
jpayne@1	503 for j in range(len(L)):
jpayne@1	504 y=0
jpayne@1	505 for x in Old:
jpayne@1	506 if L[j]==x:
jpayne@1	507 y+=1
jpayne@1	508 count.append(y)
jpayne@1	509 if sort_on_fre!="none":
jpayne@1	510 d=zip(*sorted(zip(count, L)))
jpayne@1	511 L=d[1]
jpayne@1	512 count=d[0]
jpayne@1	513 return (L,count)
jpayne@1	514
jpayne@1	515 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
jpayne@1	516 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
jpayne@1	517 #this is mainly used for
jpayne@1	518 a=nodes_vs_score_list
jpayne@1	519 fliC_score=0
jpayne@1	520 fljB_score=0
jpayne@1	521 for z in a:
jpayne@1	522 if "fliC" in z[0]:
jpayne@1	523 fliC_score+=z[1]
jpayne@1	524 elif "fljB" in z[0]:
jpayne@1	525 fljB_score+=z[1]
jpayne@1	526 if fliC_score>=fljB_score:
jpayne@1	527 role="fliC"
jpayne@1	528 else:
jpayne@1	529 role="fljB"
jpayne@1	530 return (role,abs(fliC_score-fljB_score))
jpayne@1	531
jpayne@1	532 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
jpayne@1	533 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
jpayne@1	534 #also used when no head or tail got blasted score for the contig
jpayne@1	535 role=""
jpayne@1	536 for z in Final_list_passed:
jpayne@1	537 if node_name in z[0]:
jpayne@1	538 role=z[0].split("_")[0]
jpayne@1	539 break
jpayne@1	540 return role
jpayne@1	541
jpayne@1	542 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
jpayne@1	543 #nodes_list is the c created by c,d=Uniq(nodes) in below function
jpayne@1	544 first_target=""
jpayne@1	545 role_list=[]
jpayne@1	546 for x in nodes_list:
jpayne@1	547 a=[]
jpayne@1	548 role=""
jpayne@1	549 for y in tail_head_list:
jpayne@1	550 if x in y[0]:
jpayne@1	551 a.append(y)
jpayne@1	552 if len(a)==4:
jpayne@1	553 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	554 if diff<20:
jpayne@1	555 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	556 elif len(a)==3:
jpayne@1	557 ###however, if the one with highest score is the fewer one, compare their accumulation score
jpayne@1	558 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	559 if diff<20:
jpayne@1	560 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	561 ###end of above score comparison
jpayne@1	562 elif len(a)==2:
jpayne@1	563 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
jpayne@1	564 temp=[]
jpayne@1	565 for z in a:
jpayne@1	566 temp.append(z[0].split("_")[0])
jpayne@1	567 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
jpayne@1	568 if len(m)==1:
jpayne@1	569 pass
jpayne@1	570 else:
jpayne@1	571 pass
jpayne@1	572 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	573 if diff<20:
jpayne@1	574 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	575 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
jpayne@1	576 elif len(a)==1:
jpayne@1	577 #that one
jpayne@1	578 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	579 if diff<20:
jpayne@1	580 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	581 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
jpayne@1	582 else:#a==0
jpayne@1	583 #use Final_list_passed best match
jpayne@1	584 for z in Final_list_passed:
jpayne@1	585 if x in z[0]:
jpayne@1	586 role=z[0].split("_")[0]
jpayne@1	587 break
jpayne@1	588 #print x,role,len(a)
jpayne@1	589 role_list.append((role,x))
jpayne@1	590 if len(role_list)==2:
jpayne@1	591 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
jpayne@1	592 #just use score to do a final test
jpayne@1	593 role_list=[]
jpayne@1	594 for x in nodes_list:
jpayne@1	595 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	596 role_list.append((role,x))
jpayne@1	597 return role_list
jpayne@1	598
jpayne@1	599 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
jpayne@1	600 #used to decide which contig is FliC and which one is fljB
jpayne@1	601 contigs=[]
jpayne@1	602 nodes=[]
jpayne@1	603 for x in Final_list_passed:
jpayne@1	604 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
jpayne@1	605 nodes.append(x[0].split("___")[1].strip())
jpayne@1	606 c,d=Uniq(nodes)#c is node_list
jpayne@1	607 #print c
jpayne@1	608 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
jpayne@1	609 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
jpayne@1	610 return roles
jpayne@1	611
jpayne@1	612 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
jpayne@1	613 #decide O based on Final_list
jpayne@1	614 O_choice="?"
jpayne@1	615 O_list=[]
jpayne@1	616 special_genes={}
jpayne@1	617 nodes=[]
jpayne@1	618 for x in Final_list_passed:
jpayne@1	619 if x[0].startswith("O-"):
jpayne@1	620 nodes.append(x[0].split("___")[1].strip())
jpayne@1	621 elif not x[0].startswith("fl"):
jpayne@1	622 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
jpayne@1	623 #print "special_genes:",special_genes
jpayne@1	624 c,d=Uniq(nodes)
jpayne@1	625 #print "potential O antigen contig",c
jpayne@1	626 final_O=[]
jpayne@1	627 O_nodes_list=[]
jpayne@1	628 for x in c:#c is the list for contigs
jpayne@1	629 temp=0
jpayne@1	630 for y in Final_list_passed:
jpayne@1	631 if x in y[0] and y[0].startswith("O-"):
jpayne@1	632 final_O.append(y)
jpayne@1	633 break
jpayne@1	634 ### O contig has the problem of two genes on same contig, so do additional test
jpayne@1	635 potenial_new_gene=""
jpayne@1	636 for x in final_O:
jpayne@1	637 pointer=0 #for genes merged or not
jpayne@1	638 #not consider O-1,3,19_not_in_3,10, too short compared with others
jpayne@1	639 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 3002 (for secureity can use 400*2) as flank region
jpayne@1	640 pointer=x[0].split("___")[1].strip()#store the contig name
jpayne@1	641 print(pointer)
jpayne@1	642 if pointer!=0:#it has potential merge event
jpayne@1	643 for y in Final_list:
jpayne@1	644 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
jpayne@1	645 potenial_new_gene=y
jpayne@1	646 #print(potenial_new_gene)
jpayne@1	647 break
jpayne@1	648 if potenial_new_gene!="":
jpayne@1	649 print("two differnt genes in same contig, fix it for O antigen")
jpayne@1	650 print(potenial_new_gene[:3])
jpayne@1	651 final_O.append(potenial_new_gene)
jpayne@1	652 ### end of the two genes on same contig test
jpayne@1	653 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
jpayne@1	654 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
jpayne@1	655 #print "$$$No Otype, due to no hit"#may need to be changed
jpayne@1	656 O_choice="-"
jpayne@1	657 else:
jpayne@1	658 highest_O_coverage=max([float(x[0].split("_cov_")[-1]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
jpayne@1	659 O_list=[]
jpayne@1	660 O_list_less_contamination=[]
jpayne@1	661 for x in final_O:
jpayne@1	662 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
jpayne@1	663 O_list.append(x[0].split("__")[0])
jpayne@1	664 O_nodes_list.append(x[0].split("___")[1])
jpayne@1	665 if float(x[0].split("_cov_")[-1])>highest_O_coverage*0.15:
jpayne@1	666 O_list_less_contamination.append(x[0].split("__")[0])
jpayne@1	667 ### special test for O9,46 and O3,10 family
jpayne@1	668 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
jpayne@1	669 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	670 O_choice="O-9,46"
jpayne@1	671 #print "$$$Most possilble Otype: O-9,46"
jpayne@1	672 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
jpayne@1	673 O_choice="O-9,46,27"
jpayne@1	674 #print "$$$Most possilble Otype: O-9,46,27"
jpayne@1	675 else:
jpayne@1	676 O_choice="O-9"#next, detect O9 vs O2?
jpayne@1	677 O2=0
jpayne@1	678 O9=0
jpayne@1	679 for z in special_genes:
jpayne@1	680 if "tyr-O-9" in z:
jpayne@1	681 O9=special_genes[z]
jpayne@1	682 elif "tyr-O-2" in z:
jpayne@1	683 O2=special_genes[z]
jpayne@1	684 if O2>O9:
jpayne@1	685 O_choice="O-2"
jpayne@1	686 elif O2<O9:
jpayne@1	687 pass
jpayne@1	688 else:
jpayne@1	689 pass
jpayne@1	690 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
jpayne@1	691 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	692 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
jpayne@1	693 O_choice="O-3,10"
jpayne@1	694 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
jpayne@1	695 else:
jpayne@1	696 O_choice="O-1,3,19"
jpayne@1	697 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
jpayne@1	698 ### end of special test for O9,46 and O3,10 family
jpayne@1	699 else:
jpayne@1	700 try:
jpayne@1	701 max_score=0
jpayne@1	702 for x in final_O:
jpayne@1	703 if x[2]>=max_score and float(x[0].split("_cov_")[-1])>highest_O_coverage0.15:#use x[2],08172018, the "coverage identity = cover_length identity"; also meet coverage threshold
jpayne@1	704 max_score=x[2]#change from x[-1] to x[2],08172018
jpayne@1	705 O_choice=x[0].split("_")[0]
jpayne@1	706 if O_choice=="O-1,3,19":
jpayne@1	707 O_choice=final_O[1][0].split("_")[0]
jpayne@1	708 #print "$$$Most possilble Otype: ",O_choice
jpayne@1	709 except:
jpayne@1	710 pass
jpayne@1	711 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
jpayne@1	712 #print "O:",O_choice,O_nodes_list
jpayne@1	713 Otypes=[]
jpayne@1	714 for x in O_list:
jpayne@1	715 if x!="O-1,3,19_not_in_3,10":
jpayne@1	716 if "O-9,46_" not in x:
jpayne@1	717 Otypes.append(x.split("_")[0])
jpayne@1	718 else:
jpayne@1	719 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
jpayne@1	720 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
jpayne@1	721 Otypes_uniq,Otypes_fre=Uniq(Otypes)
jpayne@1	722 contamination_O=""
jpayne@1	723 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
jpayne@1	724 if len(Otypes_uniq)>2:
jpayne@1	725 contamination_O="potential contamination from O antigen signals"
jpayne@1	726 else:
jpayne@1	727 if len(Otypes_uniq)>1:
jpayne@1	728 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@1	729 contamination_O=""
jpayne@1	730 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@1	731 contamination_O=""
jpayne@1	732 else:
jpayne@1	733 contamination_O="potential contamination from O antigen signals"
jpayne@1	734 return O_choice,O_nodes_list,special_genes,final_O,contamination_O
jpayne@1	735 ### End of SeqSero2 allele prediction and output
jpayne@1	736
jpayne@1	737 def get_input_files(make_dir,input_file,data_type,dirpath):
jpayne@1	738 #tell input files from datatype
jpayne@1	739 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
jpayne@1	740 for_fq=""
jpayne@1	741 rev_fq=""
jpayne@1	742 old_dir = os.getcwd()
jpayne@1	743 os.chdir(make_dir)
jpayne@1	744 if data_type=="1":
jpayne@1	745 input_file=input_file[0].split("/")[-1]
jpayne@1	746 if input_file.endswith(".sra"):
jpayne@1	747 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
jpayne@1	748 for_fq=input_file.replace(".sra","_1.fastq")
jpayne@1	749 rev_fq=input_file.replace(".sra","_2.fastq")
jpayne@1	750 else:
jpayne@1	751 core_id=input_file.split(".fastq")[0].split(".fq")[0]
jpayne@1	752 for_fq=core_id+"_1.fastq"
jpayne@1	753 rev_fq=core_id+"_2.fastq"
jpayne@1	754 if input_file.endswith(".gz"):
jpayne@1	755 subprocess.check_output("gzip -dc "+input_file+" \| "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1	756 else:
jpayne@1	757 subprocess.check_output("cat "+input_file+" \| "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1	758 elif data_type=="2":
jpayne@1	759 for_fq=input_file[0].split("/")[-1]
jpayne@1	760 rev_fq=input_file[1].split("/")[-1]
jpayne@1	761 elif data_type=="3":
jpayne@1	762 input_file=input_file[0].split("/")[-1]
jpayne@1	763 if input_file.endswith(".sra"):
jpayne@1	764 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
jpayne@1	765 for_fq=input_file.replace(".sra","_1.fastq")
jpayne@1	766 else:
jpayne@1	767 for_fq=input_file
jpayne@1	768 elif data_type in ["4","5","6"]:
jpayne@1	769 for_fq=input_file[0].split("/")[-1]
jpayne@1	770 os.chdir(old_dir)
jpayne@1	771 return for_fq,rev_fq
jpayne@1	772
jpayne@1	773 def predict_O_and_H_types(Final_list,Final_list_passed):
jpayne@1	774 #get O and H types from Final_list from blast parsing; allele mode
jpayne@1	775 fliC_choice="-"
jpayne@1	776 fljB_choice="-"
jpayne@1	777 fliC_contig="NA"
jpayne@1	778 fljB_contig="NA"
jpayne@1	779 fliC_region=set([0])
jpayne@1	780 fljB_region=set([0,])
jpayne@1	781 fliC_length=0 #can be changed to coverage in future
jpayne@1	782 fljB_length=0 #can be changed to coverage in future
jpayne@1	783 O_choice="-"#no need to decide O contig for now, should be only one
jpayne@1	784 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
jpayne@1	785 O_choice=O_choice.split("-")[-1].strip()
jpayne@1	786 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
jpayne@1	787 O_choice="-"
jpayne@1	788 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
jpayne@1	789 log_file=open("SeqSero_log.txt","a")
jpayne@1	790 print("O_contigs:")
jpayne@1	791 log_file.write("O_contigs:\n")
jpayne@1	792 for x in O_nodes_roles:
jpayne@1	793 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
jpayne@1	794 print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
jpayne@1	795 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+" "+"blast score: "+str(x[1])+"identity%:"+str(round(x[2]*100,2))+"% "+str(min(x[-1]))+" to "+str(max(x[-1]))+"\n")
jpayne@1	796 if len(H_contig_roles)!=0:
jpayne@1	797 highest_H_coverage=max([float(x[1].split("_cov_")[-1]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
jpayne@1	798 else:
jpayne@1	799 highest_H_coverage=0
jpayne@1	800 for x in H_contig_roles:
jpayne@1	801 #if multiple choices, temporately select the one with longest length for now, will revise in further change
jpayne@1	802 if "fliC" == x[0] and int(x[1].split("_")[3])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
jpayne@1	803 fliC_contig=x[1]
jpayne@1	804 fliC_length=int(x[1].split("_")[3])
jpayne@1	805 elif "fljB" == x[0] and int(x[1].split("_")[3])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1])>highest_H_coverage*0.13:
jpayne@1	806 fljB_contig=x[1]
jpayne@1	807 fljB_length=int(x[1].split("_")[3])
jpayne@1	808 for x in Final_list_passed:
jpayne@1	809 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
jpayne@1	810 fliC_choice=x[0].split("_")[1]
jpayne@1	811 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
jpayne@1	812 fljB_choice=x[0].split("_")[1]
jpayne@1	813 elif fliC_choice!="-" and fljB_choice!="-":
jpayne@1	814 break
jpayne@1	815 #now remove contigs not in middle core part
jpayne@1	816 first_allele="NA"
jpayne@1	817 first_allele_percentage=0
jpayne@1	818 for x in Final_list:
jpayne@1	819 if x[0].startswith("fliC") or x[0].startswith("fljB"):
jpayne@1	820 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
jpayne@1	821 first_allele_percentage=x[2]
jpayne@1	822 break
jpayne@1	823 additional_contigs=[]
jpayne@1	824 for x in Final_list:
jpayne@1	825 if first_allele in x[0]:
jpayne@1	826 if (fliC_contig == x[0].split("___")[-1]):
jpayne@1	827 fliC_region=x[3]
jpayne@1	828 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
jpayne@1	829 fljB_region=x[3]
jpayne@1	830 else:
jpayne@1	831 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
jpayne@1	832 additional_contigs.append(x)
jpayne@1	833 #else:
jpayne@1	834 #print x[:3]
jpayne@1	835 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
jpayne@1	836 if first_allele_percentage>0.9:
jpayne@1	837 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
jpayne@1	838 target_region=fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
jpayne@1	839 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
jpayne@1	840 target_region=fliC_region\|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
jpayne@1	841 else:
jpayne@1	842 target_region=set()#doesn't do anything
jpayne@1	843 else:
jpayne@1	844 target_region=set()#doesn't do anything
jpayne@1	845 #print(target_region)
jpayne@1	846 #print(additional_contigs)
jpayne@1	847 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
jpayne@1	848 target_region=target_region2\|target_region
jpayne@1	849 for x in additional_contigs:
jpayne@1	850 removal=0
jpayne@1	851 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
jpayne@1	852 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length0.5<len(x[3])<contig_length1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
jpayne@1	853 removal=1
jpayne@1	854 else:
jpayne@1	855 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
jpayne@1	856 removal=1
jpayne@1	857 else:
jpayne@1	858 pass
jpayne@1	859 if removal==1:
jpayne@1	860 for y in H_contig_roles:
jpayne@1	861 if y[1] in x[0]:
jpayne@1	862 H_contig_roles.remove(y)
jpayne@1	863 else:
jpayne@1	864 pass
jpayne@1	865 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length0.5,len(x[3]),contig_length1.5)
jpayne@1	866 #end of removing none-middle contigs
jpayne@1	867 print("H_contigs:")
jpayne@1	868 log_file.write("H_contigs:\n")
jpayne@1	869 H_contig_stat=[]
jpayne@1	870 H1_cont_stat={}
jpayne@1	871 H2_cont_stat={}
jpayne@1	872 for i in range(len(H_contig_roles)):
jpayne@1	873 x=H_contig_roles[i]
jpayne@1	874 a=0
jpayne@1	875 for y in Final_list_passed:
jpayne@1	876 if x[1] in y[0] and y[0].startswith(x[0]):
jpayne@1	877 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
jpayne@1	878 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
jpayne@1	879 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
jpayne@1	880 print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
jpayne@1	881 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n")
jpayne@1	882 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
jpayne@1	883 break
jpayne@1	884 else:
jpayne@1	885 print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
jpayne@1	886 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+" "+"blast_score: "+str(y[1])+" identity%:"+str(round(y[2]*100,2))+"% "+str(min(y[-1]))+" to "+str(max(y[-1]))+"\n")
jpayne@1	887 if x[0]=="fliC":
jpayne@1	888 if y[0].split("_")[1] not in H1_cont_stat:
jpayne@1	889 H1_cont_stat[y[0].split("_")[1]]=y[2]
jpayne@1	890 else:
jpayne@1	891 H1_cont_stat[y[0].split("_")[1]]+=y[2]
jpayne@1	892 if x[0]=="fljB":
jpayne@1	893 if y[0].split("_")[1] not in H2_cont_stat:
jpayne@1	894 H2_cont_stat[y[0].split("_")[1]]=y[2]
jpayne@1	895 else:
jpayne@1	896 H2_cont_stat[y[0].split("_")[1]]+=y[2]
jpayne@1	897 break
jpayne@1	898 #detect contaminations
jpayne@1	899 #print(H1_cont_stat)
jpayne@1	900 #print(H2_cont_stat)
jpayne@1	901 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
jpayne@1	902 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
jpayne@1	903 contamination_H=""
jpayne@1	904 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
jpayne@1	905 contamination_H="potential contamination from H antigen signals"
jpayne@1	906 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
jpayne@1	907 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
jpayne@1	908 print(contamination_O)
jpayne@1	909 print(contamination_H)
jpayne@1	910 log_file.write(contamination_O+"\n")
jpayne@1	911 log_file.write(contamination_H+"\n")
jpayne@1	912 log_file.close()
jpayne@1	913 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H
jpayne@1	914
jpayne@1	915 def get_input_K(input_file,lib_dict,data_type,k_size):
jpayne@1	916 #kmer mode; get input_Ks from dict and data_type
jpayne@1	917 kmers = []
jpayne@1	918 for h in lib_dict:
jpayne@1	919 kmers += lib_dict[h]
jpayne@1	920 if data_type == '4':
jpayne@1	921 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
jpayne@1	922 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
jpayne@1	923 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
jpayne@1	924 elif data_type == '5':#minion_2d_fasta
jpayne@1	925 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
jpayne@1	926 if data_type == '6':#minion_2d_fastq
jpayne@1	927 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
jpayne@1	928 return input_Ks
jpayne@1	929
jpayne@1	930 def get_kmer_dict(lib_dict,input_Ks):
jpayne@1	931 #kmer mode; get predicted types
jpayne@1	932 O_dict = {}
jpayne@1	933 H_dict = {}
jpayne@1	934 Special_dict = {}
jpayne@1	935 for h in lib_dict:
jpayne@1	936 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
jpayne@1	937 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
jpayne@1	938 if h.startswith('O-') and score > 25:
jpayne@1	939 O_dict[h] = score
jpayne@1	940 if h.startswith('fl') and score > 40:
jpayne@1	941 H_dict[h] = score
jpayne@1	942 if (h[:2] != 'fl') and (h[:2] != 'O-'):
jpayne@1	943 Special_dict[h] = score
jpayne@1	944 return O_dict,H_dict,Special_dict
jpayne@1	945
jpayne@1	946 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
jpayne@1	947 log_file=open("SeqSero_log.txt","a")
jpayne@1	948 log_file.write("O_scores:\n")
jpayne@1	949 #call O:
jpayne@1	950 highest_O = '-'
jpayne@1	951 if len(O_dict) == 0:
jpayne@1	952 pass
jpayne@1	953 else:
jpayne@1	954 for x in O_dict:
jpayne@1	955 log_file.write(x+"\t"+str(O_dict[x])+"\n")
jpayne@1	956 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
jpayne@1	957 if 'O-9,46_wzy__1191' in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	958 highest_O = "O-9,46"
jpayne@1	959 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
jpayne@1	960 highest_O = "O-9,46,27"
jpayne@1	961 else:
jpayne@1	962 highest_O = "O-9" # next, detect O9 vs O2?
jpayne@1	963 O2 = 0
jpayne@1	964 O9 = 0
jpayne@1	965 for z in Special_dict:
jpayne@1	966 if "tyr-O-9" in z:
jpayne@1	967 O9 = float(Special_dict[z])
jpayne@1	968 if "tyr-O-2" in z:
jpayne@1	969 O2 = float(Special_dict[z])
jpayne@1	970 if O2 > O9:
jpayne@1	971 highest_O = "O-2"
jpayne@1	972 elif ("O-3,10_wzx__1539" in O_dict) and (
jpayne@1	973 "O-9,46_wzy__1191" in O_dict
jpayne@1	974 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	975 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
jpayne@1	976 highest_O = "O-3,10"
jpayne@1	977 else:
jpayne@1	978 highest_O = "O-1,3,19"
jpayne@1	979 ### end of special test for O9,46 and O3,10 family
jpayne@1	980 else:
jpayne@1	981 try:
jpayne@1	982 max_score = 0
jpayne@1	983 for x in O_dict:
jpayne@1	984 if float(O_dict[x]) >= max_score:
jpayne@1	985 max_score = float(O_dict[x])
jpayne@1	986 highest_O = x.split("_")[0]
jpayne@1	987 if highest_O == "O-1,3,19":
jpayne@1	988 highest_O = '-'
jpayne@1	989 max_score = 0
jpayne@1	990 for x in O_dict:
jpayne@1	991 if x == 'O-1,3,19_not_in_3,10__130':
jpayne@1	992 pass
jpayne@1	993 else:
jpayne@1	994 if float(O_dict[x]) >= max_score:
jpayne@1	995 max_score = float(O_dict[x])
jpayne@1	996 highest_O = x.split("_")[0]
jpayne@1	997 except:
jpayne@1	998 pass
jpayne@1	999 #call_fliC:
jpayne@1	1000 if len(H_dict)!=0:
jpayne@1	1001 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
jpayne@1	1002 else:
jpayne@1	1003 highest_H_score_both_BC=0
jpayne@1	1004 highest_fliC = '-'
jpayne@1	1005 highest_fliC_raw = '-'
jpayne@1	1006 highest_Score = 0
jpayne@1	1007 log_file.write("\nH_scores:\n")
jpayne@1	1008 for s in H_dict:
jpayne@1	1009 log_file.write(s+"\t"+str(H_dict[s])+"\n")
jpayne@1	1010 if s.startswith('fliC'):
jpayne@1	1011 if float(H_dict[s]) > highest_Score:
jpayne@1	1012 highest_fliC = s.split('_')[1]
jpayne@1	1013 highest_fliC_raw = s
jpayne@1	1014 highest_Score = float(H_dict[s])
jpayne@1	1015 #call_fljB
jpayne@1	1016 highest_fljB = '-'
jpayne@1	1017 highest_fljB_raw = '-'
jpayne@1	1018 highest_Score = 0
jpayne@1	1019 for s in H_dict:
jpayne@1	1020 if s.startswith('fljB'):
jpayne@1	1021 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
jpayne@1	1022 highest_fljB = s.split('_')[1]
jpayne@1	1023 highest_fljB_raw = s
jpayne@1	1024 highest_Score = float(H_dict[s])
jpayne@1	1025 log_file.write("\nSpecial_scores:\n")
jpayne@1	1026 for s in Special_dict:
jpayne@1	1027 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
jpayne@1	1028 log_file.close()
jpayne@1	1029 return highest_O,highest_fliC,highest_fljB
jpayne@1	1030
jpayne@1	1031 def get_temp_file_names(for_fq,rev_fq):
jpayne@1	1032 #seqsero2 -a; get temp file names
jpayne@1	1033 sam=for_fq+".sam"
jpayne@1	1034 bam=for_fq+".bam"
jpayne@1	1035 sorted_bam=for_fq+"_sorted.bam"
jpayne@1	1036 mapped_fq1=for_fq+"_mapped.fq"
jpayne@1	1037 mapped_fq2=rev_fq+"_mapped.fq"
jpayne@1	1038 combined_fq=for_fq+"_combined.fq"
jpayne@1	1039 for_sai=for_fq+".sai"
jpayne@1	1040 rev_sai=rev_fq+".sai"
jpayne@1	1041 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
jpayne@1	1042
jpayne@1	1043 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
jpayne@1	1044 #seqsero2 -a; do mapping and sort
jpayne@1	1045 print("building database...")
jpayne@1	1046 subprocess.check_output("bwa index "+database,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1047 print("mapping...")
jpayne@1	1048 if mapping_mode=="mem":
jpayne@1	1049 subprocess.check_output("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1050 elif mapping_mode=="sam":
jpayne@1	1051 if fnameB!="":
jpayne@1	1052 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1053 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1054 subprocess.check_output("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1055 else:
jpayne@1	1056 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1057 subprocess.check_output("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam, stderr=subprocess.STDOUT)
jpayne@1	1058 subprocess.check_output("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1059 ### check the version of samtools then use differnt commands
jpayne@1	1060 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
jpayne@1	1061 out, err = samtools_version.communicate()
jpayne@1	1062 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
jpayne@1	1063 print("check samtools version:",version)
jpayne@1	1064 ### end of samtools version check and its analysis
jpayne@1	1065 if LooseVersion(version)<=LooseVersion("1.2"):
jpayne@1	1066 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1067 else:
jpayne@1	1068 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1069
jpayne@1	1070 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
jpayne@1	1071 #seqsero2 -a; extract, assembly and blast
jpayne@1	1072 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1073 if fnameB!="":
jpayne@1	1074 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 ,shell=True, stderr=subprocess.STDOUT)#2> /dev/null if want no output
jpayne@1	1075 else:
jpayne@1	1076 pass
jpayne@1	1077 outdir=current_time+"_temp"
jpayne@1	1078 print("assembling...")
jpayne@1	1079 if int(threads)>4:
jpayne@1	1080 t="4"
jpayne@1	1081 else:
jpayne@1	1082 t=threads
jpayne@1	1083 if os.path.getsize(combined_fq)>100 and os.path.getsize(mapped_fq1)>100:#if not, then it's "-:-:-"
jpayne@1	1084 if fnameB!="":
jpayne@1	1085 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1086 else:
jpayne@1	1087 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1088 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
jpayne@1	1089 subprocess.check_output("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1090 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
jpayne@1	1091 subprocess.check_output("rm -rf "+outdir+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1092 print("blasting...","\n")
jpayne@1	1093 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
jpayne@1	1094 subprocess.check_output('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl',shell=True, stderr=subprocess.STDOUT) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
jpayne@1	1095 subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10"
jpayne@1	1096 else:
jpayne@1	1097 xmlfile="NA"
jpayne@1	1098 return xmlfile
jpayne@1	1099
jpayne@1	1100 def judge_subspecies(fnameA):
jpayne@1	1101 #seqsero2 -a; judge subspecies on just forward raw reads fastq
jpayne@1	1102 salmID_output=subprocess.check_output("../SalmID/SalmID.py -i "+fnameA, shell=True, stderr=subprocess.STDOUT)
jpayne@1	1103 #out, err = salmID_output.communicate()
jpayne@1	1104 #out=out.decode("utf-8")
jpayne@1	1105 out = salmID_output.decode("utf-8")
jpayne@1	1106 file=open("data_log.txt","a")
jpayne@1	1107 file.write(out)
jpayne@1	1108 file.close()
jpayne@1	1109 salm_species_scores=out.split("\n")[1].split("\t")[6:]
jpayne@1	1110 salm_species_results=out.split("\n")[0].split("\t")[6:]
jpayne@1	1111 max_score=0
jpayne@1	1112 max_score_index=1 #default is 1, means "I"
jpayne@1	1113 for i in range(len(salm_species_scores)):
jpayne@1	1114 if max_score<float(salm_species_scores[i]):
jpayne@1	1115 max_score=float(salm_species_scores[i])
jpayne@1	1116 max_score_index=i
jpayne@1	1117 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
jpayne@1	1118 if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
jpayne@1	1119 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
jpayne@1	1120 if max_score<10:
jpayne@1	1121 prediction="-"
jpayne@1	1122 return prediction
jpayne@1	1123
jpayne@1	1124 def judge_subspecies_Kmer(Special_dict):
jpayne@1	1125 #seqsero2 -k;
jpayne@1	1126 max_score=0
jpayne@1	1127 prediction="-" #default should be I
jpayne@1	1128 for x in Special_dict:
jpayne@1	1129 if "mer" in x:
jpayne@1	1130 if max_score<float(Special_dict[x]):
jpayne@1	1131 max_score=float(Special_dict[x])
jpayne@1	1132 prediction=x.split("_")[-1].strip()
jpayne@1	1133 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
jpayne@1	1134 prediction="bongori"
jpayne@1	1135 break
jpayne@1	1136 return prediction
jpayne@1	1137
jpayne@1	1138 def main():
jpayne@1	1139 #combine SeqSeroK and SeqSero2, also with SalmID
jpayne@1	1140 args = parse_args()
jpayne@1	1141 input_file = args.i
jpayne@1	1142 data_type = args.t
jpayne@1	1143 analysis_mode = args.m
jpayne@1	1144 mapping_mode=args.b
jpayne@1	1145 threads=args.p
jpayne@1	1146 make_dir=args.d
jpayne@1	1147 clean_mode=args.c
jpayne@1	1148 k_size=27 #will change for bug fixing
jpayne@1	1149 database="H_and_O_and_specific_genes.fasta"
jpayne@1	1150
jpayne@1	1151 if len(sys.argv)==1:
jpayne@1	1152 subprocess.check_output(dirpath+"/SeqSero2_package.py -h",shell=True, stderr=subprocess.STDOUT)#change name of python file
jpayne@1	1153 else:
jpayne@1	1154 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
jpayne@1	1155 request_id += str(random.randint(1, 10000000))
jpayne@1	1156 if make_dir is None:
jpayne@1	1157 make_dir="SeqSero_result_"+request_id
jpayne@1	1158 if os.path.isdir(make_dir):
jpayne@1	1159 pass
jpayne@1	1160 else:
jpayne@1	1161 subprocess.check_output(["mkdir",make_dir])
jpayne@1	1162 #subprocess.check_output("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1163 subprocess.check_output("ln -srf "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1164 ############################begin the real analysis
jpayne@1	1165 if analysis_mode=="a":
jpayne@1	1166 if data_type in ["1","2","3"]:#use allele mode
jpayne@1	1167 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
jpayne@1	1168 os.chdir(make_dir)
jpayne@1	1169 ###add a function to tell input files
jpayne@1	1170 fnameA=for_fq.split("/")[-1]
jpayne@1	1171 fnameB=rev_fq.split("/")[-1]
jpayne@1	1172 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
jpayne@1	1173 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
jpayne@1	1174 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
jpayne@1	1175 xmlfile=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
jpayne@1	1176 if xmlfile=="NA":
jpayne@1	1177 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
jpayne@1	1178 else:
jpayne@1	1179 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
jpayne@1	1180 file=open("data_log.txt","a")
jpayne@1	1181 for x in Final_list:
jpayne@1	1182 file.write("\t".join(str(y) for y in x)+"\n")
jpayne@1	1183 file.close()
jpayne@1	1184 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
jpayne@1	1185 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=predict_O_and_H_types(Final_list,Final_list_passed) #predict O, fliC and fljB
jpayne@1	1186 subspecies=judge_subspecies(fnameA) #predict subspecies
jpayne@1	1187 ###output
jpayne@1	1188 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
jpayne@1	1189 contamination_report=""
jpayne@1	1190 if contamination_O!="" and contamination_H=="":
jpayne@1	1191 contamination_report="#detected potential contamination of mixed serotypes from O antigen signals.\n"
jpayne@1	1192 elif contamination_O=="" and contamination_H!="":
jpayne@1	1193 contamination_report="#detected potential contamination of mixed serotypes or potential thrid H phase from H antigen signals.\n"
jpayne@1	1194 elif contamination_O!="" and contamination_H!="":
jpayne@1	1195 contamination_report="#detected potential contamination of mixed serotypes from both O and H antigen signals.\n"
jpayne@1	1196 if clean_mode:
jpayne@1	1197 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1198 make_dir="none-output-directory due to '-c' flag"
jpayne@1	1199 else:
jpayne@1	1200 #new_file=open("Seqsero_result.txt","w")
jpayne@1	1201 if O_choice=="":
jpayne@1	1202 O_choice="-"
jpayne@1	1203 result = OrderedDict(
jpayne@1	1204 sample_name = input_file[0],
jpayne@1	1205 O_antigen_prediction = O_choice,
jpayne@1	1206 H1_antigen_prediction = fliC_choice,
jpayne@1	1207 H2_antigen_prediction = fljB_choice,
jpayne@1	1208 predicted_antigenic_profile = predict_form,
jpayne@1	1209 predicted_subspecies = subspecies,
jpayne@1	1210 predicted_serotype = predict_sero,
jpayne@1	1211 note=claim.replace('\n','')
jpayne@1	1212 )
jpayne@1	1213 with open("Seqsero_result.tsv","w") as new_file:
jpayne@1	1214 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
jpayne@1	1215 #new_file.close()
jpayne@1	1216 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
jpayne@1	1217 wrtr.writeheader()
jpayne@1	1218 wrtr.writerow(result)
jpayne@1	1219
jpayne@1	1220
jpayne@1	1221 # new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
jpayne@1	1222 # new_file.close()
jpayne@1	1223 #subprocess.check_output("cat Seqsero_result.txt",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1224 #subprocess.call("rm H_and_O_and_specific_genes.fasta* .sra .bam .sam .fastq .gz .fq temp.txt .xml "+fnameA+"_db* 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1225 subprocess.call("rm H_and_O_and_specific_genes.fasta* .sra .bam .sam .fastq .gz .fq temp.txt "+fnameA+"_db 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1226 print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
jpayne@1	1227 else:
jpayne@1	1228 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
jpayne@1	1229 elif analysis_mode=="k":
jpayne@1	1230 ex_dir = os.path.dirname(os.path.realpath(__file__))
jpayne@1	1231 #output_mode = args.mode
jpayne@1	1232 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
jpayne@1	1233 input_file = for_fq #-k will just use forward because not all reads were used
jpayne@1	1234 os.chdir(make_dir)
jpayne@1	1235 f = open(ex_dir + '/antigens.pickle', 'rb')
jpayne@1	1236 lib_dict = pickle.load(f)
jpayne@1	1237 f.close
jpayne@1	1238 input_Ks = get_input_K(input_file,lib_dict,data_type,k_size)
jpayne@1	1239 O_dict,H_dict,Special_dict = get_kmer_dict(lib_dict,input_Ks)
jpayne@1	1240 highest_O,highest_fliC,highest_fljB = call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
jpayne@1	1241 subspecies = judge_subspecies_Kmer(Special_dict)
jpayne@1	1242 if subspecies=="IIb" or subspecies=="IIa":
jpayne@1	1243 subspecies="II"
jpayne@1	1244 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
jpayne@1	1245 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
jpayne@1	1246 if clean_mode:
jpayne@1	1247 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1248 make_dir="none-output-directory due to '-c' flag"
jpayne@1	1249 else:
jpayne@1	1250 if highest_O.split('-')[-1]=="":
jpayne@1	1251 O_choice="-"
jpayne@1	1252 else:
jpayne@1	1253 O_choice=highest_O.split('-')[-1]
jpayne@1	1254 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
jpayne@1	1255 result = OrderedDict(
jpayne@1	1256 sample_name = input_file,
jpayne@1	1257 O_antigen_prediction = O_choice,
jpayne@1	1258 H1_antigen_prediction = highest_fliC,
jpayne@1	1259 H2_antigen_prediction = highest_fljB,
jpayne@1	1260 predicted_antigenic_profile = predict_form,
jpayne@1	1261 predicted_subspecies = subspecies,
jpayne@1	1262 predicted_serotype = predict_sero,
jpayne@1	1263 note=claim.replace('\n','')
jpayne@1	1264 )
jpayne@1	1265 with open("Seqsero_result.tsv","w") as new_file:
jpayne@1	1266 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
jpayne@1	1267 #new_file.close()
jpayne@1	1268 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
jpayne@1	1269 wrtr.writeheader()
jpayne@1	1270 wrtr.writerow(result)
jpayne@1	1271 subprocess.call("rm .fasta .fastq .gz .fq temp.txt .sra 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1272 print("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
jpayne@1	1273 return 0
jpayne@1	1274
jpayne@1	1275 if __name__ == '__main__':
jpayne@1	1276 try:
jpayne@1	1277 quit(main())
jpayne@1	1278 except subprocess.CalledProcessError as e:
jpayne@1	1279 print(str(e.output))
jpayne@1	1280 print(e)
jpayne@1	1281 quit(e.returncode)

Mercurial > repos > jpayne > seqsero_v2

annotate SeqSero2/SeqSero2_package.py @ 3:b18e8cdfdf81