seqsero_v2: SeqSero2/SeqSero2_package.py annotate

annotate SeqSero2/SeqSero2_package.py @ 16:3b6d5b60968f tip

Uploaded

author	estrain
date	Wed, 01 Mar 2023 13:21:51 -0500
parents	4ac593d4b40f
children

rev	line source
jpayne@1	1 #!/usr/bin/env python3
jpayne@1	2
jpayne@1	3 import sys
jpayne@1	4 import time
jpayne@1	5 import random
jpayne@1	6 import os
jpayne@1	7 import subprocess
jpayne@1	8 import gzip
jpayne@1	9 import io
jpayne@1	10 import pickle
jpayne@1	11 import argparse
jpayne@1	12 import itertools
jpayne@1	13 from distutils.version import LooseVersion
jpayne@1	14
jpayne@1	15 from collections import OrderedDict
jpayne@1	16 from csv import DictWriter
jpayne@1	17
jpayne@1	18 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
jpayne@1	19
jpayne@1	20 ### SeqSero Kmer
jpayne@1	21 def parse_args():
jpayne@1	22 "Parse the input arguments, use '-h' for help."
jpayne@1	23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future
jpayne@1	24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data")
jpayne@1	25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)")
jpayne@7	26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode used for mapping in allele workflow, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem")
jpayne@7	27 parser.add_argument("-p",default="1",help="<int>: threads used for allele workflow, if p>4, only 4 threads will be used for assembly step, default=1")
jpayne@7	28 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: 'k'(kmer workflow), 'a'(allele workflow), default=a")
jpayne@1	29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
jpayne@1	30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted")
jpayne@1	31 return parser.parse_args()
jpayne@1	32
jpayne@1	33 def reverse_complement(sequence):
jpayne@1	34 complement = {
jpayne@1	35 'A': 'T',
jpayne@1	36 'C': 'G',
jpayne@1	37 'G': 'C',
jpayne@1	38 'T': 'A',
jpayne@1	39 'N': 'N',
jpayne@1	40 'M': 'K',
jpayne@1	41 'R': 'Y',
jpayne@1	42 'W': 'W',
jpayne@1	43 'S': 'S',
jpayne@1	44 'Y': 'R',
jpayne@1	45 'K': 'M',
jpayne@1	46 'V': 'B',
jpayne@1	47 'H': 'D',
jpayne@1	48 'D': 'H',
jpayne@1	49 'B': 'V'
jpayne@1	50 }
jpayne@1	51 return "".join(complement[base] for base in reversed(sequence))
jpayne@1	52
jpayne@1	53
jpayne@1	54 def createKmerDict_reads(list_of_strings, kmer):
jpayne@1	55 kmer_table = {}
jpayne@1	56 for string in list_of_strings:
jpayne@1	57 sequence = string.strip('\n')
jpayne@1	58 for i in range(len(sequence) - kmer + 1):
jpayne@1	59 new_mer = sequence[i:i + kmer].upper()
jpayne@1	60 new_mer_rc = reverse_complement(new_mer)
jpayne@1	61 if new_mer in kmer_table:
jpayne@1	62 kmer_table[new_mer.upper()] += 1
jpayne@1	63 else:
jpayne@1	64 kmer_table[new_mer.upper()] = 1
jpayne@1	65 if new_mer_rc in kmer_table:
jpayne@1	66 kmer_table[new_mer_rc.upper()] += 1
jpayne@1	67 else:
jpayne@1	68 kmer_table[new_mer_rc.upper()] = 1
jpayne@1	69 return kmer_table
jpayne@1	70
jpayne@1	71
jpayne@1	72 def multifasta_dict(multifasta):
jpayne@1	73 multifasta_list = [
jpayne@1	74 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
jpayne@1	75 ]
jpayne@1	76 headers = [i for i in multifasta_list if i[0] == '>']
jpayne@1	77 multifasta_dict = {}
jpayne@1	78 for h in headers:
jpayne@1	79 start = multifasta_list.index(h)
jpayne@1	80 for element in multifasta_list[start + 1:]:
jpayne@1	81 if element[0] == '>':
jpayne@1	82 break
jpayne@1	83 else:
jpayne@1	84 if h[1:] in multifasta_dict:
jpayne@1	85 multifasta_dict[h[1:]] += element
jpayne@1	86 else:
jpayne@1	87 multifasta_dict[h[1:]] = element
jpayne@1	88 return multifasta_dict
jpayne@1	89
jpayne@1	90
jpayne@1	91 def multifasta_single_string(multifasta):
jpayne@1	92 multifasta_list = [
jpayne@1	93 line.strip() for line in open(multifasta, 'r')
jpayne@1	94 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
jpayne@1	95 ]
jpayne@1	96 return ''.join(multifasta_list)
jpayne@1	97
jpayne@1	98
jpayne@1	99 def chunk_a_long_sequence(long_sequence, chunk_size=60):
jpayne@1	100 chunk_list = []
jpayne@1	101 steps = len(long_sequence) // 60 #how many chunks
jpayne@1	102 for i in range(steps):
jpayne@1	103 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
jpayne@1	104 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
jpayne@1	105 return chunk_list
jpayne@1	106
jpayne@1	107
jpayne@1	108 def target_multifasta_kmerizer(multifasta, k, kmerDict):
jpayne@1	109 forward_length = 300 #if find the target, put forward 300 bases
jpayne@1	110 reverse_length = 2200 #if find the target, put backward 2200 bases
jpayne@1	111 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
jpayne@1	112 target_mers = []
jpayne@1	113 long_single_string = multifasta_single_string(multifasta)
jpayne@1	114 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
jpayne@1	115 unit_length = len(multifasta_list[0])
jpayne@1	116 forward_lines = int(forward_length / unit_length) + 1
jpayne@1	117 reverse_lines = int(forward_length / unit_length) + 1
jpayne@1	118 start_num = 0
jpayne@1	119 end_num = 0
jpayne@1	120 for i in range(len(multifasta_list)):
jpayne@1	121 if i not in range(start_num, end_num): #avoid computational repetition
jpayne@1	122 line = multifasta_list[i]
jpayne@1	123 start = int((len(line) - k) // 2)
jpayne@1	124 s1 = line[start:k + start]
jpayne@1	125 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
jpayne@1	126 if i - forward_lines >= 0:
jpayne@1	127 start_num = i - forward_lines
jpayne@1	128 else:
jpayne@1	129 start_num = 0
jpayne@1	130 if i + reverse_lines <= len(multifasta_list) - 1:
jpayne@1	131 end_num = i + reverse_lines
jpayne@1	132 else:
jpayne@1	133 end_num = len(multifasta_list) - 1
jpayne@1	134 target_list = [
jpayne@1	135 x.strip() for x in multifasta_list[start_num:end_num]
jpayne@1	136 ]
jpayne@1	137 target_line = "".join(target_list)
jpayne@1	138 target_mers += [
jpayne@1	139 k1 for k1 in createKmerDict_reads([str(target_line)], k)
jpayne@1	140 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
jpayne@1	141 else:
jpayne@1	142 pass
jpayne@1	143 return set(target_mers)
jpayne@1	144
jpayne@1	145
jpayne@1	146 def target_read_kmerizer(file, k, kmerDict):
jpayne@1	147 i = 1
jpayne@1	148 n_reads = 0
jpayne@1	149 total_coverage = 0
jpayne@1	150 target_mers = []
jpayne@1	151 if file.endswith(".gz"):
jpayne@1	152 file_content = io.BufferedReader(gzip.open(file))
jpayne@1	153 else:
jpayne@1	154 file_content = open(file, "r").readlines()
jpayne@1	155 for line in file_content:
jpayne@1	156 start = int((len(line) - k) // 2)
jpayne@1	157 if i % 4 == 2:
jpayne@1	158 if file.endswith(".gz"):
jpayne@1	159 s1 = line[start:k + start].decode()
jpayne@1	160 line = line.decode()
jpayne@1	161 else:
jpayne@1	162 s1 = line[start:k + start]
jpayne@1	163 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
jpayne@1	164 n_reads += 1
jpayne@1	165 total_coverage += len(line)
jpayne@1	166 target_mers += [
jpayne@1	167 k1 for k1 in createKmerDict_reads([str(line)], k)
jpayne@1	168 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
jpayne@1	169 i += 1
jpayne@1	170 if total_coverage >= 4000000:
jpayne@1	171 break
jpayne@1	172 return set(target_mers)
jpayne@1	173
jpayne@1	174
jpayne@1	175 def minion_fasta_kmerizer(file, k, kmerDict):
jpayne@1	176 i = 1
jpayne@1	177 n_reads = 0
jpayne@1	178 total_coverage = 0
jpayne@1	179 target_mers = {}
jpayne@1	180 for line in open(file):
jpayne@1	181 if i % 2 == 0:
jpayne@1	182 for kmer, rc_kmer in kmers(line.strip().upper(), k):
jpayne@1	183 if (kmer in kmerDict) or (rc_kmer in kmerDict):
jpayne@1	184 if kmer in target_mers:
jpayne@1	185 target_mers[kmer] += 1
jpayne@1	186 else:
jpayne@1	187 target_mers[kmer] = 1
jpayne@1	188 if rc_kmer in target_mers:
jpayne@1	189 target_mers[rc_kmer] += 1
jpayne@1	190 else:
jpayne@1	191 target_mers[rc_kmer] = 1
jpayne@1	192 i += 1
jpayne@1	193 return set([h for h in target_mers])
jpayne@1	194
jpayne@1	195
jpayne@1	196 def minion_fastq_kmerizer(file, k, kmerDict):
jpayne@1	197 i = 1
jpayne@1	198 n_reads = 0
jpayne@1	199 total_coverage = 0
jpayne@1	200 target_mers = {}
jpayne@1	201 for line in open(file):
jpayne@1	202 if i % 4 == 2:
jpayne@1	203 for kmer, rc_kmer in kmers(line.strip().upper(), k):
jpayne@1	204 if (kmer in kmerDict) or (rc_kmer in kmerDict):
jpayne@1	205 if kmer in target_mers:
jpayne@1	206 target_mers[kmer] += 1
jpayne@1	207 else:
jpayne@1	208 target_mers[kmer] = 1
jpayne@1	209 if rc_kmer in target_mers:
jpayne@1	210 target_mers[rc_kmer] += 1
jpayne@1	211 else:
jpayne@1	212 target_mers[rc_kmer] = 1
jpayne@1	213 i += 1
jpayne@1	214 return set([h for h in target_mers])
jpayne@1	215
jpayne@1	216
jpayne@1	217 def multifasta_single_string2(multifasta):
jpayne@1	218 single_string = ''
jpayne@1	219 with open(multifasta, 'r') as f:
jpayne@1	220 for line in f:
jpayne@1	221 if line.strip()[0] == '>':
jpayne@1	222 pass
jpayne@1	223 else:
jpayne@1	224 single_string += line.strip()
jpayne@1	225 return single_string
jpayne@1	226
jpayne@1	227
jpayne@1	228 def kmers(seq, k):
jpayne@1	229 rev_comp = reverse_complement(seq)
jpayne@1	230 for start in range(1, len(seq) - k + 1):
jpayne@1	231 yield seq[start:start + k], rev_comp[-(start + k):-start]
jpayne@1	232
jpayne@1	233
jpayne@1	234 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
jpayne@1	235 multi_seq_dict = multifasta_dict(multifasta)
jpayne@1	236 lib_dict = {}
jpayne@1	237 for h in multi_seq_dict:
jpayne@1	238 lib_dict[h] = set(
jpayne@1	239 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
jpayne@1	240 return lib_dict
jpayne@1	241
jpayne@1	242
jpayne@1	243 def Combine(b, c):
jpayne@1	244 fliC_combinations = []
jpayne@1	245 fliC_combinations.append(",".join(c))
jpayne@1	246 temp_combinations = []
jpayne@1	247 for i in range(len(b)):
jpayne@1	248 for x in itertools.combinations(b, i + 1):
jpayne@1	249 temp_combinations.append(",".join(x))
jpayne@1	250 for x in temp_combinations:
jpayne@1	251 temp = []
jpayne@1	252 for y in c:
jpayne@1	253 temp.append(y)
jpayne@1	254 temp.append(x)
jpayne@1	255 temp = ",".join(temp)
jpayne@1	256 temp = temp.split(",")
jpayne@1	257 temp.sort()
jpayne@1	258 temp = ",".join(temp)
jpayne@1	259 fliC_combinations.append(temp)
jpayne@1	260 return fliC_combinations
jpayne@1	261
jpayne@1	262
jpayne@1	263 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
jpayne@1	264 #like test_output_06012017.txt
jpayne@1	265 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
jpayne@4	266 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
jpayne@4	267 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
jpayne@4	268 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
jpayne@1	269 seronames = []
jpayne@1	270 seronames_none_subspecies=[]
jpayne@1	271 for i in range(len(phase1)):
jpayne@1	272 fliC_combine = []
jpayne@1	273 fljB_combine = []
jpayne@1	274 if phaseO[i] == Otype: # no VII in KW, but it's there
jpayne@1	275 ### for fliC, detect every possible combinations to avoid the effect of "["
jpayne@1	276 if phase1[i].count("[") == 0:
jpayne@1	277 fliC_combine.append(phase1[i])
jpayne@1	278 elif phase1[i].count("[") >= 1:
jpayne@1	279 c = []
jpayne@1	280 b = []
jpayne@1	281 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
jpayne@1	282 "[") == 1:
jpayne@1	283 content = phase1[i].replace("[", "").replace("]", "")
jpayne@1	284 fliC_combine.append(content)
jpayne@1	285 fliC_combine.append("-")
jpayne@1	286 else:
jpayne@1	287 for x in phase1[i].split(","):
jpayne@1	288 if "[" in x:
jpayne@1	289 b.append(x.replace("[", "").replace("]", ""))
jpayne@1	290 else:
jpayne@1	291 c.append(x)
jpayne@1	292 fliC_combine = Combine(
jpayne@1	293 b, c
jpayne@1	294 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
jpayne@1	295 ### end of fliC "[" detect
jpayne@1	296 ### for fljB, detect every possible combinations to avoid the effect of "["
jpayne@1	297 if phase2[i].count("[") == 0:
jpayne@1	298 fljB_combine.append(phase2[i])
jpayne@1	299 elif phase2[i].count("[") >= 1:
jpayne@1	300 d = []
jpayne@1	301 e = []
jpayne@1	302 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
jpayne@1	303 "[") == 1:
jpayne@1	304 content = phase2[i].replace("[", "").replace("]", "")
jpayne@1	305 fljB_combine.append(content)
jpayne@1	306 fljB_combine.append("-")
jpayne@1	307 else:
jpayne@1	308 for x in phase2[i].split(","):
jpayne@1	309 if "[" in x:
jpayne@1	310 d.append(x.replace("[", "").replace("]", ""))
jpayne@1	311 else:
jpayne@1	312 e.append(x)
jpayne@1	313 fljB_combine = Combine(d, e)
jpayne@1	314 ### end of fljB "[" detect
jpayne@1	315 new_fliC = fliC.split(
jpayne@1	316 ","
jpayne@1	317 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
jpayne@1	318 new_fliC.sort()
jpayne@1	319 new_fliC = ",".join(new_fliC)
jpayne@1	320 new_fljB = fljB.split(",")
jpayne@1	321 new_fljB.sort()
jpayne@1	322 new_fljB = ",".join(new_fljB)
jpayne@1	323 if (new_fliC in fliC_combine
jpayne@1	324 or fliC in fliC_combine) and (new_fljB in fljB_combine
jpayne@1	325 or fljB in fljB_combine):
jpayne@4	326 ######start, remove_list,rename_dict, added on 11/11/2018
jpayne@4	327 if sero[i] not in remove_list:
jpayne@4	328 temp_sero=sero[i]
jpayne@4	329 if temp_sero in rename_dict:
jpayne@4	330 temp_sero=rename_dict[temp_sero] #rename if in the rename list
jpayne@4	331 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
jpayne@4	332 if subs[i] == subspecies:
jpayne@4	333 seronames.append(temp_sero)
jpayne@4	334 seronames_none_subspecies.append(temp_sero)
jpayne@4	335 else:
jpayne@4	336 pass
jpayne@4	337 else:
jpayne@4	338 pass
jpayne@4	339 ######end, added on 11/11/2018
jpayne@1	340 #analyze seronames
jpayne@1	341 subspecies_pointer=""
jpayne@1	342 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
jpayne@1	343 seronames=seronames_none_subspecies
jpayne@1	344 subspecies_pointer="1"
jpayne@1	345 if len(seronames) == 0:
jpayne@1	346 seronames = [
jpayne@1	347 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
jpayne@1	348 ]
jpayne@1	349 star = ""
jpayne@1	350 star_line = ""
jpayne@1	351 if len(seronames) > 1: #there are two possible predictions for serotypes
jpayne@1	352 star = "*"
jpayne@7	353 #changed 04072019
jpayne@7	354 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
jpayne@1	355 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
jpayne@1	356 star="*"
jpayne@7	357 star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line
jpayne@7	358 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
jpayne@1	359 if Otype=="":
jpayne@1	360 Otype="-"
jpayne@1	361 predict_form = Otype + ":" + fliC + ":" + fljB
jpayne@1	362 predict_sero = (" or ").join(seronames)
jpayne@1	363 ###special test for Enteritidis
jpayne@1	364 if predict_form == "9:g,m:-":
jpayne@1	365 sdf = "-"
jpayne@1	366 for x in special_gene_list:
jpayne@1	367 if x.startswith("sdf"):
jpayne@1	368 sdf = "+"
jpayne@7	369 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
jpayne@7	370 star_line=" sdf gene detected." # ed_SL_04152019: new output format
jpayne@7	371 #predict_form = predict_form + " Sdf prediction:" + sdf
jpayne@7	372 predict_form = predict_form #changed 04072019
jpayne@1	373 if sdf == "-":
jpayne@1	374 star = "*"
jpayne@7	375 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
jpayne@7	376 star_line=" sdf gene not detected." # ed_SL_04152019: new output format
jpayne@7	377 #changed in 04072019, for new output
jpayne@7	378 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
jpayne@7	379 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
jpayne@7	380 predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format
jpayne@1	381 ###end of special test for Enteritidis
jpayne@1	382 elif predict_form == "4:i:-":
jpayne@7	383 predict_sero = "4:i:-"
jpayne@1	384 elif predict_form == "4:r:-":
jpayne@7	385 predict_sero = "4:r:-"
jpayne@1	386 elif predict_form == "4:b:-":
jpayne@7	387 predict_sero = "4:b:-"
jpayne@4	388 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
jpayne@4	389 #predict_sero = "Newport"
jpayne@4	390 #star = "*"
jpayne@4	391 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
jpayne@7	392 claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
jpayne@1	393 if "N/A" in predict_sero:
jpayne@1	394 claim = ""
jpayne@1	395 #special test for Typhimurium
jpayne@1	396 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
jpayne@1	397 normal = 0
jpayne@1	398 mutation = 0
jpayne@1	399 for x in special_gene_list:
jpayne@1	400 if "oafA-O-4_full" in x:
jpayne@1	401 normal = float(special_gene_list[x])
jpayne@1	402 elif "oafA-O-4_5-" in x:
jpayne@1	403 mutation = float(special_gene_list[x])
jpayne@1	404 if normal > mutation:
jpayne@1	405 pass
jpayne@1	406 elif normal < mutation:
jpayne@7	407 #predict_sero = predict_sero.strip() + "(O5-)"
jpayne@7	408 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
jpayne@1	409 star = "*"
jpayne@7	410 #star_line = "Detected the deletion of O5-."
jpayne@7	411 star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format
jpayne@1	412 else:
jpayne@1	413 pass
jpayne@1	414 #special test for Paratyphi B
jpayne@1	415 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
jpayne@1	416 normal = 0
jpayne@1	417 mutation = 0
jpayne@1	418 for x in special_gene_list:
jpayne@1	419 if "gntR-family-regulatory-protein_dt-positive" in x:
jpayne@1	420 normal = float(special_gene_list[x])
jpayne@1	421 elif "gntR-family-regulatory-protein_dt-negative" in x:
jpayne@1	422 mutation = float(special_gene_list[x])
jpayne@1	423 #print(normal,mutation)
jpayne@1	424 if normal > mutation:
jpayne@7	425 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
jpayne@7	426 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format
jpayne@1	427 star = "*"
jpayne@7	428 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
jpayne@7	429 star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format
jpayne@1	430 elif normal < mutation:
jpayne@7	431 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
jpayne@7	432 predict_sero = predict_sero.strip()
jpayne@1	433 star = "*"
jpayne@7	434 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
jpayne@7	435 star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format
jpayne@1	436 else:
jpayne@1	437 star = "*"
jpayne@7	438 star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
jpayne@1	439 #special test for O13,22 and O13,23
jpayne@1	440 if Otype=="13":
jpayne@1	441 #ex_dir = os.path.dirname(os.path.realpath(__file__))
jpayne@1	442 f = open(dirpath + '/special.pickle', 'rb')
jpayne@1	443 special = pickle.load(f)
jpayne@1	444 O22_O23=special['O22_O23']
jpayne@4	445 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
jpayne@1	446 O22_score=0
jpayne@1	447 O23_score=0
jpayne@1	448 for x in special_gene_list:
jpayne@1	449 if "O:22" in x:
jpayne@1	450 O22_score = O22_score+float(special_gene_list[x])
jpayne@1	451 elif "O:23" in x:
jpayne@1	452 O23_score = O23_score+float(special_gene_list[x])
jpayne@1	453 #print(O22_score,O23_score)
jpayne@1	454 for z in O22_O23[0]:
jpayne@1	455 if predict_sero.split(" or ")[0] in z:
jpayne@1	456 if O22_score > O23_score:
jpayne@1	457 star = "*"
jpayne@7	458 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
jpayne@1	459 predict_sero = z[0]
jpayne@1	460 elif O22_score < O23_score:
jpayne@1	461 star = "*"
jpayne@7	462 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
jpayne@1	463 predict_sero = z[1]
jpayne@1	464 else:
jpayne@1	465 star = "*"
jpayne@7	466 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
jpayne@7	467 if " or " in predict_sero:
jpayne@7	468 star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
jpayne@1	469 #special test for O6,8
jpayne@4	470 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
jpayne@4	471 #for x in merge_O68_list:
jpayne@4	472 # if x in predict_sero:
jpayne@4	473 # predict_sero=x
jpayne@4	474 # star=""
jpayne@4	475 # star_line=""
jpayne@1	476 #special test for Montevideo; most of them are monophasic
jpayne@4	477 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
jpayne@4	478 #star="*"
jpayne@4	479 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
jpayne@1	480 return predict_form, predict_sero, star, star_line, claim
jpayne@1	481 ### End of SeqSero Kmer part
jpayne@1	482
jpayne@1	483 ### Begin of SeqSero2 allele prediction and output
jpayne@1	484 def xml_parse_score_comparision_seqsero(xmlfile):
jpayne@1	485 #used to do seqsero xml analysis
jpayne@1	486 from Bio.Blast import NCBIXML
jpayne@1	487 handle=open(xmlfile)
jpayne@1	488 handle=NCBIXML.parse(handle)
jpayne@1	489 handle=list(handle)
jpayne@1	490 List=[]
jpayne@1	491 List_score=[]
jpayne@1	492 List_ids=[]
jpayne@1	493 List_query_region=[]
jpayne@1	494 for i in range(len(handle)):
jpayne@1	495 if len(handle[i].alignments)>0:
jpayne@1	496 for j in range(len(handle[i].alignments)):
jpayne@1	497 score=0
jpayne@1	498 ids=0
jpayne@1	499 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
jpayne@1	500 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
jpayne@1	501 for z in range(len(handle[i].alignments[j].hsps)):
jpayne@1	502 hsp=handle[i].alignments[j].hsps[z]
jpayne@1	503 temp=set(range(hsp.query_start,hsp.query_end))
jpayne@1	504 if len(cover_region)==0:
jpayne@1	505 cover_region=cover_region\|temp
jpayne@1	506 fraction=1
jpayne@1	507 else:
jpayne@1	508 fraction=1-len(cover_region&temp)/float(len(temp))
jpayne@1	509 cover_region=cover_region\|temp
jpayne@1	510 if "last" in handle[i].query or "first" in handle[i].query:
jpayne@1	511 score+=hsp.bits*fraction
jpayne@1	512 ids+=float(hsp.identities)/handle[i].query_length*fraction
jpayne@1	513 else:
jpayne@1	514 score+=hsp.bits*fraction
jpayne@1	515 ids+=float(hsp.identities)/handle[i].query_length*fraction
jpayne@1	516 List_score.append(score)
jpayne@1	517 List_ids.append(ids)
jpayne@1	518 List_query_region.append(cover_region)
jpayne@1	519 temp=zip(List,List_score,List_ids,List_query_region)
jpayne@1	520 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
jpayne@1	521 return Final_list
jpayne@1	522
jpayne@1	523
jpayne@1	524 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
jpayne@1	525 Old=L
jpayne@1	526 L.sort()
jpayne@1	527 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
jpayne@1	528 count=[]
jpayne@1	529 for j in range(len(L)):
jpayne@1	530 y=0
jpayne@1	531 for x in Old:
jpayne@1	532 if L[j]==x:
jpayne@1	533 y+=1
jpayne@1	534 count.append(y)
jpayne@1	535 if sort_on_fre!="none":
jpayne@1	536 d=zip(*sorted(zip(count, L)))
jpayne@1	537 L=d[1]
jpayne@1	538 count=d[0]
jpayne@1	539 return (L,count)
jpayne@1	540
jpayne@1	541 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
jpayne@1	542 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
jpayne@1	543 #this is mainly used for
jpayne@1	544 a=nodes_vs_score_list
jpayne@1	545 fliC_score=0
jpayne@1	546 fljB_score=0
jpayne@1	547 for z in a:
jpayne@1	548 if "fliC" in z[0]:
jpayne@1	549 fliC_score+=z[1]
jpayne@1	550 elif "fljB" in z[0]:
jpayne@1	551 fljB_score+=z[1]
jpayne@1	552 if fliC_score>=fljB_score:
jpayne@1	553 role="fliC"
jpayne@1	554 else:
jpayne@1	555 role="fljB"
jpayne@1	556 return (role,abs(fliC_score-fljB_score))
jpayne@1	557
jpayne@1	558 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
jpayne@1	559 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
jpayne@1	560 #also used when no head or tail got blasted score for the contig
jpayne@1	561 role=""
jpayne@1	562 for z in Final_list_passed:
jpayne@1	563 if node_name in z[0]:
jpayne@1	564 role=z[0].split("_")[0]
jpayne@1	565 break
jpayne@1	566 return role
jpayne@1	567
jpayne@1	568 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
jpayne@1	569 #nodes_list is the c created by c,d=Uniq(nodes) in below function
jpayne@1	570 first_target=""
jpayne@1	571 role_list=[]
jpayne@1	572 for x in nodes_list:
jpayne@1	573 a=[]
jpayne@1	574 role=""
jpayne@1	575 for y in tail_head_list:
jpayne@1	576 if x in y[0]:
jpayne@1	577 a.append(y)
jpayne@1	578 if len(a)==4:
jpayne@1	579 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	580 if diff<20:
jpayne@1	581 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	582 elif len(a)==3:
jpayne@1	583 ###however, if the one with highest score is the fewer one, compare their accumulation score
jpayne@1	584 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	585 if diff<20:
jpayne@1	586 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	587 ###end of above score comparison
jpayne@1	588 elif len(a)==2:
jpayne@1	589 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
jpayne@1	590 temp=[]
jpayne@1	591 for z in a:
jpayne@1	592 temp.append(z[0].split("_")[0])
jpayne@1	593 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
jpayne@1	594 if len(m)==1:
jpayne@1	595 pass
jpayne@1	596 else:
jpayne@1	597 pass
jpayne@1	598 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	599 if diff<20:
jpayne@1	600 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	601 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
jpayne@1	602 elif len(a)==1:
jpayne@1	603 #that one
jpayne@1	604 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1	605 if diff<20:
jpayne@1	606 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	607 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
jpayne@1	608 else:#a==0
jpayne@1	609 #use Final_list_passed best match
jpayne@1	610 for z in Final_list_passed:
jpayne@1	611 if x in z[0]:
jpayne@1	612 role=z[0].split("_")[0]
jpayne@1	613 break
jpayne@1	614 #print x,role,len(a)
jpayne@1	615 role_list.append((role,x))
jpayne@1	616 if len(role_list)==2:
jpayne@1	617 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
jpayne@1	618 #just use score to do a final test
jpayne@1	619 role_list=[]
jpayne@1	620 for x in nodes_list:
jpayne@1	621 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1	622 role_list.append((role,x))
jpayne@1	623 return role_list
jpayne@1	624
jpayne@1	625 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
jpayne@1	626 #used to decide which contig is FliC and which one is fljB
jpayne@1	627 contigs=[]
jpayne@1	628 nodes=[]
jpayne@1	629 for x in Final_list_passed:
jpayne@1	630 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
jpayne@1	631 nodes.append(x[0].split("___")[1].strip())
jpayne@1	632 c,d=Uniq(nodes)#c is node_list
jpayne@1	633 #print c
jpayne@1	634 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
jpayne@1	635 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
jpayne@1	636 return roles
jpayne@1	637
jpayne@1	638 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
jpayne@1	639 #decide O based on Final_list
jpayne@1	640 O_choice="?"
jpayne@1	641 O_list=[]
jpayne@1	642 special_genes={}
jpayne@1	643 nodes=[]
jpayne@1	644 for x in Final_list_passed:
jpayne@1	645 if x[0].startswith("O-"):
jpayne@1	646 nodes.append(x[0].split("___")[1].strip())
jpayne@1	647 elif not x[0].startswith("fl"):
jpayne@1	648 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
jpayne@1	649 #print "special_genes:",special_genes
jpayne@1	650 c,d=Uniq(nodes)
jpayne@1	651 #print "potential O antigen contig",c
jpayne@1	652 final_O=[]
jpayne@1	653 O_nodes_list=[]
jpayne@1	654 for x in c:#c is the list for contigs
jpayne@1	655 temp=0
jpayne@1	656 for y in Final_list_passed:
jpayne@1	657 if x in y[0] and y[0].startswith("O-"):
jpayne@1	658 final_O.append(y)
jpayne@1	659 break
jpayne@1	660 ### O contig has the problem of two genes on same contig, so do additional test
jpayne@1	661 potenial_new_gene=""
jpayne@1	662 for x in final_O:
jpayne@1	663 pointer=0 #for genes merged or not
jpayne@1	664 #not consider O-1,3,19_not_in_3,10, too short compared with others
jpayne@1	665 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 3002 (for secureity can use 400*2) as flank region
jpayne@1	666 pointer=x[0].split("___")[1].strip()#store the contig name
jpayne@1	667 print(pointer)
jpayne@1	668 if pointer!=0:#it has potential merge event
jpayne@1	669 for y in Final_list:
jpayne@1	670 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
jpayne@1	671 potenial_new_gene=y
jpayne@1	672 #print(potenial_new_gene)
jpayne@1	673 break
jpayne@1	674 if potenial_new_gene!="":
jpayne@1	675 print("two differnt genes in same contig, fix it for O antigen")
jpayne@1	676 print(potenial_new_gene[:3])
jpayne@4	677 pointer=0
jpayne@4	678 for y in final_O:
jpayne@4	679 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
jpayne@4	680 pointer=1
jpayne@4	681 if pointer!=1:
jpayne@4	682 final_O.append(potenial_new_gene)
jpayne@1	683 ### end of the two genes on same contig test
jpayne@1	684 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
jpayne@1	685 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
jpayne@1	686 #print "$$$No Otype, due to no hit"#may need to be changed
jpayne@1	687 O_choice="-"
jpayne@1	688 else:
jpayne@7	689 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
jpayne@1	690 O_list=[]
jpayne@1	691 O_list_less_contamination=[]
jpayne@1	692 for x in final_O:
jpayne@1	693 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
jpayne@1	694 O_list.append(x[0].split("__")[0])
jpayne@1	695 O_nodes_list.append(x[0].split("___")[1])
jpayne@7	696 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
jpayne@1	697 O_list_less_contamination.append(x[0].split("__")[0])
jpayne@1	698 ### special test for O9,46 and O3,10 family
jpayne@1	699 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
jpayne@1	700 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	701 O_choice="O-9,46"
jpayne@1	702 #print "$$$Most possilble Otype: O-9,46"
jpayne@1	703 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
jpayne@1	704 O_choice="O-9,46,27"
jpayne@1	705 #print "$$$Most possilble Otype: O-9,46,27"
jpayne@1	706 else:
jpayne@1	707 O_choice="O-9"#next, detect O9 vs O2?
jpayne@1	708 O2=0
jpayne@1	709 O9=0
jpayne@1	710 for z in special_genes:
jpayne@1	711 if "tyr-O-9" in z:
jpayne@1	712 O9=special_genes[z]
jpayne@1	713 elif "tyr-O-2" in z:
jpayne@1	714 O2=special_genes[z]
jpayne@1	715 if O2>O9:
jpayne@1	716 O_choice="O-2"
jpayne@1	717 elif O2<O9:
jpayne@1	718 pass
jpayne@1	719 else:
jpayne@1	720 pass
jpayne@1	721 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
jpayne@1	722 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	723 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
jpayne@1	724 O_choice="O-3,10"
jpayne@1	725 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
jpayne@1	726 else:
jpayne@1	727 O_choice="O-1,3,19"
jpayne@1	728 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
jpayne@1	729 ### end of special test for O9,46 and O3,10 family
jpayne@1	730 else:
jpayne@1	731 try:
jpayne@1	732 max_score=0
jpayne@1	733 for x in final_O:
jpayne@7	734 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage0.15:#use x[2],08172018, the "coverage identity = cover_length identity"; also meet coverage threshold
jpayne@1	735 max_score=x[2]#change from x[-1] to x[2],08172018
jpayne@1	736 O_choice=x[0].split("_")[0]
jpayne@1	737 if O_choice=="O-1,3,19":
jpayne@1	738 O_choice=final_O[1][0].split("_")[0]
jpayne@1	739 #print "$$$Most possilble Otype: ",O_choice
jpayne@1	740 except:
jpayne@1	741 pass
jpayne@1	742 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
jpayne@1	743 #print "O:",O_choice,O_nodes_list
jpayne@1	744 Otypes=[]
jpayne@1	745 for x in O_list:
jpayne@1	746 if x!="O-1,3,19_not_in_3,10":
jpayne@1	747 if "O-9,46_" not in x:
jpayne@1	748 Otypes.append(x.split("_")[0])
jpayne@1	749 else:
jpayne@1	750 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
jpayne@1	751 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
jpayne@1	752 Otypes_uniq,Otypes_fre=Uniq(Otypes)
jpayne@1	753 contamination_O=""
jpayne@1	754 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
jpayne@1	755 if len(Otypes_uniq)>2:
jpayne@1	756 contamination_O="potential contamination from O antigen signals"
jpayne@1	757 else:
jpayne@1	758 if len(Otypes_uniq)>1:
jpayne@1	759 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@1	760 contamination_O=""
jpayne@1	761 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@1	762 contamination_O=""
jpayne@1	763 else:
jpayne@1	764 contamination_O="potential contamination from O antigen signals"
jpayne@4	765 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
jpayne@1	766 ### End of SeqSero2 allele prediction and output
jpayne@1	767
jpayne@1	768 def get_input_files(make_dir,input_file,data_type,dirpath):
jpayne@1	769 #tell input files from datatype
jpayne@1	770 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
jpayne@1	771 for_fq=""
jpayne@1	772 rev_fq=""
jpayne@1	773 old_dir = os.getcwd()
jpayne@1	774 os.chdir(make_dir)
jpayne@1	775 if data_type=="1":
jpayne@1	776 input_file=input_file[0].split("/")[-1]
jpayne@1	777 if input_file.endswith(".sra"):
jpayne@1	778 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
jpayne@1	779 for_fq=input_file.replace(".sra","_1.fastq")
jpayne@1	780 rev_fq=input_file.replace(".sra","_2.fastq")
jpayne@1	781 else:
jpayne@1	782 core_id=input_file.split(".fastq")[0].split(".fq")[0]
jpayne@1	783 for_fq=core_id+"_1.fastq"
jpayne@1	784 rev_fq=core_id+"_2.fastq"
jpayne@1	785 if input_file.endswith(".gz"):
jpayne@1	786 subprocess.check_output("gzip -dc "+input_file+" \| "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1	787 else:
jpayne@1	788 subprocess.check_output("cat "+input_file+" \| "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1	789 elif data_type=="2":
jpayne@1	790 for_fq=input_file[0].split("/")[-1]
jpayne@1	791 rev_fq=input_file[1].split("/")[-1]
jpayne@1	792 elif data_type=="3":
jpayne@1	793 input_file=input_file[0].split("/")[-1]
jpayne@1	794 if input_file.endswith(".sra"):
jpayne@1	795 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
jpayne@1	796 for_fq=input_file.replace(".sra","_1.fastq")
jpayne@1	797 else:
jpayne@1	798 for_fq=input_file
jpayne@1	799 elif data_type in ["4","5","6"]:
jpayne@1	800 for_fq=input_file[0].split("/")[-1]
jpayne@1	801 os.chdir(old_dir)
jpayne@1	802 return for_fq,rev_fq
jpayne@1	803
jpayne@4	804 def predict_O_and_H_types(Final_list,Final_list_passed,new_fasta):
jpayne@1	805 #get O and H types from Final_list from blast parsing; allele mode
jpayne@4	806 from Bio import SeqIO
jpayne@1	807 fliC_choice="-"
jpayne@1	808 fljB_choice="-"
jpayne@1	809 fliC_contig="NA"
jpayne@1	810 fljB_contig="NA"
jpayne@1	811 fliC_region=set([0])
jpayne@1	812 fljB_region=set([0,])
jpayne@7	813 fliC_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
jpayne@7	814 fljB_length=0 #can be changed to coverage in future; in 03292019, changed to ailgned length
jpayne@1	815 O_choice="-"#no need to decide O contig for now, should be only one
jpayne@4	816 O_choice,O_nodes,special_gene_list,O_nodes_roles,contamination_O,Otypes_uniq=decide_O_type_and_get_special_genes(Final_list,Final_list_passed)#decide the O antigen type and also return special-gene-list for further identification
jpayne@1	817 O_choice=O_choice.split("-")[-1].strip()
jpayne@1	818 if (O_choice=="1,3,19" and len(O_nodes_roles)==1 and "1,3,19" in O_nodes_roles[0][0]) or O_choice=="":
jpayne@1	819 O_choice="-"
jpayne@1	820 H_contig_roles=decide_contig_roles_for_H_antigen(Final_list,Final_list_passed)#decide the H antigen contig is fliC or fljB
jpayne@7	821 #add alignment locations, used for further selection, 03312019
jpayne@7	822 for i in range(len(H_contig_roles)):
jpayne@7	823 x=H_contig_roles[i]
jpayne@7	824 for y in Final_list_passed:
jpayne@7	825 if x[1] in y[0] and y[0].startswith(x[0]):
jpayne@7	826 H_contig_roles[i]+=H_contig_roles[i]+(y[-1],)
jpayne@7	827 break
jpayne@1	828 log_file=open("SeqSero_log.txt","a")
jpayne@4	829 extract_file=open("Extracted_antigen_alleles.fasta","a")
jpayne@4	830 handle_fasta=list(SeqIO.parse(new_fasta,"fasta"))
jpayne@4	831
jpayne@4	832 #print("O_contigs:")
jpayne@1	833 log_file.write("O_contigs:\n")
jpayne@4	834 extract_file.write("#Sequences with antigen signals (if the micro-assembled contig only covers the flanking region, it will not be used for contamination analysis)\n")
jpayne@4	835 extract_file.write("#O_contigs:\n")
jpayne@1	836 for x in O_nodes_roles:
jpayne@1	837 if "O-1,3,19_not_in_3,10" not in x[0]:#O-1,3,19_not_in_3,10 is just a small size marker
jpayne@4	838 #print(x[0].split("___")[-1],x[0].split("__")[0],"blast score:",x[1],"identity%:",str(round(x[2]*100,2))+"%",str(min(x[-1]))+" to "+str(max(x[-1])))
jpayne@4	839 log_file.write(x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n")
jpayne@4	840 title=">"+x[0].split("___")[-1]+" "+x[0].split("__")[0]+"; "+"blast score: "+str(x[1])+" identity%: "+str(round(x[2]*100,2))+"%; alignment from "+str(min(x[-1]))+" to "+str(max(x[-1]))+" of antigen\n"
jpayne@4	841 seqs=""
jpayne@4	842 for z in handle_fasta:
jpayne@4	843 if x[0].split("___")[-1]==z.description:
jpayne@4	844 seqs=str(z.seq)
jpayne@4	845 extract_file.write(title+seqs+"\n")
jpayne@1	846 if len(H_contig_roles)!=0:
jpayne@7	847 highest_H_coverage=max([float(x[1].split("_cov_")[-1].split("_")[0]) for x in H_contig_roles]) #less than highest*0.1 would be regarded as contamination and noises, they will still be considered in contamination detection and logs, but not used as final serotype output
jpayne@1	848 else:
jpayne@1	849 highest_H_coverage=0
jpayne@1	850 for x in H_contig_roles:
jpayne@1	851 #if multiple choices, temporately select the one with longest length for now, will revise in further change
jpayne@7	852 if "fliC" == x[0] and len(x[-1])>=fliC_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:#remember to avoid the effect of O-type contig, so should not in O_node list
jpayne@1	853 fliC_contig=x[1]
jpayne@7	854 fliC_length=len(x[-1])
jpayne@7	855 elif "fljB" == x[0] and len(x[-1])>=fljB_length and x[1] not in O_nodes and float(x[1].split("_cov_")[-1].split("_")[0])>highest_H_coverage*0.13:
jpayne@1	856 fljB_contig=x[1]
jpayne@7	857 fljB_length=len(x[-1])
jpayne@1	858 for x in Final_list_passed:
jpayne@1	859 if fliC_choice=="-" and "fliC_" in x[0] and fliC_contig in x[0]:
jpayne@1	860 fliC_choice=x[0].split("_")[1]
jpayne@1	861 elif fljB_choice=="-" and "fljB_" in x[0] and fljB_contig in x[0]:
jpayne@1	862 fljB_choice=x[0].split("_")[1]
jpayne@1	863 elif fliC_choice!="-" and fljB_choice!="-":
jpayne@1	864 break
jpayne@1	865 #now remove contigs not in middle core part
jpayne@1	866 first_allele="NA"
jpayne@1	867 first_allele_percentage=0
jpayne@1	868 for x in Final_list:
jpayne@1	869 if x[0].startswith("fliC") or x[0].startswith("fljB"):
jpayne@1	870 first_allele=x[0].split("__")[0] #used to filter those un-middle contigs
jpayne@1	871 first_allele_percentage=x[2]
jpayne@1	872 break
jpayne@1	873 additional_contigs=[]
jpayne@1	874 for x in Final_list:
jpayne@1	875 if first_allele in x[0]:
jpayne@1	876 if (fliC_contig == x[0].split("___")[-1]):
jpayne@1	877 fliC_region=x[3]
jpayne@1	878 elif fljB_contig!="NA" and (fljB_contig == x[0].split("___")[-1]):
jpayne@1	879 fljB_region=x[3]
jpayne@1	880 else:
jpayne@1	881 if x[1]*1.1>int(x[0].split("___")[1].split("_")[3]):#loose threshold by multiplying 1.1
jpayne@1	882 additional_contigs.append(x)
jpayne@1	883 #else:
jpayne@1	884 #print x[:3]
jpayne@1	885 #we can just use the fljB region (or fliC depends on size), no matter set() or contain a large locations (without middle part); however, if none of them is fully assembled, use 500 and 1200 as conservative cut-off
jpayne@1	886 if first_allele_percentage>0.9:
jpayne@1	887 if len(fliC_region)>len(fljB_region) and (max(fljB_region)-min(fljB_region))>1000:
jpayne@1	888 target_region=fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region)))) #fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
jpayne@1	889 elif len(fliC_region)<len(fljB_region) and (max(fliC_region)-min(fliC_region))>1000:
jpayne@1	890 target_region=fliC_region\|(fljB_region-set(range(min(fliC_region),max(fliC_region)))) #fljB_region\|(fliC_region-set(range(min(fljB_region),max(fljB_region))))
jpayne@1	891 else:
jpayne@1	892 target_region=set()#doesn't do anything
jpayne@1	893 else:
jpayne@1	894 target_region=set()#doesn't do anything
jpayne@1	895 #print(target_region)
jpayne@1	896 #print(additional_contigs)
jpayne@1	897 target_region2=set(list(range(0,525))+list(range(1200,1700)))#I found to use 500 to 1200 as special region would be best
jpayne@1	898 target_region=target_region2\|target_region
jpayne@1	899 for x in additional_contigs:
jpayne@1	900 removal=0
jpayne@1	901 contig_length=int(x[0].split("___")[1].split("length_")[-1].split("_")[0])
jpayne@1	902 if fljB_contig not in x[0] and fliC_contig not in x[0] and len(target_region&x[3])/float(len(x[3]))>0.65 and contig_length0.5<len(x[3])<contig_length1.5: #consider length and alignment length for now, but very loose,0.5 and 1.5 as cut-off
jpayne@1	903 removal=1
jpayne@1	904 else:
jpayne@1	905 if first_allele_percentage > 0.9 and float(x[0].split("__")[1].split("___")[0])*x[2]/len(x[-1])>0.96:#if high similiarity with middle part of first allele (first allele >0.9, already cover middle part)
jpayne@1	906 removal=1
jpayne@1	907 else:
jpayne@1	908 pass
jpayne@1	909 if removal==1:
jpayne@1	910 for y in H_contig_roles:
jpayne@1	911 if y[1] in x[0]:
jpayne@1	912 H_contig_roles.remove(y)
jpayne@1	913 else:
jpayne@1	914 pass
jpayne@1	915 #print(x[:3],contig_length,len(target_region&x[3])/float(len(x[3])),contig_length0.5,len(x[3]),contig_length1.5)
jpayne@1	916 #end of removing none-middle contigs
jpayne@4	917 #print("H_contigs:")
jpayne@1	918 log_file.write("H_contigs:\n")
jpayne@4	919 extract_file.write("#H_contigs:\n")
jpayne@1	920 H_contig_stat=[]
jpayne@1	921 H1_cont_stat={}
jpayne@1	922 H2_cont_stat={}
jpayne@1	923 for i in range(len(H_contig_roles)):
jpayne@1	924 x=H_contig_roles[i]
jpayne@1	925 a=0
jpayne@1	926 for y in Final_list_passed:
jpayne@1	927 if x[1] in y[0] and y[0].startswith(x[0]):
jpayne@1	928 if "first" in y[0] or "last" in y[0]: #this is the final filter to decide it's fliC or fljB, if can't pass, then can't decide
jpayne@1	929 for y in Final_list_passed: #it's impossible to has the "first" and "last" allele as prediction, so re-do it
jpayne@1	930 if x[1] in y[0]:#it's very possible to be third phase allele, so no need to make it must be fliC or fljB
jpayne@4	931 #print(x[1],"can't_decide_fliC_or_fljB",y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
jpayne@4	932 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
jpayne@1	933 H_contig_roles[i]="can't decide fliC or fljB, may be third phase"
jpayne@4	934 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antiten\n"
jpayne@4	935 seqs=""
jpayne@4	936 for z in handle_fasta:
jpayne@4	937 if x[1]==z.description:
jpayne@4	938 seqs=str(z.seq)
jpayne@4	939 extract_file.write(title+seqs+"\n")
jpayne@1	940 break
jpayne@1	941 else:
jpayne@4	942 #print(x[1],x[0],y[0].split("_")[1],"blast_score:",y[1],"identity%:",str(round(y[2]*100,2))+"%",str(min(y[-1]))+" to "+str(max(y[-1])))
jpayne@4	943 log_file.write(x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n")
jpayne@4	944 title=">"+x[1]+" "+x[0]+" "+y[0].split("_")[1]+"; "+"blast score: "+str(y[1])+" identity%: "+str(round(y[2]*100,2))+"%; alignment from "+str(min(y[-1]))+" to "+str(max(y[-1]))+" of antigen\n"
jpayne@4	945 seqs=""
jpayne@4	946 for z in handle_fasta:
jpayne@4	947 if x[1]==z.description:
jpayne@4	948 seqs=str(z.seq)
jpayne@4	949 extract_file.write(title+seqs+"\n")
jpayne@1	950 if x[0]=="fliC":
jpayne@1	951 if y[0].split("_")[1] not in H1_cont_stat:
jpayne@1	952 H1_cont_stat[y[0].split("_")[1]]=y[2]
jpayne@1	953 else:
jpayne@1	954 H1_cont_stat[y[0].split("_")[1]]+=y[2]
jpayne@1	955 if x[0]=="fljB":
jpayne@1	956 if y[0].split("_")[1] not in H2_cont_stat:
jpayne@1	957 H2_cont_stat[y[0].split("_")[1]]=y[2]
jpayne@1	958 else:
jpayne@1	959 H2_cont_stat[y[0].split("_")[1]]+=y[2]
jpayne@1	960 break
jpayne@1	961 #detect contaminations
jpayne@1	962 #print(H1_cont_stat)
jpayne@1	963 #print(H2_cont_stat)
jpayne@1	964 H1_cont_stat_list=[x for x in H1_cont_stat if H1_cont_stat[x]>0.2]
jpayne@1	965 H2_cont_stat_list=[x for x in H2_cont_stat if H2_cont_stat[x]>0.2]
jpayne@1	966 contamination_H=""
jpayne@1	967 if len(H1_cont_stat_list)>1 or len(H2_cont_stat_list)>1:
jpayne@1	968 contamination_H="potential contamination from H antigen signals"
jpayne@1	969 elif len(H2_cont_stat_list)==1 and fljB_contig=="NA":
jpayne@1	970 contamination_H="potential contamination from H antigen signals, uncommon weak fljB signals detected"
jpayne@4	971 #get additional antigens
jpayne@4	972 """
jpayne@4	973 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
jpayne@4	974 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
jpayne@4	975 O_choice="O-9,46"
jpayne@4	976 #print "$$$Most possilble Otype: O-9,46"
jpayne@4	977 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
jpayne@4	978 O_choice="O-9,46,27"
jpayne@4	979 #print "$$$Most possilble Otype: O-9,46,27"
jpayne@4	980 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
jpayne@4	981 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
jpayne@4	982 O_choice="O-3,10"
jpayne@4	983 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
jpayne@4	984 else:
jpayne@4	985 O_choice="O-1,3,19"
jpayne@4	986 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
jpayne@4	987 ### end of special test for O9,46 and O3,10 family
jpayne@4	988
jpayne@4	989 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
jpayne@4	990 if len(Otypes_uniq)>2:
jpayne@4	991 contamination_O="potential contamination from O antigen signals"
jpayne@4	992 else:
jpayne@4	993 if len(Otypes_uniq)>1:
jpayne@4	994 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@4	995 contamination_O=""
jpayne@4	996 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@4	997 contamination_O=""
jpayne@4	998 """
jpayne@4	999 additonal_antigents=[]
jpayne@4	1000 #print(contamination_O)
jpayne@4	1001 #print(contamination_H)
jpayne@1	1002 log_file.write(contamination_O+"\n")
jpayne@1	1003 log_file.write(contamination_H+"\n")
jpayne@1	1004 log_file.close()
jpayne@4	1005 return O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list
jpayne@1	1006
jpayne@1	1007 def get_input_K(input_file,lib_dict,data_type,k_size):
jpayne@1	1008 #kmer mode; get input_Ks from dict and data_type
jpayne@1	1009 kmers = []
jpayne@1	1010 for h in lib_dict:
jpayne@1	1011 kmers += lib_dict[h]
jpayne@1	1012 if data_type == '4':
jpayne@1	1013 input_Ks = target_multifasta_kmerizer(input_file, k_size, set(kmers))
jpayne@1	1014 elif data_type == '1' or data_type == '2' or data_type == '3':#set it for now, will change later
jpayne@1	1015 input_Ks = target_read_kmerizer(input_file, k_size, set(kmers))
jpayne@1	1016 elif data_type == '5':#minion_2d_fasta
jpayne@1	1017 input_Ks = minion_fasta_kmerizer(input_file, k_size, set(kmers))
jpayne@1	1018 if data_type == '6':#minion_2d_fastq
jpayne@1	1019 input_Ks = minion_fastq_kmerizer(input_file, k_size, set(kmers))
jpayne@1	1020 return input_Ks
jpayne@1	1021
jpayne@1	1022 def get_kmer_dict(lib_dict,input_Ks):
jpayne@1	1023 #kmer mode; get predicted types
jpayne@1	1024 O_dict = {}
jpayne@1	1025 H_dict = {}
jpayne@1	1026 Special_dict = {}
jpayne@1	1027 for h in lib_dict:
jpayne@1	1028 score = (len(lib_dict[h] & input_Ks) / len(lib_dict[h])) * 100
jpayne@1	1029 if score > 1: # Arbitrary cut-off for similarity score very low but seems necessary to detect O-3,10 in some cases
jpayne@1	1030 if h.startswith('O-') and score > 25:
jpayne@1	1031 O_dict[h] = score
jpayne@1	1032 if h.startswith('fl') and score > 40:
jpayne@1	1033 H_dict[h] = score
jpayne@1	1034 if (h[:2] != 'fl') and (h[:2] != 'O-'):
jpayne@1	1035 Special_dict[h] = score
jpayne@1	1036 return O_dict,H_dict,Special_dict
jpayne@1	1037
jpayne@1	1038 def call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir):
jpayne@1	1039 log_file=open("SeqSero_log.txt","a")
jpayne@1	1040 log_file.write("O_scores:\n")
jpayne@1	1041 #call O:
jpayne@1	1042 highest_O = '-'
jpayne@1	1043 if len(O_dict) == 0:
jpayne@1	1044 pass
jpayne@1	1045 else:
jpayne@1	1046 for x in O_dict:
jpayne@1	1047 log_file.write(x+"\t"+str(O_dict[x])+"\n")
jpayne@1	1048 if ('O-9,46_wbaV__1002' in O_dict and O_dict['O-9,46_wbaV__1002']>70) or ("O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002" in O_dict and O_dict['O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254__1002']>70): # not sure should use and float(O9_wbaV)/float(num_1) > 0.1
jpayne@1	1049 if 'O-9,46_wzy__1191' in O_dict: # and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	1050 highest_O = "O-9,46"
jpayne@1	1051 elif "O-9,46,27_partial_wzy__1019" in O_dict: # and float(O94627)/float(num_1) > 0.1
jpayne@1	1052 highest_O = "O-9,46,27"
jpayne@1	1053 else:
jpayne@1	1054 highest_O = "O-9" # next, detect O9 vs O2?
jpayne@1	1055 O2 = 0
jpayne@1	1056 O9 = 0
jpayne@1	1057 for z in Special_dict:
jpayne@1	1058 if "tyr-O-9" in z:
jpayne@1	1059 O9 = float(Special_dict[z])
jpayne@1	1060 if "tyr-O-2" in z:
jpayne@1	1061 O2 = float(Special_dict[z])
jpayne@1	1062 if O2 > O9:
jpayne@1	1063 highest_O = "O-2"
jpayne@1	1064 elif ("O-3,10_wzx__1539" in O_dict) and (
jpayne@1	1065 "O-9,46_wzy__1191" in O_dict
jpayne@1	1066 ): # and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
jpayne@1	1067 if "O-3,10_not_in_1,3,19__1519" in O_dict: # and float(O310_no_1319)/float(num_1) > 0.1
jpayne@1	1068 highest_O = "O-3,10"
jpayne@1	1069 else:
jpayne@1	1070 highest_O = "O-1,3,19"
jpayne@1	1071 ### end of special test for O9,46 and O3,10 family
jpayne@1	1072 else:
jpayne@1	1073 try:
jpayne@1	1074 max_score = 0
jpayne@1	1075 for x in O_dict:
jpayne@1	1076 if float(O_dict[x]) >= max_score:
jpayne@1	1077 max_score = float(O_dict[x])
jpayne@1	1078 highest_O = x.split("_")[0]
jpayne@1	1079 if highest_O == "O-1,3,19":
jpayne@1	1080 highest_O = '-'
jpayne@1	1081 max_score = 0
jpayne@1	1082 for x in O_dict:
jpayne@1	1083 if x == 'O-1,3,19_not_in_3,10__130':
jpayne@1	1084 pass
jpayne@1	1085 else:
jpayne@1	1086 if float(O_dict[x]) >= max_score:
jpayne@1	1087 max_score = float(O_dict[x])
jpayne@1	1088 highest_O = x.split("_")[0]
jpayne@1	1089 except:
jpayne@1	1090 pass
jpayne@1	1091 #call_fliC:
jpayne@1	1092 if len(H_dict)!=0:
jpayne@1	1093 highest_H_score_both_BC=H_dict[max(H_dict.keys(), key=(lambda k: H_dict[k]))] #used to detect whether fljB existed or not
jpayne@1	1094 else:
jpayne@1	1095 highest_H_score_both_BC=0
jpayne@1	1096 highest_fliC = '-'
jpayne@1	1097 highest_fliC_raw = '-'
jpayne@1	1098 highest_Score = 0
jpayne@1	1099 log_file.write("\nH_scores:\n")
jpayne@1	1100 for s in H_dict:
jpayne@1	1101 log_file.write(s+"\t"+str(H_dict[s])+"\n")
jpayne@1	1102 if s.startswith('fliC'):
jpayne@1	1103 if float(H_dict[s]) > highest_Score:
jpayne@1	1104 highest_fliC = s.split('_')[1]
jpayne@1	1105 highest_fliC_raw = s
jpayne@1	1106 highest_Score = float(H_dict[s])
jpayne@1	1107 #call_fljB
jpayne@1	1108 highest_fljB = '-'
jpayne@1	1109 highest_fljB_raw = '-'
jpayne@1	1110 highest_Score = 0
jpayne@1	1111 for s in H_dict:
jpayne@1	1112 if s.startswith('fljB'):
jpayne@1	1113 if float(H_dict[s]) > highest_Score and float(H_dict[s]) > highest_H_score_both_BC * 0.65: #fljB is special, so use highest_H_score_both_BC to give a general estimate of coverage, currently 0.65 seems pretty good; the reason use a high (0.65) is some fliC and fljB shared with each other
jpayne@1	1114 highest_fljB = s.split('_')[1]
jpayne@1	1115 highest_fljB_raw = s
jpayne@1	1116 highest_Score = float(H_dict[s])
jpayne@1	1117 log_file.write("\nSpecial_scores:\n")
jpayne@1	1118 for s in Special_dict:
jpayne@1	1119 log_file.write(s+"\t"+str(Special_dict[s])+"\n")
jpayne@1	1120 log_file.close()
jpayne@1	1121 return highest_O,highest_fliC,highest_fljB
jpayne@1	1122
jpayne@1	1123 def get_temp_file_names(for_fq,rev_fq):
jpayne@1	1124 #seqsero2 -a; get temp file names
jpayne@1	1125 sam=for_fq+".sam"
jpayne@1	1126 bam=for_fq+".bam"
jpayne@1	1127 sorted_bam=for_fq+"_sorted.bam"
jpayne@1	1128 mapped_fq1=for_fq+"_mapped.fq"
jpayne@1	1129 mapped_fq2=rev_fq+"_mapped.fq"
jpayne@1	1130 combined_fq=for_fq+"_combined.fq"
jpayne@1	1131 for_sai=for_fq+".sai"
jpayne@1	1132 rev_sai=rev_fq+".sai"
jpayne@1	1133 return sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai
jpayne@1	1134
jpayne@1	1135 def map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode):
jpayne@1	1136 #seqsero2 -a; do mapping and sort
jpayne@1	1137 print("building database...")
jpayne@1	1138 subprocess.check_output("bwa index "+database,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1139 print("mapping...")
jpayne@1	1140 if mapping_mode=="mem":
jpayne@1	1141 subprocess.check_output("bwa mem -k 17 -t "+threads+" "+database+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1142 elif mapping_mode=="sam":
jpayne@1	1143 if fnameB!="":
jpayne@1	1144 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1145 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameB+" > "+rev_sai,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1146 subprocess.check_output("bwa sampe "+database+" "+for_sai+" "+ rev_sai+" "+fnameA+" "+fnameB+" > "+sam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1147 else:
jpayne@1	1148 subprocess.check_output("bwa aln -t "+threads+" "+database+" "+fnameA+" > "+for_sai,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1149 subprocess.check_output("bwa samse "+database+" "+for_sai+" "+for_fq+" > "+sam, stderr=subprocess.STDOUT)
jpayne@1	1150 subprocess.check_output("samtools view -@ "+threads+" -F 4 -Sh "+sam+" > "+bam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1151 ### check the version of samtools then use differnt commands
jpayne@1	1152 samtools_version=subprocess.Popen(["samtools"],stdout=subprocess.PIPE,stderr=subprocess.PIPE)
jpayne@1	1153 out, err = samtools_version.communicate()
jpayne@1	1154 version = str(err).split("ersion:")[1].strip().split(" ")[0].strip()
jpayne@1	1155 print("check samtools version:",version)
jpayne@1	1156 ### end of samtools version check and its analysis
jpayne@1	1157 if LooseVersion(version)<=LooseVersion("1.2"):
jpayne@1	1158 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" "+fnameA+"_sorted",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1159 else:
jpayne@1	1160 subprocess.check_output("samtools sort -@ "+threads+" -n "+bam+" >"+sorted_bam,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1161
jpayne@1	1162 def extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode):
jpayne@1	1163 #seqsero2 -a; extract, assembly and blast
jpayne@1	1164 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+combined_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1165 if fnameB!="":
jpayne@1	1166 subprocess.check_output("bamToFastq -i "+sorted_bam+" -fq "+mapped_fq1+" -fq2 "+mapped_fq2 ,shell=True, stderr=subprocess.STDOUT)#2> /dev/null if want no output
jpayne@1	1167 else:
jpayne@1	1168 pass
jpayne@1	1169 outdir=current_time+"_temp"
jpayne@1	1170 print("assembling...")
jpayne@1	1171 if int(threads)>4:
jpayne@1	1172 t="4"
jpayne@1	1173 else:
jpayne@1	1174 t=threads
jpayne@12	1175 new_fasta = None
jpayne@7	1176 if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-"
jpayne@1	1177 if fnameB!="":
jpayne@1	1178 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1179 else:
jpayne@1	1180 subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1181 new_fasta=fnameA+"_"+database+"_"+mapping_mode+".fasta"
jpayne@1	1182 subprocess.check_output("mv "+outdir+"/contigs.fasta "+new_fasta+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1183 #os.system("mv "+outdir+"/scaffolds.fasta "+new_fasta+ " 2> /dev/null") contigs.fasta
jpayne@1	1184 subprocess.check_output("rm -rf "+outdir+ " 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1185 print("blasting...","\n")
jpayne@1	1186 xmlfile="blasted_output.xml"#fnameA+"-extracted_vs_"+database+"_"+mapping_mode+".xml"
jpayne@1	1187 subprocess.check_output('makeblastdb -in '+new_fasta+' -out '+new_fasta+'_db '+'-dbtype nucl',shell=True, stderr=subprocess.STDOUT) #temp.txt is to forbid the blast result interrupt the output of our program###1/27/2015
jpayne@1	1188 subprocess.check_output("blastn -query "+database+" -db "+new_fasta+"_db -out "+xmlfile+" -outfmt 5",shell=True, stderr=subprocess.STDOUT)###1/27/2015; 08272018, remove "-word_size 10"
jpayne@1	1189 else:
jpayne@1	1190 xmlfile="NA"
jpayne@4	1191 return xmlfile,new_fasta
jpayne@1	1192
jpayne@1	1193 def judge_subspecies(fnameA):
jpayne@1	1194 #seqsero2 -a; judge subspecies on just forward raw reads fastq
jpayne@12	1195 salmID_output=subprocess.check_output(os.path.join(dirpath, "SalmID/SalmID.py") + ' -i ' +fnameA, shell=True, stderr=subprocess.STDOUT)
jpayne@1	1196 #out, err = salmID_output.communicate()
jpayne@1	1197 #out=out.decode("utf-8")
jpayne@1	1198 out = salmID_output.decode("utf-8")
jpayne@1	1199 file=open("data_log.txt","a")
jpayne@1	1200 file.write(out)
jpayne@1	1201 file.close()
jpayne@1	1202 salm_species_scores=out.split("\n")[1].split("\t")[6:]
jpayne@1	1203 salm_species_results=out.split("\n")[0].split("\t")[6:]
jpayne@1	1204 max_score=0
jpayne@1	1205 max_score_index=1 #default is 1, means "I"
jpayne@1	1206 for i in range(len(salm_species_scores)):
jpayne@1	1207 if max_score<float(salm_species_scores[i]):
jpayne@1	1208 max_score=float(salm_species_scores[i])
jpayne@1	1209 max_score_index=i
jpayne@1	1210 prediction=salm_species_results[max_score_index].split(".")[1].strip().split(" ")[0]
jpayne@1	1211 if float(out.split("\n")[1].split("\t")[4]) > float(out.split("\n")[1].split("\t")[5]): #bongori and enterica compare
jpayne@1	1212 prediction="bongori" #if not, the prediction would always be enterica, since they are located in the later part
jpayne@1	1213 if max_score<10:
jpayne@1	1214 prediction="-"
jpayne@1	1215 return prediction
jpayne@1	1216
jpayne@1	1217 def judge_subspecies_Kmer(Special_dict):
jpayne@1	1218 #seqsero2 -k;
jpayne@1	1219 max_score=0
jpayne@1	1220 prediction="-" #default should be I
jpayne@1	1221 for x in Special_dict:
jpayne@1	1222 if "mer" in x:
jpayne@1	1223 if max_score<float(Special_dict[x]):
jpayne@1	1224 max_score=float(Special_dict[x])
jpayne@1	1225 prediction=x.split("_")[-1].strip()
jpayne@1	1226 if x.split("_")[-1].strip()=="bongori" and float(Special_dict[x])>95:#if bongori already, then no need to test enterica
jpayne@1	1227 prediction="bongori"
jpayne@1	1228 break
jpayne@1	1229 return prediction
jpayne@1	1230
jpayne@1	1231 def main():
jpayne@1	1232 #combine SeqSeroK and SeqSero2, also with SalmID
jpayne@1	1233 args = parse_args()
jpayne@1	1234 input_file = args.i
jpayne@1	1235 data_type = args.t
jpayne@1	1236 analysis_mode = args.m
jpayne@1	1237 mapping_mode=args.b
jpayne@1	1238 threads=args.p
jpayne@1	1239 make_dir=args.d
jpayne@1	1240 clean_mode=args.c
jpayne@1	1241 k_size=27 #will change for bug fixing
jpayne@1	1242 database="H_and_O_and_specific_genes.fasta"
jpayne@1	1243
jpayne@1	1244 if len(sys.argv)==1:
jpayne@1	1245 subprocess.check_output(dirpath+"/SeqSero2_package.py -h",shell=True, stderr=subprocess.STDOUT)#change name of python file
jpayne@1	1246 else:
jpayne@1	1247 request_id = time.strftime("%m_%d_%Y_%H_%M_%S", time.localtime())
jpayne@1	1248 request_id += str(random.randint(1, 10000000))
jpayne@1	1249 if make_dir is None:
jpayne@1	1250 make_dir="SeqSero_result_"+request_id
jpayne@1	1251 if os.path.isdir(make_dir):
jpayne@1	1252 pass
jpayne@1	1253 else:
jpayne@1	1254 subprocess.check_output(["mkdir",make_dir])
jpayne@1	1255 #subprocess.check_output("cp "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1256 subprocess.check_output("ln -srf "+dirpath+"/"+database+" "+" ".join(input_file)+" "+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1257 ############################begin the real analysis
jpayne@1	1258 if analysis_mode=="a":
jpayne@1	1259 if data_type in ["1","2","3"]:#use allele mode
jpayne@12	1260 H1_cont_stat_list = H2_cont_stat_list = []
jpayne@1	1261 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
jpayne@1	1262 os.chdir(make_dir)
jpayne@1	1263 ###add a function to tell input files
jpayne@1	1264 fnameA=for_fq.split("/")[-1]
jpayne@1	1265 fnameB=rev_fq.split("/")[-1]
jpayne@1	1266 current_time=time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
jpayne@1	1267 sam,bam,sorted_bam,mapped_fq1,mapped_fq2,combined_fq,for_sai,rev_sai=get_temp_file_names(fnameA,fnameB) #get temp files id
jpayne@1	1268 map_and_sort(threads,database,fnameA,fnameB,sam,bam,for_sai,rev_sai,sorted_bam,mapping_mode) #do mapping and sort
jpayne@4	1269 xmlfile,new_fasta=extract_mapped_reads_and_do_assembly_and_blast(current_time,sorted_bam,combined_fq,mapped_fq1,mapped_fq2,threads,fnameA,fnameB,database,mapping_mode) #extract the mapped reads and do micro assembly and blast
jpayne@1	1270 if xmlfile=="NA":
jpayne@1	1271 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H=("-","-","-",[],"","")
jpayne@1	1272 else:
jpayne@1	1273 Final_list=xml_parse_score_comparision_seqsero(xmlfile) #analyze xml and get parsed results
jpayne@1	1274 file=open("data_log.txt","a")
jpayne@1	1275 for x in Final_list:
jpayne@1	1276 file.write("\t".join(str(y) for y in x)+"\n")
jpayne@1	1277 file.close()
jpayne@7	1278 Final_list_passed=[x for x in Final_list if float(x[0].split("_cov_")[1].split("_")[0])>=0.9 and (x[1]>=int(x[0].split("__")[1]) or x[1]>=int(x[0].split("___")[1].split("_")[3]) or x[1]>1000)]
jpayne@4	1279 O_choice,fliC_choice,fljB_choice,special_gene_list,contamination_O,contamination_H,Otypes_uniq,H1_cont_stat_list,H2_cont_stat_list=predict_O_and_H_types(Final_list,Final_list_passed,new_fasta) #predict O, fliC and fljB
jpayne@1	1280 subspecies=judge_subspecies(fnameA) #predict subspecies
jpayne@1	1281 ###output
jpayne@1	1282 predict_form,predict_sero,star,star_line,claim=seqsero_from_formula_to_serotypes(O_choice,fliC_choice,fljB_choice,special_gene_list,subspecies)
jpayne@7	1283 claim="" #04132019, disable claim for new report requirement
jpayne@1	1284 contamination_report=""
jpayne@4	1285 H_list=["fliC_"+x for x in H1_cont_stat_list if len(x)>0]+["fljB_"+x for x in H2_cont_stat_list if len(x)>0]
jpayne@1	1286 if contamination_O!="" and contamination_H=="":
jpayne@4	1287 contamination_report="#Potential inter-serotype contamination detected from O antigen signals. All O-antigens detected:"+"\t".join(Otypes_uniq)+"."
jpayne@1	1288 elif contamination_O=="" and contamination_H!="":
jpayne@4	1289 contamination_report="#Potential inter-serotype contamination detected or potential thrid H phase from H antigen signals. All H-antigens detected:"+"\t".join(H_list)+"."
jpayne@1	1290 elif contamination_O!="" and contamination_H!="":
jpayne@4	1291 contamination_report="#Potential inter-serotype contamination detected from both O and H antigen signals.All O-antigens detected:"+"\t".join(Otypes_uniq)+". All H-antigens detected:"+"\t".join(H_list)+"."
jpayne@4	1292 if contamination_report!="":
jpayne@7	1293 #contamination_report="potential inter-serotype contamination detected (please refer below antigen signal report for details)." #above contamination_reports are for back-up and bug fixing #web-based mode need to be re-used, 04132019
jpayne@7	1294 contamination_report="Co-existence of multiple serotypes detected, indicating potential inter-serotype contamination. See 'Extracted_antigen_alleles.fasta' for detected serotype determinant alleles."
jpayne@7	1295 #claim="\n"+open("Extracted_antigen_alleles.fasta","r").read()#used to store H and O antigen sequeences #04132019, need to change if using web-version
jpayne@7	1296 if contamination_report+star_line+claim=="": #0413, new output style
jpayne@7	1297 note=""
jpayne@7	1298 else:
jpayne@7	1299 note="Note:"
jpayne@1	1300 if clean_mode:
jpayne@1	1301 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1302 make_dir="none-output-directory due to '-c' flag"
jpayne@1	1303 else:
jpayne@1	1304 #new_file=open("Seqsero_result.txt","w")
jpayne@1	1305 if O_choice=="":
jpayne@1	1306 O_choice="-"
jpayne@1	1307 result = OrderedDict(
jpayne@1	1308 sample_name = input_file[0],
jpayne@1	1309 O_antigen_prediction = O_choice,
jpayne@1	1310 H1_antigen_prediction = fliC_choice,
jpayne@1	1311 H2_antigen_prediction = fljB_choice,
jpayne@1	1312 predicted_antigenic_profile = predict_form,
jpayne@1	1313 predicted_subspecies = subspecies,
jpayne@7	1314 predicted_serotype = "{}{}".format(predict_sero, star),
jpayne@1	1315 note=claim.replace('\n','')
jpayne@1	1316 )
jpayne@9	1317 result['*'] = star_line.replace('\t', ' ').replace('\n','')
jpayne@1	1318 with open("Seqsero_result.tsv","w") as new_file:
jpayne@1	1319 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
jpayne@1	1320 #new_file.close()
jpayne@1	1321 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
jpayne@1	1322 wrtr.writeheader()
jpayne@1	1323 wrtr.writerow(result)
jpayne@1	1324
jpayne@1	1325
jpayne@1	1326 # new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
jpayne@1	1327 # new_file.close()
jpayne@1	1328 #subprocess.check_output("cat Seqsero_result.txt",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1329 #subprocess.call("rm H_and_O_and_specific_genes.fasta* .sra .bam .sam .fastq .gz .fq temp.txt .xml "+fnameA+"_db* 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1330 subprocess.call("rm H_and_O_and_specific_genes.fasta* .sra .bam .sam .fastq .gz .fq temp.txt "+fnameA+"_db 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1331 print("Output_directory:"+make_dir+"\nInput files:\t"+for_fq+" "+rev_fq+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+fliC_choice+"\n"+"H2 antigen prediction(fljB):\t"+fljB_choice+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+contamination_report+star+star_line+claim+"\n")#+##
jpayne@1	1332 else:
jpayne@1	1333 print("Allele modes only support raw reads datatype, i.e. '-t 1 or 2 or 3'; please use '-m k'")
jpayne@1	1334 elif analysis_mode=="k":
jpayne@1	1335 ex_dir = os.path.dirname(os.path.realpath(__file__))
jpayne@1	1336 #output_mode = args.mode
jpayne@1	1337 for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath)
jpayne@1	1338 input_file = for_fq #-k will just use forward because not all reads were used
jpayne@1	1339 os.chdir(make_dir)
jpayne@1	1340 f = open(ex_dir + '/antigens.pickle', 'rb')
jpayne@1	1341 lib_dict = pickle.load(f)
jpayne@1	1342 f.close
jpayne@1	1343 input_Ks = get_input_K(input_file,lib_dict,data_type,k_size)
jpayne@1	1344 O_dict,H_dict,Special_dict = get_kmer_dict(lib_dict,input_Ks)
jpayne@1	1345 highest_O,highest_fliC,highest_fljB = call_O_and_H_type(O_dict,H_dict,Special_dict,make_dir)
jpayne@1	1346 subspecies = judge_subspecies_Kmer(Special_dict)
jpayne@1	1347 if subspecies=="IIb" or subspecies=="IIa":
jpayne@1	1348 subspecies="II"
jpayne@1	1349 predict_form,predict_sero,star,star_line,claim = seqsero_from_formula_to_serotypes(
jpayne@1	1350 highest_O.split('-')[1], highest_fliC, highest_fljB, Special_dict,subspecies)
jpayne@7	1351 claim="" #no claim any more based on new output requirement
jpayne@7	1352 if star_line+claim=="": #0413, new output style
jpayne@7	1353 note=""
jpayne@7	1354 else:
jpayne@7	1355 note="Note:"
jpayne@1	1356 if clean_mode:
jpayne@1	1357 subprocess.check_output("rm -rf ../"+make_dir,shell=True, stderr=subprocess.STDOUT)
jpayne@1	1358 make_dir="none-output-directory due to '-c' flag"
jpayne@1	1359 else:
jpayne@1	1360 if highest_O.split('-')[-1]=="":
jpayne@1	1361 O_choice="-"
jpayne@1	1362 else:
jpayne@1	1363 O_choice=highest_O.split('-')[-1]
jpayne@1	1364 #print("Output_directory:"+make_dir+"\tInput_file:"+input_file+"\tPredicted subpecies:"+subspecies + '\tPredicted antigenic profile:' + predict_form + '\tPredicted serotype(s):' + predict_sero)
jpayne@1	1365 result = OrderedDict(
jpayne@1	1366 sample_name = input_file,
jpayne@1	1367 O_antigen_prediction = O_choice,
jpayne@1	1368 H1_antigen_prediction = highest_fliC,
jpayne@1	1369 H2_antigen_prediction = highest_fljB,
jpayne@1	1370 predicted_antigenic_profile = predict_form,
jpayne@1	1371 predicted_subspecies = subspecies,
jpayne@7	1372 predicted_serotype = "{}{}".format(predict_sero, star),
jpayne@1	1373 note=claim.replace('\n','')
jpayne@1	1374 )
jpayne@9	1375 result['*'] = star_line.replace('\t', ' ').replace('\n','')
jpayne@1	1376 with open("Seqsero_result.tsv","w") as new_file:
jpayne@1	1377 #new_file.write("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
jpayne@1	1378 #new_file.close()
jpayne@1	1379 wrtr = DictWriter(new_file, delimiter='\t', fieldnames=result.keys())
jpayne@1	1380 wrtr.writeheader()
jpayne@1	1381 wrtr.writerow(result)
jpayne@1	1382 subprocess.call("rm .fasta .fastq .gz .fq temp.txt .sra 2> /dev/null",shell=True, stderr=subprocess.STDOUT)
jpayne@1	1383 print("Output_directory:"+make_dir+"\nInput files:\t"+input_file+"\n"+"O antigen prediction:\t"+O_choice+"\n"+"H1 antigen prediction(fliC):\t"+highest_fliC+"\n"+"H2 antigen prediction(fljB):\t"+highest_fljB+"\n"+"Predicted antigenic profile:\t"+predict_form+"\n"+"Predicted subspecies:\t"+subspecies+"\n"+"Predicted serotype(s):\t"+predict_sero+star+"\n"+star+star_line+claim+"\n")#+##
jpayne@1	1384 return 0
jpayne@1	1385
jpayne@1	1386 if __name__ == '__main__':
jpayne@1	1387 try:
jpayne@1	1388 quit(main())
jpayne@1	1389 except subprocess.CalledProcessError as e:
jpayne@1	1390 print(str(e.output))
jpayne@1	1391 print(e)
jpayne@1	1392 quit(e.returncode)

Mercurial > repos > jpayne > seqsero_v2

annotate SeqSero2/SeqSero2_package.py @ 16:3b6d5b60968f tip