annotate SeqSero2/SeqSero2_package.py @ 16:3b6d5b60968f tip

Uploaded
author estrain
date Wed, 01 Mar 2023 13:21:51 -0500
parents 4ac593d4b40f
children
rev   line source
jpayne@1 1 #!/usr/bin/env python3
jpayne@1 2
jpayne@1 3 import sys
jpayne@1 4 import time
jpayne@1 5 import random
jpayne@1 6 import os
jpayne@1 7 import subprocess
jpayne@1 8 import gzip
jpayne@1 9 import io
jpayne@1 10 import pickle
jpayne@1 11 import argparse
jpayne@1 12 import itertools
jpayne@1 13 from distutils.version import LooseVersion
jpayne@1 14
jpayne@1 15 from collections import OrderedDict
jpayne@1 16 from csv import DictWriter
jpayne@1 17
jpayne@1 18 dirpath = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
jpayne@1 19
jpayne@1 20 ### SeqSero Kmer
jpayne@1 21 def parse_args():
jpayne@1 22 "Parse the input arguments, use '-h' for help."
jpayne@1 23 parser = argparse.ArgumentParser(usage='SeqSero2_package.py -t <data_type> -m <mode> -i <input_data> [-p <number of threads>] [-b <BWA_algorithm>]\n\nDevelopper: Shaokang Zhang (zskzsk@uga.edu), Hendrik C Den-Bakker (Hendrik.DenBakker@uga.edu) and Xiangyu Deng (xdeng@uga.edu)\n\nContact email:seqsero@gmail.com')#add "-m <data_type>" in future
jpayne@1 24 parser.add_argument("-i",nargs="+",help="<string>: path/to/input_data")
jpayne@1 25 parser.add_argument("-t",choices=['1','2','3','4','5','6'],help="<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)")
jpayne@7 26 parser.add_argument("-b",choices=['sam','mem'],default="mem",help="<string>: mode used for mapping in allele workflow, 'sam'(bwa samse/sampe), 'mem'(bwa mem), default=mem")
jpayne@7 27 parser.add_argument("-p",default="1",help="<int>: threads used for allele workflow, if p>4, only 4 threads will be used for assembly step, default=1")
jpayne@7 28 parser.add_argument("-m",choices=['k','a'],default="a",help="<string>: 'k'(kmer workflow), 'a'(allele workflow), default=a")
jpayne@1 29 parser.add_argument("-d",help="<string>: output directory name, if not set, the output directory would be 'SeqSero_result_'+time stamp+one random number")
jpayne@1 30 parser.add_argument("-c",action="store_true",help="<flag>: if '-c' was flagged, SeqSero2 will use clean mode and only output serotyping prediction, the directory containing log files will be deleted")
jpayne@1 31 return parser.parse_args()
jpayne@1 32
jpayne@1 33 def reverse_complement(sequence):
jpayne@1 34 complement = {
jpayne@1 35 'A': 'T',
jpayne@1 36 'C': 'G',
jpayne@1 37 'G': 'C',
jpayne@1 38 'T': 'A',
jpayne@1 39 'N': 'N',
jpayne@1 40 'M': 'K',
jpayne@1 41 'R': 'Y',
jpayne@1 42 'W': 'W',
jpayne@1 43 'S': 'S',
jpayne@1 44 'Y': 'R',
jpayne@1 45 'K': 'M',
jpayne@1 46 'V': 'B',
jpayne@1 47 'H': 'D',
jpayne@1 48 'D': 'H',
jpayne@1 49 'B': 'V'
jpayne@1 50 }
jpayne@1 51 return "".join(complement[base] for base in reversed(sequence))
jpayne@1 52
jpayne@1 53
jpayne@1 54 def createKmerDict_reads(list_of_strings, kmer):
jpayne@1 55 kmer_table = {}
jpayne@1 56 for string in list_of_strings:
jpayne@1 57 sequence = string.strip('\n')
jpayne@1 58 for i in range(len(sequence) - kmer + 1):
jpayne@1 59 new_mer = sequence[i:i + kmer].upper()
jpayne@1 60 new_mer_rc = reverse_complement(new_mer)
jpayne@1 61 if new_mer in kmer_table:
jpayne@1 62 kmer_table[new_mer.upper()] += 1
jpayne@1 63 else:
jpayne@1 64 kmer_table[new_mer.upper()] = 1
jpayne@1 65 if new_mer_rc in kmer_table:
jpayne@1 66 kmer_table[new_mer_rc.upper()] += 1
jpayne@1 67 else:
jpayne@1 68 kmer_table[new_mer_rc.upper()] = 1
jpayne@1 69 return kmer_table
jpayne@1 70
jpayne@1 71
jpayne@1 72 def multifasta_dict(multifasta):
jpayne@1 73 multifasta_list = [
jpayne@1 74 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
jpayne@1 75 ]
jpayne@1 76 headers = [i for i in multifasta_list if i[0] == '>']
jpayne@1 77 multifasta_dict = {}
jpayne@1 78 for h in headers:
jpayne@1 79 start = multifasta_list.index(h)
jpayne@1 80 for element in multifasta_list[start + 1:]:
jpayne@1 81 if element[0] == '>':
jpayne@1 82 break
jpayne@1 83 else:
jpayne@1 84 if h[1:] in multifasta_dict:
jpayne@1 85 multifasta_dict[h[1:]] += element
jpayne@1 86 else:
jpayne@1 87 multifasta_dict[h[1:]] = element
jpayne@1 88 return multifasta_dict
jpayne@1 89
jpayne@1 90
jpayne@1 91 def multifasta_single_string(multifasta):
jpayne@1 92 multifasta_list = [
jpayne@1 93 line.strip() for line in open(multifasta, 'r')
jpayne@1 94 if (len(line.strip()) > 0) and (line.strip()[0] != '>')
jpayne@1 95 ]
jpayne@1 96 return ''.join(multifasta_list)
jpayne@1 97
jpayne@1 98
jpayne@1 99 def chunk_a_long_sequence(long_sequence, chunk_size=60):
jpayne@1 100 chunk_list = []
jpayne@1 101 steps = len(long_sequence) // 60 #how many chunks
jpayne@1 102 for i in range(steps):
jpayne@1 103 chunk_list.append(long_sequence[i * chunk_size:(i + 1) * chunk_size])
jpayne@1 104 chunk_list.append(long_sequence[steps * chunk_size:len(long_sequence)])
jpayne@1 105 return chunk_list
jpayne@1 106
jpayne@1 107
jpayne@1 108 def target_multifasta_kmerizer(multifasta, k, kmerDict):
jpayne@1 109 forward_length = 300 #if find the target, put forward 300 bases
jpayne@1 110 reverse_length = 2200 #if find the target, put backward 2200 bases
jpayne@1 111 chunk_size = 60 #it will firstly chunk the single long sequence to multiple smaller sequences, it controls the size of those smaller sequences
jpayne@1 112 target_mers = []
jpayne@1 113 long_single_string = multifasta_single_string(multifasta)
jpayne@1 114 multifasta_list = chunk_a_long_sequence(long_single_string, chunk_size)
jpayne@1 115 unit_length = len(multifasta_list[0])
jpayne@1 116 forward_lines = int(forward_length / unit_length) + 1
jpayne@1 117 reverse_lines = int(forward_length / unit_length) + 1
jpayne@1 118 start_num = 0
jpayne@1 119 end_num = 0
jpayne@1 120 for i in range(len(multifasta_list)):
jpayne@1 121 if i not in range(start_num, end_num): #avoid computational repetition
jpayne@1 122 line = multifasta_list[i]
jpayne@1 123 start = int((len(line) - k) // 2)
jpayne@1 124 s1 = line[start:k + start]
jpayne@1 125 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
jpayne@1 126 if i - forward_lines >= 0:
jpayne@1 127 start_num = i - forward_lines
jpayne@1 128 else:
jpayne@1 129 start_num = 0
jpayne@1 130 if i + reverse_lines <= len(multifasta_list) - 1:
jpayne@1 131 end_num = i + reverse_lines
jpayne@1 132 else:
jpayne@1 133 end_num = len(multifasta_list) - 1
jpayne@1 134 target_list = [
jpayne@1 135 x.strip() for x in multifasta_list[start_num:end_num]
jpayne@1 136 ]
jpayne@1 137 target_line = "".join(target_list)
jpayne@1 138 target_mers += [
jpayne@1 139 k1 for k1 in createKmerDict_reads([str(target_line)], k)
jpayne@1 140 ] ##changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
jpayne@1 141 else:
jpayne@1 142 pass
jpayne@1 143 return set(target_mers)
jpayne@1 144
jpayne@1 145
jpayne@1 146 def target_read_kmerizer(file, k, kmerDict):
jpayne@1 147 i = 1
jpayne@1 148 n_reads = 0
jpayne@1 149 total_coverage = 0
jpayne@1 150 target_mers = []
jpayne@1 151 if file.endswith(".gz"):
jpayne@1 152 file_content = io.BufferedReader(gzip.open(file))
jpayne@1 153 else:
jpayne@1 154 file_content = open(file, "r").readlines()
jpayne@1 155 for line in file_content:
jpayne@1 156 start = int((len(line) - k) // 2)
jpayne@1 157 if i % 4 == 2:
jpayne@1 158 if file.endswith(".gz"):
jpayne@1 159 s1 = line[start:k + start].decode()
jpayne@1 160 line = line.decode()
jpayne@1 161 else:
jpayne@1 162 s1 = line[start:k + start]
jpayne@1 163 if s1 in kmerDict: #detect it is a potential read or not (use the middle part)
jpayne@1 164 n_reads += 1
jpayne@1 165 total_coverage += len(line)
jpayne@1 166 target_mers += [
jpayne@1 167 k1 for k1 in createKmerDict_reads([str(line)], k)
jpayne@1 168 ] #changed k to k1, just want to avoid the mixes of this "k" (kmer) to the "k" above (kmer length)
jpayne@1 169 i += 1
jpayne@1 170 if total_coverage >= 4000000:
jpayne@1 171 break
jpayne@1 172 return set(target_mers)
jpayne@1 173
jpayne@1 174
jpayne@1 175 def minion_fasta_kmerizer(file, k, kmerDict):
jpayne@1 176 i = 1
jpayne@1 177 n_reads = 0
jpayne@1 178 total_coverage = 0
jpayne@1 179 target_mers = {}
jpayne@1 180 for line in open(file):
jpayne@1 181 if i % 2 == 0:
jpayne@1 182 for kmer, rc_kmer in kmers(line.strip().upper(), k):
jpayne@1 183 if (kmer in kmerDict) or (rc_kmer in kmerDict):
jpayne@1 184 if kmer in target_mers:
jpayne@1 185 target_mers[kmer] += 1
jpayne@1 186 else:
jpayne@1 187 target_mers[kmer] = 1
jpayne@1 188 if rc_kmer in target_mers:
jpayne@1 189 target_mers[rc_kmer] += 1
jpayne@1 190 else:
jpayne@1 191 target_mers[rc_kmer] = 1
jpayne@1 192 i += 1
jpayne@1 193 return set([h for h in target_mers])
jpayne@1 194
jpayne@1 195
jpayne@1 196 def minion_fastq_kmerizer(file, k, kmerDict):
jpayne@1 197 i = 1
jpayne@1 198 n_reads = 0
jpayne@1 199 total_coverage = 0
jpayne@1 200 target_mers = {}
jpayne@1 201 for line in open(file):
jpayne@1 202 if i % 4 == 2:
jpayne@1 203 for kmer, rc_kmer in kmers(line.strip().upper(), k):
jpayne@1 204 if (kmer in kmerDict) or (rc_kmer in kmerDict):
jpayne@1 205 if kmer in target_mers:
jpayne@1 206 target_mers[kmer] += 1
jpayne@1 207 else:
jpayne@1 208 target_mers[kmer] = 1
jpayne@1 209 if rc_kmer in target_mers:
jpayne@1 210 target_mers[rc_kmer] += 1
jpayne@1 211 else:
jpayne@1 212 target_mers[rc_kmer] = 1
jpayne@1 213 i += 1
jpayne@1 214 return set([h for h in target_mers])
jpayne@1 215
jpayne@1 216
jpayne@1 217 def multifasta_single_string2(multifasta):
jpayne@1 218 single_string = ''
jpayne@1 219 with open(multifasta, 'r') as f:
jpayne@1 220 for line in f:
jpayne@1 221 if line.strip()[0] == '>':
jpayne@1 222 pass
jpayne@1 223 else:
jpayne@1 224 single_string += line.strip()
jpayne@1 225 return single_string
jpayne@1 226
jpayne@1 227
jpayne@1 228 def kmers(seq, k):
jpayne@1 229 rev_comp = reverse_complement(seq)
jpayne@1 230 for start in range(1, len(seq) - k + 1):
jpayne@1 231 yield seq[start:start + k], rev_comp[-(start + k):-start]
jpayne@1 232
jpayne@1 233
jpayne@1 234 def multifasta_to_kmers_dict(multifasta,k_size):#used to create database kmer set
jpayne@1 235 multi_seq_dict = multifasta_dict(multifasta)
jpayne@1 236 lib_dict = {}
jpayne@1 237 for h in multi_seq_dict:
jpayne@1 238 lib_dict[h] = set(
jpayne@1 239 [k for k in createKmerDict_reads([multi_seq_dict[h]], k_size)])
jpayne@1 240 return lib_dict
jpayne@1 241
jpayne@1 242
jpayne@1 243 def Combine(b, c):
jpayne@1 244 fliC_combinations = []
jpayne@1 245 fliC_combinations.append(",".join(c))
jpayne@1 246 temp_combinations = []
jpayne@1 247 for i in range(len(b)):
jpayne@1 248 for x in itertools.combinations(b, i + 1):
jpayne@1 249 temp_combinations.append(",".join(x))
jpayne@1 250 for x in temp_combinations:
jpayne@1 251 temp = []
jpayne@1 252 for y in c:
jpayne@1 253 temp.append(y)
jpayne@1 254 temp.append(x)
jpayne@1 255 temp = ",".join(temp)
jpayne@1 256 temp = temp.split(",")
jpayne@1 257 temp.sort()
jpayne@1 258 temp = ",".join(temp)
jpayne@1 259 fliC_combinations.append(temp)
jpayne@1 260 return fliC_combinations
jpayne@1 261
jpayne@1 262
jpayne@1 263 def seqsero_from_formula_to_serotypes(Otype, fliC, fljB, special_gene_list,subspecies):
jpayne@1 264 #like test_output_06012017.txt
jpayne@1 265 #can add more varialbles like sdf-type, sub-species-type in future (we can conclude it into a special-gene-list)
jpayne@4 266 from Initial_Conditions import phase1,phase2,phaseO,sero,subs,remove_list,rename_dict
jpayne@4 267 rename_dict_not_anymore=[rename_dict[x] for x in rename_dict]
jpayne@4 268 rename_dict_all=rename_dict_not_anymore+list(rename_dict) #used for decide whether to
jpayne@1 269 seronames = []
jpayne@1 270 seronames_none_subspecies=[]
jpayne@1 271 for i in range(len(phase1)):
jpayne@1 272 fliC_combine = []
jpayne@1 273 fljB_combine = []
jpayne@1 274 if phaseO[i] == Otype: # no VII in KW, but it's there
jpayne@1 275 ### for fliC, detect every possible combinations to avoid the effect of "["
jpayne@1 276 if phase1[i].count("[") == 0:
jpayne@1 277 fliC_combine.append(phase1[i])
jpayne@1 278 elif phase1[i].count("[") >= 1:
jpayne@1 279 c = []
jpayne@1 280 b = []
jpayne@1 281 if phase1[i][0] == "[" and phase1[i][-1] == "]" and phase1[i].count(
jpayne@1 282 "[") == 1:
jpayne@1 283 content = phase1[i].replace("[", "").replace("]", "")
jpayne@1 284 fliC_combine.append(content)
jpayne@1 285 fliC_combine.append("-")
jpayne@1 286 else:
jpayne@1 287 for x in phase1[i].split(","):
jpayne@1 288 if "[" in x:
jpayne@1 289 b.append(x.replace("[", "").replace("]", ""))
jpayne@1 290 else:
jpayne@1 291 c.append(x)
jpayne@1 292 fliC_combine = Combine(
jpayne@1 293 b, c
jpayne@1 294 ) #Combine will offer every possible combinations of the formula, like f,[g],t: f,t f,g,t
jpayne@1 295 ### end of fliC "[" detect
jpayne@1 296 ### for fljB, detect every possible combinations to avoid the effect of "["
jpayne@1 297 if phase2[i].count("[") == 0:
jpayne@1 298 fljB_combine.append(phase2[i])
jpayne@1 299 elif phase2[i].count("[") >= 1:
jpayne@1 300 d = []
jpayne@1 301 e = []
jpayne@1 302 if phase2[i][0] == "[" and phase2[i][-1] == "]" and phase2[i].count(
jpayne@1 303 "[") == 1:
jpayne@1 304 content = phase2[i].replace("[", "").replace("]", "")
jpayne@1 305 fljB_combine.append(content)
jpayne@1 306 fljB_combine.append("-")
jpayne@1 307 else:
jpayne@1 308 for x in phase2[i].split(","):
jpayne@1 309 if "[" in x:
jpayne@1 310 d.append(x.replace("[", "").replace("]", ""))
jpayne@1 311 else:
jpayne@1 312 e.append(x)
jpayne@1 313 fljB_combine = Combine(d, e)
jpayne@1 314 ### end of fljB "[" detect
jpayne@1 315 new_fliC = fliC.split(
jpayne@1 316 ","
jpayne@1 317 ) #because some antigen like r,[i] not follow alphabetical order, so use this one to judge and can avoid missings
jpayne@1 318 new_fliC.sort()
jpayne@1 319 new_fliC = ",".join(new_fliC)
jpayne@1 320 new_fljB = fljB.split(",")
jpayne@1 321 new_fljB.sort()
jpayne@1 322 new_fljB = ",".join(new_fljB)
jpayne@1 323 if (new_fliC in fliC_combine
jpayne@1 324 or fliC in fliC_combine) and (new_fljB in fljB_combine
jpayne@1 325 or fljB in fljB_combine):
jpayne@4 326 ######start, remove_list,rename_dict, added on 11/11/2018
jpayne@4 327 if sero[i] not in remove_list:
jpayne@4 328 temp_sero=sero[i]
jpayne@4 329 if temp_sero in rename_dict:
jpayne@4 330 temp_sero=rename_dict[temp_sero] #rename if in the rename list
jpayne@4 331 if temp_sero not in seronames:#the new sero may already included, if yes, then not consider
jpayne@4 332 if subs[i] == subspecies:
jpayne@4 333 seronames.append(temp_sero)
jpayne@4 334 seronames_none_subspecies.append(temp_sero)
jpayne@4 335 else:
jpayne@4 336 pass
jpayne@4 337 else:
jpayne@4 338 pass
jpayne@4 339 ######end, added on 11/11/2018
jpayne@1 340 #analyze seronames
jpayne@1 341 subspecies_pointer=""
jpayne@1 342 if len(seronames) == 0 and len(seronames_none_subspecies)!=0:
jpayne@1 343 seronames=seronames_none_subspecies
jpayne@1 344 subspecies_pointer="1"
jpayne@1 345 if len(seronames) == 0:
jpayne@1 346 seronames = [
jpayne@1 347 "N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)"
jpayne@1 348 ]
jpayne@1 349 star = ""
jpayne@1 350 star_line = ""
jpayne@1 351 if len(seronames) > 1: #there are two possible predictions for serotypes
jpayne@1 352 star = "*"
jpayne@7 353 #changed 04072019
jpayne@7 354 #star_line = "The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
jpayne@1 355 if subspecies_pointer=="1" and len(seronames_none_subspecies)!=0:
jpayne@1 356 star="*"
jpayne@7 357 star_line=" The predicted O and H antigens correspond to serotype '"+(" or ").join(seronames)+"' in the Kauffmann-White scheme. The predicted subspecies by SalmID (github.com/hcdenbakker/SalmID) may not be consistent with subspecies designation in the Kauffmann-White scheme." + star_line
jpayne@7 358 #star_line="The formula with this subspieces prediction can't get a serotype in KW manual, and the serotyping prediction was made without considering it."+star_line
jpayne@1 359 if Otype=="":
jpayne@1 360 Otype="-"
jpayne@1 361 predict_form = Otype + ":" + fliC + ":" + fljB
jpayne@1 362 predict_sero = (" or ").join(seronames)
jpayne@1 363 ###special test for Enteritidis
jpayne@1 364 if predict_form == "9:g,m:-":
jpayne@1 365 sdf = "-"
jpayne@1 366 for x in special_gene_list:
jpayne@1 367 if x.startswith("sdf"):
jpayne@1 368 sdf = "+"
jpayne@7 369 #star_line="Detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
jpayne@7 370 star_line=" sdf gene detected." # ed_SL_04152019: new output format
jpayne@7 371 #predict_form = predict_form + " Sdf prediction:" + sdf
jpayne@7 372 predict_form = predict_form #changed 04072019
jpayne@1 373 if sdf == "-":
jpayne@1 374 star = "*"
jpayne@7 375 #star_line="Didn't detected sdf gene, a marker to differentiate Gallinarum and Enteritidis"
jpayne@7 376 star_line=" sdf gene not detected." # ed_SL_04152019: new output format
jpayne@7 377 #changed in 04072019, for new output
jpayne@7 378 #star_line = "Additional characterization is necessary to assign a serotype to this strain. Commonly circulating strains of serotype Enteritidis are sdf+, although sdf- strains of serotype Enteritidis are known to exist. Serotype Gallinarum is typically sdf- but should be quite rare. Sdf- strains of serotype Enteritidis and serotype Gallinarum can be differentiated by phenotypic profile or genetic criteria.\n"
jpayne@7 379 #predict_sero = "Gallinarum/Enteritidis" #04132019, for new output requirement
jpayne@7 380 predict_sero = "Gallinarum or Enteritidis" # ed_SL_04152019: new output format
jpayne@1 381 ###end of special test for Enteritidis
jpayne@1 382 elif predict_form == "4:i:-":
jpayne@7 383 predict_sero = "4:i:-"
jpayne@1 384 elif predict_form == "4:r:-":
jpayne@7 385 predict_sero = "4:r:-"
jpayne@1 386 elif predict_form == "4:b:-":
jpayne@7 387 predict_sero = "4:b:-"
jpayne@4 388 #elif predict_form == "8:e,h:1,2": #removed after official merge of newport and bardo
jpayne@4 389 #predict_sero = "Newport"
jpayne@4 390 #star = "*"
jpayne@4 391 #star_line = "Serotype Bardo shares the same antigenic profile with Newport, but Bardo is exceedingly rare."
jpayne@7 392 claim = " The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme. New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies. Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.\n"
jpayne@1 393 if "N/A" in predict_sero:
jpayne@1 394 claim = ""
jpayne@1 395 #special test for Typhimurium
jpayne@1 396 if "Typhimurium" in predict_sero or predict_form == "4:i:-":
jpayne@1 397 normal = 0
jpayne@1 398 mutation = 0
jpayne@1 399 for x in special_gene_list:
jpayne@1 400 if "oafA-O-4_full" in x:
jpayne@1 401 normal = float(special_gene_list[x])
jpayne@1 402 elif "oafA-O-4_5-" in x:
jpayne@1 403 mutation = float(special_gene_list[x])
jpayne@1 404 if normal > mutation:
jpayne@1 405 pass
jpayne@1 406 elif normal < mutation:
jpayne@7 407 #predict_sero = predict_sero.strip() + "(O5-)"
jpayne@7 408 predict_sero = predict_sero.strip() #diable special sero for new output requirement, 04132019
jpayne@1 409 star = "*"
jpayne@7 410 #star_line = "Detected the deletion of O5-."
jpayne@7 411 star_line = " Detected a deletion that causes O5- variant of Typhimurium." # ed_SL_04152019: new output format
jpayne@1 412 else:
jpayne@1 413 pass
jpayne@1 414 #special test for Paratyphi B
jpayne@1 415 if "Paratyphi B" in predict_sero or predict_form == "4:b:-":
jpayne@1 416 normal = 0
jpayne@1 417 mutation = 0
jpayne@1 418 for x in special_gene_list:
jpayne@1 419 if "gntR-family-regulatory-protein_dt-positive" in x:
jpayne@1 420 normal = float(special_gene_list[x])
jpayne@1 421 elif "gntR-family-regulatory-protein_dt-negative" in x:
jpayne@1 422 mutation = float(special_gene_list[x])
jpayne@1 423 #print(normal,mutation)
jpayne@1 424 if normal > mutation:
jpayne@7 425 #predict_sero = predict_sero.strip() + "(dt+)" #diable special sero for new output requirement, 04132019
jpayne@7 426 predict_sero = predict_sero.strip()+' var. L(+) tartrate+' if "Paratyphi B" in predict_sero else predict_sero.strip() # ed_SL_04152019: new output format
jpayne@1 427 star = "*"
jpayne@7 428 #star_line = "Didn't detect the SNP for dt- which means this isolate is a Paratyphi B variant L(+) tartrate(+)."
jpayne@7 429 star_line = " The SNP that causes d-Tartrate nonfermentating phenotype was not detected. " # ed_SL_04152019: new output format
jpayne@1 430 elif normal < mutation:
jpayne@7 431 #predict_sero = predict_sero.strip() + "(dt-)" #diable special sero for new output requirement, 04132019
jpayne@7 432 predict_sero = predict_sero.strip()
jpayne@1 433 star = "*"
jpayne@7 434 #star_line = "Detected the SNP for dt- which means this isolate is a systemic pathovar of Paratyphi B."
jpayne@7 435 star_line = " Detected the SNP for d-Tartrate nonfermenting phenotype." # ed_SL_04152019: new output format
jpayne@1 436 else:
jpayne@1 437 star = "*"
jpayne@7 438 star_line = " Failed to detect the SNP for dt-, can't decide it's a Paratyphi B variant L(+) tartrate(+) or not."
jpayne@1 439 #special test for O13,22 and O13,23
jpayne@1 440 if Otype=="13":
jpayne@1 441 #ex_dir = os.path.dirname(os.path.realpath(__file__))
jpayne@1 442 f = open(dirpath + '/special.pickle', 'rb')
jpayne@1 443 special = pickle.load(f)
jpayne@1 444 O22_O23=special['O22_O23']
jpayne@4 445 if predict_sero.split(" or ")[0] in O22_O23[-1] and predict_sero.split(" or ")[0] not in rename_dict_all:#if in rename_dict_all, then it means already merged, no need to analyze
jpayne@1 446 O22_score=0
jpayne@1 447 O23_score=0
jpayne@1 448 for x in special_gene_list:
jpayne@1 449 if "O:22" in x:
jpayne@1 450 O22_score = O22_score+float(special_gene_list[x])
jpayne@1 451 elif "O:23" in x:
jpayne@1 452 O23_score = O23_score+float(special_gene_list[x])
jpayne@1 453 #print(O22_score,O23_score)
jpayne@1 454 for z in O22_O23[0]:
jpayne@1 455 if predict_sero.split(" or ")[0] in z:
jpayne@1 456 if O22_score > O23_score:
jpayne@1 457 star = "*"
jpayne@7 458 #star_line = "Detected O22 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
jpayne@1 459 predict_sero = z[0]
jpayne@1 460 elif O22_score < O23_score:
jpayne@1 461 star = "*"
jpayne@7 462 #star_line = "Detected O23 specific genes to further differenciate '"+predict_sero+"'." #diabled for new output requirement, 04132019
jpayne@1 463 predict_sero = z[1]
jpayne@1 464 else:
jpayne@1 465 star = "*"
jpayne@7 466 #star_line = "Fail to detect O22 and O23 differences." #diabled for new output requirement, 04132019
jpayne@7 467 if " or " in predict_sero:
jpayne@7 468 star_line = star_line + " The predicted serotypes share the same general formula:\t" + Otype + ":" + fliC + ":" + fljB + "\n"
jpayne@1 469 #special test for O6,8
jpayne@4 470 #merge_O68_list=["Blockley","Bovismorbificans","Hadar","Litchfield","Manhattan","Muenchen"] #remove 11/11/2018, because already in merge list
jpayne@4 471 #for x in merge_O68_list:
jpayne@4 472 # if x in predict_sero:
jpayne@4 473 # predict_sero=x
jpayne@4 474 # star=""
jpayne@4 475 # star_line=""
jpayne@1 476 #special test for Montevideo; most of them are monophasic
jpayne@4 477 #if "Montevideo" in predict_sero and "1,2,7" in predict_form: #remove 11/11/2018, because already in merge list
jpayne@4 478 #star="*"
jpayne@4 479 #star_line="Montevideo is almost always monophasic, having an antigen called for the fljB position may be a result of Salmonella-Salmonella contamination."
jpayne@1 480 return predict_form, predict_sero, star, star_line, claim
jpayne@1 481 ### End of SeqSero Kmer part
jpayne@1 482
jpayne@1 483 ### Begin of SeqSero2 allele prediction and output
jpayne@1 484 def xml_parse_score_comparision_seqsero(xmlfile):
jpayne@1 485 #used to do seqsero xml analysis
jpayne@1 486 from Bio.Blast import NCBIXML
jpayne@1 487 handle=open(xmlfile)
jpayne@1 488 handle=NCBIXML.parse(handle)
jpayne@1 489 handle=list(handle)
jpayne@1 490 List=[]
jpayne@1 491 List_score=[]
jpayne@1 492 List_ids=[]
jpayne@1 493 List_query_region=[]
jpayne@1 494 for i in range(len(handle)):
jpayne@1 495 if len(handle[i].alignments)>0:
jpayne@1 496 for j in range(len(handle[i].alignments)):
jpayne@1 497 score=0
jpayne@1 498 ids=0
jpayne@1 499 cover_region=set() #fixed problem that repeated calculation leading percentage > 1
jpayne@1 500 List.append(handle[i].query.strip()+"___"+handle[i].alignments[j].hit_def)
jpayne@1 501 for z in range(len(handle[i].alignments[j].hsps)):
jpayne@1 502 hsp=handle[i].alignments[j].hsps[z]
jpayne@1 503 temp=set(range(hsp.query_start,hsp.query_end))
jpayne@1 504 if len(cover_region)==0:
jpayne@1 505 cover_region=cover_region|temp
jpayne@1 506 fraction=1
jpayne@1 507 else:
jpayne@1 508 fraction=1-len(cover_region&temp)/float(len(temp))
jpayne@1 509 cover_region=cover_region|temp
jpayne@1 510 if "last" in handle[i].query or "first" in handle[i].query:
jpayne@1 511 score+=hsp.bits*fraction
jpayne@1 512 ids+=float(hsp.identities)/handle[i].query_length*fraction
jpayne@1 513 else:
jpayne@1 514 score+=hsp.bits*fraction
jpayne@1 515 ids+=float(hsp.identities)/handle[i].query_length*fraction
jpayne@1 516 List_score.append(score)
jpayne@1 517 List_ids.append(ids)
jpayne@1 518 List_query_region.append(cover_region)
jpayne@1 519 temp=zip(List,List_score,List_ids,List_query_region)
jpayne@1 520 Final_list=sorted(temp, key=lambda d:d[1], reverse = True)
jpayne@1 521 return Final_list
jpayne@1 522
jpayne@1 523
jpayne@1 524 def Uniq(L,sort_on_fre="none"): #return the uniq list and the count number
jpayne@1 525 Old=L
jpayne@1 526 L.sort()
jpayne@1 527 L = [L[i] for i in range(len(L)) if L[i] not in L[:i]]
jpayne@1 528 count=[]
jpayne@1 529 for j in range(len(L)):
jpayne@1 530 y=0
jpayne@1 531 for x in Old:
jpayne@1 532 if L[j]==x:
jpayne@1 533 y+=1
jpayne@1 534 count.append(y)
jpayne@1 535 if sort_on_fre!="none":
jpayne@1 536 d=zip(*sorted(zip(count, L)))
jpayne@1 537 L=d[1]
jpayne@1 538 count=d[0]
jpayne@1 539 return (L,count)
jpayne@1 540
jpayne@1 541 def judge_fliC_or_fljB_from_head_tail_for_one_contig(nodes_vs_score_list):
jpayne@1 542 #used to predict it's fliC or fljB for one contig, based on tail and head score, but output the score difference,if it is very small, then not reliable, use blast score for whole contig to test
jpayne@1 543 #this is mainly used for
jpayne@1 544 a=nodes_vs_score_list
jpayne@1 545 fliC_score=0
jpayne@1 546 fljB_score=0
jpayne@1 547 for z in a:
jpayne@1 548 if "fliC" in z[0]:
jpayne@1 549 fliC_score+=z[1]
jpayne@1 550 elif "fljB" in z[0]:
jpayne@1 551 fljB_score+=z[1]
jpayne@1 552 if fliC_score>=fljB_score:
jpayne@1 553 role="fliC"
jpayne@1 554 else:
jpayne@1 555 role="fljB"
jpayne@1 556 return (role,abs(fliC_score-fljB_score))
jpayne@1 557
jpayne@1 558 def judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(node_name,Final_list,Final_list_passed):
jpayne@1 559 #used to predict contig is fliC or fljB, if the differnce score value on above head_and_tail is less than 10 (quite small)
jpayne@1 560 #also used when no head or tail got blasted score for the contig
jpayne@1 561 role=""
jpayne@1 562 for z in Final_list_passed:
jpayne@1 563 if node_name in z[0]:
jpayne@1 564 role=z[0].split("_")[0]
jpayne@1 565 break
jpayne@1 566 return role
jpayne@1 567
jpayne@1 568 def fliC_or_fljB_judge_from_head_tail_sequence(nodes_list,tail_head_list,Final_list,Final_list_passed):
jpayne@1 569 #nodes_list is the c created by c,d=Uniq(nodes) in below function
jpayne@1 570 first_target=""
jpayne@1 571 role_list=[]
jpayne@1 572 for x in nodes_list:
jpayne@1 573 a=[]
jpayne@1 574 role=""
jpayne@1 575 for y in tail_head_list:
jpayne@1 576 if x in y[0]:
jpayne@1 577 a.append(y)
jpayne@1 578 if len(a)==4:
jpayne@1 579 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1 580 if diff<20:
jpayne@1 581 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1 582 elif len(a)==3:
jpayne@1 583 ###however, if the one with highest score is the fewer one, compare their accumulation score
jpayne@1 584 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1 585 if diff<20:
jpayne@1 586 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1 587 ###end of above score comparison
jpayne@1 588 elif len(a)==2:
jpayne@1 589 #must on same node, if not, then decide with unit blast score, blast-score/length_of_special_sequence(30 or 37)
jpayne@1 590 temp=[]
jpayne@1 591 for z in a:
jpayne@1 592 temp.append(z[0].split("_")[0])
jpayne@1 593 m,n=Uniq(temp)#should only have one choice, but weird situation might occur too
jpayne@1 594 if len(m)==1:
jpayne@1 595 pass
jpayne@1 596 else:
jpayne@1 597 pass
jpayne@1 598 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1 599 if diff<20:
jpayne@1 600 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1 601 ###need to desgin a algorithm to guess most possible situation for nodes_list, See the situations of test evaluation
jpayne@1 602 elif len(a)==1:
jpayne@1 603 #that one
jpayne@1 604 role,diff=judge_fliC_or_fljB_from_head_tail_for_one_contig(a)
jpayne@1 605 if diff<20:
jpayne@1 606 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1 607 #need to evaluate, in future, may set up a cut-off, if not met, then just find Final_list_passed best match,like when "a==0"
jpayne@1 608 else:#a==0
jpayne@1 609 #use Final_list_passed best match
jpayne@1 610 for z in Final_list_passed:
jpayne@1 611 if x in z[0]:
jpayne@1 612 role=z[0].split("_")[0]
jpayne@1 613 break
jpayne@1 614 #print x,role,len(a)
jpayne@1 615 role_list.append((role,x))
jpayne@1 616 if len(role_list)==2:
jpayne@1 617 if role_list[0][0]==role_list[1][0]:#this is the most cocmmon error, two antigen were assigned to same phase
jpayne@1 618 #just use score to do a final test
jpayne@1 619 role_list=[]
jpayne@1 620 for x in nodes_list:
jpayne@1 621 role=judge_fliC_or_fljB_from_whole_contig_blast_score_ranking(x,Final_list,Final_list_passed)
jpayne@1 622 role_list.append((role,x))
jpayne@1 623 return role_list
jpayne@1 624
jpayne@1 625 def decide_contig_roles_for_H_antigen(Final_list,Final_list_passed):
jpayne@1 626 #used to decide which contig is FliC and which one is fljB
jpayne@1 627 contigs=[]
jpayne@1 628 nodes=[]
jpayne@1 629 for x in Final_list_passed:
jpayne@1 630 if x[0].startswith("fl") and "last" not in x[0] and "first" not in x[0]:
jpayne@1 631 nodes.append(x[0].split("___")[1].strip())
jpayne@1 632 c,d=Uniq(nodes)#c is node_list
jpayne@1 633 #print c
jpayne@1 634 tail_head_list=[x for x in Final_list if ("last" in x[0] or "first" in x[0])]
jpayne@1 635 roles=fliC_or_fljB_judge_from_head_tail_sequence(c,tail_head_list,Final_list,Final_list_passed)
jpayne@1 636 return roles
jpayne@1 637
jpayne@1 638 def decide_O_type_and_get_special_genes(Final_list,Final_list_passed):
jpayne@1 639 #decide O based on Final_list
jpayne@1 640 O_choice="?"
jpayne@1 641 O_list=[]
jpayne@1 642 special_genes={}
jpayne@1 643 nodes=[]
jpayne@1 644 for x in Final_list_passed:
jpayne@1 645 if x[0].startswith("O-"):
jpayne@1 646 nodes.append(x[0].split("___")[1].strip())
jpayne@1 647 elif not x[0].startswith("fl"):
jpayne@1 648 special_genes[x[0]]=x[2]#08172018, x[2] changed from x[-1]
jpayne@1 649 #print "special_genes:",special_genes
jpayne@1 650 c,d=Uniq(nodes)
jpayne@1 651 #print "potential O antigen contig",c
jpayne@1 652 final_O=[]
jpayne@1 653 O_nodes_list=[]
jpayne@1 654 for x in c:#c is the list for contigs
jpayne@1 655 temp=0
jpayne@1 656 for y in Final_list_passed:
jpayne@1 657 if x in y[0] and y[0].startswith("O-"):
jpayne@1 658 final_O.append(y)
jpayne@1 659 break
jpayne@1 660 ### O contig has the problem of two genes on same contig, so do additional test
jpayne@1 661 potenial_new_gene=""
jpayne@1 662 for x in final_O:
jpayne@1 663 pointer=0 #for genes merged or not
jpayne@1 664 #not consider O-1,3,19_not_in_3,10, too short compared with others
jpayne@1 665 if "O-1,3,19_not_in_3,10" not in x[0] and int(x[0].split("__")[1].split("___")[0])*x[2]+850 <= int(x[0].split("length_")[1].split("_")[0]):#gene length << contig length; for now give 300*2 (for secureity can use 400*2) as flank region
jpayne@1 666 pointer=x[0].split("___")[1].strip()#store the contig name
jpayne@1 667 print(pointer)
jpayne@1 668 if pointer!=0:#it has potential merge event
jpayne@1 669 for y in Final_list:
jpayne@1 670 if pointer in y[0] and y not in final_O and (y[1]>=int(y[0].split("__")[1].split("___")[0])*1.5 or (y[1]>=int(y[0].split("__")[1].split("___")[0])*y[2] and y[1]>=400)):#that's a realtively strict filter now; if passed, it has merge event and add one more to final_O
jpayne@1 671 potenial_new_gene=y
jpayne@1 672 #print(potenial_new_gene)
jpayne@1 673 break
jpayne@1 674 if potenial_new_gene!="":
jpayne@1 675 print("two differnt genes in same contig, fix it for O antigen")
jpayne@1 676 print(potenial_new_gene[:3])
jpayne@4 677 pointer=0
jpayne@4 678 for y in final_O:
jpayne@4 679 if y[0].split("___")[-1]==potenial_new_gene[0].split("___")[-1]:
jpayne@4 680 pointer=1
jpayne@4 681 if pointer!=1:
jpayne@4 682 final_O.append(potenial_new_gene)
jpayne@1 683 ### end of the two genes on same contig test
jpayne@1 684 final_O=sorted(final_O,key=lambda x: x[2], reverse=True)#sorted
jpayne@1 685 if len(final_O)==0 or (len(final_O)==1 and "O-1,3,19_not_in_3,10" in final_O[0][0]):
jpayne@1 686 #print "$$$No Otype, due to no hit"#may need to be changed
jpayne@1 687 O_choice="-"
jpayne@1 688 else:
jpayne@7 689 highest_O_coverage=max([float(x[0].split("_cov_")[-1].split("_")[0]) for x in final_O if "O-1,3,19_not_in_3,10" not in x[0]])
jpayne@1 690 O_list=[]
jpayne@1 691 O_list_less_contamination=[]
jpayne@1 692 for x in final_O:
jpayne@1 693 if not "O-1,3,19_not_in_3,10__130" in x[0]:#O-1,3,19_not_in_3,10 is too small, which may affect further analysis; to avoid contamination affect, use 0.15 of highest coverage as cut-off
jpayne@1 694 O_list.append(x[0].split("__")[0])
jpayne@1 695 O_nodes_list.append(x[0].split("___")[1])
jpayne@7 696 if float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:
jpayne@1 697 O_list_less_contamination.append(x[0].split("__")[0])
jpayne@1 698 ### special test for O9,46 and O3,10 family
jpayne@1 699 if ("O-9,46_wbaV" in O_list or "O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254" in O_list) and O_list_less_contamination[0].startswith("O-9,"):#not sure should use and float(O9_wbaV)/float(num_1) > 0.1
jpayne@1 700 if "O-9,46_wzy" in O_list:#and float(O946_wzy)/float(num_1) > 0.1
jpayne@1 701 O_choice="O-9,46"
jpayne@1 702 #print "$$$Most possilble Otype: O-9,46"
jpayne@1 703 elif "O-9,46,27_partial_wzy" in O_list:#and float(O94627)/float(num_1) > 0.1
jpayne@1 704 O_choice="O-9,46,27"
jpayne@1 705 #print "$$$Most possilble Otype: O-9,46,27"
jpayne@1 706 else:
jpayne@1 707 O_choice="O-9"#next, detect O9 vs O2?
jpayne@1 708 O2=0
jpayne@1 709 O9=0
jpayne@1 710 for z in special_genes:
jpayne@1 711 if "tyr-O-9" in z:
jpayne@1 712 O9=special_genes[z]
jpayne@1 713 elif "tyr-O-2" in z:
jpayne@1 714 O2=special_genes[z]
jpayne@1 715 if O2>O9:
jpayne@1 716 O_choice="O-2"
jpayne@1 717 elif O2<O9:
jpayne@1 718 pass
jpayne@1 719 else:
jpayne@1 720 pass
jpayne@1 721 #print "$$$No suitable one, because can't distinct it's O-9 or O-2, but O-9 has a more possibility."
jpayne@1 722 elif ("O-3,10_wzx" in O_list) and ("O-9,46_wzy" in O_list) and (O_list[0].startswith("O-3,10") or O_list_less_contamination[0].startswith("O-9,46_wzy")):#and float(O310_wzx)/float(num_1) > 0.1 and float(O946_wzy)/float(num_1) > 0.1
jpayne@1 723 if "O-3,10_not_in_1,3,19" in O_list:#and float(O310_no_1319)/float(num_1) > 0.1
jpayne@1 724 O_choice="O-3,10"
jpayne@1 725 #print "$$$Most possilble Otype: O-3,10 (contain O-3,10_not_in_1,3,19)"
jpayne@1 726 else:
jpayne@1 727 O_choice="O-1,3,19"
jpayne@1 728 #print "$$$Most possilble Otype: O-1,3,19 (not contain O-3,10_not_in_1,3,19)"
jpayne@1 729 ### end of special test for O9,46 and O3,10 family
jpayne@1 730 else:
jpayne@1 731 try:
jpayne@1 732 max_score=0
jpayne@1 733 for x in final_O:
jpayne@7 734 if x[2]>=max_score and float(x[0].split("_cov_")[-1].split("_")[0])>highest_O_coverage*0.15:#use x[2],08172018, the "coverage identity = cover_length * identity"; also meet coverage threshold
jpayne@1 735 max_score=x[2]#change from x[-1] to x[2],08172018
jpayne@1 736 O_choice=x[0].split("_")[0]
jpayne@1 737 if O_choice=="O-1,3,19":
jpayne@1 738 O_choice=final_O[1][0].split("_")[0]
jpayne@1 739 #print "$$$Most possilble Otype: ",O_choice
jpayne@1 740 except:
jpayne@1 741 pass
jpayne@1 742 #print "$$$No suitable Otype, or failure of mapping (please check the quality of raw reads)"
jpayne@1 743 #print "O:",O_choice,O_nodes_list
jpayne@1 744 Otypes=[]
jpayne@1 745 for x in O_list:
jpayne@1 746 if x!="O-1,3,19_not_in_3,10":
jpayne@1 747 if "O-9,46_" not in x:
jpayne@1 748 Otypes.append(x.split("_")[0])
jpayne@1 749 else:
jpayne@1 750 Otypes.append(x.split("-from")[0])#O-9,46_wbaV-from-II-9,12:z29:1,5-SRR1346254
jpayne@1 751 #Otypes=[x.split("_")[0] for x in O_list if x!="O-1,3,19_not_in_3,10"]
jpayne@1 752 Otypes_uniq,Otypes_fre=Uniq(Otypes)
jpayne@1 753 contamination_O=""
jpayne@1 754 if O_choice=="O-9,46,27" or O_choice=="O-3,10" or O_choice=="O-1,3,19":
jpayne@1 755 if len(Otypes_uniq)>2:
jpayne@1 756 contamination_O="potential contamination from O antigen signals"
jpayne@1 757 else:
jpayne@1 758 if len(Otypes_uniq)>1:
jpayne@1 759 if O_choice=="O-4" and len(Otypes_uniq)==2 and "O-9,46,27" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@1 760 contamination_O=""
jpayne@1 761 elif O_choice=="O-9,46" and len(Otypes_uniq)==2 and "O-9,46_wbaV" in Otypes_uniq and "O-9,46_wzy" in Otypes_uniq: #for special 4,12,27 case such as Bredeney and Schwarzengrund
jpayne@1 762 contamination_O=""
jpayne@1 763 else:
jpayne@1 764 contamination_O="potential contamination from O antigen signals"
jpayne@4 765 return O_choice,O_nodes_list,special_genes,final_O,contamination_O,Otypes_uniq
jpayne@1 766 ### End of SeqSero2 allele prediction and output
jpayne@1 767
jpayne@1 768 def get_input_files(make_dir,input_file,data_type,dirpath):
jpayne@1 769 #tell input files from datatype
jpayne@1 770 #"<int>: '1'(pair-end reads, interleaved),'2'(pair-end reads, seperated),'3'(single-end reads), '4'(assembly),'5'(nanopore fasta),'6'(nanopore fastq)"
jpayne@1 771 for_fq=""
jpayne@1 772 rev_fq=""
jpayne@1 773 old_dir = os.getcwd()
jpayne@1 774 os.chdir(make_dir)
jpayne@1 775 if data_type=="1":
jpayne@1 776 input_file=input_file[0].split("/")[-1]
jpayne@1 777 if input_file.endswith(".sra"):
jpayne@1 778 subprocess.check_output("fastq-dump --split-files "+input_file,shell=True, stderr=subprocess.STDOUT)
jpayne@1 779 for_fq=input_file.replace(".sra","_1.fastq")
jpayne@1 780 rev_fq=input_file.replace(".sra","_2.fastq")
jpayne@1 781 else:
jpayne@1 782 core_id=input_file.split(".fastq")[0].split(".fq")[0]
jpayne@1 783 for_fq=core_id+"_1.fastq"
jpayne@1 784 rev_fq=core_id+"_2.fastq"
jpayne@1 785 if input_file.endswith(".gz"):
jpayne@1 786 subprocess.check_output("gzip -dc "+input_file+" | "+dirpath+"/deinterleave_fastq.sh "+for_fq+" "+rev_fq,shell=True, stderr=subprocess.STDOUT)
jpayne@1 787 else:
jpayne@1