annotate SeqSero2_update_kmer_database.py @ 0:18c8b4d6ab1e

Uploaded
author estrain
date Wed, 02 Oct 2019 16:51:30 -0400
parents
children
rev   line source
estrain@0 1 #!/usr/bin/env python3
estrain@0 2
estrain@0 3 import argparse
estrain@0 4 import os,subprocess
estrain@0 5 import pickle
estrain@0 6
estrain@0 7 ### SeqSero Kmer
estrain@0 8 def parse_args():
estrain@0 9 "Parse the input arguments, use '-h' for help."
estrain@0 10 parser = argparse.ArgumentParser(usage='Just type "SeqSero2_update_kmer_database.py", it will update kmer database automatically')
estrain@0 11 return parser.parse_args()
estrain@0 12
estrain@0 13 def reverse_complement(sequence):
estrain@0 14 complement = {
estrain@0 15 'A': 'T',
estrain@0 16 'C': 'G',
estrain@0 17 'G': 'C',
estrain@0 18 'T': 'A',
estrain@0 19 'N': 'N',
estrain@0 20 'M': 'K',
estrain@0 21 'R': 'Y',
estrain@0 22 'W': 'W',
estrain@0 23 'S': 'S',
estrain@0 24 'Y': 'R',
estrain@0 25 'K': 'M',
estrain@0 26 'V': 'B',
estrain@0 27 'H': 'D',
estrain@0 28 'D': 'H',
estrain@0 29 'B': 'V'
estrain@0 30 }
estrain@0 31 return "".join(complement[base] for base in reversed(sequence))
estrain@0 32
estrain@0 33 def multifasta_dict(multifasta):
estrain@0 34 multifasta_list = [
estrain@0 35 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
estrain@0 36 ]
estrain@0 37 headers = [i for i in multifasta_list if i[0] == '>']
estrain@0 38 multifasta_dict = {}
estrain@0 39 for h in headers:
estrain@0 40 start = multifasta_list.index(h)
estrain@0 41 for element in multifasta_list[start + 1:]:
estrain@0 42 if element[0] == '>':
estrain@0 43 break
estrain@0 44 else:
estrain@0 45 if h[1:] in multifasta_dict:
estrain@0 46 multifasta_dict[h[1:]] += element
estrain@0 47 else:
estrain@0 48 multifasta_dict[h[1:]] = element
estrain@0 49 return multifasta_dict
estrain@0 50
estrain@0 51 def createKmerDict_reads(list_of_strings, kmer):
estrain@0 52 kmer_table = {}
estrain@0 53 for string in list_of_strings:
estrain@0 54 sequence = string.strip('\n')
estrain@0 55 for i in range(len(sequence) - kmer + 1):
estrain@0 56 new_mer = sequence[i:i + kmer].upper()
estrain@0 57 new_mer_rc = reverse_complement(new_mer)
estrain@0 58 if new_mer in kmer_table:
estrain@0 59 kmer_table[new_mer.upper()] += 1
estrain@0 60 else:
estrain@0 61 kmer_table[new_mer.upper()] = 1
estrain@0 62 if new_mer_rc in kmer_table:
estrain@0 63 kmer_table[new_mer_rc.upper()] += 1
estrain@0 64 else:
estrain@0 65 kmer_table[new_mer_rc.upper()] = 1
estrain@0 66 return kmer_table
estrain@0 67
estrain@0 68 def multifasta_to_kmers_dict(multifasta):
estrain@0 69 multi_seq_dict = multifasta_dict(multifasta)
estrain@0 70 lib_dict = {}
estrain@0 71 for h in multi_seq_dict:
estrain@0 72 lib_dict[h] = set(
estrain@0 73 [k for k in createKmerDict_reads([multi_seq_dict[h]], 27)])
estrain@0 74 return lib_dict
estrain@0 75
estrain@0 76 def get_salmid_invA_database(ex_dir):
estrain@0 77 # read invA kmer and return it
estrain@0 78 a = open(ex_dir + '/invA_mers_dict', 'rb')
estrain@0 79 invA_dict = pickle.load(a)
estrain@0 80 try:
estrain@0 81 del invA_dict['version']
estrain@0 82 except:
estrain@0 83 pass
estrain@0 84 return invA_dict
estrain@0 85
estrain@0 86 def get_salmid_rpoB_database(ex_dir):
estrain@0 87 # read invA kmer and return it
estrain@0 88 a = open(ex_dir + '/rpoB_mers_dict', 'rb')
estrain@0 89 rpoB_dict = pickle.load(a)
estrain@0 90 try:
estrain@0 91 del rpoB_dict['version']
estrain@0 92 except:
estrain@0 93 pass
estrain@0 94 return rpoB_dict
estrain@0 95
estrain@0 96 def main():
estrain@0 97 args = parse_args()
estrain@0 98 ex_dir = os.path.dirname(os.path.realpath(__file__))
estrain@0 99 lib_dict = multifasta_to_kmers_dict(ex_dir + '/H_and_O_and_specific_genes.fasta')
estrain@0 100 invA_dict=get_salmid_invA_database(ex_dir)
estrain@0 101 #rpoB_dict=get_salmid_rpoB_database(ex_dir)
estrain@0 102 lib_dict_new = lib_dict.copy()
estrain@0 103 #print(len(lib_dict_new))
estrain@0 104 lib_dict_new.update(invA_dict)
estrain@0 105 #print(len(lib_dict_new))
estrain@0 106 #lib_dict_new.update(rpoB_dict)
estrain@0 107 #print(len(lib_dict_new))
estrain@0 108 f = open(ex_dir + '/antigens.pickle', "wb")
estrain@0 109 pickle.dump(lib_dict_new, f)
estrain@0 110 f.close()
estrain@0 111
estrain@0 112 if __name__ == '__main__':
estrain@0 113 main()