annotate SeqSero2/SeqSero2_update_kmer_database.py @ 6:5b84740f0463

planemo upload
author jpayne
date Thu, 18 Apr 2019 16:14:32 -0400
parents fae43708974d
children
rev   line source
jpayne@1 1 #!/usr/bin/env python3
jpayne@1 2
jpayne@1 3 import argparse
jpayne@1 4 import os,subprocess
jpayne@1 5 import pickle
jpayne@1 6
jpayne@1 7 ### SeqSero Kmer
jpayne@1 8 def parse_args():
jpayne@1 9 "Parse the input arguments, use '-h' for help."
jpayne@1 10 parser = argparse.ArgumentParser(usage='Just type "SeqSero2_update_kmer_database.py", it will update kmer database automatically')
jpayne@1 11 return parser.parse_args()
jpayne@1 12
jpayne@1 13 def reverse_complement(sequence):
jpayne@1 14 complement = {
jpayne@1 15 'A': 'T',
jpayne@1 16 'C': 'G',
jpayne@1 17 'G': 'C',
jpayne@1 18 'T': 'A',
jpayne@1 19 'N': 'N',
jpayne@1 20 'M': 'K',
jpayne@1 21 'R': 'Y',
jpayne@1 22 'W': 'W',
jpayne@1 23 'S': 'S',
jpayne@1 24 'Y': 'R',
jpayne@1 25 'K': 'M',
jpayne@1 26 'V': 'B',
jpayne@1 27 'H': 'D',
jpayne@1 28 'D': 'H',
jpayne@1 29 'B': 'V'
jpayne@1 30 }
jpayne@1 31 return "".join(complement[base] for base in reversed(sequence))
jpayne@1 32
jpayne@1 33 def multifasta_dict(multifasta):
jpayne@1 34 multifasta_list = [
jpayne@1 35 line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0
jpayne@1 36 ]
jpayne@1 37 headers = [i for i in multifasta_list if i[0] == '>']
jpayne@1 38 multifasta_dict = {}
jpayne@1 39 for h in headers:
jpayne@1 40 start = multifasta_list.index(h)
jpayne@1 41 for element in multifasta_list[start + 1:]:
jpayne@1 42 if element[0] == '>':
jpayne@1 43 break
jpayne@1 44 else:
jpayne@1 45 if h[1:] in multifasta_dict:
jpayne@1 46 multifasta_dict[h[1:]] += element
jpayne@1 47 else:
jpayne@1 48 multifasta_dict[h[1:]] = element
jpayne@1 49 return multifasta_dict
jpayne@1 50
jpayne@1 51 def createKmerDict_reads(list_of_strings, kmer):
jpayne@1 52 kmer_table = {}
jpayne@1 53 for string in list_of_strings:
jpayne@1 54 sequence = string.strip('\n')
jpayne@1 55 for i in range(len(sequence) - kmer + 1):
jpayne@1 56 new_mer = sequence[i:i + kmer].upper()
jpayne@1 57 new_mer_rc = reverse_complement(new_mer)
jpayne@1 58 if new_mer in kmer_table:
jpayne@1 59 kmer_table[new_mer.upper()] += 1
jpayne@1 60 else:
jpayne@1 61 kmer_table[new_mer.upper()] = 1
jpayne@1 62 if new_mer_rc in kmer_table:
jpayne@1 63 kmer_table[new_mer_rc.upper()] += 1
jpayne@1 64 else:
jpayne@1 65 kmer_table[new_mer_rc.upper()] = 1
jpayne@1 66 return kmer_table
jpayne@1 67
jpayne@1 68 def multifasta_to_kmers_dict(multifasta):
jpayne@1 69 multi_seq_dict = multifasta_dict(multifasta)
jpayne@1 70 lib_dict = {}
jpayne@1 71 for h in multi_seq_dict:
jpayne@1 72 lib_dict[h] = set(
jpayne@1 73 [k for k in createKmerDict_reads([multi_seq_dict[h]], 27)])
jpayne@1 74 return lib_dict
jpayne@1 75
jpayne@1 76 def get_salmid_invA_database(ex_dir):
jpayne@1 77 # read invA kmer and return it
jpayne@1 78 a = open(ex_dir + '/invA_mers_dict', 'rb')
jpayne@1 79 invA_dict = pickle.load(a)
jpayne@1 80 try:
jpayne@1 81 del invA_dict['version']
jpayne@1 82 except:
jpayne@1 83 pass
jpayne@1 84 return invA_dict
jpayne@1 85
jpayne@1 86 def get_salmid_rpoB_database(ex_dir):
jpayne@1 87 # read invA kmer and return it
jpayne@1 88 a = open(ex_dir + '/rpoB_mers_dict', 'rb')
jpayne@1 89 rpoB_dict = pickle.load(a)
jpayne@1 90 try:
jpayne@1 91 del rpoB_dict['version']
jpayne@1 92 except:
jpayne@1 93 pass
jpayne@1 94 return rpoB_dict
jpayne@1 95
jpayne@1 96 def main():
jpayne@1 97 args = parse_args()
jpayne@1 98 ex_dir = os.path.dirname(os.path.realpath(__file__))
jpayne@1 99 lib_dict = multifasta_to_kmers_dict(ex_dir + '/H_and_O_and_specific_genes.fasta')
jpayne@1 100 invA_dict=get_salmid_invA_database(ex_dir)
jpayne@1 101 #rpoB_dict=get_salmid_rpoB_database(ex_dir)
jpayne@1 102 lib_dict_new = lib_dict.copy()
jpayne@1 103 #print(len(lib_dict_new))
jpayne@1 104 lib_dict_new.update(invA_dict)
jpayne@1 105 #print(len(lib_dict_new))
jpayne@1 106 #lib_dict_new.update(rpoB_dict)
jpayne@1 107 #print(len(lib_dict_new))
jpayne@1 108 f = open(ex_dir + '/antigens.pickle', "wb")
jpayne@1 109 pickle.dump(lib_dict_new, f)
jpayne@1 110 f.close()
jpayne@1 111
jpayne@1 112 if __name__ == '__main__':
jpayne@1 113 main()