estrain@0: #!/usr/bin/env python3 estrain@0: estrain@0: import argparse estrain@0: import os,subprocess estrain@0: import pickle estrain@0: estrain@0: ### SeqSero Kmer estrain@0: def parse_args(): estrain@0: "Parse the input arguments, use '-h' for help." estrain@0: parser = argparse.ArgumentParser(usage='Just type "SeqSero2_update_kmer_database.py", it will update kmer database automatically') estrain@0: return parser.parse_args() estrain@0: estrain@0: def reverse_complement(sequence): estrain@0: complement = { estrain@0: 'A': 'T', estrain@0: 'C': 'G', estrain@0: 'G': 'C', estrain@0: 'T': 'A', estrain@0: 'N': 'N', estrain@0: 'M': 'K', estrain@0: 'R': 'Y', estrain@0: 'W': 'W', estrain@0: 'S': 'S', estrain@0: 'Y': 'R', estrain@0: 'K': 'M', estrain@0: 'V': 'B', estrain@0: 'H': 'D', estrain@0: 'D': 'H', estrain@0: 'B': 'V' estrain@0: } estrain@0: return "".join(complement[base] for base in reversed(sequence)) estrain@0: estrain@0: def multifasta_dict(multifasta): estrain@0: multifasta_list = [ estrain@0: line.strip() for line in open(multifasta, 'r') if len(line.strip()) > 0 estrain@0: ] estrain@0: headers = [i for i in multifasta_list if i[0] == '>'] estrain@0: multifasta_dict = {} estrain@0: for h in headers: estrain@0: start = multifasta_list.index(h) estrain@0: for element in multifasta_list[start + 1:]: estrain@0: if element[0] == '>': estrain@0: break estrain@0: else: estrain@0: if h[1:] in multifasta_dict: estrain@0: multifasta_dict[h[1:]] += element estrain@0: else: estrain@0: multifasta_dict[h[1:]] = element estrain@0: return multifasta_dict estrain@0: estrain@0: def createKmerDict_reads(list_of_strings, kmer): estrain@0: kmer_table = {} estrain@0: for string in list_of_strings: estrain@0: sequence = string.strip('\n') estrain@0: for i in range(len(sequence) - kmer + 1): estrain@0: new_mer = sequence[i:i + kmer].upper() estrain@0: new_mer_rc = reverse_complement(new_mer) estrain@0: if new_mer in kmer_table: estrain@0: kmer_table[new_mer.upper()] += 1 estrain@0: else: estrain@0: kmer_table[new_mer.upper()] = 1 estrain@0: if new_mer_rc in kmer_table: estrain@0: kmer_table[new_mer_rc.upper()] += 1 estrain@0: else: estrain@0: kmer_table[new_mer_rc.upper()] = 1 estrain@0: return kmer_table estrain@0: estrain@0: def multifasta_to_kmers_dict(multifasta): estrain@0: multi_seq_dict = multifasta_dict(multifasta) estrain@0: lib_dict = {} estrain@0: for h in multi_seq_dict: estrain@0: lib_dict[h] = set( estrain@0: [k for k in createKmerDict_reads([multi_seq_dict[h]], 27)]) estrain@0: return lib_dict estrain@0: estrain@0: def get_salmid_invA_database(ex_dir): estrain@0: # read invA kmer and return it estrain@0: a = open(ex_dir + '/invA_mers_dict', 'rb') estrain@0: invA_dict = pickle.load(a) estrain@0: try: estrain@0: del invA_dict['version'] estrain@0: except: estrain@0: pass estrain@0: return invA_dict estrain@0: estrain@0: def get_salmid_rpoB_database(ex_dir): estrain@0: # read invA kmer and return it estrain@0: a = open(ex_dir + '/rpoB_mers_dict', 'rb') estrain@0: rpoB_dict = pickle.load(a) estrain@0: try: estrain@0: del rpoB_dict['version'] estrain@0: except: estrain@0: pass estrain@0: return rpoB_dict estrain@0: estrain@0: def main(): estrain@0: args = parse_args() estrain@0: ex_dir = os.path.dirname(os.path.realpath(__file__)) estrain@0: lib_dict = multifasta_to_kmers_dict(ex_dir + '/H_and_O_and_specific_genes.fasta') estrain@0: invA_dict=get_salmid_invA_database(ex_dir) estrain@0: #rpoB_dict=get_salmid_rpoB_database(ex_dir) estrain@0: lib_dict_new = lib_dict.copy() estrain@0: #print(len(lib_dict_new)) estrain@0: lib_dict_new.update(invA_dict) estrain@0: #print(len(lib_dict_new)) estrain@0: #lib_dict_new.update(rpoB_dict) estrain@0: #print(len(lib_dict_new)) estrain@0: f = open(ex_dir + '/antigens.pickle', "wb") estrain@0: pickle.dump(lib_dict_new, f) estrain@0: f.close() estrain@0: estrain@0: if __name__ == '__main__': estrain@0: main()