cstrittmatter@0: #!/usr/bin/env python3 cstrittmatter@0: # -*- coding: utf-8 -*- cstrittmatter@0: """ cstrittmatter@0: ############################################################################ cstrittmatter@0: # Istituto Superiore di Sanita' cstrittmatter@0: # European Union Reference Laboratory (EU-RL) for Escherichia coli, including Verotoxigenic E. coli (VTEC) cstrittmatter@0: # Developer: Arnold Knijn arnold.knijn@iss.it cstrittmatter@0: ############################################################################ cstrittmatter@0: """ cstrittmatter@0: cstrittmatter@0: import argparse cstrittmatter@0: import sys cstrittmatter@0: import os cstrittmatter@0: import subprocess cstrittmatter@0: import shutil cstrittmatter@0: cstrittmatter@0: def __main__(): cstrittmatter@0: parser = argparse.ArgumentParser() cstrittmatter@0: parser.add_argument('-i', '--input', dest='input', help='input alignment file') cstrittmatter@0: parser.add_argument('-o', '--output', dest='output', help='output consensus file') cstrittmatter@0: args = parser.parse_args() cstrittmatter@0: cstrittmatter@0: sequences = [] cstrittmatter@0: varsequences = [] cstrittmatter@0: # read input cstrittmatter@0: with open(args.input) as alignments: cstrittmatter@0: for alignment in alignments: cstrittmatter@0: if alignment[0] != ">": cstrittmatter@0: sequences.append(alignment.rstrip()) cstrittmatter@0: numsequences = len(sequences) cstrittmatter@0: for j in range(0, numsequences + 1): cstrittmatter@0: varsequences.append("") cstrittmatter@0: lstnumvariants = [] cstrittmatter@0: lstnumhyphens = [] cstrittmatter@0: # loop over the columns cstrittmatter@0: for i in range(0, len(sequences[0])): cstrittmatter@0: variants = [] cstrittmatter@0: numhyphens = 0 cstrittmatter@0: # loop over the original rows to obtain variants cstrittmatter@0: for j in range(0, numsequences): cstrittmatter@0: if sequences[j][i:i+1] == "-": cstrittmatter@0: numhyphens = numhyphens + 1 cstrittmatter@0: if sequences[j][i:i+1] not in variants and sequences[j][i:i+1] != "-": cstrittmatter@0: variants.append(sequences[j][i:i+1]) cstrittmatter@0: lstnumvariants.append(len(variants)) cstrittmatter@0: lstnumhyphens.append(numhyphens) cstrittmatter@0: # create varsequences with a template of the variants cstrittmatter@0: for j in range(0, numsequences): cstrittmatter@0: if lstnumhyphens[i] == 0: cstrittmatter@0: varsequences[j] = varsequences[j] + "-" cstrittmatter@0: elif lstnumvariants[i] < 2: cstrittmatter@0: varsequences[j] = varsequences[j] + "-" cstrittmatter@0: else: cstrittmatter@0: varsequences[j] = varsequences[j] + sequences[j][i:i+1] cstrittmatter@0: if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0: cstrittmatter@0: varsequences[numsequences] = varsequences[numsequences] + variants[0] cstrittmatter@0: else: cstrittmatter@0: varsequences[numsequences] = varsequences[numsequences] + "-" cstrittmatter@0: # loop over the columns, apply single variant cstrittmatter@0: for i in range(0, len(sequences[0])): cstrittmatter@0: if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0: cstrittmatter@0: # loop over all the rows to apply single variant to "-" cstrittmatter@0: for j in range(0, len(sequences)): cstrittmatter@0: if sequences[j][i:i+1] == "-": cstrittmatter@0: lstsequence = list(sequences[j]) cstrittmatter@0: lstsequence[i] = varsequences[numsequences][i:i+1] cstrittmatter@0: sequences[j] = ''.join(lstsequence) cstrittmatter@0: # loop over the rows of the sequences, apply multiple variants cstrittmatter@0: for j in range(0, len(sequences)): cstrittmatter@0: # loop over the columns cstrittmatter@0: for i in range(0, len(sequences[0])): cstrittmatter@0: variants = [] cstrittmatter@0: if sequences[j][i:i+1] == "-" and lstnumvariants[i] > 1: cstrittmatter@0: # loop over the rows of the varsequences cstrittmatter@0: for k in range(0, numsequences): cstrittmatter@0: if varsequences[k][i:i+1] not in variants and varsequences[k][i:i+1] != "-": cstrittmatter@0: variants.append(varsequences[k][i:i+1]) cstrittmatter@0: if len(variants) == 1: cstrittmatter@0: lstsequence = list(sequences[j]) cstrittmatter@0: lstsequence[i] = variants[0] cstrittmatter@0: sequences[j] = ''.join(lstsequence) cstrittmatter@0: else: cstrittmatter@0: lstsequence[i] = variants[len(variants) - 1] cstrittmatter@0: sequences.append(''.join(lstsequence)) cstrittmatter@0: # eliminate duplicate sequences cstrittmatter@0: sequences_unique = list(set(sequences)) cstrittmatter@0: # write consensus sequences to output cstrittmatter@0: consensus = open(args.output, 'w') cstrittmatter@0: n = 1 cstrittmatter@0: for sequence in sequences_unique: cstrittmatter@0: consensus.write(">consensus_" + str(n) + "\n") cstrittmatter@0: consensus.write(sequence + "\n") cstrittmatter@0: n = n + 1 cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: if __name__ == "__main__": cstrittmatter@0: __main__()