cstrittmatter@0
|
1 #!/usr/bin/env python3
|
cstrittmatter@0
|
2 # -*- coding: utf-8 -*-
|
cstrittmatter@0
|
3 """
|
cstrittmatter@0
|
4 ############################################################################
|
cstrittmatter@0
|
5 # Istituto Superiore di Sanita'
|
cstrittmatter@0
|
6 # European Union Reference Laboratory (EU-RL) for Escherichia coli, including Verotoxigenic E. coli (VTEC)
|
cstrittmatter@0
|
7 # Developer: Arnold Knijn arnold.knijn@iss.it
|
cstrittmatter@0
|
8 ############################################################################
|
cstrittmatter@0
|
9 """
|
cstrittmatter@0
|
10
|
cstrittmatter@0
|
11 import argparse
|
cstrittmatter@0
|
12 import sys
|
cstrittmatter@0
|
13 import os
|
cstrittmatter@0
|
14 import subprocess
|
cstrittmatter@0
|
15 import shutil
|
cstrittmatter@0
|
16
|
cstrittmatter@0
|
17 def __main__():
|
cstrittmatter@0
|
18 parser = argparse.ArgumentParser()
|
cstrittmatter@0
|
19 parser.add_argument('-i', '--input', dest='input', help='input alignment file')
|
cstrittmatter@0
|
20 parser.add_argument('-o', '--output', dest='output', help='output consensus file')
|
cstrittmatter@0
|
21 args = parser.parse_args()
|
cstrittmatter@0
|
22
|
cstrittmatter@0
|
23 sequences = []
|
cstrittmatter@0
|
24 varsequences = []
|
cstrittmatter@0
|
25 # read input
|
cstrittmatter@0
|
26 with open(args.input) as alignments:
|
cstrittmatter@0
|
27 for alignment in alignments:
|
cstrittmatter@0
|
28 if alignment[0] != ">":
|
cstrittmatter@0
|
29 sequences.append(alignment.rstrip())
|
cstrittmatter@0
|
30 numsequences = len(sequences)
|
cstrittmatter@0
|
31 for j in range(0, numsequences + 1):
|
cstrittmatter@0
|
32 varsequences.append("")
|
cstrittmatter@0
|
33 lstnumvariants = []
|
cstrittmatter@0
|
34 lstnumhyphens = []
|
cstrittmatter@0
|
35 # loop over the columns
|
cstrittmatter@0
|
36 for i in range(0, len(sequences[0])):
|
cstrittmatter@0
|
37 variants = []
|
cstrittmatter@0
|
38 numhyphens = 0
|
cstrittmatter@0
|
39 # loop over the original rows to obtain variants
|
cstrittmatter@0
|
40 for j in range(0, numsequences):
|
cstrittmatter@0
|
41 if sequences[j][i:i+1] == "-":
|
cstrittmatter@0
|
42 numhyphens = numhyphens + 1
|
cstrittmatter@0
|
43 if sequences[j][i:i+1] not in variants and sequences[j][i:i+1] != "-":
|
cstrittmatter@0
|
44 variants.append(sequences[j][i:i+1])
|
cstrittmatter@0
|
45 lstnumvariants.append(len(variants))
|
cstrittmatter@0
|
46 lstnumhyphens.append(numhyphens)
|
cstrittmatter@0
|
47 # create varsequences with a template of the variants
|
cstrittmatter@0
|
48 for j in range(0, numsequences):
|
cstrittmatter@0
|
49 if lstnumhyphens[i] == 0:
|
cstrittmatter@0
|
50 varsequences[j] = varsequences[j] + "-"
|
cstrittmatter@0
|
51 elif lstnumvariants[i] < 2:
|
cstrittmatter@0
|
52 varsequences[j] = varsequences[j] + "-"
|
cstrittmatter@0
|
53 else:
|
cstrittmatter@0
|
54 varsequences[j] = varsequences[j] + sequences[j][i:i+1]
|
cstrittmatter@0
|
55 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0:
|
cstrittmatter@0
|
56 varsequences[numsequences] = varsequences[numsequences] + variants[0]
|
cstrittmatter@0
|
57 else:
|
cstrittmatter@0
|
58 varsequences[numsequences] = varsequences[numsequences] + "-"
|
cstrittmatter@0
|
59 # loop over the columns, apply single variant
|
cstrittmatter@0
|
60 for i in range(0, len(sequences[0])):
|
cstrittmatter@0
|
61 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0:
|
cstrittmatter@0
|
62 # loop over all the rows to apply single variant to "-"
|
cstrittmatter@0
|
63 for j in range(0, len(sequences)):
|
cstrittmatter@0
|
64 if sequences[j][i:i+1] == "-":
|
cstrittmatter@0
|
65 lstsequence = list(sequences[j])
|
cstrittmatter@0
|
66 lstsequence[i] = varsequences[numsequences][i:i+1]
|
cstrittmatter@0
|
67 sequences[j] = ''.join(lstsequence)
|
cstrittmatter@0
|
68 # loop over the rows of the sequences, apply multiple variants
|
cstrittmatter@0
|
69 for j in range(0, len(sequences)):
|
cstrittmatter@0
|
70 # loop over the columns
|
cstrittmatter@0
|
71 for i in range(0, len(sequences[0])):
|
cstrittmatter@0
|
72 variants = []
|
cstrittmatter@0
|
73 if sequences[j][i:i+1] == "-" and lstnumvariants[i] > 1:
|
cstrittmatter@0
|
74 # loop over the rows of the varsequences
|
cstrittmatter@0
|
75 for k in range(0, numsequences):
|
cstrittmatter@0
|
76 if varsequences[k][i:i+1] not in variants and varsequences[k][i:i+1] != "-":
|
cstrittmatter@0
|
77 variants.append(varsequences[k][i:i+1])
|
cstrittmatter@0
|
78 if len(variants) == 1:
|
cstrittmatter@0
|
79 lstsequence = list(sequences[j])
|
cstrittmatter@0
|
80 lstsequence[i] = variants[0]
|
cstrittmatter@0
|
81 sequences[j] = ''.join(lstsequence)
|
cstrittmatter@0
|
82 else:
|
cstrittmatter@0
|
83 lstsequence[i] = variants[len(variants) - 1]
|
cstrittmatter@0
|
84 sequences.append(''.join(lstsequence))
|
cstrittmatter@0
|
85 # eliminate duplicate sequences
|
cstrittmatter@0
|
86 sequences_unique = list(set(sequences))
|
cstrittmatter@0
|
87 # write consensus sequences to output
|
cstrittmatter@0
|
88 consensus = open(args.output, 'w')
|
cstrittmatter@0
|
89 n = 1
|
cstrittmatter@0
|
90 for sequence in sequences_unique:
|
cstrittmatter@0
|
91 consensus.write(">consensus_" + str(n) + "\n")
|
cstrittmatter@0
|
92 consensus.write(sequence + "\n")
|
cstrittmatter@0
|
93 n = n + 1
|
cstrittmatter@0
|
94
|
cstrittmatter@0
|
95
|
cstrittmatter@0
|
96 if __name__ == "__main__":
|
cstrittmatter@0
|
97 __main__() |