annotate scripts/GetConsensus.py @ 0:8be2feb96994

"planemo upload commit cb65588391944306ff3cb32a23e1c28f65122014"
author cstrittmatter
date Fri, 11 Mar 2022 15:50:35 -0500
parents
children
rev   line source
cstrittmatter@0 1 #!/usr/bin/env python3
cstrittmatter@0 2 # -*- coding: utf-8 -*-
cstrittmatter@0 3 """
cstrittmatter@0 4 ############################################################################
cstrittmatter@0 5 # Istituto Superiore di Sanita'
cstrittmatter@0 6 # European Union Reference Laboratory (EU-RL) for Escherichia coli, including Verotoxigenic E. coli (VTEC)
cstrittmatter@0 7 # Developer: Arnold Knijn arnold.knijn@iss.it
cstrittmatter@0 8 ############################################################################
cstrittmatter@0 9 """
cstrittmatter@0 10
cstrittmatter@0 11 import argparse
cstrittmatter@0 12 import sys
cstrittmatter@0 13 import os
cstrittmatter@0 14 import subprocess
cstrittmatter@0 15 import shutil
cstrittmatter@0 16
cstrittmatter@0 17 def __main__():
cstrittmatter@0 18 parser = argparse.ArgumentParser()
cstrittmatter@0 19 parser.add_argument('-i', '--input', dest='input', help='input alignment file')
cstrittmatter@0 20 parser.add_argument('-o', '--output', dest='output', help='output consensus file')
cstrittmatter@0 21 args = parser.parse_args()
cstrittmatter@0 22
cstrittmatter@0 23 sequences = []
cstrittmatter@0 24 varsequences = []
cstrittmatter@0 25 # read input
cstrittmatter@0 26 with open(args.input) as alignments:
cstrittmatter@0 27 for alignment in alignments:
cstrittmatter@0 28 if alignment[0] != ">":
cstrittmatter@0 29 sequences.append(alignment.rstrip())
cstrittmatter@0 30 numsequences = len(sequences)
cstrittmatter@0 31 for j in range(0, numsequences + 1):
cstrittmatter@0 32 varsequences.append("")
cstrittmatter@0 33 lstnumvariants = []
cstrittmatter@0 34 lstnumhyphens = []
cstrittmatter@0 35 # loop over the columns
cstrittmatter@0 36 for i in range(0, len(sequences[0])):
cstrittmatter@0 37 variants = []
cstrittmatter@0 38 numhyphens = 0
cstrittmatter@0 39 # loop over the original rows to obtain variants
cstrittmatter@0 40 for j in range(0, numsequences):
cstrittmatter@0 41 if sequences[j][i:i+1] == "-":
cstrittmatter@0 42 numhyphens = numhyphens + 1
cstrittmatter@0 43 if sequences[j][i:i+1] not in variants and sequences[j][i:i+1] != "-":
cstrittmatter@0 44 variants.append(sequences[j][i:i+1])
cstrittmatter@0 45 lstnumvariants.append(len(variants))
cstrittmatter@0 46 lstnumhyphens.append(numhyphens)
cstrittmatter@0 47 # create varsequences with a template of the variants
cstrittmatter@0 48 for j in range(0, numsequences):
cstrittmatter@0 49 if lstnumhyphens[i] == 0:
cstrittmatter@0 50 varsequences[j] = varsequences[j] + "-"
cstrittmatter@0 51 elif lstnumvariants[i] < 2:
cstrittmatter@0 52 varsequences[j] = varsequences[j] + "-"
cstrittmatter@0 53 else:
cstrittmatter@0 54 varsequences[j] = varsequences[j] + sequences[j][i:i+1]
cstrittmatter@0 55 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0:
cstrittmatter@0 56 varsequences[numsequences] = varsequences[numsequences] + variants[0]
cstrittmatter@0 57 else:
cstrittmatter@0 58 varsequences[numsequences] = varsequences[numsequences] + "-"
cstrittmatter@0 59 # loop over the columns, apply single variant
cstrittmatter@0 60 for i in range(0, len(sequences[0])):
cstrittmatter@0 61 if lstnumvariants[i] == 1 and lstnumhyphens[i] > 0:
cstrittmatter@0 62 # loop over all the rows to apply single variant to "-"
cstrittmatter@0 63 for j in range(0, len(sequences)):
cstrittmatter@0 64 if sequences[j][i:i+1] == "-":
cstrittmatter@0 65 lstsequence = list(sequences[j])
cstrittmatter@0 66 lstsequence[i] = varsequences[numsequences][i:i+1]
cstrittmatter@0 67 sequences[j] = ''.join(lstsequence)
cstrittmatter@0 68 # loop over the rows of the sequences, apply multiple variants
cstrittmatter@0 69 for j in range(0, len(sequences)):
cstrittmatter@0 70 # loop over the columns
cstrittmatter@0 71 for i in range(0, len(sequences[0])):
cstrittmatter@0 72 variants = []
cstrittmatter@0 73 if sequences[j][i:i+1] == "-" and lstnumvariants[i] > 1:
cstrittmatter@0 74 # loop over the rows of the varsequences
cstrittmatter@0 75 for k in range(0, numsequences):
cstrittmatter@0 76 if varsequences[k][i:i+1] not in variants and varsequences[k][i:i+1] != "-":
cstrittmatter@0 77 variants.append(varsequences[k][i:i+1])
cstrittmatter@0 78 if len(variants) == 1:
cstrittmatter@0 79 lstsequence = list(sequences[j])
cstrittmatter@0 80 lstsequence[i] = variants[0]
cstrittmatter@0 81 sequences[j] = ''.join(lstsequence)
cstrittmatter@0 82 else:
cstrittmatter@0 83 lstsequence[i] = variants[len(variants) - 1]
cstrittmatter@0 84 sequences.append(''.join(lstsequence))
cstrittmatter@0 85 # eliminate duplicate sequences
cstrittmatter@0 86 sequences_unique = list(set(sequences))
cstrittmatter@0 87 # write consensus sequences to output
cstrittmatter@0 88 consensus = open(args.output, 'w')
cstrittmatter@0 89 n = 1
cstrittmatter@0 90 for sequence in sequences_unique:
cstrittmatter@0 91 consensus.write(">consensus_" + str(n) + "\n")
cstrittmatter@0 92 consensus.write(sequence + "\n")
cstrittmatter@0 93 n = n + 1
cstrittmatter@0 94
cstrittmatter@0 95
cstrittmatter@0 96 if __name__ == "__main__":
cstrittmatter@0 97 __main__()