csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/Pileup.py comparison

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/Pileup.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

comparison

equal deleted inserted replaced

-:0e9998148a16
+:33d812a61356
+'''Tools for working with files in the samtools pileup -c format.'''
+import collections
+import pysam
+PileupSubstitution = collections.namedtuple("PileupSubstitution",
+(
+"chromosome",
+"pos",
+"reference_base",
+"genotype",
+"consensus_quality",
+"snp_quality",
+"mapping_quality",
+"coverage",
+"read_bases",
+"base_qualities"))
+PileupIndel = collections.namedtuple("PileupIndel",
+(
+"chromosome",
+"pos",
+"reference_base",
+"genotype",
+"consensus_quality",
+"snp_quality",
+"mapping_quality",
+"coverage",
+"first_allele",
+"second_allele",
+"reads_first",
+"reads_second",
+"reads_diff"))
+def iterate(infile):
+'''iterate over ``samtools pileup -c`` formatted file.
+*infile* can be any iterator over a lines.
+The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution`
+or :class:`pysam.Pileup.PileupIndel`.
+.. note::
+The parser converts to 0-based coordinates
+'''
+conv_subst = (str, lambda x: int(x) - 1, str,
+str, int, int, int, int, str, str)
+conv_indel = (str, lambda x: int(x) - 1, str, str, int,
+int, int, int, str, str, int, int, int)
+for line in infile:
+d = line[:-1].split()
+if d[2] == "*":
+try:
+yield PileupIndel(*[x(y) for x, y in zip(conv_indel, d)])
+except TypeError:
+raise pysam.SamtoolsError("parsing error in line: `%s`" % line)
+else:
+try:
+yield PileupSubstitution(*[x(y) for x, y in zip(conv_subst, d)])
+except TypeError:
+raise pysam.SamtoolsError("parsing error in line: `%s`" % line)
+ENCODE_GENOTYPE = {
+'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T',
+'AA': 'A', 'CC': 'C', 'GG': 'G', 'TT': 'T', 'UU': 'U',
+'AG': 'r', 'GA': 'R',
+'CT': 'y', 'TC': 'Y',
+'AC': 'm', 'CA': 'M',
+'GT': 'k', 'TG': 'K',
+'CG': 's', 'GC': 'S',
+'AT': 'w', 'TA': 'W',
+}
+DECODE_GENOTYPE = {
+'A': 'AA',
+'C': 'CC',
+'G': 'GG',
+'T': 'TT',
+'r': 'AG', 'R': 'AG',
+'y': 'CT', 'Y': 'CT',
+'m': 'AC', 'M': 'AC',
+'k': 'GT', 'K': 'GT',
+'s': 'CG', 'S': 'CG',
+'w': 'AT', 'W': 'AT',
+}
+# ------------------------------------------------------------
+def encodeGenotype(code):
+'''encode genotypes like GG, GA into a one-letter code.
+The returned code is lower case if code[0] < code[1], otherwise
+it is uppercase.
+'''
+return ENCODE_GENOTYPE[code.upper()]
+def decodeGenotype(code):
+'''decode single letter genotypes like m, M into two letters.
+This is the reverse operation to :meth:`encodeGenotype`.
+'''
+return DECODE_GENOTYPE[code]
+def translateIndelGenotypeFromVCF(vcf_genotypes, ref):
+'''translate indel from vcf to pileup format.'''
+# indels
+def getPrefix(s1, s2):
+'''get common prefix of strings s1 and s2.'''
+n = min(len(s1), len(s2))
+for x in range(n):
+if s1[x] != s2[x]:
+return s1[:x]
+return s1[:n]
+def getSuffix(s1, s2):
+'''get common sufix of strings s1 and s2.'''
+n = min(len(s1), len(s2))
+if s1[-1] != s2[-1]:
+return ""
+for x in range(-2, -n - 1, -1):
+if s1[x] != s2[x]:
+return s1[x + 1:]
+return s1[-n:]
+def getGenotype(variant, ref):
+if variant == ref:
+return "*", 0
+if len(ref) > len(variant):
+# is a deletion
+if ref.startswith(variant):
+return "-%s" % ref[len(variant):], len(variant) - 1
+elif ref.endswith(variant):
+return "-%s" % ref[:-len(variant)], -1
+else:
+prefix = getPrefix(ref, variant)
+suffix = getSuffix(ref, variant)
+shared = len(prefix) + len(suffix) - len(variant)
+# print "-", prefix, suffix, ref, variant, shared, len(prefix), len(suffix), len(ref)
+if shared < 0:
+raise ValueError()
+return "-%s" % ref[len(prefix):-(len(suffix) - shared)], len(prefix) - 1
+elif len(ref) < len(variant):
+# is an insertion
+if variant.startswith(ref):
+return "+%s" % variant[len(ref):], len(ref) - 1
+elif variant.endswith(ref):
+return "+%s" % variant[:len(ref)], 0
+else:
+prefix = getPrefix(ref, variant)
+suffix = getSuffix(ref, variant)
+shared = len(prefix) + len(suffix) - len(ref)
+if shared < 0:
+raise ValueError()
+return "+%s" % variant[len(prefix):-(len(suffix) - shared)], len(prefix)
+else:
+assert 0, "snp?"
+# in pileup, the position refers to the base
+# after the coordinate, hence subtract 1
+# pos -= 1
+genotypes, offsets = [], []
+is_error = True
+for variant in vcf_genotypes:
+try:
+g, offset = getGenotype(variant, ref)
+except ValueError:
+break
+genotypes.append(g)
+if g != "*":
+offsets.append(offset)
+else:
+is_error = False
+if is_error:
+raise ValueError()
+assert len(set(offsets)) == 1, "multiple offsets for indel"
+offset = offsets[0]
+genotypes = "/".join(genotypes)
+return genotypes, offset
+def vcf2pileup(vcf, sample):
+'''convert vcf record to pileup record.'''
+chromosome = vcf.contig
+pos = vcf.pos
+reference = vcf.ref
+allelles = [reference] + vcf.alt
+data = vcf[sample]
+# get genotype
+genotypes = data["GT"]
+if len(genotypes) > 1:
+raise ValueError("only single genotype per position, %s" % (str(vcf)))
+genotypes = genotypes[0]
+# not a variant
+if genotypes[0] == ".":
+return None
+genotypes = [allelles[int(x)] for x in genotypes if x != "/"]
+# snp_quality is "genotype quality"
+snp_quality = consensus_quality = data.get("GQ", [0])[0]
+mapping_quality = vcf.info.get("MQ", [0])[0]
+coverage = data.get("DP", 0)
+if len(reference) > 1 or max([len(x) for x in vcf.alt]) > 1:
+# indel
+genotype, offset = translateIndelGenotypeFromVCF(genotypes, reference)
+return PileupIndel(chromosome,
+pos + offset,
+"*",
+genotype,
+consensus_quality,
+snp_quality,
+mapping_quality,
+coverage,
+genotype,
+"<" * len(genotype),
+0,
+0,
+0)
+else:
+genotype = encodeGenotype("".join(genotypes))
+read_bases = ""
+base_qualities = ""
+return PileupSubstitution(chromosome, pos, reference,
+genotype, consensus_quality,
+snp_quality, mapping_quality,
+coverage, read_bases,
+base_qualities)
+def iterate_from_vcf(infile, sample):
+'''iterate over a vcf-formatted file.
+*infile* can be any iterator over a lines.
+The function yields named tuples of the type
+:class:`pysam.Pileup.PileupSubstitution` or
+:class:`pysam.Pileup.PileupIndel`.
+Positions without a snp will be skipped.
+This method is wasteful and written to support same legacy code
+that expects samtools pileup output.
+Better use the vcf parser directly.
+'''
+vcf = pysam.VCF()
+vcf.connect(infile)
+if sample not in vcf.getsamples():
+raise KeyError("sample %s not vcf file")
+for row in vcf.fetch():
+result = vcf2pileup(row, sample)
+if result:
+yield result

Mercurial > repos > rliterman > csp2

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/Pileup.py @ 69:33d812a61356