jpayne@68: '''Tools for working with files in the samtools pileup -c format.''' jpayne@68: import collections jpayne@68: import pysam jpayne@68: jpayne@68: PileupSubstitution = collections.namedtuple("PileupSubstitution", jpayne@68: ( jpayne@68: "chromosome", jpayne@68: "pos", jpayne@68: "reference_base", jpayne@68: "genotype", jpayne@68: "consensus_quality", jpayne@68: "snp_quality", jpayne@68: "mapping_quality", jpayne@68: "coverage", jpayne@68: "read_bases", jpayne@68: "base_qualities")) jpayne@68: jpayne@68: PileupIndel = collections.namedtuple("PileupIndel", jpayne@68: ( jpayne@68: "chromosome", jpayne@68: "pos", jpayne@68: "reference_base", jpayne@68: "genotype", jpayne@68: "consensus_quality", jpayne@68: "snp_quality", jpayne@68: "mapping_quality", jpayne@68: "coverage", jpayne@68: "first_allele", jpayne@68: "second_allele", jpayne@68: "reads_first", jpayne@68: "reads_second", jpayne@68: "reads_diff")) jpayne@68: jpayne@68: jpayne@68: def iterate(infile): jpayne@68: '''iterate over ``samtools pileup -c`` formatted file. jpayne@68: jpayne@68: *infile* can be any iterator over a lines. jpayne@68: jpayne@68: The function yields named tuples of the type :class:`pysam.Pileup.PileupSubstitution` jpayne@68: or :class:`pysam.Pileup.PileupIndel`. jpayne@68: jpayne@68: .. note:: jpayne@68: jpayne@68: The parser converts to 0-based coordinates jpayne@68: ''' jpayne@68: jpayne@68: conv_subst = (str, lambda x: int(x) - 1, str, jpayne@68: str, int, int, int, int, str, str) jpayne@68: conv_indel = (str, lambda x: int(x) - 1, str, str, int, jpayne@68: int, int, int, str, str, int, int, int) jpayne@68: jpayne@68: for line in infile: jpayne@68: d = line[:-1].split() jpayne@68: if d[2] == "*": jpayne@68: try: jpayne@68: yield PileupIndel(*[x(y) for x, y in zip(conv_indel, d)]) jpayne@68: except TypeError: jpayne@68: raise pysam.SamtoolsError("parsing error in line: `%s`" % line) jpayne@68: else: jpayne@68: try: jpayne@68: yield PileupSubstitution(*[x(y) for x, y in zip(conv_subst, d)]) jpayne@68: except TypeError: jpayne@68: raise pysam.SamtoolsError("parsing error in line: `%s`" % line) jpayne@68: jpayne@68: jpayne@68: ENCODE_GENOTYPE = { jpayne@68: 'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T', jpayne@68: 'AA': 'A', 'CC': 'C', 'GG': 'G', 'TT': 'T', 'UU': 'U', jpayne@68: 'AG': 'r', 'GA': 'R', jpayne@68: 'CT': 'y', 'TC': 'Y', jpayne@68: 'AC': 'm', 'CA': 'M', jpayne@68: 'GT': 'k', 'TG': 'K', jpayne@68: 'CG': 's', 'GC': 'S', jpayne@68: 'AT': 'w', 'TA': 'W', jpayne@68: } jpayne@68: jpayne@68: DECODE_GENOTYPE = { jpayne@68: 'A': 'AA', jpayne@68: 'C': 'CC', jpayne@68: 'G': 'GG', jpayne@68: 'T': 'TT', jpayne@68: 'r': 'AG', 'R': 'AG', jpayne@68: 'y': 'CT', 'Y': 'CT', jpayne@68: 'm': 'AC', 'M': 'AC', jpayne@68: 'k': 'GT', 'K': 'GT', jpayne@68: 's': 'CG', 'S': 'CG', jpayne@68: 'w': 'AT', 'W': 'AT', jpayne@68: } jpayne@68: jpayne@68: # ------------------------------------------------------------ jpayne@68: jpayne@68: jpayne@68: def encodeGenotype(code): jpayne@68: '''encode genotypes like GG, GA into a one-letter code. jpayne@68: The returned code is lower case if code[0] < code[1], otherwise jpayne@68: it is uppercase. jpayne@68: ''' jpayne@68: return ENCODE_GENOTYPE[code.upper()] jpayne@68: jpayne@68: jpayne@68: def decodeGenotype(code): jpayne@68: '''decode single letter genotypes like m, M into two letters. jpayne@68: This is the reverse operation to :meth:`encodeGenotype`. jpayne@68: ''' jpayne@68: return DECODE_GENOTYPE[code] jpayne@68: jpayne@68: jpayne@68: def translateIndelGenotypeFromVCF(vcf_genotypes, ref): jpayne@68: '''translate indel from vcf to pileup format.''' jpayne@68: jpayne@68: # indels jpayne@68: def getPrefix(s1, s2): jpayne@68: '''get common prefix of strings s1 and s2.''' jpayne@68: n = min(len(s1), len(s2)) jpayne@68: for x in range(n): jpayne@68: if s1[x] != s2[x]: jpayne@68: return s1[:x] jpayne@68: return s1[:n] jpayne@68: jpayne@68: def getSuffix(s1, s2): jpayne@68: '''get common sufix of strings s1 and s2.''' jpayne@68: n = min(len(s1), len(s2)) jpayne@68: if s1[-1] != s2[-1]: jpayne@68: return "" jpayne@68: for x in range(-2, -n - 1, -1): jpayne@68: if s1[x] != s2[x]: jpayne@68: return s1[x + 1:] jpayne@68: return s1[-n:] jpayne@68: jpayne@68: def getGenotype(variant, ref): jpayne@68: jpayne@68: if variant == ref: jpayne@68: return "*", 0 jpayne@68: jpayne@68: if len(ref) > len(variant): jpayne@68: # is a deletion jpayne@68: if ref.startswith(variant): jpayne@68: return "-%s" % ref[len(variant):], len(variant) - 1 jpayne@68: elif ref.endswith(variant): jpayne@68: return "-%s" % ref[:-len(variant)], -1 jpayne@68: else: jpayne@68: prefix = getPrefix(ref, variant) jpayne@68: suffix = getSuffix(ref, variant) jpayne@68: shared = len(prefix) + len(suffix) - len(variant) jpayne@68: # print "-", prefix, suffix, ref, variant, shared, len(prefix), len(suffix), len(ref) jpayne@68: if shared < 0: jpayne@68: raise ValueError() jpayne@68: return "-%s" % ref[len(prefix):-(len(suffix) - shared)], len(prefix) - 1 jpayne@68: jpayne@68: elif len(ref) < len(variant): jpayne@68: # is an insertion jpayne@68: if variant.startswith(ref): jpayne@68: return "+%s" % variant[len(ref):], len(ref) - 1 jpayne@68: elif variant.endswith(ref): jpayne@68: return "+%s" % variant[:len(ref)], 0 jpayne@68: else: jpayne@68: prefix = getPrefix(ref, variant) jpayne@68: suffix = getSuffix(ref, variant) jpayne@68: shared = len(prefix) + len(suffix) - len(ref) jpayne@68: if shared < 0: jpayne@68: raise ValueError() jpayne@68: jpayne@68: return "+%s" % variant[len(prefix):-(len(suffix) - shared)], len(prefix) jpayne@68: else: jpayne@68: assert 0, "snp?" jpayne@68: jpayne@68: # in pileup, the position refers to the base jpayne@68: # after the coordinate, hence subtract 1 jpayne@68: # pos -= 1 jpayne@68: jpayne@68: genotypes, offsets = [], [] jpayne@68: is_error = True jpayne@68: jpayne@68: for variant in vcf_genotypes: jpayne@68: try: jpayne@68: g, offset = getGenotype(variant, ref) jpayne@68: except ValueError: jpayne@68: break jpayne@68: jpayne@68: genotypes.append(g) jpayne@68: if g != "*": jpayne@68: offsets.append(offset) jpayne@68: jpayne@68: else: jpayne@68: is_error = False jpayne@68: jpayne@68: if is_error: jpayne@68: raise ValueError() jpayne@68: jpayne@68: assert len(set(offsets)) == 1, "multiple offsets for indel" jpayne@68: offset = offsets[0] jpayne@68: jpayne@68: genotypes = "/".join(genotypes) jpayne@68: return genotypes, offset jpayne@68: jpayne@68: jpayne@68: def vcf2pileup(vcf, sample): jpayne@68: '''convert vcf record to pileup record.''' jpayne@68: jpayne@68: chromosome = vcf.contig jpayne@68: pos = vcf.pos jpayne@68: reference = vcf.ref jpayne@68: allelles = [reference] + vcf.alt jpayne@68: jpayne@68: data = vcf[sample] jpayne@68: jpayne@68: # get genotype jpayne@68: genotypes = data["GT"] jpayne@68: if len(genotypes) > 1: jpayne@68: raise ValueError("only single genotype per position, %s" % (str(vcf))) jpayne@68: jpayne@68: genotypes = genotypes[0] jpayne@68: jpayne@68: # not a variant jpayne@68: if genotypes[0] == ".": jpayne@68: return None jpayne@68: jpayne@68: genotypes = [allelles[int(x)] for x in genotypes if x != "/"] jpayne@68: jpayne@68: # snp_quality is "genotype quality" jpayne@68: snp_quality = consensus_quality = data.get("GQ", [0])[0] jpayne@68: mapping_quality = vcf.info.get("MQ", [0])[0] jpayne@68: coverage = data.get("DP", 0) jpayne@68: jpayne@68: if len(reference) > 1 or max([len(x) for x in vcf.alt]) > 1: jpayne@68: # indel jpayne@68: genotype, offset = translateIndelGenotypeFromVCF(genotypes, reference) jpayne@68: jpayne@68: return PileupIndel(chromosome, jpayne@68: pos + offset, jpayne@68: "*", jpayne@68: genotype, jpayne@68: consensus_quality, jpayne@68: snp_quality, jpayne@68: mapping_quality, jpayne@68: coverage, jpayne@68: genotype, jpayne@68: "<" * len(genotype), jpayne@68: 0, jpayne@68: 0, jpayne@68: 0) jpayne@68: jpayne@68: else: jpayne@68: genotype = encodeGenotype("".join(genotypes)) jpayne@68: read_bases = "" jpayne@68: base_qualities = "" jpayne@68: jpayne@68: return PileupSubstitution(chromosome, pos, reference, jpayne@68: genotype, consensus_quality, jpayne@68: snp_quality, mapping_quality, jpayne@68: coverage, read_bases, jpayne@68: base_qualities) jpayne@68: jpayne@68: jpayne@68: def iterate_from_vcf(infile, sample): jpayne@68: '''iterate over a vcf-formatted file. jpayne@68: jpayne@68: *infile* can be any iterator over a lines. jpayne@68: jpayne@68: The function yields named tuples of the type jpayne@68: :class:`pysam.Pileup.PileupSubstitution` or jpayne@68: :class:`pysam.Pileup.PileupIndel`. jpayne@68: jpayne@68: Positions without a snp will be skipped. jpayne@68: jpayne@68: This method is wasteful and written to support same legacy code jpayne@68: that expects samtools pileup output. jpayne@68: jpayne@68: Better use the vcf parser directly. jpayne@68: jpayne@68: ''' jpayne@68: vcf = pysam.VCF() jpayne@68: vcf.connect(infile) jpayne@68: jpayne@68: if sample not in vcf.getsamples(): jpayne@68: raise KeyError("sample %s not vcf file") jpayne@68: jpayne@68: for row in vcf.fetch(): jpayne@68: result = vcf2pileup(row, sample) jpayne@68: if result: jpayne@68: yield result