csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcvcf.pyx annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcvcf.pyx @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

rev	line source
jpayne@68	1 # cython: embedsignature=True
jpayne@68	2 #
jpayne@68	3 # Code to read, write and edit VCF files
jpayne@68	4 #
jpayne@68	5 # VCF lines are encoded as a dictionary with these keys (note: all lowercase):
jpayne@68	6 # 'chrom': string
jpayne@68	7 # 'pos': integer
jpayne@68	8 # 'id': string
jpayne@68	9 # 'ref': string
jpayne@68	10 # 'alt': list of strings
jpayne@68	11 # 'qual': integer
jpayne@68	12 # 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"]
jpayne@68	13 # 'info': dictionary of values (see below)
jpayne@68	14 # 'format': list of keys (strings)
jpayne@68	15 # sample keys: dictionary of values (see below)
jpayne@68	16 #
jpayne@68	17 # The sample keys are accessible through vcf.getsamples()
jpayne@68	18 #
jpayne@68	19 # A dictionary of values contains value keys (defined in ##INFO or
jpayne@68	20 # ##FORMAT lines) which map to a list, containing integers, floats,
jpayne@68	21 # strings, or characters. Missing values are replaced by a particular
jpayne@68	22 # value, often -1 or .
jpayne@68	23 #
jpayne@68	24 # Genotypes are not stored as a string, but as a list of 1 or 3
jpayne@68	25 # elements (for haploid and diploid samples), the first (and last) the
jpayne@68	26 # integer representing an allele, and the second the separation
jpayne@68	27 # character. Note that there is just one genotype per sample, but for
jpayne@68	28 # consistency the single element is stored in a list.
jpayne@68	29 #
jpayne@68	30 # Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as
jpayne@68	31 # (key, value) pairs and are accessible through getheader()
jpayne@68	32 #
jpayne@68	33 # The VCF class can be instantiated with a 'regions' variable
jpayne@68	34 # consisting of tuples (chrom,start,end) encoding 0-based half-open
jpayne@68	35 # segments. Only variants with a position inside the segment will be
jpayne@68	36 # parsed. A regions parser is available under parse_regions.
jpayne@68	37 #
jpayne@68	38 # When instantiated, a reference can be passed to the VCF class. This
jpayne@68	39 # may be any class that supports a fetch(chrom, start, end) method.
jpayne@68	40 #
jpayne@68	41 # NOTE: the position that is returned to Python is 0-based, NOT
jpayne@68	42 # 1-based as in the VCF file.
jpayne@68	43 # NOTE: There is also preliminary VCF functionality in the VariantFile class.
jpayne@68	44 #
jpayne@68	45 # TODO:
jpayne@68	46 # only v4.0 writing is complete; alleles are not converted to v3.3 format
jpayne@68	47 #
jpayne@68	48
jpayne@68	49 from collections import namedtuple, defaultdict
jpayne@68	50 from operator import itemgetter
jpayne@68	51 import sys, re, copy, bisect
jpayne@68	52
jpayne@68	53 from libc.stdlib cimport atoi
jpayne@68	54 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
jpayne@68	55 from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
jpayne@68	56
jpayne@68	57 cimport pysam.libctabix as libctabix
jpayne@68	58 cimport pysam.libctabixproxies as libctabixproxies
jpayne@68	59
jpayne@68	60 from pysam.libcutils cimport force_str
jpayne@68	61
jpayne@68	62 import pysam
jpayne@68	63
jpayne@68	64 gtsRegEx = re.compile("[\|/\\\\]")
jpayne@68	65 alleleRegEx = re.compile('^[ACGTN]+$')
jpayne@68	66
jpayne@68	67 # Utility function. Uses 0-based coordinates
jpayne@68	68 def get_sequence(chrom, start, end, fa):
jpayne@68	69 # obtain sequence from .fa file, without truncation
jpayne@68	70 if end<=start: return ""
jpayne@68	71 if not fa: return "N"*(end-start)
jpayne@68	72 if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper()
jpayne@68	73 sequence = fa.fetch(chrom, start, end).upper()
jpayne@68	74 if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence))
jpayne@68	75 return sequence
jpayne@68	76
jpayne@68	77 # Utility function. Parses a region string
jpayne@68	78 def parse_regions( string ):
jpayne@68	79 result = []
jpayne@68	80 for r in string.split(','):
jpayne@68	81 elts = r.split(':')
jpayne@68	82 chrom, start, end = elts[0], 0, 3000000000
jpayne@68	83 if len(elts)==1: pass
jpayne@68	84 elif len(elts)==2:
jpayne@68	85 if len(elts[1])>0:
jpayne@68	86 ielts = elts[1].split('-')
jpayne@68	87 if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r)
jpayne@68	88 try: start, end = int(ielts[0])-1, int(ielts[1])
jpayne@68	89 except: raise ValueError("Don't understand region string '%s'" % r)
jpayne@68	90 else:
jpayne@68	91 raise ValueError("Don't understand region string '%s'" % r)
jpayne@68	92 result.append( (chrom,start,end) )
jpayne@68	93 return result
jpayne@68	94
jpayne@68	95
jpayne@68	96 FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue')
jpayne@68	97
jpayne@68	98 ###########################################################################################################
jpayne@68	99 #
jpayne@68	100 # New class
jpayne@68	101 #
jpayne@68	102 ###########################################################################################################
jpayne@68	103
jpayne@68	104 cdef class VCFRecord(libctabixproxies.TupleProxy):
jpayne@68	105 '''vcf record.
jpayne@68	106
jpayne@68	107 initialized from data and vcf meta
jpayne@68	108 '''
jpayne@68	109
jpayne@68	110 cdef vcf
jpayne@68	111 cdef char * contig
jpayne@68	112 cdef uint32_t pos
jpayne@68	113
jpayne@68	114 def __init__(self, vcf):
jpayne@68	115 self.vcf = vcf
jpayne@68	116 self.encoding = vcf.encoding
jpayne@68	117
jpayne@68	118 # if len(data) != len(self.vcf._samples):
jpayne@68	119 # self.vcf.error(str(data),
jpayne@68	120 # self.BAD_NUMBER_OF_COLUMNS,
jpayne@68	121 # "expected %s for %s samples (%s), got %s" % \
jpayne@68	122 # (len(self.vcf._samples),
jpayne@68	123 # len(self.vcf._samples),
jpayne@68	124 # self.vcf._samples,
jpayne@68	125 # len(data)))
jpayne@68	126
jpayne@68	127 def __cinit__(self, vcf):
jpayne@68	128 # start indexed access at genotypes
jpayne@68	129 self.offset = 9
jpayne@68	130
jpayne@68	131 self.vcf = vcf
jpayne@68	132 self.encoding = vcf.encoding
jpayne@68	133
jpayne@68	134 def error(self, line, error, opt=None):
jpayne@68	135 '''raise error.'''
jpayne@68	136 # pass to vcf file for error handling
jpayne@68	137 return self.vcf.error(line, error, opt)
jpayne@68	138
jpayne@68	139 cdef update(self, char * buffer, size_t nbytes):
jpayne@68	140 '''update internal data.
jpayne@68	141
jpayne@68	142 nbytes does not include the terminal '\0'.
jpayne@68	143 '''
jpayne@68	144 libctabixproxies.TupleProxy.update(self, buffer, nbytes)
jpayne@68	145
jpayne@68	146 self.contig = self.fields[0]
jpayne@68	147 # vcf counts from 1 - correct here
jpayne@68	148 self.pos = atoi(self.fields[1]) - 1
jpayne@68	149
jpayne@68	150 def __len__(self):
jpayne@68	151 return max(0, self.nfields - 9)
jpayne@68	152
jpayne@68	153 property contig:
jpayne@68	154 def __get__(self): return self.contig
jpayne@68	155
jpayne@68	156 property pos:
jpayne@68	157 def __get__(self): return self.pos
jpayne@68	158
jpayne@68	159 property id:
jpayne@68	160 def __get__(self): return self.fields[2]
jpayne@68	161
jpayne@68	162 property ref:
jpayne@68	163 def __get__(self):
jpayne@68	164 return self.fields[3]
jpayne@68	165
jpayne@68	166 property alt:
jpayne@68	167 def __get__(self):
jpayne@68	168 # convert v3.3 to v4.0 alleles below
jpayne@68	169 alt = self.fields[4]
jpayne@68	170 if alt == ".": alt = []
jpayne@68	171 else: alt = alt.upper().split(',')
jpayne@68	172 return alt
jpayne@68	173
jpayne@68	174 property qual:
jpayne@68	175 def __get__(self):
jpayne@68	176 qual = self.fields[5]
jpayne@68	177 if qual == b".": qual = -1
jpayne@68	178 else:
jpayne@68	179 try: qual = float(qual)
jpayne@68	180 except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL)
jpayne@68	181 return qual
jpayne@68	182
jpayne@68	183 property filter:
jpayne@68	184 def __get__(self):
jpayne@68	185 f = self.fields[6]
jpayne@68	186 # postpone checking that filters exist. Encode missing filter or no filtering as empty list
jpayne@68	187 if f == b"." or f == b"PASS" or f == b"0": return []
jpayne@68	188 else: return f.split(';')
jpayne@68	189
jpayne@68	190 property info:
jpayne@68	191 def __get__(self):
jpayne@68	192 col = self.fields[7]
jpayne@68	193 # dictionary of keys, and list of values
jpayne@68	194 info = {}
jpayne@68	195 if col != b".":
jpayne@68	196 for blurp in col.split(';'):
jpayne@68	197 elts = blurp.split('=')
jpayne@68	198 if len(elts) == 1: v = None
jpayne@68	199 elif len(elts) == 2: v = elts[1]
jpayne@68	200 else: self.vcf.error(str(self),self.ERROR_INFO_STRING)
jpayne@68	201 info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf))
jpayne@68	202 return info
jpayne@68	203
jpayne@68	204 property format:
jpayne@68	205 def __get__(self):
jpayne@68	206 return self.fields[8].split(':')
jpayne@68	207
jpayne@68	208 property samples:
jpayne@68	209 def __get__(self):
jpayne@68	210 return self.vcf._samples
jpayne@68	211
jpayne@68	212 def __getitem__(self, key):
jpayne@68	213
jpayne@68	214 # parse sample columns
jpayne@68	215 values = self.fields[self.vcf._sample2column[key]].split(':')
jpayne@68	216 alt = self.alt
jpayne@68	217 format = self.format
jpayne@68	218
jpayne@68	219 if len(values) > len(format):
jpayne@68	220 self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\
jpayne@68	221 (len(values),key,len(format)))
jpayne@68	222
jpayne@68	223 result = {}
jpayne@68	224 for idx in range(len(format)):
jpayne@68	225 expected = self.vcf.get_expected(format[idx], self.vcf._format, alt)
jpayne@68	226 if idx < len(values): value = values[idx]
jpayne@68	227 else:
jpayne@68	228 if expected == -1: value = "."
jpayne@68	229 else: value = ",".join(["."]*expected)
jpayne@68	230
jpayne@68	231 result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data))
jpayne@68	232 if expected != -1 and len(result[format[idx]]) != expected:
jpayne@68	233 self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS,
jpayne@68	234 "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]]))
jpayne@68	235 if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]]))
jpayne@68	236 result[format[idx]] = result[format[idx]][:expected]
jpayne@68	237
jpayne@68	238 return result
jpayne@68	239
jpayne@68	240
jpayne@68	241 cdef class asVCFRecord(libctabix.Parser):
jpayne@68	242 '''converts a :term:`tabix row` into a VCF record.'''
jpayne@68	243 cdef vcffile
jpayne@68	244 def __init__(self, vcffile):
jpayne@68	245 self.vcffile = vcffile
jpayne@68	246
jpayne@68	247 cdef parse(self, char * buffer, int len):
jpayne@68	248 cdef VCFRecord r
jpayne@68	249 r = VCFRecord(self.vcffile)
jpayne@68	250 r.copy(buffer, len)
jpayne@68	251 return r
jpayne@68	252
jpayne@68	253 class VCF(object):
jpayne@68	254
jpayne@68	255 # types
jpayne@68	256 NT_UNKNOWN = 0
jpayne@68	257 NT_NUMBER = 1
jpayne@68	258 NT_ALLELES = 2
jpayne@68	259 NT_NR_ALLELES = 3
jpayne@68	260 NT_GENOTYPES = 4
jpayne@68	261 NT_PHASED_GENOTYPES = 5
jpayne@68	262
jpayne@68	263 _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier",
jpayne@68	264 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string",
jpayne@68	265 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s",
jpayne@68	266 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)",
jpayne@68	267 4:"POS_NOT_NUMERICAL:Position column is not numerical",
jpayne@68	268 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field",
jpayne@68	269 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF",
jpayne@68	270 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF",
jpayne@68	271 8:"POS_NOT_POSITIVE:Position field must be >0",
jpayne@68	272 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'",
jpayne@68	273 10:"ERROR_INFO_STRING:Error while parsing info field",
jpayne@68	274 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)",
jpayne@68	275 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s",
jpayne@68	276 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string",
jpayne@68	277 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header",
jpayne@68	278 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header",
jpayne@68	279 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)",
jpayne@68	280 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)",
jpayne@68	281 18:"BAD_GENOTYPE:Cannot parse genotype (%s)",
jpayne@68	282 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)",
jpayne@68	283 20:"MISSING_REF:Reference allele missing",
jpayne@68	284 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)",
jpayne@68	285 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets",
jpayne@68	286 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes",
jpayne@68	287 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields",
jpayne@68	288 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs",
jpayne@68	289 26:"WRONG_REF:Wrong reference %s",
jpayne@68	290 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data",
jpayne@68	291 28:"BAD_CHR_TAG:Error calculating chr tag for %s",
jpayne@68	292 29:"ZERO_LENGTH_ALLELE:Found zero-length allele",
jpayne@68	293 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base",
jpayne@68	294 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'",
jpayne@68	295 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s",
jpayne@68	296 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value",
jpayne@68	297 }
jpayne@68	298
jpayne@68	299 # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields
jpayne@68	300 _header = []
jpayne@68	301
jpayne@68	302 # version number; 33=v3.3; 40=v4.0
jpayne@68	303 _version = 40
jpayne@68	304
jpayne@68	305 # info, filter and format data
jpayne@68	306 _info = {}
jpayne@68	307 _filter = {}
jpayne@68	308 _format = {}
jpayne@68	309
jpayne@68	310 # header; and required columns
jpayne@68	311 _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
jpayne@68	312 _samples = []
jpayne@68	313
jpayne@68	314 # control behaviour
jpayne@68	315 _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD
jpayne@68	316 _warn_errors = set([])
jpayne@68	317 _leftalign = False
jpayne@68	318
jpayne@68	319 # reference sequence
jpayne@68	320 _reference = None
jpayne@68	321
jpayne@68	322 # regions to include; None includes everything
jpayne@68	323 _regions = None
jpayne@68	324
jpayne@68	325 # statefull stuff
jpayne@68	326 _lineno = -1
jpayne@68	327 _line = None
jpayne@68	328 _lines = None
jpayne@68	329
jpayne@68	330 def __init__(self, _copy=None, reference=None, regions=None,
jpayne@68	331 lines=None, leftalign=False):
jpayne@68	332 # make error identifiers accessible by name
jpayne@68	333 for id in self._errors.keys():
jpayne@68	334 self.__dict__[self._errors[id].split(':')[0]] = id
jpayne@68	335 if _copy != None:
jpayne@68	336 self._leftalign = _copy._leftalign
jpayne@68	337 self._header = _copy._header[:]
jpayne@68	338 self._version = _copy._version
jpayne@68	339 self._info = copy.deepcopy(_copy._info)
jpayne@68	340 self._filter = copy.deepcopy(_copy._filter)
jpayne@68	341 self._format = copy.deepcopy(_copy._format)
jpayne@68	342 self._samples = _copy._samples[:]
jpayne@68	343 self._sample2column = copy.deepcopy(_copy._sample2column)
jpayne@68	344 self._ignored_errors = copy.deepcopy(_copy._ignored_errors)
jpayne@68	345 self._warn_errors = copy.deepcopy(_copy._warn_errors)
jpayne@68	346 self._reference = _copy._reference
jpayne@68	347 self._regions = _copy._regions
jpayne@68	348 if reference: self._reference = reference
jpayne@68	349 if regions: self._regions = regions
jpayne@68	350 if leftalign: self._leftalign = leftalign
jpayne@68	351 self._lines = lines
jpayne@68	352 self.encoding = "ascii"
jpayne@68	353 self.tabixfile = None
jpayne@68	354
jpayne@68	355 def error(self,line,error,opt=None):
jpayne@68	356 if error in self._ignored_errors: return
jpayne@68	357 errorlabel, errorstring = self._errors[error].split(':')
jpayne@68	358 if opt: errorstring = errorstring % opt
jpayne@68	359 errwarn = ["Error","Warning"][error in self._warn_errors]
jpayne@68	360 errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)
jpayne@68	361 if error in self._warn_errors: return
jpayne@68	362 raise ValueError(errorstring)
jpayne@68	363
jpayne@68	364 def parse_format(self,line,format,filter=False):
jpayne@68	365 if self._version == 40:
jpayne@68	366 if not format.startswith('<'):
jpayne@68	367 self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
jpayne@68	368 format = "<"+format
jpayne@68	369 if not format.endswith('>'):
jpayne@68	370 self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
jpayne@68	371 format += ">"
jpayne@68	372 format = format[1:-1]
jpayne@68	373 data = {'id':None,'number':None,'type':None,'descr':None}
jpayne@68	374 idx = 0
jpayne@68	375 while len(format.strip())>0:
jpayne@68	376 elts = format.strip().split(',')
jpayne@68	377 first, rest = elts[0], ','.join(elts[1:])
jpayne@68	378 if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')):
jpayne@68	379 if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS)
jpayne@68	380 if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
jpayne@68	381 first = ["ID=","Number=","Type=","Description="][idx] + first
jpayne@68	382 if first.startswith('ID='): data['id'] = first.split('=')[1]
jpayne@68	383 elif first.startswith('Number='): data['number'] = first.split('=')[1]
jpayne@68	384 elif first.startswith('Type='): data['type'] = first.split('=')[1]
jpayne@68	385 elif first.startswith('Description='):
jpayne@68	386 elts = format.split('"')
jpayne@68	387 if len(elts)<3:
jpayne@68	388 self.error(line,self.FORMAT_MISSING_QUOTES)
jpayne@68	389 elts = first.split('=') + [rest]
jpayne@68	390 data['descr'] = elts[1]
jpayne@68	391 rest = '"'.join(elts[2:])
jpayne@68	392 if rest.startswith(','): rest = rest[1:]
jpayne@68	393 else:
jpayne@68	394 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
jpayne@68	395 format = rest
jpayne@68	396 idx += 1
jpayne@68	397 if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings
jpayne@68	398 if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
jpayne@68	399 if 'descr' not in data:
jpayne@68	400 # missing description
jpayne@68	401 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
jpayne@68	402 data['descr'] = ""
jpayne@68	403 if not data['type'] and not data['number']:
jpayne@68	404 # fine, ##filter format
jpayne@68	405 return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.')
jpayne@68	406 if not data['type'] in ["Integer","Float","Character","String","Flag"]:
jpayne@68	407 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
jpayne@68	408 # I would like a missing-value field, but it isn't there
jpayne@68	409 if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value
jpayne@68	410 else: data['missing'] = '.'
jpayne@68	411 if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
jpayne@68	412 try:
jpayne@68	413 n = int(data['number'])
jpayne@68	414 t = self.NT_NUMBER
jpayne@68	415 except ValueError:
jpayne@68	416 n = -1
jpayne@68	417 if data['number'] == '.': t = self.NT_UNKNOWN
jpayne@68	418 elif data['number'] == '#alleles': t = self.NT_ALLELES
jpayne@68	419 elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES
jpayne@68	420 elif data['number'] == '#genotypes': t = self.NT_GENOTYPES
jpayne@68	421 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
jpayne@68	422 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
jpayne@68	423 # abbreviations added in VCF version v4.1
jpayne@68	424 elif data['number'] == 'A': t = self.NT_ALLELES
jpayne@68	425 elif data['number'] == 'G': t = self.NT_GENOTYPES
jpayne@68	426 else:
jpayne@68	427 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
jpayne@68	428 # if number is 0 - type must be Flag
jpayne@68	429 if n == 0 and data['type'] != 'Flag':
jpayne@68	430 self.error( line, self.ZERO_FOR_NON_FLAG_FIELD)
jpayne@68	431 # force type 'Flag' if no number
jpayne@68	432 data['type'] = 'Flag'
jpayne@68	433
jpayne@68	434 return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing'])
jpayne@68	435
jpayne@68	436 def format_format( self, fmt, filter=False ):
jpayne@68	437 values = [('ID',fmt.id)]
jpayne@68	438 if fmt.number != None and not filter:
jpayne@68	439 if fmt.numbertype == self.NT_UNKNOWN: nmb = "."
jpayne@68	440 elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number)
jpayne@68	441 elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles"
jpayne@68	442 elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles"
jpayne@68	443 elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes"
jpayne@68	444 elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes"
jpayne@68	445 else:
jpayne@68	446 raise ValueError("Unknown number type encountered: %s" % fmt.numbertype)
jpayne@68	447 values.append( ('Number',nmb) )
jpayne@68	448 values.append( ('Type', fmt.type) )
jpayne@68	449 values.append( ('Description', '"' + fmt.description + '"') )
jpayne@68	450 if self._version == 33:
jpayne@68	451 format = ",".join([v for k,v in values])
jpayne@68	452 else:
jpayne@68	453 format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">"
jpayne@68	454 return format
jpayne@68	455
jpayne@68	456 def get_expected(self, format, formatdict, alt):
jpayne@68	457 fmt = formatdict[format]
jpayne@68	458 if fmt.numbertype == self.NT_UNKNOWN: return -1
jpayne@68	459 if fmt.numbertype == self.NT_NUMBER: return fmt.number
jpayne@68	460 if fmt.numbertype == self.NT_ALLELES: return len(alt)+1
jpayne@68	461 if fmt.numbertype == self.NT_NR_ALLELES: return len(alt)
jpayne@68	462 if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2
jpayne@68	463 if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1)
jpayne@68	464 return 0
jpayne@68	465
jpayne@68	466
jpayne@68	467 def _add_definition(self, formatdict, key, data, line ):
jpayne@68	468 if key in formatdict: return
jpayne@68	469 self.error(line,self.ERROR_UNKNOWN_KEY,key)
jpayne@68	470 if data == None:
jpayne@68	471 formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".")
jpayne@68	472 return
jpayne@68	473 if data == []: data = [""] # unsure what type -- say string
jpayne@68	474 if type(data[0]) == type(0.0):
jpayne@68	475 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None)
jpayne@68	476 return
jpayne@68	477 if type(data[0]) == type(0):
jpayne@68	478 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None)
jpayne@68	479 return
jpayne@68	480 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".")
jpayne@68	481
jpayne@68	482
jpayne@68	483 # todo: trim trailing missing values
jpayne@68	484 def format_formatdata( self, data, format, key=True, value=True, separator=":" ):
jpayne@68	485 output, sdata = [], []
jpayne@68	486 if type(data) == type([]): # for FORMAT field, make data with dummy values
jpayne@68	487 d = {}
jpayne@68	488 for k in data: d[k] = []
jpayne@68	489 data = d
jpayne@68	490 # convert missing values; and silently add definitions if required
jpayne@68	491 for k in data:
jpayne@68	492 self._add_definition( format, k, data[k], "(output)" )
jpayne@68	493 for idx,v in enumerate(data[k]):
jpayne@68	494 if v == format[k].missingvalue: data[k][idx] = "."
jpayne@68	495 # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string
jpayne@68	496 for k in data:
jpayne@68	497 if k != 'GT': sdata.append( (k,data[k]) )
jpayne@68	498 sdata.sort()
jpayne@68	499 if 'GT' in data:
jpayne@68	500 sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata
jpayne@68	501 for k,v in sdata:
jpayne@68	502 if v == []: v = None
jpayne@68	503 if key and value:
jpayne@68	504 if v != None: output.append( k+"="+','.join(map(str,v)) )
jpayne@68	505 else: output.append( k )
jpayne@68	506 elif key: output.append(k)
jpayne@68	507 elif value:
jpayne@68	508 if v != None: output.append( ','.join(map(str,v)) )
jpayne@68	509 else: output.append( "." ) # should not happen
jpayne@68	510 # snip off trailing missing data
jpayne@68	511 while len(output) > 1:
jpayne@68	512 last = output[-1].replace(',','').replace('.','')
jpayne@68	513 if len(last)>0: break
jpayne@68	514 output = output[:-1]
jpayne@68	515 return separator.join(output)
jpayne@68	516
jpayne@68	517
jpayne@68	518 def enter_default_format(self):
jpayne@68	519 for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'),
jpayne@68	520 FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1),
jpayne@68	521 FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'),
jpayne@68	522 FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
jpayne@68	523 FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
jpayne@68	524 FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1),
jpayne@68	525 FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'),
jpayne@68	526 FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'),
jpayne@68	527 FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'),
jpayne@68	528 FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid
jpayne@68	529 FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'),
jpayne@68	530 FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1),
jpayne@68	531 FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1),
jpayne@68	532 FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1),
jpayne@68	533 ]:
jpayne@68	534 if f.id not in self._format:
jpayne@68	535 self._format[f.id] = f
jpayne@68	536
jpayne@68	537 def parse_header(self, line):
jpayne@68	538
jpayne@68	539 assert line.startswith('##')
jpayne@68	540 elts = line[2:].split('=')
jpayne@68	541 key = elts[0].strip()
jpayne@68	542 value = '='.join(elts[1:]).strip()
jpayne@68	543 if key == "fileformat":
jpayne@68	544 if value == "VCFv3.3":
jpayne@68	545 self._version = 33
jpayne@68	546 elif value == "VCFv4.0":
jpayne@68	547 self._version = 40
jpayne@68	548 elif value == "VCFv4.1":
jpayne@68	549 # AH - for testing
jpayne@68	550 self._version = 40
jpayne@68	551 elif value == "VCFv4.2":
jpayne@68	552 # AH - for testing
jpayne@68	553 self._version = 40
jpayne@68	554 else:
jpayne@68	555 self.error(line,self.UNKNOWN_FORMAT_STRING)
jpayne@68	556 elif key == "INFO":
jpayne@68	557 f = self.parse_format(line, value)
jpayne@68	558 self._info[ f.id ] = f
jpayne@68	559 elif key == "FILTER":
jpayne@68	560 f = self.parse_format(line, value, filter=True)
jpayne@68	561 self._filter[ f.id ] = f
jpayne@68	562 elif key == "FORMAT":
jpayne@68	563 f = self.parse_format(line, value)
jpayne@68	564 self._format[ f.id ] = f
jpayne@68	565 else:
jpayne@68	566 # keep other keys in the header field
jpayne@68	567 self._header.append( (key,value) )
jpayne@68	568
jpayne@68	569
jpayne@68	570 def write_header( self, stream ):
jpayne@68	571 stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10))
jpayne@68	572 for key,value in self._header: stream.write("##%s=%s\n" % (key,value))
jpayne@68	573 for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]:
jpayne@68	574 for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER"))))
jpayne@68	575
jpayne@68	576
jpayne@68	577 def parse_heading( self, line ):
jpayne@68	578 assert line.startswith('#')
jpayne@68	579 assert not line.startswith('##')
jpayne@68	580 headings = line[1:].split('\t')
jpayne@68	581 # test for 8, as FORMAT field might be missing
jpayne@68	582 if len(headings)==1 and len(line[1:].split()) >= 8:
jpayne@68	583 self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS)
jpayne@68	584 headings = line[1:].split()
jpayne@68	585
jpayne@68	586 for i,s in enumerate(self._required):
jpayne@68	587
jpayne@68	588 if len(headings)<=i or headings[i] != s:
jpayne@68	589
jpayne@68	590 if len(headings) <= i:
jpayne@68	591 err = "(%sth entry not found)" % (i+1)
jpayne@68	592 else:
jpayne@68	593 err = "(found %s, expected %s)" % (headings[i],s)
jpayne@68	594
jpayne@68	595 #self.error(line,self.BADLY_FORMATTED_HEADING,err)
jpayne@68	596 # allow FORMAT column to be absent
jpayne@68	597 if len(headings) == 8:
jpayne@68	598 headings.append("FORMAT")
jpayne@68	599 else:
jpayne@68	600 self.error(line,self.BADLY_FORMATTED_HEADING,err)
jpayne@68	601
jpayne@68	602 self._samples = headings[9:]
jpayne@68	603 self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] )
jpayne@68	604
jpayne@68	605 def write_heading( self, stream ):
jpayne@68	606 stream.write("#" + "\t".join(self._required + self._samples) + "\n")
jpayne@68	607
jpayne@68	608 def convertGT(self, GTstring):
jpayne@68	609 if GTstring == ".": return ["."]
jpayne@68	610 try:
jpayne@68	611 gts = gtsRegEx.split(GTstring)
jpayne@68	612 if len(gts) == 1: return [int(gts[0])]
jpayne@68	613 if len(gts) != 2: raise ValueError()
jpayne@68	614 if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]]
jpayne@68	615 return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])]
jpayne@68	616 except ValueError:
jpayne@68	617 self.error(self._line,self.BAD_GENOTYPE,GTstring)
jpayne@68	618 return [".","\|","."]
jpayne@68	619
jpayne@68	620 def convertGTback(self, GTdata):
jpayne@68	621 return ''.join(map(str,GTdata))
jpayne@68	622
jpayne@68	623 def parse_formatdata( self, key, value, formatdict, line ):
jpayne@68	624 # To do: check that the right number of values is present
jpayne@68	625 f = formatdict.get(key,None)
jpayne@68	626 if f == None:
jpayne@68	627 self._add_definition(formatdict, key, value, line )
jpayne@68	628 f = formatdict[key]
jpayne@68	629 if f.type == "Flag":
jpayne@68	630 if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE)
jpayne@68	631 return []
jpayne@68	632 values = value.split(',')
jpayne@68	633 # deal with trailing data in some early VCF files
jpayne@68	634 if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1:
jpayne@68	635 self.error(line,self.ERROR_TRAILING_DATA,values[-1])
jpayne@68	636 values[-1] = values[-1].split(';')[0]
jpayne@68	637 if f.type == "Integer":
jpayne@68	638 for idx,v in enumerate(values):
jpayne@68	639 try:
jpayne@68	640 if v == ".": values[idx] = f.missingvalue
jpayne@68	641 else: values[idx] = int(v)
jpayne@68	642 except:
jpayne@68	643 self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values)))
jpayne@68	644 return [0] * len(values)
jpayne@68	645 return values
jpayne@68	646 elif f.type == "String":
jpayne@68	647 self._line = line
jpayne@68	648 if f.id == "GT": values = list(map( self.convertGT, values ))
jpayne@68	649 return values
jpayne@68	650 elif f.type == "Character":
jpayne@68	651 for v in values:
jpayne@68	652 if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR)
jpayne@68	653 return values
jpayne@68	654 elif f.type == "Float":
jpayne@68	655 for idx,v in enumerate(values):
jpayne@68	656 if v == ".": values[idx] = f.missingvalue
jpayne@68	657 try: return list(map(float,values))
jpayne@68	658 except:
jpayne@68	659 self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values)))
jpayne@68	660 return [0.0] * len(values)
jpayne@68	661 else:
jpayne@68	662 # can't happen
jpayne@68	663 self.error(line,self.ERROR_INFO_STRING)
jpayne@68	664
jpayne@68	665 def inregion(self, chrom, pos):
jpayne@68	666 if not self._regions: return True
jpayne@68	667 for r in self._regions:
jpayne@68	668 if r[0] == chrom and r[1] <= pos < r[2]: return True
jpayne@68	669 return False
jpayne@68	670
jpayne@68	671 def parse_data( self, line, lineparse=False ):
jpayne@68	672 cols = line.split('\t')
jpayne@68	673 if len(cols) != len(self._samples)+9:
jpayne@68	674 # gracefully deal with absent FORMAT column
jpayne@68	675 # and those missing samples
jpayne@68	676 if len(cols) == 8:
jpayne@68	677 cols.append("")
jpayne@68	678 else:
jpayne@68	679 self.error(line,
jpayne@68	680 self.BAD_NUMBER_OF_COLUMNS,
jpayne@68	681 "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols)))
jpayne@68	682
jpayne@68	683 chrom = cols[0]
jpayne@68	684
jpayne@68	685 # get 0-based position
jpayne@68	686 try: pos = int(cols[1])-1
jpayne@68	687 except: self.error(line,self.POS_NOT_NUMERICAL)
jpayne@68	688 if pos < 0: self.error(line,self.POS_NOT_POSITIVE)
jpayne@68	689
jpayne@68	690 # implement filtering
jpayne@68	691 if not self.inregion(chrom,pos): return None
jpayne@68	692
jpayne@68	693 # end of first-pass parse for sortedVCF
jpayne@68	694 if lineparse: return chrom, pos, line
jpayne@68	695
jpayne@68	696 id = cols[2]
jpayne@68	697
jpayne@68	698 ref = cols[3].upper()
jpayne@68	699 if ref == ".":
jpayne@68	700 self.error(line,self.MISSING_REF)
jpayne@68	701 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
jpayne@68	702 else: ref = ""
jpayne@68	703 else:
jpayne@68	704 for c in ref:
jpayne@68	705 if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF)
jpayne@68	706 if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference)
jpayne@68	707
jpayne@68	708 # make sure reference is sane
jpayne@68	709 if self._reference:
jpayne@68	710 left = max(0,pos-100)
jpayne@68	711 faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference)
jpayne@68	712 faref = faref_leftflank[pos-left:]
jpayne@68	713 if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
jpayne@68	714 ref = faref
jpayne@68	715
jpayne@68	716 # convert v3.3 to v4.0 alleles below
jpayne@68	717 if cols[4] == ".": alt = []
jpayne@68	718 else: alt = cols[4].upper().split(',')
jpayne@68	719
jpayne@68	720 if cols[5] == ".": qual = -1
jpayne@68	721 else:
jpayne@68	722 try: qual = float(cols[5])
jpayne@68	723 except: self.error(line,self.QUAL_NOT_NUMERICAL)
jpayne@68	724
jpayne@68	725 # postpone checking that filters exist. Encode missing filter or no filtering as empty list
jpayne@68	726 if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = []
jpayne@68	727 else: filter = cols[6].split(';')
jpayne@68	728
jpayne@68	729 # dictionary of keys, and list of values
jpayne@68	730 info = {}
jpayne@68	731 if cols[7] != ".":
jpayne@68	732 for blurp in cols[7].split(';'):
jpayne@68	733 elts = blurp.split('=')
jpayne@68	734 if len(elts) == 1: v = None
jpayne@68	735 elif len(elts) == 2: v = elts[1]
jpayne@68	736 else: self.error(line,self.ERROR_INFO_STRING)
jpayne@68	737 info[elts[0]] = self.parse_formatdata(elts[0],
jpayne@68	738 v,
jpayne@68	739 self._info,
jpayne@68	740 line)
jpayne@68	741
jpayne@68	742 # Gracefully deal with absent FORMAT column
jpayne@68	743 if cols[8] == "": format = []
jpayne@68	744 else: format = cols[8].split(':')
jpayne@68	745
jpayne@68	746 # check: all filters are defined
jpayne@68	747 for f in filter:
jpayne@68	748 if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f)
jpayne@68	749
jpayne@68	750 # check: format fields are defined
jpayne@68	751 if self._format:
jpayne@68	752 for f in format:
jpayne@68	753 if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f)
jpayne@68	754
jpayne@68	755 # convert v3.3 alleles
jpayne@68	756 if self._version == 33:
jpayne@68	757 if len(ref) != 1: self.error(line,self.V33_BAD_REF)
jpayne@68	758 newalts = []
jpayne@68	759 have_deletions = False
jpayne@68	760 for a in alt:
jpayne@68	761 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
jpayne@68	762 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
jpayne@68	763 elif a.startswith('D'): # allow D<seq> and D<num>
jpayne@68	764 have_deletions = True
jpayne@68	765 try:
jpayne@68	766 l = int(a[1:]) # throws ValueError if sequence
jpayne@68	767 if len(ref) < l: # add to reference if necessary
jpayne@68	768 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
jpayne@68	769 ref += addns
jpayne@68	770 for i,na in enumerate(newalts): newalts[i] = na+addns
jpayne@68	771 a = ref[l:] # new deletion, deleting pos...pos+l
jpayne@68	772 except ValueError:
jpayne@68	773 s = a[1:]
jpayne@68	774 if len(ref) < len(s): # add Ns to reference if necessary
jpayne@68	775 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
jpayne@68	776 if not s.endswith(addns) and addns != 'N'*len(addns):
jpayne@68	777 self.error(line,self.V33_UNMATCHED_DELETION,
jpayne@68	778 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
jpayne@68	779 ref += addns
jpayne@68	780 for i,na in enumerate(newalts): newalts[i] = na+addns
jpayne@68	781 a = ref[len(s):] # new deletion, deleting from pos
jpayne@68	782 else:
jpayne@68	783 self.error(line,self.V33_BAD_ALLELE)
jpayne@68	784 newalts.append(a)
jpayne@68	785 alt = newalts
jpayne@68	786 # deletion alleles exist, add dummy 1st reference allele, and account for leading base
jpayne@68	787 if have_deletions:
jpayne@68	788 if pos == 0:
jpayne@68	789 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
jpayne@68	790 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
jpayne@68	791 ref += addn
jpayne@68	792 alt = [allele+addn for allele in alt]
jpayne@68	793 else:
jpayne@68	794 addn = get_sequence(chrom,pos-1,pos,self._reference)
jpayne@68	795 ref = addn + ref
jpayne@68	796 alt = [addn + allele for allele in alt]
jpayne@68	797 pos -= 1
jpayne@68	798 else:
jpayne@68	799 # format v4.0 -- just check for nucleotides
jpayne@68	800 for allele in alt:
jpayne@68	801 if not alleleRegEx.match(allele):
jpayne@68	802 self.error(line,self.V40_BAD_ALLELE,allele)
jpayne@68	803
jpayne@68	804 # check for leading nucleotide in indel calls
jpayne@68	805 for allele in alt:
jpayne@68	806 if len(allele) != len(ref):
jpayne@68	807 if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE)
jpayne@68	808 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
jpayne@68	809 self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE)
jpayne@68	810
jpayne@68	811 # trim trailing bases in alleles
jpayne@68	812 # AH: not certain why trimming this needs to be added
jpayne@68	813 # disabled now for unit testing
jpayne@68	814 # if alt:
jpayne@68	815 # for i in range(1,min(len(ref),min(map(len,alt)))):
jpayne@68	816 # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
jpayne@68	817 # break
jpayne@68	818 # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
jpayne@68	819
jpayne@68	820 # left-align alleles, if a reference is available
jpayne@68	821 if self._leftalign and self._reference:
jpayne@68	822 while left < pos:
jpayne@68	823 movable = True
jpayne@68	824 for allele in alt:
jpayne@68	825 if len(allele) > len(ref):
jpayne@68	826 longest, shortest = allele, ref
jpayne@68	827 else:
jpayne@68	828 longest, shortest = ref, allele
jpayne@68	829 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
jpayne@68	830 movable = False
jpayne@68	831 if longest[-1].upper() != longest[len(shortest)-1].upper():
jpayne@68	832 movable = False
jpayne@68	833 if not movable:
jpayne@68	834 break
jpayne@68	835 ref = ref[:-1]
jpayne@68	836 alt = [allele[:-1] for allele in alt]
jpayne@68	837 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
jpayne@68	838 ref = faref_leftflank[pos-left-1] + ref
jpayne@68	839 alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
jpayne@68	840 pos -= 1
jpayne@68	841
jpayne@68	842 # parse sample columns
jpayne@68	843 samples = []
jpayne@68	844 for sample in cols[9:]:
jpayne@68	845 dict = {}
jpayne@68	846 values = sample.split(':')
jpayne@68	847 if len(values) > len(format):
jpayne@68	848 self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format)))
jpayne@68	849 for idx in range(len(format)):
jpayne@68	850 expected = self.get_expected(format[idx], self._format, alt)
jpayne@68	851 if idx < len(values): value = values[idx]
jpayne@68	852 else:
jpayne@68	853 if expected == -1: value = "."
jpayne@68	854 else: value = ",".join(["."]*expected)
jpayne@68	855
jpayne@68	856 dict[format[idx]] = self.parse_formatdata(format[idx],
jpayne@68	857 value,
jpayne@68	858 self._format,
jpayne@68	859 line)
jpayne@68	860 if expected != -1 and len(dict[format[idx]]) != expected:
jpayne@68	861 self.error(line,self.BAD_NUMBER_OF_PARAMETERS,
jpayne@68	862 "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]]))
jpayne@68	863 if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]]))
jpayne@68	864 dict[format[idx]] = dict[format[idx]][:expected]
jpayne@68	865 samples.append( dict )
jpayne@68	866
jpayne@68	867 # done
jpayne@68	868 d = {'chrom':chrom,
jpayne@68	869 'pos':pos, # return 0-based position
jpayne@68	870 'id':id,
jpayne@68	871 'ref':ref,
jpayne@68	872 'alt':alt,
jpayne@68	873 'qual':qual,
jpayne@68	874 'filter':filter,
jpayne@68	875 'info':info,
jpayne@68	876 'format':format}
jpayne@68	877 for key,value in zip(self._samples,samples):
jpayne@68	878 d[key] = value
jpayne@68	879
jpayne@68	880 return d
jpayne@68	881
jpayne@68	882
jpayne@68	883 def write_data(self, stream, data):
jpayne@68	884 required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples
jpayne@68	885 for k in required:
jpayne@68	886 if k not in data: raise ValueError("Required key %s not found in data" % str(k))
jpayne@68	887 if data['alt'] == []: alt = "."
jpayne@68	888 else: alt = ",".join(data['alt'])
jpayne@68	889 if data['filter'] == None: filter = "."
jpayne@68	890 elif data['filter'] == []:
jpayne@68	891 if self._version == 33: filter = "0"
jpayne@68	892 else: filter = "PASS"
jpayne@68	893 else: filter = ';'.join(data['filter'])
jpayne@68	894 if data['qual'] == -1: qual = "."
jpayne@68	895 else: qual = str(data['qual'])
jpayne@68	896
jpayne@68	897 output = [data['chrom'],
jpayne@68	898 str(data['pos']+1), # change to 1-based position
jpayne@68	899 data['id'],
jpayne@68	900 data['ref'],
jpayne@68	901 alt,
jpayne@68	902 qual,
jpayne@68	903 filter,
jpayne@68	904 self.format_formatdata(
jpayne@68	905 data['info'], self._info, separator=";"),
jpayne@68	906 self.format_formatdata(
jpayne@68	907 data['format'], self._format, value=False)]
jpayne@68	908
jpayne@68	909 for s in self._samples:
jpayne@68	910 output.append(self.format_formatdata(
jpayne@68	911 data[s], self._format, key=False))
jpayne@68	912
jpayne@68	913 stream.write( "\t".join(output) + "\n" )
jpayne@68	914
jpayne@68	915 def _parse_header(self, stream):
jpayne@68	916 self._lineno = 0
jpayne@68	917 for line in stream:
jpayne@68	918 line = force_str(line, self.encoding)
jpayne@68	919 self._lineno += 1
jpayne@68	920 if line.startswith('##'):
jpayne@68	921 self.parse_header(line.strip())
jpayne@68	922 elif line.startswith('#'):
jpayne@68	923 self.parse_heading(line.strip())
jpayne@68	924 self.enter_default_format()
jpayne@68	925 else:
jpayne@68	926 break
jpayne@68	927 return line
jpayne@68	928
jpayne@68	929 def _parse(self, line, stream):
jpayne@68	930 # deal with files with header only
jpayne@68	931 if line.startswith("##"): return
jpayne@68	932 if len(line.strip()) > 0:
jpayne@68	933 d = self.parse_data( line.strip() )
jpayne@68	934 if d: yield d
jpayne@68	935 for line in stream:
jpayne@68	936 self._lineno += 1
jpayne@68	937 if self._lines and self._lineno > self._lines: raise StopIteration
jpayne@68	938 d = self.parse_data( line.strip() )
jpayne@68	939 if d: yield d
jpayne@68	940
jpayne@68	941 ######################################################################################################
jpayne@68	942 #
jpayne@68	943 # API follows
jpayne@68	944 #
jpayne@68	945 ######################################################################################################
jpayne@68	946
jpayne@68	947 def getsamples(self):
jpayne@68	948 """ List of samples in VCF file """
jpayne@68	949 return self._samples
jpayne@68	950
jpayne@68	951 def setsamples(self,samples):
jpayne@68	952 """ List of samples in VCF file """
jpayne@68	953 self._samples = samples
jpayne@68	954
jpayne@68	955 def getheader(self):
jpayne@68	956 """ List of header key-value pairs (strings) """
jpayne@68	957 return self._header
jpayne@68	958
jpayne@68	959 def setheader(self,header):
jpayne@68	960 """ List of header key-value pairs (strings) """
jpayne@68	961 self._header = header
jpayne@68	962
jpayne@68	963 def getinfo(self):
jpayne@68	964 """ Dictionary of ##INFO tags, as VCF.FORMAT values """
jpayne@68	965 return self._info
jpayne@68	966
jpayne@68	967 def setinfo(self,info):
jpayne@68	968 """ Dictionary of ##INFO tags, as VCF.FORMAT values """
jpayne@68	969 self._info = info
jpayne@68	970
jpayne@68	971 def getformat(self):
jpayne@68	972 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
jpayne@68	973 return self._format
jpayne@68	974
jpayne@68	975 def setformat(self,format):
jpayne@68	976 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
jpayne@68	977 self._format = format
jpayne@68	978
jpayne@68	979 def getfilter(self):
jpayne@68	980 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
jpayne@68	981 return self._filter
jpayne@68	982
jpayne@68	983 def setfilter(self,filter):
jpayne@68	984 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
jpayne@68	985 self._filter = filter
jpayne@68	986
jpayne@68	987 def setversion(self, version):
jpayne@68	988 if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files")
jpayne@68	989 self._version = version
jpayne@68	990
jpayne@68	991 def setregions(self, regions):
jpayne@68	992 self._regions = regions
jpayne@68	993
jpayne@68	994 def setreference(self, ref):
jpayne@68	995 """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """
jpayne@68	996 self._reference = ref
jpayne@68	997
jpayne@68	998 def ignoreerror(self, errorstring):
jpayne@68	999 try: self._ignored_errors.add(self.__dict__[errorstring])
jpayne@68	1000 except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
jpayne@68	1001
jpayne@68	1002 def warnerror(self, errorstring):
jpayne@68	1003 try: self._warn_errors.add(self.__dict__[errorstring])
jpayne@68	1004 except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
jpayne@68	1005
jpayne@68	1006 def parse(self, stream):
jpayne@68	1007 """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """
jpayne@68	1008 last_line = self._parse_header(stream)
jpayne@68	1009 # now return a generator that does the actual work. In this way the pre-processing is done
jpayne@68	1010 # before the first piece of data is yielded
jpayne@68	1011 return self._parse(last_line, stream)
jpayne@68	1012
jpayne@68	1013 def write(self, stream, datagenerator):
jpayne@68	1014 """ Writes a VCF file to a stream, using a data generator (or list) """
jpayne@68	1015 self.write_header(stream)
jpayne@68	1016 self.write_heading(stream)
jpayne@68	1017 for data in datagenerator: self.write_data(stream,data)
jpayne@68	1018
jpayne@68	1019 def writeheader(self, stream):
jpayne@68	1020 """ Writes a VCF header """
jpayne@68	1021 self.write_header(stream)
jpayne@68	1022 self.write_heading(stream)
jpayne@68	1023
jpayne@68	1024 def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2):
jpayne@68	1025 """ Utility function: compares two calls for equality """
jpayne@68	1026 # a variant should always be assigned to a unique position, one base before
jpayne@68	1027 # the leftmost position of the alignment gap. If this rule is implemented
jpayne@68	1028 # correctly, the two positions must be equal for the calls to be identical.
jpayne@68	1029 if pos1 != pos2: return False
jpayne@68	1030 # from both calls, trim rightmost bases when identical. Do this safely, i.e.
jpayne@68	1031 # only when the reference bases are not Ns
jpayne@68	1032 while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]:
jpayne@68	1033 ref1 = ref1[:-1]
jpayne@68	1034 alt1 = alt1[:-1]
jpayne@68	1035 while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]:
jpayne@68	1036 ref2 = ref2[:-1]
jpayne@68	1037 alt2 = alt2[:-1]
jpayne@68	1038 # now, the alternative alleles must be identical
jpayne@68	1039 return alt1 == alt2
jpayne@68	1040
jpayne@68	1041 ###########################################################################################################
jpayne@68	1042 ###########################################################################################################
jpayne@68	1043 ## API functions added by Andreas
jpayne@68	1044 ###########################################################################################################
jpayne@68	1045
jpayne@68	1046 def connect(self, filename, encoding="ascii"):
jpayne@68	1047 '''connect to tabix file.'''
jpayne@68	1048 self.encoding=encoding
jpayne@68	1049 self.tabixfile = pysam.Tabixfile(filename, encoding=encoding)
jpayne@68	1050 self._parse_header(self.tabixfile.header)
jpayne@68	1051
jpayne@68	1052 def __del__(self):
jpayne@68	1053 self.close()
jpayne@68	1054 self.tabixfile = None
jpayne@68	1055
jpayne@68	1056 def close(self):
jpayne@68	1057 if self.tabixfile:
jpayne@68	1058 self.tabixfile.close()
jpayne@68	1059 self.tabixfile = None
jpayne@68	1060
jpayne@68	1061 def fetch(self,
jpayne@68	1062 reference=None,
jpayne@68	1063 start=None,
jpayne@68	1064 end=None,
jpayne@68	1065 region=None ):
jpayne@68	1066 """ Parse a stream of VCF-formatted lines.
jpayne@68	1067 Initializes class instance and return generator """
jpayne@68	1068 return self.tabixfile.fetch(
jpayne@68	1069 reference,
jpayne@68	1070 start,
jpayne@68	1071 end,
jpayne@68	1072 region,
jpayne@68	1073 parser = asVCFRecord(self))
jpayne@68	1074
jpayne@68	1075 def validate(self, record):
jpayne@68	1076 '''validate vcf record.
jpayne@68	1077
jpayne@68	1078 returns a validated record.
jpayne@68	1079 '''
jpayne@68	1080
jpayne@68	1081 raise NotImplementedError("needs to be checked")
jpayne@68	1082
jpayne@68	1083 chrom, pos = record.chrom, record.pos
jpayne@68	1084
jpayne@68	1085 # check reference
jpayne@68	1086 ref = record.ref
jpayne@68	1087 if ref == ".":
jpayne@68	1088 self.error(str(record),self.MISSING_REF)
jpayne@68	1089 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
jpayne@68	1090 else: ref = ""
jpayne@68	1091 else:
jpayne@68	1092 for c in ref:
jpayne@68	1093 if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF)
jpayne@68	1094 if "N" in ref: ref = get_sequence(chrom,
jpayne@68	1095 pos,
jpayne@68	1096 pos+len(ref),
jpayne@68	1097 self._reference)
jpayne@68	1098
jpayne@68	1099 # make sure reference is sane
jpayne@68	1100 if self._reference:
jpayne@68	1101 left = max(0,self.pos-100)
jpayne@68	1102 faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference)
jpayne@68	1103 faref = faref_leftflank[pos-left:]
jpayne@68	1104 if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
jpayne@68	1105 ref = faref
jpayne@68	1106
jpayne@68	1107 # check: format fields are defined
jpayne@68	1108 for f in record.format:
jpayne@68	1109 if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f)
jpayne@68	1110
jpayne@68	1111 # check: all filters are defined
jpayne@68	1112 for f in record.filter:
jpayne@68	1113 if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f)
jpayne@68	1114
jpayne@68	1115 # convert v3.3 alleles
jpayne@68	1116 if self._version == 33:
jpayne@68	1117 if len(ref) != 1: self.error(str(record),self.V33_BAD_REF)
jpayne@68	1118 newalts = []
jpayne@68	1119 have_deletions = False
jpayne@68	1120 for a in alt:
jpayne@68	1121 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
jpayne@68	1122 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
jpayne@68	1123 elif a.startswith('D'): # allow D<seq> and D<num>
jpayne@68	1124 have_deletions = True
jpayne@68	1125 try:
jpayne@68	1126 l = int(a[1:]) # throws ValueError if sequence
jpayne@68	1127 if len(ref) < l: # add to reference if necessary
jpayne@68	1128 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
jpayne@68	1129 ref += addns
jpayne@68	1130 for i,na in enumerate(newalts): newalts[i] = na+addns
jpayne@68	1131 a = ref[l:] # new deletion, deleting pos...pos+l
jpayne@68	1132 except ValueError:
jpayne@68	1133 s = a[1:]
jpayne@68	1134 if len(ref) < len(s): # add Ns to reference if necessary
jpayne@68	1135 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
jpayne@68	1136 if not s.endswith(addns) and addns != 'N'*len(addns):
jpayne@68	1137 self.error(str(record),self.V33_UNMATCHED_DELETION,
jpayne@68	1138 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
jpayne@68	1139 ref += addns
jpayne@68	1140 for i,na in enumerate(newalts): newalts[i] = na+addns
jpayne@68	1141 a = ref[len(s):] # new deletion, deleting from pos
jpayne@68	1142 else:
jpayne@68	1143 self.error(str(record),self.V33_BAD_ALLELE)
jpayne@68	1144 newalts.append(a)
jpayne@68	1145 alt = newalts
jpayne@68	1146 # deletion alleles exist, add dummy 1st reference allele, and account for leading base
jpayne@68	1147 if have_deletions:
jpayne@68	1148 if pos == 0:
jpayne@68	1149 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
jpayne@68	1150 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
jpayne@68	1151 ref += addn
jpayne@68	1152 alt = [allele+addn for allele in alt]
jpayne@68	1153 else:
jpayne@68	1154 addn = get_sequence(chrom,pos-1,pos,self._reference)
jpayne@68	1155 ref = addn + ref
jpayne@68	1156 alt = [addn + allele for allele in alt]
jpayne@68	1157 pos -= 1
jpayne@68	1158 else:
jpayne@68	1159 # format v4.0 -- just check for nucleotides
jpayne@68	1160 for allele in alt:
jpayne@68	1161 if not alleleRegEx.match(allele):
jpayne@68	1162 self.error(str(record),self.V40_BAD_ALLELE,allele)
jpayne@68	1163
jpayne@68	1164
jpayne@68	1165 # check for leading nucleotide in indel calls
jpayne@68	1166 for allele in alt:
jpayne@68	1167 if len(allele) != len(ref):
jpayne@68	1168 if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE)
jpayne@68	1169 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
jpayne@68	1170 self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE)
jpayne@68	1171
jpayne@68	1172 # trim trailing bases in alleles
jpayne@68	1173 # AH: not certain why trimming this needs to be added
jpayne@68	1174 # disabled now for unit testing
jpayne@68	1175 # for i in range(1,min(len(ref),min(map(len,alt)))):
jpayne@68	1176 # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
jpayne@68	1177 # break
jpayne@68	1178 # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
jpayne@68	1179
jpayne@68	1180 # left-align alleles, if a reference is available
jpayne@68	1181 if self._leftalign and self._reference:
jpayne@68	1182 while left < pos:
jpayne@68	1183 movable = True
jpayne@68	1184 for allele in alt:
jpayne@68	1185 if len(allele) > len(ref):
jpayne@68	1186 longest, shortest = allele, ref
jpayne@68	1187 else:
jpayne@68	1188 longest, shortest = ref, allele
jpayne@68	1189 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
jpayne@68	1190 movable = False
jpayne@68	1191 if longest[-1].upper() != longest[len(shortest)-1].upper():
jpayne@68	1192 movable = False
jpayne@68	1193 if not movable:
jpayne@68	1194 break
jpayne@68	1195 ref = ref[:-1]
jpayne@68	1196 alt = [allele[:-1] for allele in alt]
jpayne@68	1197 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
jpayne@68	1198 ref = faref_leftflank[pos-left-1] + ref
jpayne@68	1199 alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
jpayne@68	1200 pos -= 1
jpayne@68	1201
jpayne@68	1202 __all__ = [
jpayne@68	1203 "VCF", "VCFRecord", ]

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcvcf.pyx @ 68:5028fdace37b