jpayne@68
|
1 # cython: embedsignature=True
|
jpayne@68
|
2 #
|
jpayne@68
|
3 # Code to read, write and edit VCF files
|
jpayne@68
|
4 #
|
jpayne@68
|
5 # VCF lines are encoded as a dictionary with these keys (note: all lowercase):
|
jpayne@68
|
6 # 'chrom': string
|
jpayne@68
|
7 # 'pos': integer
|
jpayne@68
|
8 # 'id': string
|
jpayne@68
|
9 # 'ref': string
|
jpayne@68
|
10 # 'alt': list of strings
|
jpayne@68
|
11 # 'qual': integer
|
jpayne@68
|
12 # 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"]
|
jpayne@68
|
13 # 'info': dictionary of values (see below)
|
jpayne@68
|
14 # 'format': list of keys (strings)
|
jpayne@68
|
15 # sample keys: dictionary of values (see below)
|
jpayne@68
|
16 #
|
jpayne@68
|
17 # The sample keys are accessible through vcf.getsamples()
|
jpayne@68
|
18 #
|
jpayne@68
|
19 # A dictionary of values contains value keys (defined in ##INFO or
|
jpayne@68
|
20 # ##FORMAT lines) which map to a list, containing integers, floats,
|
jpayne@68
|
21 # strings, or characters. Missing values are replaced by a particular
|
jpayne@68
|
22 # value, often -1 or .
|
jpayne@68
|
23 #
|
jpayne@68
|
24 # Genotypes are not stored as a string, but as a list of 1 or 3
|
jpayne@68
|
25 # elements (for haploid and diploid samples), the first (and last) the
|
jpayne@68
|
26 # integer representing an allele, and the second the separation
|
jpayne@68
|
27 # character. Note that there is just one genotype per sample, but for
|
jpayne@68
|
28 # consistency the single element is stored in a list.
|
jpayne@68
|
29 #
|
jpayne@68
|
30 # Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as
|
jpayne@68
|
31 # (key, value) pairs and are accessible through getheader()
|
jpayne@68
|
32 #
|
jpayne@68
|
33 # The VCF class can be instantiated with a 'regions' variable
|
jpayne@68
|
34 # consisting of tuples (chrom,start,end) encoding 0-based half-open
|
jpayne@68
|
35 # segments. Only variants with a position inside the segment will be
|
jpayne@68
|
36 # parsed. A regions parser is available under parse_regions.
|
jpayne@68
|
37 #
|
jpayne@68
|
38 # When instantiated, a reference can be passed to the VCF class. This
|
jpayne@68
|
39 # may be any class that supports a fetch(chrom, start, end) method.
|
jpayne@68
|
40 #
|
jpayne@68
|
41 # NOTE: the position that is returned to Python is 0-based, NOT
|
jpayne@68
|
42 # 1-based as in the VCF file.
|
jpayne@68
|
43 # NOTE: There is also preliminary VCF functionality in the VariantFile class.
|
jpayne@68
|
44 #
|
jpayne@68
|
45 # TODO:
|
jpayne@68
|
46 # only v4.0 writing is complete; alleles are not converted to v3.3 format
|
jpayne@68
|
47 #
|
jpayne@68
|
48
|
jpayne@68
|
49 from collections import namedtuple, defaultdict
|
jpayne@68
|
50 from operator import itemgetter
|
jpayne@68
|
51 import sys, re, copy, bisect
|
jpayne@68
|
52
|
jpayne@68
|
53 from libc.stdlib cimport atoi
|
jpayne@68
|
54 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
|
jpayne@68
|
55 from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
|
jpayne@68
|
56
|
jpayne@68
|
57 cimport pysam.libctabix as libctabix
|
jpayne@68
|
58 cimport pysam.libctabixproxies as libctabixproxies
|
jpayne@68
|
59
|
jpayne@68
|
60 from pysam.libcutils cimport force_str
|
jpayne@68
|
61
|
jpayne@68
|
62 import pysam
|
jpayne@68
|
63
|
jpayne@68
|
64 gtsRegEx = re.compile("[|/\\\\]")
|
jpayne@68
|
65 alleleRegEx = re.compile('^[ACGTN]+$')
|
jpayne@68
|
66
|
jpayne@68
|
67 # Utility function. Uses 0-based coordinates
|
jpayne@68
|
68 def get_sequence(chrom, start, end, fa):
|
jpayne@68
|
69 # obtain sequence from .fa file, without truncation
|
jpayne@68
|
70 if end<=start: return ""
|
jpayne@68
|
71 if not fa: return "N"*(end-start)
|
jpayne@68
|
72 if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper()
|
jpayne@68
|
73 sequence = fa.fetch(chrom, start, end).upper()
|
jpayne@68
|
74 if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence))
|
jpayne@68
|
75 return sequence
|
jpayne@68
|
76
|
jpayne@68
|
77 # Utility function. Parses a region string
|
jpayne@68
|
78 def parse_regions( string ):
|
jpayne@68
|
79 result = []
|
jpayne@68
|
80 for r in string.split(','):
|
jpayne@68
|
81 elts = r.split(':')
|
jpayne@68
|
82 chrom, start, end = elts[0], 0, 3000000000
|
jpayne@68
|
83 if len(elts)==1: pass
|
jpayne@68
|
84 elif len(elts)==2:
|
jpayne@68
|
85 if len(elts[1])>0:
|
jpayne@68
|
86 ielts = elts[1].split('-')
|
jpayne@68
|
87 if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r)
|
jpayne@68
|
88 try: start, end = int(ielts[0])-1, int(ielts[1])
|
jpayne@68
|
89 except: raise ValueError("Don't understand region string '%s'" % r)
|
jpayne@68
|
90 else:
|
jpayne@68
|
91 raise ValueError("Don't understand region string '%s'" % r)
|
jpayne@68
|
92 result.append( (chrom,start,end) )
|
jpayne@68
|
93 return result
|
jpayne@68
|
94
|
jpayne@68
|
95
|
jpayne@68
|
96 FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue')
|
jpayne@68
|
97
|
jpayne@68
|
98 ###########################################################################################################
|
jpayne@68
|
99 #
|
jpayne@68
|
100 # New class
|
jpayne@68
|
101 #
|
jpayne@68
|
102 ###########################################################################################################
|
jpayne@68
|
103
|
jpayne@68
|
104 cdef class VCFRecord(libctabixproxies.TupleProxy):
|
jpayne@68
|
105 '''vcf record.
|
jpayne@68
|
106
|
jpayne@68
|
107 initialized from data and vcf meta
|
jpayne@68
|
108 '''
|
jpayne@68
|
109
|
jpayne@68
|
110 cdef vcf
|
jpayne@68
|
111 cdef char * contig
|
jpayne@68
|
112 cdef uint32_t pos
|
jpayne@68
|
113
|
jpayne@68
|
114 def __init__(self, vcf):
|
jpayne@68
|
115 self.vcf = vcf
|
jpayne@68
|
116 self.encoding = vcf.encoding
|
jpayne@68
|
117
|
jpayne@68
|
118 # if len(data) != len(self.vcf._samples):
|
jpayne@68
|
119 # self.vcf.error(str(data),
|
jpayne@68
|
120 # self.BAD_NUMBER_OF_COLUMNS,
|
jpayne@68
|
121 # "expected %s for %s samples (%s), got %s" % \
|
jpayne@68
|
122 # (len(self.vcf._samples),
|
jpayne@68
|
123 # len(self.vcf._samples),
|
jpayne@68
|
124 # self.vcf._samples,
|
jpayne@68
|
125 # len(data)))
|
jpayne@68
|
126
|
jpayne@68
|
127 def __cinit__(self, vcf):
|
jpayne@68
|
128 # start indexed access at genotypes
|
jpayne@68
|
129 self.offset = 9
|
jpayne@68
|
130
|
jpayne@68
|
131 self.vcf = vcf
|
jpayne@68
|
132 self.encoding = vcf.encoding
|
jpayne@68
|
133
|
jpayne@68
|
134 def error(self, line, error, opt=None):
|
jpayne@68
|
135 '''raise error.'''
|
jpayne@68
|
136 # pass to vcf file for error handling
|
jpayne@68
|
137 return self.vcf.error(line, error, opt)
|
jpayne@68
|
138
|
jpayne@68
|
139 cdef update(self, char * buffer, size_t nbytes):
|
jpayne@68
|
140 '''update internal data.
|
jpayne@68
|
141
|
jpayne@68
|
142 nbytes does not include the terminal '\0'.
|
jpayne@68
|
143 '''
|
jpayne@68
|
144 libctabixproxies.TupleProxy.update(self, buffer, nbytes)
|
jpayne@68
|
145
|
jpayne@68
|
146 self.contig = self.fields[0]
|
jpayne@68
|
147 # vcf counts from 1 - correct here
|
jpayne@68
|
148 self.pos = atoi(self.fields[1]) - 1
|
jpayne@68
|
149
|
jpayne@68
|
150 def __len__(self):
|
jpayne@68
|
151 return max(0, self.nfields - 9)
|
jpayne@68
|
152
|
jpayne@68
|
153 property contig:
|
jpayne@68
|
154 def __get__(self): return self.contig
|
jpayne@68
|
155
|
jpayne@68
|
156 property pos:
|
jpayne@68
|
157 def __get__(self): return self.pos
|
jpayne@68
|
158
|
jpayne@68
|
159 property id:
|
jpayne@68
|
160 def __get__(self): return self.fields[2]
|
jpayne@68
|
161
|
jpayne@68
|
162 property ref:
|
jpayne@68
|
163 def __get__(self):
|
jpayne@68
|
164 return self.fields[3]
|
jpayne@68
|
165
|
jpayne@68
|
166 property alt:
|
jpayne@68
|
167 def __get__(self):
|
jpayne@68
|
168 # convert v3.3 to v4.0 alleles below
|
jpayne@68
|
169 alt = self.fields[4]
|
jpayne@68
|
170 if alt == ".": alt = []
|
jpayne@68
|
171 else: alt = alt.upper().split(',')
|
jpayne@68
|
172 return alt
|
jpayne@68
|
173
|
jpayne@68
|
174 property qual:
|
jpayne@68
|
175 def __get__(self):
|
jpayne@68
|
176 qual = self.fields[5]
|
jpayne@68
|
177 if qual == b".": qual = -1
|
jpayne@68
|
178 else:
|
jpayne@68
|
179 try: qual = float(qual)
|
jpayne@68
|
180 except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL)
|
jpayne@68
|
181 return qual
|
jpayne@68
|
182
|
jpayne@68
|
183 property filter:
|
jpayne@68
|
184 def __get__(self):
|
jpayne@68
|
185 f = self.fields[6]
|
jpayne@68
|
186 # postpone checking that filters exist. Encode missing filter or no filtering as empty list
|
jpayne@68
|
187 if f == b"." or f == b"PASS" or f == b"0": return []
|
jpayne@68
|
188 else: return f.split(';')
|
jpayne@68
|
189
|
jpayne@68
|
190 property info:
|
jpayne@68
|
191 def __get__(self):
|
jpayne@68
|
192 col = self.fields[7]
|
jpayne@68
|
193 # dictionary of keys, and list of values
|
jpayne@68
|
194 info = {}
|
jpayne@68
|
195 if col != b".":
|
jpayne@68
|
196 for blurp in col.split(';'):
|
jpayne@68
|
197 elts = blurp.split('=')
|
jpayne@68
|
198 if len(elts) == 1: v = None
|
jpayne@68
|
199 elif len(elts) == 2: v = elts[1]
|
jpayne@68
|
200 else: self.vcf.error(str(self),self.ERROR_INFO_STRING)
|
jpayne@68
|
201 info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf))
|
jpayne@68
|
202 return info
|
jpayne@68
|
203
|
jpayne@68
|
204 property format:
|
jpayne@68
|
205 def __get__(self):
|
jpayne@68
|
206 return self.fields[8].split(':')
|
jpayne@68
|
207
|
jpayne@68
|
208 property samples:
|
jpayne@68
|
209 def __get__(self):
|
jpayne@68
|
210 return self.vcf._samples
|
jpayne@68
|
211
|
jpayne@68
|
212 def __getitem__(self, key):
|
jpayne@68
|
213
|
jpayne@68
|
214 # parse sample columns
|
jpayne@68
|
215 values = self.fields[self.vcf._sample2column[key]].split(':')
|
jpayne@68
|
216 alt = self.alt
|
jpayne@68
|
217 format = self.format
|
jpayne@68
|
218
|
jpayne@68
|
219 if len(values) > len(format):
|
jpayne@68
|
220 self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\
|
jpayne@68
|
221 (len(values),key,len(format)))
|
jpayne@68
|
222
|
jpayne@68
|
223 result = {}
|
jpayne@68
|
224 for idx in range(len(format)):
|
jpayne@68
|
225 expected = self.vcf.get_expected(format[idx], self.vcf._format, alt)
|
jpayne@68
|
226 if idx < len(values): value = values[idx]
|
jpayne@68
|
227 else:
|
jpayne@68
|
228 if expected == -1: value = "."
|
jpayne@68
|
229 else: value = ",".join(["."]*expected)
|
jpayne@68
|
230
|
jpayne@68
|
231 result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data))
|
jpayne@68
|
232 if expected != -1 and len(result[format[idx]]) != expected:
|
jpayne@68
|
233 self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS,
|
jpayne@68
|
234 "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]]))
|
jpayne@68
|
235 if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]]))
|
jpayne@68
|
236 result[format[idx]] = result[format[idx]][:expected]
|
jpayne@68
|
237
|
jpayne@68
|
238 return result
|
jpayne@68
|
239
|
jpayne@68
|
240
|
jpayne@68
|
241 cdef class asVCFRecord(libctabix.Parser):
|
jpayne@68
|
242 '''converts a :term:`tabix row` into a VCF record.'''
|
jpayne@68
|
243 cdef vcffile
|
jpayne@68
|
244 def __init__(self, vcffile):
|
jpayne@68
|
245 self.vcffile = vcffile
|
jpayne@68
|
246
|
jpayne@68
|
247 cdef parse(self, char * buffer, int len):
|
jpayne@68
|
248 cdef VCFRecord r
|
jpayne@68
|
249 r = VCFRecord(self.vcffile)
|
jpayne@68
|
250 r.copy(buffer, len)
|
jpayne@68
|
251 return r
|
jpayne@68
|
252
|
jpayne@68
|
253 class VCF(object):
|
jpayne@68
|
254
|
jpayne@68
|
255 # types
|
jpayne@68
|
256 NT_UNKNOWN = 0
|
jpayne@68
|
257 NT_NUMBER = 1
|
jpayne@68
|
258 NT_ALLELES = 2
|
jpayne@68
|
259 NT_NR_ALLELES = 3
|
jpayne@68
|
260 NT_GENOTYPES = 4
|
jpayne@68
|
261 NT_PHASED_GENOTYPES = 5
|
jpayne@68
|
262
|
jpayne@68
|
263 _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier",
|
jpayne@68
|
264 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string",
|
jpayne@68
|
265 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s",
|
jpayne@68
|
266 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)",
|
jpayne@68
|
267 4:"POS_NOT_NUMERICAL:Position column is not numerical",
|
jpayne@68
|
268 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field",
|
jpayne@68
|
269 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF",
|
jpayne@68
|
270 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF",
|
jpayne@68
|
271 8:"POS_NOT_POSITIVE:Position field must be >0",
|
jpayne@68
|
272 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'",
|
jpayne@68
|
273 10:"ERROR_INFO_STRING:Error while parsing info field",
|
jpayne@68
|
274 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)",
|
jpayne@68
|
275 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s",
|
jpayne@68
|
276 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string",
|
jpayne@68
|
277 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header",
|
jpayne@68
|
278 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header",
|
jpayne@68
|
279 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)",
|
jpayne@68
|
280 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)",
|
jpayne@68
|
281 18:"BAD_GENOTYPE:Cannot parse genotype (%s)",
|
jpayne@68
|
282 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)",
|
jpayne@68
|
283 20:"MISSING_REF:Reference allele missing",
|
jpayne@68
|
284 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)",
|
jpayne@68
|
285 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets",
|
jpayne@68
|
286 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes",
|
jpayne@68
|
287 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields",
|
jpayne@68
|
288 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs",
|
jpayne@68
|
289 26:"WRONG_REF:Wrong reference %s",
|
jpayne@68
|
290 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data",
|
jpayne@68
|
291 28:"BAD_CHR_TAG:Error calculating chr tag for %s",
|
jpayne@68
|
292 29:"ZERO_LENGTH_ALLELE:Found zero-length allele",
|
jpayne@68
|
293 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base",
|
jpayne@68
|
294 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'",
|
jpayne@68
|
295 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s",
|
jpayne@68
|
296 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value",
|
jpayne@68
|
297 }
|
jpayne@68
|
298
|
jpayne@68
|
299 # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields
|
jpayne@68
|
300 _header = []
|
jpayne@68
|
301
|
jpayne@68
|
302 # version number; 33=v3.3; 40=v4.0
|
jpayne@68
|
303 _version = 40
|
jpayne@68
|
304
|
jpayne@68
|
305 # info, filter and format data
|
jpayne@68
|
306 _info = {}
|
jpayne@68
|
307 _filter = {}
|
jpayne@68
|
308 _format = {}
|
jpayne@68
|
309
|
jpayne@68
|
310 # header; and required columns
|
jpayne@68
|
311 _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
|
jpayne@68
|
312 _samples = []
|
jpayne@68
|
313
|
jpayne@68
|
314 # control behaviour
|
jpayne@68
|
315 _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD
|
jpayne@68
|
316 _warn_errors = set([])
|
jpayne@68
|
317 _leftalign = False
|
jpayne@68
|
318
|
jpayne@68
|
319 # reference sequence
|
jpayne@68
|
320 _reference = None
|
jpayne@68
|
321
|
jpayne@68
|
322 # regions to include; None includes everything
|
jpayne@68
|
323 _regions = None
|
jpayne@68
|
324
|
jpayne@68
|
325 # statefull stuff
|
jpayne@68
|
326 _lineno = -1
|
jpayne@68
|
327 _line = None
|
jpayne@68
|
328 _lines = None
|
jpayne@68
|
329
|
jpayne@68
|
330 def __init__(self, _copy=None, reference=None, regions=None,
|
jpayne@68
|
331 lines=None, leftalign=False):
|
jpayne@68
|
332 # make error identifiers accessible by name
|
jpayne@68
|
333 for id in self._errors.keys():
|
jpayne@68
|
334 self.__dict__[self._errors[id].split(':')[0]] = id
|
jpayne@68
|
335 if _copy != None:
|
jpayne@68
|
336 self._leftalign = _copy._leftalign
|
jpayne@68
|
337 self._header = _copy._header[:]
|
jpayne@68
|
338 self._version = _copy._version
|
jpayne@68
|
339 self._info = copy.deepcopy(_copy._info)
|
jpayne@68
|
340 self._filter = copy.deepcopy(_copy._filter)
|
jpayne@68
|
341 self._format = copy.deepcopy(_copy._format)
|
jpayne@68
|
342 self._samples = _copy._samples[:]
|
jpayne@68
|
343 self._sample2column = copy.deepcopy(_copy._sample2column)
|
jpayne@68
|
344 self._ignored_errors = copy.deepcopy(_copy._ignored_errors)
|
jpayne@68
|
345 self._warn_errors = copy.deepcopy(_copy._warn_errors)
|
jpayne@68
|
346 self._reference = _copy._reference
|
jpayne@68
|
347 self._regions = _copy._regions
|
jpayne@68
|
348 if reference: self._reference = reference
|
jpayne@68
|
349 if regions: self._regions = regions
|
jpayne@68
|
350 if leftalign: self._leftalign = leftalign
|
jpayne@68
|
351 self._lines = lines
|
jpayne@68
|
352 self.encoding = "ascii"
|
jpayne@68
|
353 self.tabixfile = None
|
jpayne@68
|
354
|
jpayne@68
|
355 def error(self,line,error,opt=None):
|
jpayne@68
|
356 if error in self._ignored_errors: return
|
jpayne@68
|
357 errorlabel, errorstring = self._errors[error].split(':')
|
jpayne@68
|
358 if opt: errorstring = errorstring % opt
|
jpayne@68
|
359 errwarn = ["Error","Warning"][error in self._warn_errors]
|
jpayne@68
|
360 errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring)
|
jpayne@68
|
361 if error in self._warn_errors: return
|
jpayne@68
|
362 raise ValueError(errorstring)
|
jpayne@68
|
363
|
jpayne@68
|
364 def parse_format(self,line,format,filter=False):
|
jpayne@68
|
365 if self._version == 40:
|
jpayne@68
|
366 if not format.startswith('<'):
|
jpayne@68
|
367 self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
|
jpayne@68
|
368 format = "<"+format
|
jpayne@68
|
369 if not format.endswith('>'):
|
jpayne@68
|
370 self.error(line,self.V40_MISSING_ANGLE_BRACKETS)
|
jpayne@68
|
371 format += ">"
|
jpayne@68
|
372 format = format[1:-1]
|
jpayne@68
|
373 data = {'id':None,'number':None,'type':None,'descr':None}
|
jpayne@68
|
374 idx = 0
|
jpayne@68
|
375 while len(format.strip())>0:
|
jpayne@68
|
376 elts = format.strip().split(',')
|
jpayne@68
|
377 first, rest = elts[0], ','.join(elts[1:])
|
jpayne@68
|
378 if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')):
|
jpayne@68
|
379 if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS)
|
jpayne@68
|
380 if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
|
jpayne@68
|
381 first = ["ID=","Number=","Type=","Description="][idx] + first
|
jpayne@68
|
382 if first.startswith('ID='): data['id'] = first.split('=')[1]
|
jpayne@68
|
383 elif first.startswith('Number='): data['number'] = first.split('=')[1]
|
jpayne@68
|
384 elif first.startswith('Type='): data['type'] = first.split('=')[1]
|
jpayne@68
|
385 elif first.startswith('Description='):
|
jpayne@68
|
386 elts = format.split('"')
|
jpayne@68
|
387 if len(elts)<3:
|
jpayne@68
|
388 self.error(line,self.FORMAT_MISSING_QUOTES)
|
jpayne@68
|
389 elts = first.split('=') + [rest]
|
jpayne@68
|
390 data['descr'] = elts[1]
|
jpayne@68
|
391 rest = '"'.join(elts[2:])
|
jpayne@68
|
392 if rest.startswith(','): rest = rest[1:]
|
jpayne@68
|
393 else:
|
jpayne@68
|
394 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
|
jpayne@68
|
395 format = rest
|
jpayne@68
|
396 idx += 1
|
jpayne@68
|
397 if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings
|
jpayne@68
|
398 if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
|
jpayne@68
|
399 if 'descr' not in data:
|
jpayne@68
|
400 # missing description
|
jpayne@68
|
401 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
|
jpayne@68
|
402 data['descr'] = ""
|
jpayne@68
|
403 if not data['type'] and not data['number']:
|
jpayne@68
|
404 # fine, ##filter format
|
jpayne@68
|
405 return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.')
|
jpayne@68
|
406 if not data['type'] in ["Integer","Float","Character","String","Flag"]:
|
jpayne@68
|
407 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
|
jpayne@68
|
408 # I would like a missing-value field, but it isn't there
|
jpayne@68
|
409 if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value
|
jpayne@68
|
410 else: data['missing'] = '.'
|
jpayne@68
|
411 if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
|
jpayne@68
|
412 try:
|
jpayne@68
|
413 n = int(data['number'])
|
jpayne@68
|
414 t = self.NT_NUMBER
|
jpayne@68
|
415 except ValueError:
|
jpayne@68
|
416 n = -1
|
jpayne@68
|
417 if data['number'] == '.': t = self.NT_UNKNOWN
|
jpayne@68
|
418 elif data['number'] == '#alleles': t = self.NT_ALLELES
|
jpayne@68
|
419 elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES
|
jpayne@68
|
420 elif data['number'] == '#genotypes': t = self.NT_GENOTYPES
|
jpayne@68
|
421 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
|
jpayne@68
|
422 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES
|
jpayne@68
|
423 # abbreviations added in VCF version v4.1
|
jpayne@68
|
424 elif data['number'] == 'A': t = self.NT_ALLELES
|
jpayne@68
|
425 elif data['number'] == 'G': t = self.NT_GENOTYPES
|
jpayne@68
|
426 else:
|
jpayne@68
|
427 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING)
|
jpayne@68
|
428 # if number is 0 - type must be Flag
|
jpayne@68
|
429 if n == 0 and data['type'] != 'Flag':
|
jpayne@68
|
430 self.error( line, self.ZERO_FOR_NON_FLAG_FIELD)
|
jpayne@68
|
431 # force type 'Flag' if no number
|
jpayne@68
|
432 data['type'] = 'Flag'
|
jpayne@68
|
433
|
jpayne@68
|
434 return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing'])
|
jpayne@68
|
435
|
jpayne@68
|
436 def format_format( self, fmt, filter=False ):
|
jpayne@68
|
437 values = [('ID',fmt.id)]
|
jpayne@68
|
438 if fmt.number != None and not filter:
|
jpayne@68
|
439 if fmt.numbertype == self.NT_UNKNOWN: nmb = "."
|
jpayne@68
|
440 elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number)
|
jpayne@68
|
441 elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles"
|
jpayne@68
|
442 elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles"
|
jpayne@68
|
443 elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes"
|
jpayne@68
|
444 elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes"
|
jpayne@68
|
445 else:
|
jpayne@68
|
446 raise ValueError("Unknown number type encountered: %s" % fmt.numbertype)
|
jpayne@68
|
447 values.append( ('Number',nmb) )
|
jpayne@68
|
448 values.append( ('Type', fmt.type) )
|
jpayne@68
|
449 values.append( ('Description', '"' + fmt.description + '"') )
|
jpayne@68
|
450 if self._version == 33:
|
jpayne@68
|
451 format = ",".join([v for k,v in values])
|
jpayne@68
|
452 else:
|
jpayne@68
|
453 format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">"
|
jpayne@68
|
454 return format
|
jpayne@68
|
455
|
jpayne@68
|
456 def get_expected(self, format, formatdict, alt):
|
jpayne@68
|
457 fmt = formatdict[format]
|
jpayne@68
|
458 if fmt.numbertype == self.NT_UNKNOWN: return -1
|
jpayne@68
|
459 if fmt.numbertype == self.NT_NUMBER: return fmt.number
|
jpayne@68
|
460 if fmt.numbertype == self.NT_ALLELES: return len(alt)+1
|
jpayne@68
|
461 if fmt.numbertype == self.NT_NR_ALLELES: return len(alt)
|
jpayne@68
|
462 if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2
|
jpayne@68
|
463 if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1)
|
jpayne@68
|
464 return 0
|
jpayne@68
|
465
|
jpayne@68
|
466
|
jpayne@68
|
467 def _add_definition(self, formatdict, key, data, line ):
|
jpayne@68
|
468 if key in formatdict: return
|
jpayne@68
|
469 self.error(line,self.ERROR_UNKNOWN_KEY,key)
|
jpayne@68
|
470 if data == None:
|
jpayne@68
|
471 formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".")
|
jpayne@68
|
472 return
|
jpayne@68
|
473 if data == []: data = [""] # unsure what type -- say string
|
jpayne@68
|
474 if type(data[0]) == type(0.0):
|
jpayne@68
|
475 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None)
|
jpayne@68
|
476 return
|
jpayne@68
|
477 if type(data[0]) == type(0):
|
jpayne@68
|
478 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None)
|
jpayne@68
|
479 return
|
jpayne@68
|
480 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".")
|
jpayne@68
|
481
|
jpayne@68
|
482
|
jpayne@68
|
483 # todo: trim trailing missing values
|
jpayne@68
|
484 def format_formatdata( self, data, format, key=True, value=True, separator=":" ):
|
jpayne@68
|
485 output, sdata = [], []
|
jpayne@68
|
486 if type(data) == type([]): # for FORMAT field, make data with dummy values
|
jpayne@68
|
487 d = {}
|
jpayne@68
|
488 for k in data: d[k] = []
|
jpayne@68
|
489 data = d
|
jpayne@68
|
490 # convert missing values; and silently add definitions if required
|
jpayne@68
|
491 for k in data:
|
jpayne@68
|
492 self._add_definition( format, k, data[k], "(output)" )
|
jpayne@68
|
493 for idx,v in enumerate(data[k]):
|
jpayne@68
|
494 if v == format[k].missingvalue: data[k][idx] = "."
|
jpayne@68
|
495 # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string
|
jpayne@68
|
496 for k in data:
|
jpayne@68
|
497 if k != 'GT': sdata.append( (k,data[k]) )
|
jpayne@68
|
498 sdata.sort()
|
jpayne@68
|
499 if 'GT' in data:
|
jpayne@68
|
500 sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata
|
jpayne@68
|
501 for k,v in sdata:
|
jpayne@68
|
502 if v == []: v = None
|
jpayne@68
|
503 if key and value:
|
jpayne@68
|
504 if v != None: output.append( k+"="+','.join(map(str,v)) )
|
jpayne@68
|
505 else: output.append( k )
|
jpayne@68
|
506 elif key: output.append(k)
|
jpayne@68
|
507 elif value:
|
jpayne@68
|
508 if v != None: output.append( ','.join(map(str,v)) )
|
jpayne@68
|
509 else: output.append( "." ) # should not happen
|
jpayne@68
|
510 # snip off trailing missing data
|
jpayne@68
|
511 while len(output) > 1:
|
jpayne@68
|
512 last = output[-1].replace(',','').replace('.','')
|
jpayne@68
|
513 if len(last)>0: break
|
jpayne@68
|
514 output = output[:-1]
|
jpayne@68
|
515 return separator.join(output)
|
jpayne@68
|
516
|
jpayne@68
|
517
|
jpayne@68
|
518 def enter_default_format(self):
|
jpayne@68
|
519 for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'),
|
jpayne@68
|
520 FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1),
|
jpayne@68
|
521 FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'),
|
jpayne@68
|
522 FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
|
jpayne@68
|
523 FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'),
|
jpayne@68
|
524 FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1),
|
jpayne@68
|
525 FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'),
|
jpayne@68
|
526 FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'),
|
jpayne@68
|
527 FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'),
|
jpayne@68
|
528 FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid
|
jpayne@68
|
529 FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'),
|
jpayne@68
|
530 FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1),
|
jpayne@68
|
531 FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1),
|
jpayne@68
|
532 FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1),
|
jpayne@68
|
533 ]:
|
jpayne@68
|
534 if f.id not in self._format:
|
jpayne@68
|
535 self._format[f.id] = f
|
jpayne@68
|
536
|
jpayne@68
|
537 def parse_header(self, line):
|
jpayne@68
|
538
|
jpayne@68
|
539 assert line.startswith('##')
|
jpayne@68
|
540 elts = line[2:].split('=')
|
jpayne@68
|
541 key = elts[0].strip()
|
jpayne@68
|
542 value = '='.join(elts[1:]).strip()
|
jpayne@68
|
543 if key == "fileformat":
|
jpayne@68
|
544 if value == "VCFv3.3":
|
jpayne@68
|
545 self._version = 33
|
jpayne@68
|
546 elif value == "VCFv4.0":
|
jpayne@68
|
547 self._version = 40
|
jpayne@68
|
548 elif value == "VCFv4.1":
|
jpayne@68
|
549 # AH - for testing
|
jpayne@68
|
550 self._version = 40
|
jpayne@68
|
551 elif value == "VCFv4.2":
|
jpayne@68
|
552 # AH - for testing
|
jpayne@68
|
553 self._version = 40
|
jpayne@68
|
554 else:
|
jpayne@68
|
555 self.error(line,self.UNKNOWN_FORMAT_STRING)
|
jpayne@68
|
556 elif key == "INFO":
|
jpayne@68
|
557 f = self.parse_format(line, value)
|
jpayne@68
|
558 self._info[ f.id ] = f
|
jpayne@68
|
559 elif key == "FILTER":
|
jpayne@68
|
560 f = self.parse_format(line, value, filter=True)
|
jpayne@68
|
561 self._filter[ f.id ] = f
|
jpayne@68
|
562 elif key == "FORMAT":
|
jpayne@68
|
563 f = self.parse_format(line, value)
|
jpayne@68
|
564 self._format[ f.id ] = f
|
jpayne@68
|
565 else:
|
jpayne@68
|
566 # keep other keys in the header field
|
jpayne@68
|
567 self._header.append( (key,value) )
|
jpayne@68
|
568
|
jpayne@68
|
569
|
jpayne@68
|
570 def write_header( self, stream ):
|
jpayne@68
|
571 stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10))
|
jpayne@68
|
572 for key,value in self._header: stream.write("##%s=%s\n" % (key,value))
|
jpayne@68
|
573 for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]:
|
jpayne@68
|
574 for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER"))))
|
jpayne@68
|
575
|
jpayne@68
|
576
|
jpayne@68
|
577 def parse_heading( self, line ):
|
jpayne@68
|
578 assert line.startswith('#')
|
jpayne@68
|
579 assert not line.startswith('##')
|
jpayne@68
|
580 headings = line[1:].split('\t')
|
jpayne@68
|
581 # test for 8, as FORMAT field might be missing
|
jpayne@68
|
582 if len(headings)==1 and len(line[1:].split()) >= 8:
|
jpayne@68
|
583 self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS)
|
jpayne@68
|
584 headings = line[1:].split()
|
jpayne@68
|
585
|
jpayne@68
|
586 for i,s in enumerate(self._required):
|
jpayne@68
|
587
|
jpayne@68
|
588 if len(headings)<=i or headings[i] != s:
|
jpayne@68
|
589
|
jpayne@68
|
590 if len(headings) <= i:
|
jpayne@68
|
591 err = "(%sth entry not found)" % (i+1)
|
jpayne@68
|
592 else:
|
jpayne@68
|
593 err = "(found %s, expected %s)" % (headings[i],s)
|
jpayne@68
|
594
|
jpayne@68
|
595 #self.error(line,self.BADLY_FORMATTED_HEADING,err)
|
jpayne@68
|
596 # allow FORMAT column to be absent
|
jpayne@68
|
597 if len(headings) == 8:
|
jpayne@68
|
598 headings.append("FORMAT")
|
jpayne@68
|
599 else:
|
jpayne@68
|
600 self.error(line,self.BADLY_FORMATTED_HEADING,err)
|
jpayne@68
|
601
|
jpayne@68
|
602 self._samples = headings[9:]
|
jpayne@68
|
603 self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] )
|
jpayne@68
|
604
|
jpayne@68
|
605 def write_heading( self, stream ):
|
jpayne@68
|
606 stream.write("#" + "\t".join(self._required + self._samples) + "\n")
|
jpayne@68
|
607
|
jpayne@68
|
608 def convertGT(self, GTstring):
|
jpayne@68
|
609 if GTstring == ".": return ["."]
|
jpayne@68
|
610 try:
|
jpayne@68
|
611 gts = gtsRegEx.split(GTstring)
|
jpayne@68
|
612 if len(gts) == 1: return [int(gts[0])]
|
jpayne@68
|
613 if len(gts) != 2: raise ValueError()
|
jpayne@68
|
614 if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]]
|
jpayne@68
|
615 return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])]
|
jpayne@68
|
616 except ValueError:
|
jpayne@68
|
617 self.error(self._line,self.BAD_GENOTYPE,GTstring)
|
jpayne@68
|
618 return [".","|","."]
|
jpayne@68
|
619
|
jpayne@68
|
620 def convertGTback(self, GTdata):
|
jpayne@68
|
621 return ''.join(map(str,GTdata))
|
jpayne@68
|
622
|
jpayne@68
|
623 def parse_formatdata( self, key, value, formatdict, line ):
|
jpayne@68
|
624 # To do: check that the right number of values is present
|
jpayne@68
|
625 f = formatdict.get(key,None)
|
jpayne@68
|
626 if f == None:
|
jpayne@68
|
627 self._add_definition(formatdict, key, value, line )
|
jpayne@68
|
628 f = formatdict[key]
|
jpayne@68
|
629 if f.type == "Flag":
|
jpayne@68
|
630 if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE)
|
jpayne@68
|
631 return []
|
jpayne@68
|
632 values = value.split(',')
|
jpayne@68
|
633 # deal with trailing data in some early VCF files
|
jpayne@68
|
634 if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1:
|
jpayne@68
|
635 self.error(line,self.ERROR_TRAILING_DATA,values[-1])
|
jpayne@68
|
636 values[-1] = values[-1].split(';')[0]
|
jpayne@68
|
637 if f.type == "Integer":
|
jpayne@68
|
638 for idx,v in enumerate(values):
|
jpayne@68
|
639 try:
|
jpayne@68
|
640 if v == ".": values[idx] = f.missingvalue
|
jpayne@68
|
641 else: values[idx] = int(v)
|
jpayne@68
|
642 except:
|
jpayne@68
|
643 self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values)))
|
jpayne@68
|
644 return [0] * len(values)
|
jpayne@68
|
645 return values
|
jpayne@68
|
646 elif f.type == "String":
|
jpayne@68
|
647 self._line = line
|
jpayne@68
|
648 if f.id == "GT": values = list(map( self.convertGT, values ))
|
jpayne@68
|
649 return values
|
jpayne@68
|
650 elif f.type == "Character":
|
jpayne@68
|
651 for v in values:
|
jpayne@68
|
652 if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR)
|
jpayne@68
|
653 return values
|
jpayne@68
|
654 elif f.type == "Float":
|
jpayne@68
|
655 for idx,v in enumerate(values):
|
jpayne@68
|
656 if v == ".": values[idx] = f.missingvalue
|
jpayne@68
|
657 try: return list(map(float,values))
|
jpayne@68
|
658 except:
|
jpayne@68
|
659 self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values)))
|
jpayne@68
|
660 return [0.0] * len(values)
|
jpayne@68
|
661 else:
|
jpayne@68
|
662 # can't happen
|
jpayne@68
|
663 self.error(line,self.ERROR_INFO_STRING)
|
jpayne@68
|
664
|
jpayne@68
|
665 def inregion(self, chrom, pos):
|
jpayne@68
|
666 if not self._regions: return True
|
jpayne@68
|
667 for r in self._regions:
|
jpayne@68
|
668 if r[0] == chrom and r[1] <= pos < r[2]: return True
|
jpayne@68
|
669 return False
|
jpayne@68
|
670
|
jpayne@68
|
671 def parse_data( self, line, lineparse=False ):
|
jpayne@68
|
672 cols = line.split('\t')
|
jpayne@68
|
673 if len(cols) != len(self._samples)+9:
|
jpayne@68
|
674 # gracefully deal with absent FORMAT column
|
jpayne@68
|
675 # and those missing samples
|
jpayne@68
|
676 if len(cols) == 8:
|
jpayne@68
|
677 cols.append("")
|
jpayne@68
|
678 else:
|
jpayne@68
|
679 self.error(line,
|
jpayne@68
|
680 self.BAD_NUMBER_OF_COLUMNS,
|
jpayne@68
|
681 "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols)))
|
jpayne@68
|
682
|
jpayne@68
|
683 chrom = cols[0]
|
jpayne@68
|
684
|
jpayne@68
|
685 # get 0-based position
|
jpayne@68
|
686 try: pos = int(cols[1])-1
|
jpayne@68
|
687 except: self.error(line,self.POS_NOT_NUMERICAL)
|
jpayne@68
|
688 if pos < 0: self.error(line,self.POS_NOT_POSITIVE)
|
jpayne@68
|
689
|
jpayne@68
|
690 # implement filtering
|
jpayne@68
|
691 if not self.inregion(chrom,pos): return None
|
jpayne@68
|
692
|
jpayne@68
|
693 # end of first-pass parse for sortedVCF
|
jpayne@68
|
694 if lineparse: return chrom, pos, line
|
jpayne@68
|
695
|
jpayne@68
|
696 id = cols[2]
|
jpayne@68
|
697
|
jpayne@68
|
698 ref = cols[3].upper()
|
jpayne@68
|
699 if ref == ".":
|
jpayne@68
|
700 self.error(line,self.MISSING_REF)
|
jpayne@68
|
701 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
|
jpayne@68
|
702 else: ref = ""
|
jpayne@68
|
703 else:
|
jpayne@68
|
704 for c in ref:
|
jpayne@68
|
705 if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF)
|
jpayne@68
|
706 if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference)
|
jpayne@68
|
707
|
jpayne@68
|
708 # make sure reference is sane
|
jpayne@68
|
709 if self._reference:
|
jpayne@68
|
710 left = max(0,pos-100)
|
jpayne@68
|
711 faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference)
|
jpayne@68
|
712 faref = faref_leftflank[pos-left:]
|
jpayne@68
|
713 if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
|
jpayne@68
|
714 ref = faref
|
jpayne@68
|
715
|
jpayne@68
|
716 # convert v3.3 to v4.0 alleles below
|
jpayne@68
|
717 if cols[4] == ".": alt = []
|
jpayne@68
|
718 else: alt = cols[4].upper().split(',')
|
jpayne@68
|
719
|
jpayne@68
|
720 if cols[5] == ".": qual = -1
|
jpayne@68
|
721 else:
|
jpayne@68
|
722 try: qual = float(cols[5])
|
jpayne@68
|
723 except: self.error(line,self.QUAL_NOT_NUMERICAL)
|
jpayne@68
|
724
|
jpayne@68
|
725 # postpone checking that filters exist. Encode missing filter or no filtering as empty list
|
jpayne@68
|
726 if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = []
|
jpayne@68
|
727 else: filter = cols[6].split(';')
|
jpayne@68
|
728
|
jpayne@68
|
729 # dictionary of keys, and list of values
|
jpayne@68
|
730 info = {}
|
jpayne@68
|
731 if cols[7] != ".":
|
jpayne@68
|
732 for blurp in cols[7].split(';'):
|
jpayne@68
|
733 elts = blurp.split('=')
|
jpayne@68
|
734 if len(elts) == 1: v = None
|
jpayne@68
|
735 elif len(elts) == 2: v = elts[1]
|
jpayne@68
|
736 else: self.error(line,self.ERROR_INFO_STRING)
|
jpayne@68
|
737 info[elts[0]] = self.parse_formatdata(elts[0],
|
jpayne@68
|
738 v,
|
jpayne@68
|
739 self._info,
|
jpayne@68
|
740 line)
|
jpayne@68
|
741
|
jpayne@68
|
742 # Gracefully deal with absent FORMAT column
|
jpayne@68
|
743 if cols[8] == "": format = []
|
jpayne@68
|
744 else: format = cols[8].split(':')
|
jpayne@68
|
745
|
jpayne@68
|
746 # check: all filters are defined
|
jpayne@68
|
747 for f in filter:
|
jpayne@68
|
748 if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f)
|
jpayne@68
|
749
|
jpayne@68
|
750 # check: format fields are defined
|
jpayne@68
|
751 if self._format:
|
jpayne@68
|
752 for f in format:
|
jpayne@68
|
753 if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f)
|
jpayne@68
|
754
|
jpayne@68
|
755 # convert v3.3 alleles
|
jpayne@68
|
756 if self._version == 33:
|
jpayne@68
|
757 if len(ref) != 1: self.error(line,self.V33_BAD_REF)
|
jpayne@68
|
758 newalts = []
|
jpayne@68
|
759 have_deletions = False
|
jpayne@68
|
760 for a in alt:
|
jpayne@68
|
761 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
|
jpayne@68
|
762 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
|
jpayne@68
|
763 elif a.startswith('D'): # allow D<seq> and D<num>
|
jpayne@68
|
764 have_deletions = True
|
jpayne@68
|
765 try:
|
jpayne@68
|
766 l = int(a[1:]) # throws ValueError if sequence
|
jpayne@68
|
767 if len(ref) < l: # add to reference if necessary
|
jpayne@68
|
768 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
|
jpayne@68
|
769 ref += addns
|
jpayne@68
|
770 for i,na in enumerate(newalts): newalts[i] = na+addns
|
jpayne@68
|
771 a = ref[l:] # new deletion, deleting pos...pos+l
|
jpayne@68
|
772 except ValueError:
|
jpayne@68
|
773 s = a[1:]
|
jpayne@68
|
774 if len(ref) < len(s): # add Ns to reference if necessary
|
jpayne@68
|
775 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
|
jpayne@68
|
776 if not s.endswith(addns) and addns != 'N'*len(addns):
|
jpayne@68
|
777 self.error(line,self.V33_UNMATCHED_DELETION,
|
jpayne@68
|
778 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
|
jpayne@68
|
779 ref += addns
|
jpayne@68
|
780 for i,na in enumerate(newalts): newalts[i] = na+addns
|
jpayne@68
|
781 a = ref[len(s):] # new deletion, deleting from pos
|
jpayne@68
|
782 else:
|
jpayne@68
|
783 self.error(line,self.V33_BAD_ALLELE)
|
jpayne@68
|
784 newalts.append(a)
|
jpayne@68
|
785 alt = newalts
|
jpayne@68
|
786 # deletion alleles exist, add dummy 1st reference allele, and account for leading base
|
jpayne@68
|
787 if have_deletions:
|
jpayne@68
|
788 if pos == 0:
|
jpayne@68
|
789 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
|
jpayne@68
|
790 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
|
jpayne@68
|
791 ref += addn
|
jpayne@68
|
792 alt = [allele+addn for allele in alt]
|
jpayne@68
|
793 else:
|
jpayne@68
|
794 addn = get_sequence(chrom,pos-1,pos,self._reference)
|
jpayne@68
|
795 ref = addn + ref
|
jpayne@68
|
796 alt = [addn + allele for allele in alt]
|
jpayne@68
|
797 pos -= 1
|
jpayne@68
|
798 else:
|
jpayne@68
|
799 # format v4.0 -- just check for nucleotides
|
jpayne@68
|
800 for allele in alt:
|
jpayne@68
|
801 if not alleleRegEx.match(allele):
|
jpayne@68
|
802 self.error(line,self.V40_BAD_ALLELE,allele)
|
jpayne@68
|
803
|
jpayne@68
|
804 # check for leading nucleotide in indel calls
|
jpayne@68
|
805 for allele in alt:
|
jpayne@68
|
806 if len(allele) != len(ref):
|
jpayne@68
|
807 if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE)
|
jpayne@68
|
808 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
|
jpayne@68
|
809 self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE)
|
jpayne@68
|
810
|
jpayne@68
|
811 # trim trailing bases in alleles
|
jpayne@68
|
812 # AH: not certain why trimming this needs to be added
|
jpayne@68
|
813 # disabled now for unit testing
|
jpayne@68
|
814 # if alt:
|
jpayne@68
|
815 # for i in range(1,min(len(ref),min(map(len,alt)))):
|
jpayne@68
|
816 # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
|
jpayne@68
|
817 # break
|
jpayne@68
|
818 # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
|
jpayne@68
|
819
|
jpayne@68
|
820 # left-align alleles, if a reference is available
|
jpayne@68
|
821 if self._leftalign and self._reference:
|
jpayne@68
|
822 while left < pos:
|
jpayne@68
|
823 movable = True
|
jpayne@68
|
824 for allele in alt:
|
jpayne@68
|
825 if len(allele) > len(ref):
|
jpayne@68
|
826 longest, shortest = allele, ref
|
jpayne@68
|
827 else:
|
jpayne@68
|
828 longest, shortest = ref, allele
|
jpayne@68
|
829 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
|
jpayne@68
|
830 movable = False
|
jpayne@68
|
831 if longest[-1].upper() != longest[len(shortest)-1].upper():
|
jpayne@68
|
832 movable = False
|
jpayne@68
|
833 if not movable:
|
jpayne@68
|
834 break
|
jpayne@68
|
835 ref = ref[:-1]
|
jpayne@68
|
836 alt = [allele[:-1] for allele in alt]
|
jpayne@68
|
837 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
|
jpayne@68
|
838 ref = faref_leftflank[pos-left-1] + ref
|
jpayne@68
|
839 alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
|
jpayne@68
|
840 pos -= 1
|
jpayne@68
|
841
|
jpayne@68
|
842 # parse sample columns
|
jpayne@68
|
843 samples = []
|
jpayne@68
|
844 for sample in cols[9:]:
|
jpayne@68
|
845 dict = {}
|
jpayne@68
|
846 values = sample.split(':')
|
jpayne@68
|
847 if len(values) > len(format):
|
jpayne@68
|
848 self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format)))
|
jpayne@68
|
849 for idx in range(len(format)):
|
jpayne@68
|
850 expected = self.get_expected(format[idx], self._format, alt)
|
jpayne@68
|
851 if idx < len(values): value = values[idx]
|
jpayne@68
|
852 else:
|
jpayne@68
|
853 if expected == -1: value = "."
|
jpayne@68
|
854 else: value = ",".join(["."]*expected)
|
jpayne@68
|
855
|
jpayne@68
|
856 dict[format[idx]] = self.parse_formatdata(format[idx],
|
jpayne@68
|
857 value,
|
jpayne@68
|
858 self._format,
|
jpayne@68
|
859 line)
|
jpayne@68
|
860 if expected != -1 and len(dict[format[idx]]) != expected:
|
jpayne@68
|
861 self.error(line,self.BAD_NUMBER_OF_PARAMETERS,
|
jpayne@68
|
862 "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]]))
|
jpayne@68
|
863 if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]]))
|
jpayne@68
|
864 dict[format[idx]] = dict[format[idx]][:expected]
|
jpayne@68
|
865 samples.append( dict )
|
jpayne@68
|
866
|
jpayne@68
|
867 # done
|
jpayne@68
|
868 d = {'chrom':chrom,
|
jpayne@68
|
869 'pos':pos, # return 0-based position
|
jpayne@68
|
870 'id':id,
|
jpayne@68
|
871 'ref':ref,
|
jpayne@68
|
872 'alt':alt,
|
jpayne@68
|
873 'qual':qual,
|
jpayne@68
|
874 'filter':filter,
|
jpayne@68
|
875 'info':info,
|
jpayne@68
|
876 'format':format}
|
jpayne@68
|
877 for key,value in zip(self._samples,samples):
|
jpayne@68
|
878 d[key] = value
|
jpayne@68
|
879
|
jpayne@68
|
880 return d
|
jpayne@68
|
881
|
jpayne@68
|
882
|
jpayne@68
|
883 def write_data(self, stream, data):
|
jpayne@68
|
884 required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples
|
jpayne@68
|
885 for k in required:
|
jpayne@68
|
886 if k not in data: raise ValueError("Required key %s not found in data" % str(k))
|
jpayne@68
|
887 if data['alt'] == []: alt = "."
|
jpayne@68
|
888 else: alt = ",".join(data['alt'])
|
jpayne@68
|
889 if data['filter'] == None: filter = "."
|
jpayne@68
|
890 elif data['filter'] == []:
|
jpayne@68
|
891 if self._version == 33: filter = "0"
|
jpayne@68
|
892 else: filter = "PASS"
|
jpayne@68
|
893 else: filter = ';'.join(data['filter'])
|
jpayne@68
|
894 if data['qual'] == -1: qual = "."
|
jpayne@68
|
895 else: qual = str(data['qual'])
|
jpayne@68
|
896
|
jpayne@68
|
897 output = [data['chrom'],
|
jpayne@68
|
898 str(data['pos']+1), # change to 1-based position
|
jpayne@68
|
899 data['id'],
|
jpayne@68
|
900 data['ref'],
|
jpayne@68
|
901 alt,
|
jpayne@68
|
902 qual,
|
jpayne@68
|
903 filter,
|
jpayne@68
|
904 self.format_formatdata(
|
jpayne@68
|
905 data['info'], self._info, separator=";"),
|
jpayne@68
|
906 self.format_formatdata(
|
jpayne@68
|
907 data['format'], self._format, value=False)]
|
jpayne@68
|
908
|
jpayne@68
|
909 for s in self._samples:
|
jpayne@68
|
910 output.append(self.format_formatdata(
|
jpayne@68
|
911 data[s], self._format, key=False))
|
jpayne@68
|
912
|
jpayne@68
|
913 stream.write( "\t".join(output) + "\n" )
|
jpayne@68
|
914
|
jpayne@68
|
915 def _parse_header(self, stream):
|
jpayne@68
|
916 self._lineno = 0
|
jpayne@68
|
917 for line in stream:
|
jpayne@68
|
918 line = force_str(line, self.encoding)
|
jpayne@68
|
919 self._lineno += 1
|
jpayne@68
|
920 if line.startswith('##'):
|
jpayne@68
|
921 self.parse_header(line.strip())
|
jpayne@68
|
922 elif line.startswith('#'):
|
jpayne@68
|
923 self.parse_heading(line.strip())
|
jpayne@68
|
924 self.enter_default_format()
|
jpayne@68
|
925 else:
|
jpayne@68
|
926 break
|
jpayne@68
|
927 return line
|
jpayne@68
|
928
|
jpayne@68
|
929 def _parse(self, line, stream):
|
jpayne@68
|
930 # deal with files with header only
|
jpayne@68
|
931 if line.startswith("##"): return
|
jpayne@68
|
932 if len(line.strip()) > 0:
|
jpayne@68
|
933 d = self.parse_data( line.strip() )
|
jpayne@68
|
934 if d: yield d
|
jpayne@68
|
935 for line in stream:
|
jpayne@68
|
936 self._lineno += 1
|
jpayne@68
|
937 if self._lines and self._lineno > self._lines: raise StopIteration
|
jpayne@68
|
938 d = self.parse_data( line.strip() )
|
jpayne@68
|
939 if d: yield d
|
jpayne@68
|
940
|
jpayne@68
|
941 ######################################################################################################
|
jpayne@68
|
942 #
|
jpayne@68
|
943 # API follows
|
jpayne@68
|
944 #
|
jpayne@68
|
945 ######################################################################################################
|
jpayne@68
|
946
|
jpayne@68
|
947 def getsamples(self):
|
jpayne@68
|
948 """ List of samples in VCF file """
|
jpayne@68
|
949 return self._samples
|
jpayne@68
|
950
|
jpayne@68
|
951 def setsamples(self,samples):
|
jpayne@68
|
952 """ List of samples in VCF file """
|
jpayne@68
|
953 self._samples = samples
|
jpayne@68
|
954
|
jpayne@68
|
955 def getheader(self):
|
jpayne@68
|
956 """ List of header key-value pairs (strings) """
|
jpayne@68
|
957 return self._header
|
jpayne@68
|
958
|
jpayne@68
|
959 def setheader(self,header):
|
jpayne@68
|
960 """ List of header key-value pairs (strings) """
|
jpayne@68
|
961 self._header = header
|
jpayne@68
|
962
|
jpayne@68
|
963 def getinfo(self):
|
jpayne@68
|
964 """ Dictionary of ##INFO tags, as VCF.FORMAT values """
|
jpayne@68
|
965 return self._info
|
jpayne@68
|
966
|
jpayne@68
|
967 def setinfo(self,info):
|
jpayne@68
|
968 """ Dictionary of ##INFO tags, as VCF.FORMAT values """
|
jpayne@68
|
969 self._info = info
|
jpayne@68
|
970
|
jpayne@68
|
971 def getformat(self):
|
jpayne@68
|
972 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
|
jpayne@68
|
973 return self._format
|
jpayne@68
|
974
|
jpayne@68
|
975 def setformat(self,format):
|
jpayne@68
|
976 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """
|
jpayne@68
|
977 self._format = format
|
jpayne@68
|
978
|
jpayne@68
|
979 def getfilter(self):
|
jpayne@68
|
980 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
|
jpayne@68
|
981 return self._filter
|
jpayne@68
|
982
|
jpayne@68
|
983 def setfilter(self,filter):
|
jpayne@68
|
984 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """
|
jpayne@68
|
985 self._filter = filter
|
jpayne@68
|
986
|
jpayne@68
|
987 def setversion(self, version):
|
jpayne@68
|
988 if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files")
|
jpayne@68
|
989 self._version = version
|
jpayne@68
|
990
|
jpayne@68
|
991 def setregions(self, regions):
|
jpayne@68
|
992 self._regions = regions
|
jpayne@68
|
993
|
jpayne@68
|
994 def setreference(self, ref):
|
jpayne@68
|
995 """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """
|
jpayne@68
|
996 self._reference = ref
|
jpayne@68
|
997
|
jpayne@68
|
998 def ignoreerror(self, errorstring):
|
jpayne@68
|
999 try: self._ignored_errors.add(self.__dict__[errorstring])
|
jpayne@68
|
1000 except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
|
jpayne@68
|
1001
|
jpayne@68
|
1002 def warnerror(self, errorstring):
|
jpayne@68
|
1003 try: self._warn_errors.add(self.__dict__[errorstring])
|
jpayne@68
|
1004 except KeyError: raise ValueError("Invalid error string: %s" % errorstring)
|
jpayne@68
|
1005
|
jpayne@68
|
1006 def parse(self, stream):
|
jpayne@68
|
1007 """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """
|
jpayne@68
|
1008 last_line = self._parse_header(stream)
|
jpayne@68
|
1009 # now return a generator that does the actual work. In this way the pre-processing is done
|
jpayne@68
|
1010 # before the first piece of data is yielded
|
jpayne@68
|
1011 return self._parse(last_line, stream)
|
jpayne@68
|
1012
|
jpayne@68
|
1013 def write(self, stream, datagenerator):
|
jpayne@68
|
1014 """ Writes a VCF file to a stream, using a data generator (or list) """
|
jpayne@68
|
1015 self.write_header(stream)
|
jpayne@68
|
1016 self.write_heading(stream)
|
jpayne@68
|
1017 for data in datagenerator: self.write_data(stream,data)
|
jpayne@68
|
1018
|
jpayne@68
|
1019 def writeheader(self, stream):
|
jpayne@68
|
1020 """ Writes a VCF header """
|
jpayne@68
|
1021 self.write_header(stream)
|
jpayne@68
|
1022 self.write_heading(stream)
|
jpayne@68
|
1023
|
jpayne@68
|
1024 def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2):
|
jpayne@68
|
1025 """ Utility function: compares two calls for equality """
|
jpayne@68
|
1026 # a variant should always be assigned to a unique position, one base before
|
jpayne@68
|
1027 # the leftmost position of the alignment gap. If this rule is implemented
|
jpayne@68
|
1028 # correctly, the two positions must be equal for the calls to be identical.
|
jpayne@68
|
1029 if pos1 != pos2: return False
|
jpayne@68
|
1030 # from both calls, trim rightmost bases when identical. Do this safely, i.e.
|
jpayne@68
|
1031 # only when the reference bases are not Ns
|
jpayne@68
|
1032 while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]:
|
jpayne@68
|
1033 ref1 = ref1[:-1]
|
jpayne@68
|
1034 alt1 = alt1[:-1]
|
jpayne@68
|
1035 while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]:
|
jpayne@68
|
1036 ref2 = ref2[:-1]
|
jpayne@68
|
1037 alt2 = alt2[:-1]
|
jpayne@68
|
1038 # now, the alternative alleles must be identical
|
jpayne@68
|
1039 return alt1 == alt2
|
jpayne@68
|
1040
|
jpayne@68
|
1041 ###########################################################################################################
|
jpayne@68
|
1042 ###########################################################################################################
|
jpayne@68
|
1043 ## API functions added by Andreas
|
jpayne@68
|
1044 ###########################################################################################################
|
jpayne@68
|
1045
|
jpayne@68
|
1046 def connect(self, filename, encoding="ascii"):
|
jpayne@68
|
1047 '''connect to tabix file.'''
|
jpayne@68
|
1048 self.encoding=encoding
|
jpayne@68
|
1049 self.tabixfile = pysam.Tabixfile(filename, encoding=encoding)
|
jpayne@68
|
1050 self._parse_header(self.tabixfile.header)
|
jpayne@68
|
1051
|
jpayne@68
|
1052 def __del__(self):
|
jpayne@68
|
1053 self.close()
|
jpayne@68
|
1054 self.tabixfile = None
|
jpayne@68
|
1055
|
jpayne@68
|
1056 def close(self):
|
jpayne@68
|
1057 if self.tabixfile:
|
jpayne@68
|
1058 self.tabixfile.close()
|
jpayne@68
|
1059 self.tabixfile = None
|
jpayne@68
|
1060
|
jpayne@68
|
1061 def fetch(self,
|
jpayne@68
|
1062 reference=None,
|
jpayne@68
|
1063 start=None,
|
jpayne@68
|
1064 end=None,
|
jpayne@68
|
1065 region=None ):
|
jpayne@68
|
1066 """ Parse a stream of VCF-formatted lines.
|
jpayne@68
|
1067 Initializes class instance and return generator """
|
jpayne@68
|
1068 return self.tabixfile.fetch(
|
jpayne@68
|
1069 reference,
|
jpayne@68
|
1070 start,
|
jpayne@68
|
1071 end,
|
jpayne@68
|
1072 region,
|
jpayne@68
|
1073 parser = asVCFRecord(self))
|
jpayne@68
|
1074
|
jpayne@68
|
1075 def validate(self, record):
|
jpayne@68
|
1076 '''validate vcf record.
|
jpayne@68
|
1077
|
jpayne@68
|
1078 returns a validated record.
|
jpayne@68
|
1079 '''
|
jpayne@68
|
1080
|
jpayne@68
|
1081 raise NotImplementedError("needs to be checked")
|
jpayne@68
|
1082
|
jpayne@68
|
1083 chrom, pos = record.chrom, record.pos
|
jpayne@68
|
1084
|
jpayne@68
|
1085 # check reference
|
jpayne@68
|
1086 ref = record.ref
|
jpayne@68
|
1087 if ref == ".":
|
jpayne@68
|
1088 self.error(str(record),self.MISSING_REF)
|
jpayne@68
|
1089 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference)
|
jpayne@68
|
1090 else: ref = ""
|
jpayne@68
|
1091 else:
|
jpayne@68
|
1092 for c in ref:
|
jpayne@68
|
1093 if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF)
|
jpayne@68
|
1094 if "N" in ref: ref = get_sequence(chrom,
|
jpayne@68
|
1095 pos,
|
jpayne@68
|
1096 pos+len(ref),
|
jpayne@68
|
1097 self._reference)
|
jpayne@68
|
1098
|
jpayne@68
|
1099 # make sure reference is sane
|
jpayne@68
|
1100 if self._reference:
|
jpayne@68
|
1101 left = max(0,self.pos-100)
|
jpayne@68
|
1102 faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference)
|
jpayne@68
|
1103 faref = faref_leftflank[pos-left:]
|
jpayne@68
|
1104 if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref))
|
jpayne@68
|
1105 ref = faref
|
jpayne@68
|
1106
|
jpayne@68
|
1107 # check: format fields are defined
|
jpayne@68
|
1108 for f in record.format:
|
jpayne@68
|
1109 if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f)
|
jpayne@68
|
1110
|
jpayne@68
|
1111 # check: all filters are defined
|
jpayne@68
|
1112 for f in record.filter:
|
jpayne@68
|
1113 if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f)
|
jpayne@68
|
1114
|
jpayne@68
|
1115 # convert v3.3 alleles
|
jpayne@68
|
1116 if self._version == 33:
|
jpayne@68
|
1117 if len(ref) != 1: self.error(str(record),self.V33_BAD_REF)
|
jpayne@68
|
1118 newalts = []
|
jpayne@68
|
1119 have_deletions = False
|
jpayne@68
|
1120 for a in alt:
|
jpayne@68
|
1121 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference
|
jpayne@68
|
1122 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference
|
jpayne@68
|
1123 elif a.startswith('D'): # allow D<seq> and D<num>
|
jpayne@68
|
1124 have_deletions = True
|
jpayne@68
|
1125 try:
|
jpayne@68
|
1126 l = int(a[1:]) # throws ValueError if sequence
|
jpayne@68
|
1127 if len(ref) < l: # add to reference if necessary
|
jpayne@68
|
1128 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference)
|
jpayne@68
|
1129 ref += addns
|
jpayne@68
|
1130 for i,na in enumerate(newalts): newalts[i] = na+addns
|
jpayne@68
|
1131 a = ref[l:] # new deletion, deleting pos...pos+l
|
jpayne@68
|
1132 except ValueError:
|
jpayne@68
|
1133 s = a[1:]
|
jpayne@68
|
1134 if len(ref) < len(s): # add Ns to reference if necessary
|
jpayne@68
|
1135 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference)
|
jpayne@68
|
1136 if not s.endswith(addns) and addns != 'N'*len(addns):
|
jpayne@68
|
1137 self.error(str(record),self.V33_UNMATCHED_DELETION,
|
jpayne@68
|
1138 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference)))
|
jpayne@68
|
1139 ref += addns
|
jpayne@68
|
1140 for i,na in enumerate(newalts): newalts[i] = na+addns
|
jpayne@68
|
1141 a = ref[len(s):] # new deletion, deleting from pos
|
jpayne@68
|
1142 else:
|
jpayne@68
|
1143 self.error(str(record),self.V33_BAD_ALLELE)
|
jpayne@68
|
1144 newalts.append(a)
|
jpayne@68
|
1145 alt = newalts
|
jpayne@68
|
1146 # deletion alleles exist, add dummy 1st reference allele, and account for leading base
|
jpayne@68
|
1147 if have_deletions:
|
jpayne@68
|
1148 if pos == 0:
|
jpayne@68
|
1149 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1
|
jpayne@68
|
1150 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference)
|
jpayne@68
|
1151 ref += addn
|
jpayne@68
|
1152 alt = [allele+addn for allele in alt]
|
jpayne@68
|
1153 else:
|
jpayne@68
|
1154 addn = get_sequence(chrom,pos-1,pos,self._reference)
|
jpayne@68
|
1155 ref = addn + ref
|
jpayne@68
|
1156 alt = [addn + allele for allele in alt]
|
jpayne@68
|
1157 pos -= 1
|
jpayne@68
|
1158 else:
|
jpayne@68
|
1159 # format v4.0 -- just check for nucleotides
|
jpayne@68
|
1160 for allele in alt:
|
jpayne@68
|
1161 if not alleleRegEx.match(allele):
|
jpayne@68
|
1162 self.error(str(record),self.V40_BAD_ALLELE,allele)
|
jpayne@68
|
1163
|
jpayne@68
|
1164
|
jpayne@68
|
1165 # check for leading nucleotide in indel calls
|
jpayne@68
|
1166 for allele in alt:
|
jpayne@68
|
1167 if len(allele) != len(ref):
|
jpayne@68
|
1168 if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE)
|
jpayne@68
|
1169 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper():
|
jpayne@68
|
1170 self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE)
|
jpayne@68
|
1171
|
jpayne@68
|
1172 # trim trailing bases in alleles
|
jpayne@68
|
1173 # AH: not certain why trimming this needs to be added
|
jpayne@68
|
1174 # disabled now for unit testing
|
jpayne@68
|
1175 # for i in range(1,min(len(ref),min(map(len,alt)))):
|
jpayne@68
|
1176 # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper():
|
jpayne@68
|
1177 # break
|
jpayne@68
|
1178 # ref, alt = ref[:-1], [allele[:-1] for allele in alt]
|
jpayne@68
|
1179
|
jpayne@68
|
1180 # left-align alleles, if a reference is available
|
jpayne@68
|
1181 if self._leftalign and self._reference:
|
jpayne@68
|
1182 while left < pos:
|
jpayne@68
|
1183 movable = True
|
jpayne@68
|
1184 for allele in alt:
|
jpayne@68
|
1185 if len(allele) > len(ref):
|
jpayne@68
|
1186 longest, shortest = allele, ref
|
jpayne@68
|
1187 else:
|
jpayne@68
|
1188 longest, shortest = ref, allele
|
jpayne@68
|
1189 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper():
|
jpayne@68
|
1190 movable = False
|
jpayne@68
|
1191 if longest[-1].upper() != longest[len(shortest)-1].upper():
|
jpayne@68
|
1192 movable = False
|
jpayne@68
|
1193 if not movable:
|
jpayne@68
|
1194 break
|
jpayne@68
|
1195 ref = ref[:-1]
|
jpayne@68
|
1196 alt = [allele[:-1] for allele in alt]
|
jpayne@68
|
1197 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0:
|
jpayne@68
|
1198 ref = faref_leftflank[pos-left-1] + ref
|
jpayne@68
|
1199 alt = [faref_leftflank[pos-left-1] + allele for allele in alt]
|
jpayne@68
|
1200 pos -= 1
|
jpayne@68
|
1201
|
jpayne@68
|
1202 __all__ = [
|
jpayne@68
|
1203 "VCF", "VCFRecord", ]
|