Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pysam/libcvcf.pyx @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # cython: embedsignature=True | |
2 # | |
3 # Code to read, write and edit VCF files | |
4 # | |
5 # VCF lines are encoded as a dictionary with these keys (note: all lowercase): | |
6 # 'chrom': string | |
7 # 'pos': integer | |
8 # 'id': string | |
9 # 'ref': string | |
10 # 'alt': list of strings | |
11 # 'qual': integer | |
12 # 'filter': None (missing value), or list of keys (strings); empty list parsed as ["PASS"] | |
13 # 'info': dictionary of values (see below) | |
14 # 'format': list of keys (strings) | |
15 # sample keys: dictionary of values (see below) | |
16 # | |
17 # The sample keys are accessible through vcf.getsamples() | |
18 # | |
19 # A dictionary of values contains value keys (defined in ##INFO or | |
20 # ##FORMAT lines) which map to a list, containing integers, floats, | |
21 # strings, or characters. Missing values are replaced by a particular | |
22 # value, often -1 or . | |
23 # | |
24 # Genotypes are not stored as a string, but as a list of 1 or 3 | |
25 # elements (for haploid and diploid samples), the first (and last) the | |
26 # integer representing an allele, and the second the separation | |
27 # character. Note that there is just one genotype per sample, but for | |
28 # consistency the single element is stored in a list. | |
29 # | |
30 # Header lines other than ##INFO, ##FORMAT and ##FILTER are stored as | |
31 # (key, value) pairs and are accessible through getheader() | |
32 # | |
33 # The VCF class can be instantiated with a 'regions' variable | |
34 # consisting of tuples (chrom,start,end) encoding 0-based half-open | |
35 # segments. Only variants with a position inside the segment will be | |
36 # parsed. A regions parser is available under parse_regions. | |
37 # | |
38 # When instantiated, a reference can be passed to the VCF class. This | |
39 # may be any class that supports a fetch(chrom, start, end) method. | |
40 # | |
41 # NOTE: the position that is returned to Python is 0-based, NOT | |
42 # 1-based as in the VCF file. | |
43 # NOTE: There is also preliminary VCF functionality in the VariantFile class. | |
44 # | |
45 # TODO: | |
46 # only v4.0 writing is complete; alleles are not converted to v3.3 format | |
47 # | |
48 | |
49 from collections import namedtuple, defaultdict | |
50 from operator import itemgetter | |
51 import sys, re, copy, bisect | |
52 | |
53 from libc.stdlib cimport atoi | |
54 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t | |
55 from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t | |
56 | |
57 cimport pysam.libctabix as libctabix | |
58 cimport pysam.libctabixproxies as libctabixproxies | |
59 | |
60 from pysam.libcutils cimport force_str | |
61 | |
62 import pysam | |
63 | |
64 gtsRegEx = re.compile("[|/\\\\]") | |
65 alleleRegEx = re.compile('^[ACGTN]+$') | |
66 | |
67 # Utility function. Uses 0-based coordinates | |
68 def get_sequence(chrom, start, end, fa): | |
69 # obtain sequence from .fa file, without truncation | |
70 if end<=start: return "" | |
71 if not fa: return "N"*(end-start) | |
72 if start<0: return "N"*(-start) + get_sequence(chrom, 0, end, fa).upper() | |
73 sequence = fa.fetch(chrom, start, end).upper() | |
74 if len(sequence) < end-start: sequence += "N"*(end-start-len(sequence)) | |
75 return sequence | |
76 | |
77 # Utility function. Parses a region string | |
78 def parse_regions( string ): | |
79 result = [] | |
80 for r in string.split(','): | |
81 elts = r.split(':') | |
82 chrom, start, end = elts[0], 0, 3000000000 | |
83 if len(elts)==1: pass | |
84 elif len(elts)==2: | |
85 if len(elts[1])>0: | |
86 ielts = elts[1].split('-') | |
87 if len(ielts) != 2: ValueError("Don't understand region string '%s'" % r) | |
88 try: start, end = int(ielts[0])-1, int(ielts[1]) | |
89 except: raise ValueError("Don't understand region string '%s'" % r) | |
90 else: | |
91 raise ValueError("Don't understand region string '%s'" % r) | |
92 result.append( (chrom,start,end) ) | |
93 return result | |
94 | |
95 | |
96 FORMAT = namedtuple('FORMAT','id numbertype number type description missingvalue') | |
97 | |
98 ########################################################################################################### | |
99 # | |
100 # New class | |
101 # | |
102 ########################################################################################################### | |
103 | |
104 cdef class VCFRecord(libctabixproxies.TupleProxy): | |
105 '''vcf record. | |
106 | |
107 initialized from data and vcf meta | |
108 ''' | |
109 | |
110 cdef vcf | |
111 cdef char * contig | |
112 cdef uint32_t pos | |
113 | |
114 def __init__(self, vcf): | |
115 self.vcf = vcf | |
116 self.encoding = vcf.encoding | |
117 | |
118 # if len(data) != len(self.vcf._samples): | |
119 # self.vcf.error(str(data), | |
120 # self.BAD_NUMBER_OF_COLUMNS, | |
121 # "expected %s for %s samples (%s), got %s" % \ | |
122 # (len(self.vcf._samples), | |
123 # len(self.vcf._samples), | |
124 # self.vcf._samples, | |
125 # len(data))) | |
126 | |
127 def __cinit__(self, vcf): | |
128 # start indexed access at genotypes | |
129 self.offset = 9 | |
130 | |
131 self.vcf = vcf | |
132 self.encoding = vcf.encoding | |
133 | |
134 def error(self, line, error, opt=None): | |
135 '''raise error.''' | |
136 # pass to vcf file for error handling | |
137 return self.vcf.error(line, error, opt) | |
138 | |
139 cdef update(self, char * buffer, size_t nbytes): | |
140 '''update internal data. | |
141 | |
142 nbytes does not include the terminal '\0'. | |
143 ''' | |
144 libctabixproxies.TupleProxy.update(self, buffer, nbytes) | |
145 | |
146 self.contig = self.fields[0] | |
147 # vcf counts from 1 - correct here | |
148 self.pos = atoi(self.fields[1]) - 1 | |
149 | |
150 def __len__(self): | |
151 return max(0, self.nfields - 9) | |
152 | |
153 property contig: | |
154 def __get__(self): return self.contig | |
155 | |
156 property pos: | |
157 def __get__(self): return self.pos | |
158 | |
159 property id: | |
160 def __get__(self): return self.fields[2] | |
161 | |
162 property ref: | |
163 def __get__(self): | |
164 return self.fields[3] | |
165 | |
166 property alt: | |
167 def __get__(self): | |
168 # convert v3.3 to v4.0 alleles below | |
169 alt = self.fields[4] | |
170 if alt == ".": alt = [] | |
171 else: alt = alt.upper().split(',') | |
172 return alt | |
173 | |
174 property qual: | |
175 def __get__(self): | |
176 qual = self.fields[5] | |
177 if qual == b".": qual = -1 | |
178 else: | |
179 try: qual = float(qual) | |
180 except: self.vcf.error(str(self),self.QUAL_NOT_NUMERICAL) | |
181 return qual | |
182 | |
183 property filter: | |
184 def __get__(self): | |
185 f = self.fields[6] | |
186 # postpone checking that filters exist. Encode missing filter or no filtering as empty list | |
187 if f == b"." or f == b"PASS" or f == b"0": return [] | |
188 else: return f.split(';') | |
189 | |
190 property info: | |
191 def __get__(self): | |
192 col = self.fields[7] | |
193 # dictionary of keys, and list of values | |
194 info = {} | |
195 if col != b".": | |
196 for blurp in col.split(';'): | |
197 elts = blurp.split('=') | |
198 if len(elts) == 1: v = None | |
199 elif len(elts) == 2: v = elts[1] | |
200 else: self.vcf.error(str(self),self.ERROR_INFO_STRING) | |
201 info[elts[0]] = self.vcf.parse_formatdata(elts[0], v, self.vcf._info, str(self.vcf)) | |
202 return info | |
203 | |
204 property format: | |
205 def __get__(self): | |
206 return self.fields[8].split(':') | |
207 | |
208 property samples: | |
209 def __get__(self): | |
210 return self.vcf._samples | |
211 | |
212 def __getitem__(self, key): | |
213 | |
214 # parse sample columns | |
215 values = self.fields[self.vcf._sample2column[key]].split(':') | |
216 alt = self.alt | |
217 format = self.format | |
218 | |
219 if len(values) > len(format): | |
220 self.vcf.error(str(self.line),self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" %\ | |
221 (len(values),key,len(format))) | |
222 | |
223 result = {} | |
224 for idx in range(len(format)): | |
225 expected = self.vcf.get_expected(format[idx], self.vcf._format, alt) | |
226 if idx < len(values): value = values[idx] | |
227 else: | |
228 if expected == -1: value = "." | |
229 else: value = ",".join(["."]*expected) | |
230 | |
231 result[format[idx]] = self.vcf.parse_formatdata(format[idx], value, self.vcf._format, str(self.data)) | |
232 if expected != -1 and len(result[format[idx]]) != expected: | |
233 self.vcf.error(str(self.data),self.BAD_NUMBER_OF_PARAMETERS, | |
234 "id=%s, expected %s parameters, got %s" % (format[idx],expected,result[format[idx]])) | |
235 if len(result[format[idx]] ) < expected: result[format[idx]] += [result[format[idx]][-1]]*(expected-len(result[format[idx]])) | |
236 result[format[idx]] = result[format[idx]][:expected] | |
237 | |
238 return result | |
239 | |
240 | |
241 cdef class asVCFRecord(libctabix.Parser): | |
242 '''converts a :term:`tabix row` into a VCF record.''' | |
243 cdef vcffile | |
244 def __init__(self, vcffile): | |
245 self.vcffile = vcffile | |
246 | |
247 cdef parse(self, char * buffer, int len): | |
248 cdef VCFRecord r | |
249 r = VCFRecord(self.vcffile) | |
250 r.copy(buffer, len) | |
251 return r | |
252 | |
253 class VCF(object): | |
254 | |
255 # types | |
256 NT_UNKNOWN = 0 | |
257 NT_NUMBER = 1 | |
258 NT_ALLELES = 2 | |
259 NT_NR_ALLELES = 3 | |
260 NT_GENOTYPES = 4 | |
261 NT_PHASED_GENOTYPES = 5 | |
262 | |
263 _errors = { 0:"UNKNOWN_FORMAT_STRING:Unknown file format identifier", | |
264 1:"BADLY_FORMATTED_FORMAT_STRING:Formatting error in the format string", | |
265 2:"BADLY_FORMATTED_HEADING:Did not find 9 required headings (CHROM, POS, ..., FORMAT) %s", | |
266 3:"BAD_NUMBER_OF_COLUMNS:Wrong number of columns found (%s)", | |
267 4:"POS_NOT_NUMERICAL:Position column is not numerical", | |
268 5:"UNKNOWN_CHAR_IN_REF:Unknown character in reference field", | |
269 6:"V33_BAD_REF:Reference should be single-character in v3.3 VCF", | |
270 7:"V33_BAD_ALLELE:Cannot interpret allele for v3.3 VCF", | |
271 8:"POS_NOT_POSITIVE:Position field must be >0", | |
272 9:"QUAL_NOT_NUMERICAL:Quality field must be numerical, or '.'", | |
273 10:"ERROR_INFO_STRING:Error while parsing info field", | |
274 11:"ERROR_UNKNOWN_KEY:Unknown key (%s) found in formatted field (info; format; or filter)", | |
275 12:"ERROR_FORMAT_NOT_NUMERICAL:Expected integer or float in formatted field; got %s", | |
276 13:"ERROR_FORMAT_NOT_CHAR:Eexpected character in formatted field; got string", | |
277 14:"FILTER_NOT_DEFINED:Identifier (%s) in filter found which was not defined in header", | |
278 15:"FORMAT_NOT_DEFINED:Identifier (%s) in format found which was not defined in header", | |
279 16:"BAD_NUMBER_OF_VALUES:Found too many of values in sample column (%s)", | |
280 17:"BAD_NUMBER_OF_PARAMETERS:Found unexpected number of parameters (%s)", | |
281 18:"BAD_GENOTYPE:Cannot parse genotype (%s)", | |
282 19:"V40_BAD_ALLELE:Bad allele found for v4.0 VCF (%s)", | |
283 20:"MISSING_REF:Reference allele missing", | |
284 21:"V33_UNMATCHED_DELETION:Deleted sequence does not match reference (%s)", | |
285 22:"V40_MISSING_ANGLE_BRACKETS:Format definition is not deliminted by angular brackets", | |
286 23:"FORMAT_MISSING_QUOTES:Description field in format definition is not surrounded by quotes", | |
287 24:"V40_FORMAT_MUST_HAVE_NAMED_FIELDS:Fields in v4.0 VCF format definition must have named fields", | |
288 25:"HEADING_NOT_SEPARATED_BY_TABS:Heading line appears separated by spaces, not tabs", | |
289 26:"WRONG_REF:Wrong reference %s", | |
290 27:"ERROR_TRAILING_DATA:Numerical field ('%s') has semicolon-separated trailing data", | |
291 28:"BAD_CHR_TAG:Error calculating chr tag for %s", | |
292 29:"ZERO_LENGTH_ALLELE:Found zero-length allele", | |
293 30:"MISSING_INDEL_ALLELE_REF_BASE:Indel alleles must begin with single reference base", | |
294 31:"ZERO_FOR_NON_FLAG_FIELD: number set to 0, but type is not 'FLAG'", | |
295 32:"ERROR_FORMAT_NOT_INTEGER:Expected integer in formatted field; got %s", | |
296 33:"ERROR_FLAG_HAS_VALUE:Flag fields should not have a value", | |
297 } | |
298 | |
299 # tag-value pairs; tags are not unique; does not include fileformat, INFO, FILTER or FORMAT fields | |
300 _header = [] | |
301 | |
302 # version number; 33=v3.3; 40=v4.0 | |
303 _version = 40 | |
304 | |
305 # info, filter and format data | |
306 _info = {} | |
307 _filter = {} | |
308 _format = {} | |
309 | |
310 # header; and required columns | |
311 _required = ["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] | |
312 _samples = [] | |
313 | |
314 # control behaviour | |
315 _ignored_errors = set([11,31]) # ERROR_UNKNOWN_KEY, ERROR_ZERO_FOR_NON_FLAG_FIELD | |
316 _warn_errors = set([]) | |
317 _leftalign = False | |
318 | |
319 # reference sequence | |
320 _reference = None | |
321 | |
322 # regions to include; None includes everything | |
323 _regions = None | |
324 | |
325 # statefull stuff | |
326 _lineno = -1 | |
327 _line = None | |
328 _lines = None | |
329 | |
330 def __init__(self, _copy=None, reference=None, regions=None, | |
331 lines=None, leftalign=False): | |
332 # make error identifiers accessible by name | |
333 for id in self._errors.keys(): | |
334 self.__dict__[self._errors[id].split(':')[0]] = id | |
335 if _copy != None: | |
336 self._leftalign = _copy._leftalign | |
337 self._header = _copy._header[:] | |
338 self._version = _copy._version | |
339 self._info = copy.deepcopy(_copy._info) | |
340 self._filter = copy.deepcopy(_copy._filter) | |
341 self._format = copy.deepcopy(_copy._format) | |
342 self._samples = _copy._samples[:] | |
343 self._sample2column = copy.deepcopy(_copy._sample2column) | |
344 self._ignored_errors = copy.deepcopy(_copy._ignored_errors) | |
345 self._warn_errors = copy.deepcopy(_copy._warn_errors) | |
346 self._reference = _copy._reference | |
347 self._regions = _copy._regions | |
348 if reference: self._reference = reference | |
349 if regions: self._regions = regions | |
350 if leftalign: self._leftalign = leftalign | |
351 self._lines = lines | |
352 self.encoding = "ascii" | |
353 self.tabixfile = None | |
354 | |
355 def error(self,line,error,opt=None): | |
356 if error in self._ignored_errors: return | |
357 errorlabel, errorstring = self._errors[error].split(':') | |
358 if opt: errorstring = errorstring % opt | |
359 errwarn = ["Error","Warning"][error in self._warn_errors] | |
360 errorstring += " in line %s: '%s'\n%s %s: %s\n" % (self._lineno,line,errwarn,errorlabel,errorstring) | |
361 if error in self._warn_errors: return | |
362 raise ValueError(errorstring) | |
363 | |
364 def parse_format(self,line,format,filter=False): | |
365 if self._version == 40: | |
366 if not format.startswith('<'): | |
367 self.error(line,self.V40_MISSING_ANGLE_BRACKETS) | |
368 format = "<"+format | |
369 if not format.endswith('>'): | |
370 self.error(line,self.V40_MISSING_ANGLE_BRACKETS) | |
371 format += ">" | |
372 format = format[1:-1] | |
373 data = {'id':None,'number':None,'type':None,'descr':None} | |
374 idx = 0 | |
375 while len(format.strip())>0: | |
376 elts = format.strip().split(',') | |
377 first, rest = elts[0], ','.join(elts[1:]) | |
378 if first.find('=') == -1 or (first.find('"')>=0 and first.find('=') > first.find('"')): | |
379 if self._version == 40: self.error(line,self.V40_FORMAT_MUST_HAVE_NAMED_FIELDS) | |
380 if idx == 4: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) | |
381 first = ["ID=","Number=","Type=","Description="][idx] + first | |
382 if first.startswith('ID='): data['id'] = first.split('=')[1] | |
383 elif first.startswith('Number='): data['number'] = first.split('=')[1] | |
384 elif first.startswith('Type='): data['type'] = first.split('=')[1] | |
385 elif first.startswith('Description='): | |
386 elts = format.split('"') | |
387 if len(elts)<3: | |
388 self.error(line,self.FORMAT_MISSING_QUOTES) | |
389 elts = first.split('=') + [rest] | |
390 data['descr'] = elts[1] | |
391 rest = '"'.join(elts[2:]) | |
392 if rest.startswith(','): rest = rest[1:] | |
393 else: | |
394 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) | |
395 format = rest | |
396 idx += 1 | |
397 if filter and idx==1: idx=3 # skip number and type fields for FILTER format strings | |
398 if not data['id']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) | |
399 if 'descr' not in data: | |
400 # missing description | |
401 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) | |
402 data['descr'] = "" | |
403 if not data['type'] and not data['number']: | |
404 # fine, ##filter format | |
405 return FORMAT(data['id'],self.NT_NUMBER,0,"Flag",data['descr'],'.') | |
406 if not data['type'] in ["Integer","Float","Character","String","Flag"]: | |
407 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) | |
408 # I would like a missing-value field, but it isn't there | |
409 if data['type'] in ['Integer','Float']: data['missing'] = None # Do NOT use arbitrary int/float as missing value | |
410 else: data['missing'] = '.' | |
411 if not data['number']: self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) | |
412 try: | |
413 n = int(data['number']) | |
414 t = self.NT_NUMBER | |
415 except ValueError: | |
416 n = -1 | |
417 if data['number'] == '.': t = self.NT_UNKNOWN | |
418 elif data['number'] == '#alleles': t = self.NT_ALLELES | |
419 elif data['number'] == '#nonref_alleles': t = self.NT_NR_ALLELES | |
420 elif data['number'] == '#genotypes': t = self.NT_GENOTYPES | |
421 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES | |
422 elif data['number'] == '#phased_genotypes': t = self.NT_PHASED_GENOTYPES | |
423 # abbreviations added in VCF version v4.1 | |
424 elif data['number'] == 'A': t = self.NT_ALLELES | |
425 elif data['number'] == 'G': t = self.NT_GENOTYPES | |
426 else: | |
427 self.error(line,self.BADLY_FORMATTED_FORMAT_STRING) | |
428 # if number is 0 - type must be Flag | |
429 if n == 0 and data['type'] != 'Flag': | |
430 self.error( line, self.ZERO_FOR_NON_FLAG_FIELD) | |
431 # force type 'Flag' if no number | |
432 data['type'] = 'Flag' | |
433 | |
434 return FORMAT(data['id'],t,n,data['type'],data['descr'],data['missing']) | |
435 | |
436 def format_format( self, fmt, filter=False ): | |
437 values = [('ID',fmt.id)] | |
438 if fmt.number != None and not filter: | |
439 if fmt.numbertype == self.NT_UNKNOWN: nmb = "." | |
440 elif fmt.numbertype == self.NT_NUMBER: nmb = str(fmt.number) | |
441 elif fmt.numbertype == self.NT_ALLELES: nmb = "#alleles" | |
442 elif fmt.numbertype == self.NT_NR_ALLELES: nmb = "#nonref_alleles" | |
443 elif fmt.numbertype == self.NT_GENOTYPES: nmb = "#genotypes" | |
444 elif fmt.numbertype == self.NT_PHASED_GENOTYPES: nmb = "#phased_genotypes" | |
445 else: | |
446 raise ValueError("Unknown number type encountered: %s" % fmt.numbertype) | |
447 values.append( ('Number',nmb) ) | |
448 values.append( ('Type', fmt.type) ) | |
449 values.append( ('Description', '"' + fmt.description + '"') ) | |
450 if self._version == 33: | |
451 format = ",".join([v for k,v in values]) | |
452 else: | |
453 format = "<" + (",".join( ["%s=%s" % (k,v) for (k,v) in values] )) + ">" | |
454 return format | |
455 | |
456 def get_expected(self, format, formatdict, alt): | |
457 fmt = formatdict[format] | |
458 if fmt.numbertype == self.NT_UNKNOWN: return -1 | |
459 if fmt.numbertype == self.NT_NUMBER: return fmt.number | |
460 if fmt.numbertype == self.NT_ALLELES: return len(alt)+1 | |
461 if fmt.numbertype == self.NT_NR_ALLELES: return len(alt) | |
462 if fmt.numbertype == self.NT_GENOTYPES: return ((len(alt)+1)*(len(alt)+2)) // 2 | |
463 if fmt.numbertype == self.NT_PHASED_GENOTYPES: return (len(alt)+1)*(len(alt)+1) | |
464 return 0 | |
465 | |
466 | |
467 def _add_definition(self, formatdict, key, data, line ): | |
468 if key in formatdict: return | |
469 self.error(line,self.ERROR_UNKNOWN_KEY,key) | |
470 if data == None: | |
471 formatdict[key] = FORMAT(key,self.NT_NUMBER,0,"Flag","(Undefined tag)",".") | |
472 return | |
473 if data == []: data = [""] # unsure what type -- say string | |
474 if type(data[0]) == type(0.0): | |
475 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Float","(Undefined tag)",None) | |
476 return | |
477 if type(data[0]) == type(0): | |
478 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"Integer","(Undefined tag)",None) | |
479 return | |
480 formatdict[key] = FORMAT(key,self.NT_UNKNOWN,-1,"String","(Undefined tag)",".") | |
481 | |
482 | |
483 # todo: trim trailing missing values | |
484 def format_formatdata( self, data, format, key=True, value=True, separator=":" ): | |
485 output, sdata = [], [] | |
486 if type(data) == type([]): # for FORMAT field, make data with dummy values | |
487 d = {} | |
488 for k in data: d[k] = [] | |
489 data = d | |
490 # convert missing values; and silently add definitions if required | |
491 for k in data: | |
492 self._add_definition( format, k, data[k], "(output)" ) | |
493 for idx,v in enumerate(data[k]): | |
494 if v == format[k].missingvalue: data[k][idx] = "." | |
495 # make sure GT comes first; and ensure fixed ordering; also convert GT data back to string | |
496 for k in data: | |
497 if k != 'GT': sdata.append( (k,data[k]) ) | |
498 sdata.sort() | |
499 if 'GT' in data: | |
500 sdata = [('GT',map(self.convertGTback,data['GT']))] + sdata | |
501 for k,v in sdata: | |
502 if v == []: v = None | |
503 if key and value: | |
504 if v != None: output.append( k+"="+','.join(map(str,v)) ) | |
505 else: output.append( k ) | |
506 elif key: output.append(k) | |
507 elif value: | |
508 if v != None: output.append( ','.join(map(str,v)) ) | |
509 else: output.append( "." ) # should not happen | |
510 # snip off trailing missing data | |
511 while len(output) > 1: | |
512 last = output[-1].replace(',','').replace('.','') | |
513 if len(last)>0: break | |
514 output = output[:-1] | |
515 return separator.join(output) | |
516 | |
517 | |
518 def enter_default_format(self): | |
519 for f in [FORMAT('GT',self.NT_NUMBER,1,'String','Genotype','.'), | |
520 FORMAT('DP',self.NT_NUMBER,1,'Integer','Read depth at this position for this sample',-1), | |
521 FORMAT('FT',self.NT_NUMBER,1,'String','Sample Genotype Filter','.'), | |
522 FORMAT('GL',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), | |
523 FORMAT('GLE',self.NT_UNKNOWN,-1,'Float','Genotype likelihoods','.'), | |
524 FORMAT('GQ',self.NT_NUMBER,1,'Integer','Genotype Quality',-1), | |
525 FORMAT('PL',self.NT_GENOTYPES,-1,'Integer','Phred-scaled genotype likelihoods', '.'), | |
526 FORMAT('GP',self.NT_GENOTYPES,-1,'Float','Genotype posterior probabilities','.'), | |
527 FORMAT('GQ',self.NT_GENOTYPES,-1,'Integer','Conditional genotype quality','.'), | |
528 FORMAT('HQ',self.NT_UNKNOWN,-1,'Integer','Haplotype Quality',-1), # unknown number, since may be haploid | |
529 FORMAT('PS',self.NT_UNKNOWN,-1,'Integer','Phase set','.'), | |
530 FORMAT('PQ',self.NT_NUMBER,1,'Integer','Phasing quality',-1), | |
531 FORMAT('EC',self.NT_ALLELES,1,'Integer','Expected alternate allel counts',-1), | |
532 FORMAT('MQ',self.NT_NUMBER,1,'Integer','RMS mapping quality',-1), | |
533 ]: | |
534 if f.id not in self._format: | |
535 self._format[f.id] = f | |
536 | |
537 def parse_header(self, line): | |
538 | |
539 assert line.startswith('##') | |
540 elts = line[2:].split('=') | |
541 key = elts[0].strip() | |
542 value = '='.join(elts[1:]).strip() | |
543 if key == "fileformat": | |
544 if value == "VCFv3.3": | |
545 self._version = 33 | |
546 elif value == "VCFv4.0": | |
547 self._version = 40 | |
548 elif value == "VCFv4.1": | |
549 # AH - for testing | |
550 self._version = 40 | |
551 elif value == "VCFv4.2": | |
552 # AH - for testing | |
553 self._version = 40 | |
554 else: | |
555 self.error(line,self.UNKNOWN_FORMAT_STRING) | |
556 elif key == "INFO": | |
557 f = self.parse_format(line, value) | |
558 self._info[ f.id ] = f | |
559 elif key == "FILTER": | |
560 f = self.parse_format(line, value, filter=True) | |
561 self._filter[ f.id ] = f | |
562 elif key == "FORMAT": | |
563 f = self.parse_format(line, value) | |
564 self._format[ f.id ] = f | |
565 else: | |
566 # keep other keys in the header field | |
567 self._header.append( (key,value) ) | |
568 | |
569 | |
570 def write_header( self, stream ): | |
571 stream.write("##fileformat=VCFv%s.%s\n" % (self._version // 10, self._version % 10)) | |
572 for key,value in self._header: stream.write("##%s=%s\n" % (key,value)) | |
573 for var,label in [(self._info,"INFO"),(self._filter,"FILTER"),(self._format,"FORMAT")]: | |
574 for f in var.itervalues(): stream.write("##%s=%s\n" % (label,self.format_format(f,filter=(label=="FILTER")))) | |
575 | |
576 | |
577 def parse_heading( self, line ): | |
578 assert line.startswith('#') | |
579 assert not line.startswith('##') | |
580 headings = line[1:].split('\t') | |
581 # test for 8, as FORMAT field might be missing | |
582 if len(headings)==1 and len(line[1:].split()) >= 8: | |
583 self.error(line,self.HEADING_NOT_SEPARATED_BY_TABS) | |
584 headings = line[1:].split() | |
585 | |
586 for i,s in enumerate(self._required): | |
587 | |
588 if len(headings)<=i or headings[i] != s: | |
589 | |
590 if len(headings) <= i: | |
591 err = "(%sth entry not found)" % (i+1) | |
592 else: | |
593 err = "(found %s, expected %s)" % (headings[i],s) | |
594 | |
595 #self.error(line,self.BADLY_FORMATTED_HEADING,err) | |
596 # allow FORMAT column to be absent | |
597 if len(headings) == 8: | |
598 headings.append("FORMAT") | |
599 else: | |
600 self.error(line,self.BADLY_FORMATTED_HEADING,err) | |
601 | |
602 self._samples = headings[9:] | |
603 self._sample2column = dict( [(y,x+9) for x,y in enumerate( self._samples ) ] ) | |
604 | |
605 def write_heading( self, stream ): | |
606 stream.write("#" + "\t".join(self._required + self._samples) + "\n") | |
607 | |
608 def convertGT(self, GTstring): | |
609 if GTstring == ".": return ["."] | |
610 try: | |
611 gts = gtsRegEx.split(GTstring) | |
612 if len(gts) == 1: return [int(gts[0])] | |
613 if len(gts) != 2: raise ValueError() | |
614 if gts[0] == "." and gts[1] == ".": return [gts[0],GTstring[len(gts[0]):-len(gts[1])],gts[1]] | |
615 return [int(gts[0]),GTstring[len(gts[0]):-len(gts[1])],int(gts[1])] | |
616 except ValueError: | |
617 self.error(self._line,self.BAD_GENOTYPE,GTstring) | |
618 return [".","|","."] | |
619 | |
620 def convertGTback(self, GTdata): | |
621 return ''.join(map(str,GTdata)) | |
622 | |
623 def parse_formatdata( self, key, value, formatdict, line ): | |
624 # To do: check that the right number of values is present | |
625 f = formatdict.get(key,None) | |
626 if f == None: | |
627 self._add_definition(formatdict, key, value, line ) | |
628 f = formatdict[key] | |
629 if f.type == "Flag": | |
630 if value is not None: self.error(line,self.ERROR_FLAG_HAS_VALUE) | |
631 return [] | |
632 values = value.split(',') | |
633 # deal with trailing data in some early VCF files | |
634 if f.type in ["Float","Integer"] and len(values)>0 and values[-1].find(';') > -1: | |
635 self.error(line,self.ERROR_TRAILING_DATA,values[-1]) | |
636 values[-1] = values[-1].split(';')[0] | |
637 if f.type == "Integer": | |
638 for idx,v in enumerate(values): | |
639 try: | |
640 if v == ".": values[idx] = f.missingvalue | |
641 else: values[idx] = int(v) | |
642 except: | |
643 self.error(line,self.ERROR_FORMAT_NOT_INTEGER,"%s=%s" % (key, str(values))) | |
644 return [0] * len(values) | |
645 return values | |
646 elif f.type == "String": | |
647 self._line = line | |
648 if f.id == "GT": values = list(map( self.convertGT, values )) | |
649 return values | |
650 elif f.type == "Character": | |
651 for v in values: | |
652 if len(v) != 1: self.error(line,self.ERROR_FORMAT_NOT_CHAR) | |
653 return values | |
654 elif f.type == "Float": | |
655 for idx,v in enumerate(values): | |
656 if v == ".": values[idx] = f.missingvalue | |
657 try: return list(map(float,values)) | |
658 except: | |
659 self.error(line,self.ERROR_FORMAT_NOT_NUMERICAL,"%s=%s" % (key, str(values))) | |
660 return [0.0] * len(values) | |
661 else: | |
662 # can't happen | |
663 self.error(line,self.ERROR_INFO_STRING) | |
664 | |
665 def inregion(self, chrom, pos): | |
666 if not self._regions: return True | |
667 for r in self._regions: | |
668 if r[0] == chrom and r[1] <= pos < r[2]: return True | |
669 return False | |
670 | |
671 def parse_data( self, line, lineparse=False ): | |
672 cols = line.split('\t') | |
673 if len(cols) != len(self._samples)+9: | |
674 # gracefully deal with absent FORMAT column | |
675 # and those missing samples | |
676 if len(cols) == 8: | |
677 cols.append("") | |
678 else: | |
679 self.error(line, | |
680 self.BAD_NUMBER_OF_COLUMNS, | |
681 "expected %s for %s samples (%s), got %s" % (len(self._samples)+9, len(self._samples), self._samples, len(cols))) | |
682 | |
683 chrom = cols[0] | |
684 | |
685 # get 0-based position | |
686 try: pos = int(cols[1])-1 | |
687 except: self.error(line,self.POS_NOT_NUMERICAL) | |
688 if pos < 0: self.error(line,self.POS_NOT_POSITIVE) | |
689 | |
690 # implement filtering | |
691 if not self.inregion(chrom,pos): return None | |
692 | |
693 # end of first-pass parse for sortedVCF | |
694 if lineparse: return chrom, pos, line | |
695 | |
696 id = cols[2] | |
697 | |
698 ref = cols[3].upper() | |
699 if ref == ".": | |
700 self.error(line,self.MISSING_REF) | |
701 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) | |
702 else: ref = "" | |
703 else: | |
704 for c in ref: | |
705 if c not in "ACGTN": self.error(line,self.UNKNOWN_CHAR_IN_REF) | |
706 if "N" in ref: ref = get_sequence(chrom,pos,pos+len(ref),self._reference) | |
707 | |
708 # make sure reference is sane | |
709 if self._reference: | |
710 left = max(0,pos-100) | |
711 faref_leftflank = get_sequence(chrom,left,pos+len(ref),self._reference) | |
712 faref = faref_leftflank[pos-left:] | |
713 if faref != ref: self.error(line,self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) | |
714 ref = faref | |
715 | |
716 # convert v3.3 to v4.0 alleles below | |
717 if cols[4] == ".": alt = [] | |
718 else: alt = cols[4].upper().split(',') | |
719 | |
720 if cols[5] == ".": qual = -1 | |
721 else: | |
722 try: qual = float(cols[5]) | |
723 except: self.error(line,self.QUAL_NOT_NUMERICAL) | |
724 | |
725 # postpone checking that filters exist. Encode missing filter or no filtering as empty list | |
726 if cols[6] == "." or cols[6] == "PASS" or cols[6] == "0": filter = [] | |
727 else: filter = cols[6].split(';') | |
728 | |
729 # dictionary of keys, and list of values | |
730 info = {} | |
731 if cols[7] != ".": | |
732 for blurp in cols[7].split(';'): | |
733 elts = blurp.split('=') | |
734 if len(elts) == 1: v = None | |
735 elif len(elts) == 2: v = elts[1] | |
736 else: self.error(line,self.ERROR_INFO_STRING) | |
737 info[elts[0]] = self.parse_formatdata(elts[0], | |
738 v, | |
739 self._info, | |
740 line) | |
741 | |
742 # Gracefully deal with absent FORMAT column | |
743 if cols[8] == "": format = [] | |
744 else: format = cols[8].split(':') | |
745 | |
746 # check: all filters are defined | |
747 for f in filter: | |
748 if f not in self._filter: self.error(line,self.FILTER_NOT_DEFINED, f) | |
749 | |
750 # check: format fields are defined | |
751 if self._format: | |
752 for f in format: | |
753 if f not in self._format: self.error(line,self.FORMAT_NOT_DEFINED, f) | |
754 | |
755 # convert v3.3 alleles | |
756 if self._version == 33: | |
757 if len(ref) != 1: self.error(line,self.V33_BAD_REF) | |
758 newalts = [] | |
759 have_deletions = False | |
760 for a in alt: | |
761 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference | |
762 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference | |
763 elif a.startswith('D'): # allow D<seq> and D<num> | |
764 have_deletions = True | |
765 try: | |
766 l = int(a[1:]) # throws ValueError if sequence | |
767 if len(ref) < l: # add to reference if necessary | |
768 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) | |
769 ref += addns | |
770 for i,na in enumerate(newalts): newalts[i] = na+addns | |
771 a = ref[l:] # new deletion, deleting pos...pos+l | |
772 except ValueError: | |
773 s = a[1:] | |
774 if len(ref) < len(s): # add Ns to reference if necessary | |
775 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) | |
776 if not s.endswith(addns) and addns != 'N'*len(addns): | |
777 self.error(line,self.V33_UNMATCHED_DELETION, | |
778 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) | |
779 ref += addns | |
780 for i,na in enumerate(newalts): newalts[i] = na+addns | |
781 a = ref[len(s):] # new deletion, deleting from pos | |
782 else: | |
783 self.error(line,self.V33_BAD_ALLELE) | |
784 newalts.append(a) | |
785 alt = newalts | |
786 # deletion alleles exist, add dummy 1st reference allele, and account for leading base | |
787 if have_deletions: | |
788 if pos == 0: | |
789 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 | |
790 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) | |
791 ref += addn | |
792 alt = [allele+addn for allele in alt] | |
793 else: | |
794 addn = get_sequence(chrom,pos-1,pos,self._reference) | |
795 ref = addn + ref | |
796 alt = [addn + allele for allele in alt] | |
797 pos -= 1 | |
798 else: | |
799 # format v4.0 -- just check for nucleotides | |
800 for allele in alt: | |
801 if not alleleRegEx.match(allele): | |
802 self.error(line,self.V40_BAD_ALLELE,allele) | |
803 | |
804 # check for leading nucleotide in indel calls | |
805 for allele in alt: | |
806 if len(allele) != len(ref): | |
807 if len(allele) == 0: self.error(line,self.ZERO_LENGTH_ALLELE) | |
808 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): | |
809 self.error(line,self.MISSING_INDEL_ALLELE_REF_BASE) | |
810 | |
811 # trim trailing bases in alleles | |
812 # AH: not certain why trimming this needs to be added | |
813 # disabled now for unit testing | |
814 # if alt: | |
815 # for i in range(1,min(len(ref),min(map(len,alt)))): | |
816 # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): | |
817 # break | |
818 # ref, alt = ref[:-1], [allele[:-1] for allele in alt] | |
819 | |
820 # left-align alleles, if a reference is available | |
821 if self._leftalign and self._reference: | |
822 while left < pos: | |
823 movable = True | |
824 for allele in alt: | |
825 if len(allele) > len(ref): | |
826 longest, shortest = allele, ref | |
827 else: | |
828 longest, shortest = ref, allele | |
829 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): | |
830 movable = False | |
831 if longest[-1].upper() != longest[len(shortest)-1].upper(): | |
832 movable = False | |
833 if not movable: | |
834 break | |
835 ref = ref[:-1] | |
836 alt = [allele[:-1] for allele in alt] | |
837 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: | |
838 ref = faref_leftflank[pos-left-1] + ref | |
839 alt = [faref_leftflank[pos-left-1] + allele for allele in alt] | |
840 pos -= 1 | |
841 | |
842 # parse sample columns | |
843 samples = [] | |
844 for sample in cols[9:]: | |
845 dict = {} | |
846 values = sample.split(':') | |
847 if len(values) > len(format): | |
848 self.error(line,self.BAD_NUMBER_OF_VALUES,"(found %s values in element %s; expected %s)" % (len(values),sample,len(format))) | |
849 for idx in range(len(format)): | |
850 expected = self.get_expected(format[idx], self._format, alt) | |
851 if idx < len(values): value = values[idx] | |
852 else: | |
853 if expected == -1: value = "." | |
854 else: value = ",".join(["."]*expected) | |
855 | |
856 dict[format[idx]] = self.parse_formatdata(format[idx], | |
857 value, | |
858 self._format, | |
859 line) | |
860 if expected != -1 and len(dict[format[idx]]) != expected: | |
861 self.error(line,self.BAD_NUMBER_OF_PARAMETERS, | |
862 "id=%s, expected %s parameters, got %s" % (format[idx],expected,dict[format[idx]])) | |
863 if len(dict[format[idx]] ) < expected: dict[format[idx]] += [dict[format[idx]][-1]]*(expected-len(dict[format[idx]])) | |
864 dict[format[idx]] = dict[format[idx]][:expected] | |
865 samples.append( dict ) | |
866 | |
867 # done | |
868 d = {'chrom':chrom, | |
869 'pos':pos, # return 0-based position | |
870 'id':id, | |
871 'ref':ref, | |
872 'alt':alt, | |
873 'qual':qual, | |
874 'filter':filter, | |
875 'info':info, | |
876 'format':format} | |
877 for key,value in zip(self._samples,samples): | |
878 d[key] = value | |
879 | |
880 return d | |
881 | |
882 | |
883 def write_data(self, stream, data): | |
884 required = ['chrom','pos','id','ref','alt','qual','filter','info','format'] + self._samples | |
885 for k in required: | |
886 if k not in data: raise ValueError("Required key %s not found in data" % str(k)) | |
887 if data['alt'] == []: alt = "." | |
888 else: alt = ",".join(data['alt']) | |
889 if data['filter'] == None: filter = "." | |
890 elif data['filter'] == []: | |
891 if self._version == 33: filter = "0" | |
892 else: filter = "PASS" | |
893 else: filter = ';'.join(data['filter']) | |
894 if data['qual'] == -1: qual = "." | |
895 else: qual = str(data['qual']) | |
896 | |
897 output = [data['chrom'], | |
898 str(data['pos']+1), # change to 1-based position | |
899 data['id'], | |
900 data['ref'], | |
901 alt, | |
902 qual, | |
903 filter, | |
904 self.format_formatdata( | |
905 data['info'], self._info, separator=";"), | |
906 self.format_formatdata( | |
907 data['format'], self._format, value=False)] | |
908 | |
909 for s in self._samples: | |
910 output.append(self.format_formatdata( | |
911 data[s], self._format, key=False)) | |
912 | |
913 stream.write( "\t".join(output) + "\n" ) | |
914 | |
915 def _parse_header(self, stream): | |
916 self._lineno = 0 | |
917 for line in stream: | |
918 line = force_str(line, self.encoding) | |
919 self._lineno += 1 | |
920 if line.startswith('##'): | |
921 self.parse_header(line.strip()) | |
922 elif line.startswith('#'): | |
923 self.parse_heading(line.strip()) | |
924 self.enter_default_format() | |
925 else: | |
926 break | |
927 return line | |
928 | |
929 def _parse(self, line, stream): | |
930 # deal with files with header only | |
931 if line.startswith("##"): return | |
932 if len(line.strip()) > 0: | |
933 d = self.parse_data( line.strip() ) | |
934 if d: yield d | |
935 for line in stream: | |
936 self._lineno += 1 | |
937 if self._lines and self._lineno > self._lines: raise StopIteration | |
938 d = self.parse_data( line.strip() ) | |
939 if d: yield d | |
940 | |
941 ###################################################################################################### | |
942 # | |
943 # API follows | |
944 # | |
945 ###################################################################################################### | |
946 | |
947 def getsamples(self): | |
948 """ List of samples in VCF file """ | |
949 return self._samples | |
950 | |
951 def setsamples(self,samples): | |
952 """ List of samples in VCF file """ | |
953 self._samples = samples | |
954 | |
955 def getheader(self): | |
956 """ List of header key-value pairs (strings) """ | |
957 return self._header | |
958 | |
959 def setheader(self,header): | |
960 """ List of header key-value pairs (strings) """ | |
961 self._header = header | |
962 | |
963 def getinfo(self): | |
964 """ Dictionary of ##INFO tags, as VCF.FORMAT values """ | |
965 return self._info | |
966 | |
967 def setinfo(self,info): | |
968 """ Dictionary of ##INFO tags, as VCF.FORMAT values """ | |
969 self._info = info | |
970 | |
971 def getformat(self): | |
972 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ | |
973 return self._format | |
974 | |
975 def setformat(self,format): | |
976 """ Dictionary of ##FORMAT tags, as VCF.FORMAT values """ | |
977 self._format = format | |
978 | |
979 def getfilter(self): | |
980 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ | |
981 return self._filter | |
982 | |
983 def setfilter(self,filter): | |
984 """ Dictionary of ##FILTER tags, as VCF.FORMAT values """ | |
985 self._filter = filter | |
986 | |
987 def setversion(self, version): | |
988 if version != 33 and version != 40: raise ValueError("Can only handle v3.3 and v4.0 VCF files") | |
989 self._version = version | |
990 | |
991 def setregions(self, regions): | |
992 self._regions = regions | |
993 | |
994 def setreference(self, ref): | |
995 """ Provide a reference sequence; a Python class supporting a fetch(chromosome, start, end) method, e.g. PySam.FastaFile """ | |
996 self._reference = ref | |
997 | |
998 def ignoreerror(self, errorstring): | |
999 try: self._ignored_errors.add(self.__dict__[errorstring]) | |
1000 except KeyError: raise ValueError("Invalid error string: %s" % errorstring) | |
1001 | |
1002 def warnerror(self, errorstring): | |
1003 try: self._warn_errors.add(self.__dict__[errorstring]) | |
1004 except KeyError: raise ValueError("Invalid error string: %s" % errorstring) | |
1005 | |
1006 def parse(self, stream): | |
1007 """ Parse a stream of VCF-formatted lines. Initializes class instance and return generator """ | |
1008 last_line = self._parse_header(stream) | |
1009 # now return a generator that does the actual work. In this way the pre-processing is done | |
1010 # before the first piece of data is yielded | |
1011 return self._parse(last_line, stream) | |
1012 | |
1013 def write(self, stream, datagenerator): | |
1014 """ Writes a VCF file to a stream, using a data generator (or list) """ | |
1015 self.write_header(stream) | |
1016 self.write_heading(stream) | |
1017 for data in datagenerator: self.write_data(stream,data) | |
1018 | |
1019 def writeheader(self, stream): | |
1020 """ Writes a VCF header """ | |
1021 self.write_header(stream) | |
1022 self.write_heading(stream) | |
1023 | |
1024 def compare_calls(self, pos1, ref1, alt1, pos2, ref2, alt2): | |
1025 """ Utility function: compares two calls for equality """ | |
1026 # a variant should always be assigned to a unique position, one base before | |
1027 # the leftmost position of the alignment gap. If this rule is implemented | |
1028 # correctly, the two positions must be equal for the calls to be identical. | |
1029 if pos1 != pos2: return False | |
1030 # from both calls, trim rightmost bases when identical. Do this safely, i.e. | |
1031 # only when the reference bases are not Ns | |
1032 while len(ref1)>0 and len(alt1)>0 and ref1[-1] == alt1[-1]: | |
1033 ref1 = ref1[:-1] | |
1034 alt1 = alt1[:-1] | |
1035 while len(ref2)>0 and len(alt2)>0 and ref2[-1] == alt2[-1]: | |
1036 ref2 = ref2[:-1] | |
1037 alt2 = alt2[:-1] | |
1038 # now, the alternative alleles must be identical | |
1039 return alt1 == alt2 | |
1040 | |
1041 ########################################################################################################### | |
1042 ########################################################################################################### | |
1043 ## API functions added by Andreas | |
1044 ########################################################################################################### | |
1045 | |
1046 def connect(self, filename, encoding="ascii"): | |
1047 '''connect to tabix file.''' | |
1048 self.encoding=encoding | |
1049 self.tabixfile = pysam.Tabixfile(filename, encoding=encoding) | |
1050 self._parse_header(self.tabixfile.header) | |
1051 | |
1052 def __del__(self): | |
1053 self.close() | |
1054 self.tabixfile = None | |
1055 | |
1056 def close(self): | |
1057 if self.tabixfile: | |
1058 self.tabixfile.close() | |
1059 self.tabixfile = None | |
1060 | |
1061 def fetch(self, | |
1062 reference=None, | |
1063 start=None, | |
1064 end=None, | |
1065 region=None ): | |
1066 """ Parse a stream of VCF-formatted lines. | |
1067 Initializes class instance and return generator """ | |
1068 return self.tabixfile.fetch( | |
1069 reference, | |
1070 start, | |
1071 end, | |
1072 region, | |
1073 parser = asVCFRecord(self)) | |
1074 | |
1075 def validate(self, record): | |
1076 '''validate vcf record. | |
1077 | |
1078 returns a validated record. | |
1079 ''' | |
1080 | |
1081 raise NotImplementedError("needs to be checked") | |
1082 | |
1083 chrom, pos = record.chrom, record.pos | |
1084 | |
1085 # check reference | |
1086 ref = record.ref | |
1087 if ref == ".": | |
1088 self.error(str(record),self.MISSING_REF) | |
1089 if self._version == 33: ref = get_sequence(chrom,pos,pos+1,self._reference) | |
1090 else: ref = "" | |
1091 else: | |
1092 for c in ref: | |
1093 if c not in "ACGTN": self.error(str(record),self.UNKNOWN_CHAR_IN_REF) | |
1094 if "N" in ref: ref = get_sequence(chrom, | |
1095 pos, | |
1096 pos+len(ref), | |
1097 self._reference) | |
1098 | |
1099 # make sure reference is sane | |
1100 if self._reference: | |
1101 left = max(0,self.pos-100) | |
1102 faref_leftflank = get_sequence(chrom,left,self.pos+len(ref),self._reference) | |
1103 faref = faref_leftflank[pos-left:] | |
1104 if faref != ref: self.error(str(record),self.WRONG_REF,"(reference is %s, VCF says %s)" % (faref,ref)) | |
1105 ref = faref | |
1106 | |
1107 # check: format fields are defined | |
1108 for f in record.format: | |
1109 if f not in self._format: self.error(str(record),self.FORMAT_NOT_DEFINED, f) | |
1110 | |
1111 # check: all filters are defined | |
1112 for f in record.filter: | |
1113 if f not in self._filter: self.error(str(record),self.FILTER_NOT_DEFINED, f) | |
1114 | |
1115 # convert v3.3 alleles | |
1116 if self._version == 33: | |
1117 if len(ref) != 1: self.error(str(record),self.V33_BAD_REF) | |
1118 newalts = [] | |
1119 have_deletions = False | |
1120 for a in alt: | |
1121 if len(a) == 1: a = a + ref[1:] # SNP; add trailing reference | |
1122 elif a.startswith('I'): a = ref[0] + a[1:] + ref[1:] # insertion just beyond pos; add first and trailing reference | |
1123 elif a.startswith('D'): # allow D<seq> and D<num> | |
1124 have_deletions = True | |
1125 try: | |
1126 l = int(a[1:]) # throws ValueError if sequence | |
1127 if len(ref) < l: # add to reference if necessary | |
1128 addns = get_sequence(chrom,pos+len(ref),pos+l,self._reference) | |
1129 ref += addns | |
1130 for i,na in enumerate(newalts): newalts[i] = na+addns | |
1131 a = ref[l:] # new deletion, deleting pos...pos+l | |
1132 except ValueError: | |
1133 s = a[1:] | |
1134 if len(ref) < len(s): # add Ns to reference if necessary | |
1135 addns = get_sequence(chrom,pos+len(ref),pos+len(s),self._reference) | |
1136 if not s.endswith(addns) and addns != 'N'*len(addns): | |
1137 self.error(str(record),self.V33_UNMATCHED_DELETION, | |
1138 "(deletion is %s, reference is %s)" % (a,get_sequence(chrom,pos,pos+len(s),self._reference))) | |
1139 ref += addns | |
1140 for i,na in enumerate(newalts): newalts[i] = na+addns | |
1141 a = ref[len(s):] # new deletion, deleting from pos | |
1142 else: | |
1143 self.error(str(record),self.V33_BAD_ALLELE) | |
1144 newalts.append(a) | |
1145 alt = newalts | |
1146 # deletion alleles exist, add dummy 1st reference allele, and account for leading base | |
1147 if have_deletions: | |
1148 if pos == 0: | |
1149 # Petr Danacek's: we can't have a leading nucleotide at (1-based) position 1 | |
1150 addn = get_sequence(chrom,pos+len(ref),pos+len(ref)+1,self._reference) | |
1151 ref += addn | |
1152 alt = [allele+addn for allele in alt] | |
1153 else: | |
1154 addn = get_sequence(chrom,pos-1,pos,self._reference) | |
1155 ref = addn + ref | |
1156 alt = [addn + allele for allele in alt] | |
1157 pos -= 1 | |
1158 else: | |
1159 # format v4.0 -- just check for nucleotides | |
1160 for allele in alt: | |
1161 if not alleleRegEx.match(allele): | |
1162 self.error(str(record),self.V40_BAD_ALLELE,allele) | |
1163 | |
1164 | |
1165 # check for leading nucleotide in indel calls | |
1166 for allele in alt: | |
1167 if len(allele) != len(ref): | |
1168 if len(allele) == 0: self.error(str(record),self.ZERO_LENGTH_ALLELE) | |
1169 if ref[0].upper() != allele[0].upper() and "N" not in (ref[0]+allele[0]).upper(): | |
1170 self.error(str(record),self.MISSING_INDEL_ALLELE_REF_BASE) | |
1171 | |
1172 # trim trailing bases in alleles | |
1173 # AH: not certain why trimming this needs to be added | |
1174 # disabled now for unit testing | |
1175 # for i in range(1,min(len(ref),min(map(len,alt)))): | |
1176 # if len(set(allele[-1].upper() for allele in alt)) > 1 or ref[-1].upper() != alt[0][-1].upper(): | |
1177 # break | |
1178 # ref, alt = ref[:-1], [allele[:-1] for allele in alt] | |
1179 | |
1180 # left-align alleles, if a reference is available | |
1181 if self._leftalign and self._reference: | |
1182 while left < pos: | |
1183 movable = True | |
1184 for allele in alt: | |
1185 if len(allele) > len(ref): | |
1186 longest, shortest = allele, ref | |
1187 else: | |
1188 longest, shortest = ref, allele | |
1189 if len(longest) == len(shortest) or longest[:len(shortest)].upper() != shortest.upper(): | |
1190 movable = False | |
1191 if longest[-1].upper() != longest[len(shortest)-1].upper(): | |
1192 movable = False | |
1193 if not movable: | |
1194 break | |
1195 ref = ref[:-1] | |
1196 alt = [allele[:-1] for allele in alt] | |
1197 if min([len(allele) for allele in alt]) == 0 or len(ref) == 0: | |
1198 ref = faref_leftflank[pos-left-1] + ref | |
1199 alt = [faref_leftflank[pos-left-1] + allele for allele in alt] | |
1200 pos -= 1 | |
1201 | |
1202 __all__ = [ | |
1203 "VCF", "VCFRecord", ] |