comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/pytools/lib/readSeq.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 #!/usr/bin/env python
2
3 import sys
4 import math
5 import random
6 import fileinput
7 import re
8 #import os
9 import copy
10
11 class readSeq:
12
13 def __init__(self, fileName, paired = False, fileType = 'fastq', compression = 'auto', sampleRate = None):
14 self.fileName = fileName
15 self.fileType = fileType
16 self.fileType = self.__detectFileFormat()
17 self.compression = compression
18 self.__readObj = self.readFile()
19 self.paired = paired
20 self.sampleRate = sampleRate
21
22 def __iter__(self):
23 return self
24
25 def next(self):
26 record = self.__readObj.next()
27 self.header = record['header']
28 self.seq = record['seq']
29 self.seqNum = record['seqNum']
30 if self.fileType == 'fastq':
31 self.qual = record['qual']
32
33 if self.paired is True:
34 self2 = copy.copy(self)
35 record = self.__readObj.next()
36 self2.header = record['header']
37 self2.seq = record['seq']
38 self2.seqNum = record['seqNum']
39 if self.fileType == 'fastq':
40 self2.qual = record['qual']
41 return self, self2
42 else:
43 return self
44
45 def __detectFileFormat(self):
46 if self.fileType != 'auto':
47 return self.fileType
48
49 if ( re.search('fasta$', self.fileName) or
50 re.search('fa$', self.fileName) or
51 re.search('fna$', self.fileName) ):
52 self.fileType = 'fasta'
53
54 elif ( re.search('fastq$', self.fileName) or
55 re.search('fq$', self.fileName) ) :
56 self.fileType = 'fastq'
57
58 return self.fileType
59
60 def readFile(self):
61
62 record = dict()
63
64 # allow user to specify if compression should be auto detected using file names
65 # specified by fileinput documentation
66 if self.compression == 'auto':
67 inFH = fileinput.FileInput(self.fileName, openhook=fileinput.hook_compressed)
68 else:
69 inFH = fileinput.FileInput(self.fileName)
70
71 #TODO RAISE exception if file doesn't exist
72
73 # read fastq
74 if self.fileType == 'fastq':
75 self.seqNum = 0
76 for record['header'] in inFH:
77
78 record['seqNum'] = int(math.ceil(inFH.lineno()/8.0))
79 record['seq'] = inFH.readline().strip()
80 record['r1Plus'] = inFH.readline()
81 record['qual'] = inFH.readline().strip()
82
83 if not re.search('^@', record['header'], flags=0):
84 pass
85 #TODO add exception for not being a fastq
86
87 record['header'] = re.sub('^@', '', record['header'].strip())
88 yield record
89
90 elif self.fileType == 'fasta':
91 record['header'] = None
92 record['seq'] = None
93 record['seqNum'] = 0
94
95 for line in inFH:
96
97 line = line.strip()
98 if re.search('^>', line):
99 line = re.sub('^>', '', line)
100 if record['header'] is None:
101 record['header'] = line
102 if not re.search('>', record['header']):
103 pass
104 # TODO add exception for not being fasta
105
106 if record['seq'] is not None:
107 record['seqNum'] = record['seqNum'] + 1
108
109 if self.sampleRate:
110 #subsampling of reads is desired
111 if random.random() < self.sampleRate:
112 yield record
113 else:
114 yield record
115 record['seq'] = None
116
117 record['header'] = line
118 else:
119 if record['seq'] is not None:
120 record['seq'] += line
121 else:
122 record['seq'] = line
123
124 record['seqNum'] = record['seqNum'] + 1
125
126 if self.sampleRate:
127 #subsampling of reads is desired
128 if random.random() < self.sampleRate:
129 yield record
130 else:
131 yield record
132
133 try:
134 inFH = fileinput.close()
135 except:
136 pass
137
138 #except:
139 sys.exc_info()[0]
140
141 def printRecord(self):
142 if self.fileType == 'fastq':
143 print("@%s" % self.header)
144 print(self.seq)
145 print("+")
146 print(self.qual)
147 elif self.fileType == 'fasta':
148 print(">%s" % self.header)
149 print(self.seq)
150
151 if __name__ == '__main__':
152
153 for inputfile in sys.argv[1:]:
154 #get a pair of reads
155 for r1, r2 in readSeq(inputfile, paired=True):
156 print r1.header
157 print r2.header