Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/pytools/lib/readSeq.py @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys | |
4 import math | |
5 import random | |
6 import fileinput | |
7 import re | |
8 #import os | |
9 import copy | |
10 | |
11 class readSeq: | |
12 | |
13 def __init__(self, fileName, paired = False, fileType = 'fastq', compression = 'auto', sampleRate = None): | |
14 self.fileName = fileName | |
15 self.fileType = fileType | |
16 self.fileType = self.__detectFileFormat() | |
17 self.compression = compression | |
18 self.__readObj = self.readFile() | |
19 self.paired = paired | |
20 self.sampleRate = sampleRate | |
21 | |
22 def __iter__(self): | |
23 return self | |
24 | |
25 def next(self): | |
26 record = self.__readObj.next() | |
27 self.header = record['header'] | |
28 self.seq = record['seq'] | |
29 self.seqNum = record['seqNum'] | |
30 if self.fileType == 'fastq': | |
31 self.qual = record['qual'] | |
32 | |
33 if self.paired is True: | |
34 self2 = copy.copy(self) | |
35 record = self.__readObj.next() | |
36 self2.header = record['header'] | |
37 self2.seq = record['seq'] | |
38 self2.seqNum = record['seqNum'] | |
39 if self.fileType == 'fastq': | |
40 self2.qual = record['qual'] | |
41 return self, self2 | |
42 else: | |
43 return self | |
44 | |
45 def __detectFileFormat(self): | |
46 if self.fileType != 'auto': | |
47 return self.fileType | |
48 | |
49 if ( re.search('fasta$', self.fileName) or | |
50 re.search('fa$', self.fileName) or | |
51 re.search('fna$', self.fileName) ): | |
52 self.fileType = 'fasta' | |
53 | |
54 elif ( re.search('fastq$', self.fileName) or | |
55 re.search('fq$', self.fileName) ) : | |
56 self.fileType = 'fastq' | |
57 | |
58 return self.fileType | |
59 | |
60 def readFile(self): | |
61 | |
62 record = dict() | |
63 | |
64 # allow user to specify if compression should be auto detected using file names | |
65 # specified by fileinput documentation | |
66 if self.compression == 'auto': | |
67 inFH = fileinput.FileInput(self.fileName, openhook=fileinput.hook_compressed) | |
68 else: | |
69 inFH = fileinput.FileInput(self.fileName) | |
70 | |
71 #TODO RAISE exception if file doesn't exist | |
72 | |
73 # read fastq | |
74 if self.fileType == 'fastq': | |
75 self.seqNum = 0 | |
76 for record['header'] in inFH: | |
77 | |
78 record['seqNum'] = int(math.ceil(inFH.lineno()/8.0)) | |
79 record['seq'] = inFH.readline().strip() | |
80 record['r1Plus'] = inFH.readline() | |
81 record['qual'] = inFH.readline().strip() | |
82 | |
83 if not re.search('^@', record['header'], flags=0): | |
84 pass | |
85 #TODO add exception for not being a fastq | |
86 | |
87 record['header'] = re.sub('^@', '', record['header'].strip()) | |
88 yield record | |
89 | |
90 elif self.fileType == 'fasta': | |
91 record['header'] = None | |
92 record['seq'] = None | |
93 record['seqNum'] = 0 | |
94 | |
95 for line in inFH: | |
96 | |
97 line = line.strip() | |
98 if re.search('^>', line): | |
99 line = re.sub('^>', '', line) | |
100 if record['header'] is None: | |
101 record['header'] = line | |
102 if not re.search('>', record['header']): | |
103 pass | |
104 # TODO add exception for not being fasta | |
105 | |
106 if record['seq'] is not None: | |
107 record['seqNum'] = record['seqNum'] + 1 | |
108 | |
109 if self.sampleRate: | |
110 #subsampling of reads is desired | |
111 if random.random() < self.sampleRate: | |
112 yield record | |
113 else: | |
114 yield record | |
115 record['seq'] = None | |
116 | |
117 record['header'] = line | |
118 else: | |
119 if record['seq'] is not None: | |
120 record['seq'] += line | |
121 else: | |
122 record['seq'] = line | |
123 | |
124 record['seqNum'] = record['seqNum'] + 1 | |
125 | |
126 if self.sampleRate: | |
127 #subsampling of reads is desired | |
128 if random.random() < self.sampleRate: | |
129 yield record | |
130 else: | |
131 yield record | |
132 | |
133 try: | |
134 inFH = fileinput.close() | |
135 except: | |
136 pass | |
137 | |
138 #except: | |
139 sys.exc_info()[0] | |
140 | |
141 def printRecord(self): | |
142 if self.fileType == 'fastq': | |
143 print("@%s" % self.header) | |
144 print(self.seq) | |
145 print("+") | |
146 print(self.qual) | |
147 elif self.fileType == 'fasta': | |
148 print(">%s" % self.header) | |
149 print(self.seq) | |
150 | |
151 if __name__ == '__main__': | |
152 | |
153 for inputfile in sys.argv[1:]: | |
154 #get a pair of reads | |
155 for r1, r2 in readSeq(inputfile, paired=True): | |
156 print r1.header | |
157 print r2.header |