Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/ProkObject.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package prok; | |
2 | |
3 import java.io.File; | |
4 | |
5 import dna.AminoAcid; | |
6 import dna.Data; | |
7 import fileIO.FileFormat; | |
8 import fileIO.ReadWrite; | |
9 import shared.Parse; | |
10 import shared.Tools; | |
11 import stream.ConcurrentReadInputStream; | |
12 import stream.Read; | |
13 import stream.ReadInputStream; | |
14 import structures.ListNum; | |
15 import structures.LongHashSet; | |
16 | |
17 /** Contains a lot of statics and static methods for gene-calling */ | |
18 public abstract class ProkObject { | |
19 | |
20 public static boolean parse(String arg, String a, String b){ | |
21 if(a.equalsIgnoreCase("16sstartslop") || a.equalsIgnoreCase("ssustartslop")){ | |
22 ssuStartSlop=Integer.parseInt(b); | |
23 }else if(a.equalsIgnoreCase("23sstartslop") || a.equalsIgnoreCase("lsustartslop")){ | |
24 lsuStartSlop=Integer.parseInt(b); | |
25 }else if(a.equalsIgnoreCase("5sstartslop")){ | |
26 r5SStartSlop=Integer.parseInt(b); | |
27 }else if(a.equalsIgnoreCase("16sstopslop") || a.equalsIgnoreCase("ssustopslop")){ | |
28 ssuStopSlop=Integer.parseInt(b); | |
29 }else if(a.equalsIgnoreCase("23sstopslop") || a.equalsIgnoreCase("lsustopslop")){ | |
30 lsuStopSlop=Integer.parseInt(b); | |
31 }else if(a.equalsIgnoreCase("5sstopslop")){ | |
32 r5SStopSlop=Integer.parseInt(b); | |
33 }else if(a.equals("plus")){ | |
34 PROCESS_PLUS_STRAND=Parse.parseBoolean(b); | |
35 }else if(a.equals("minus")){ | |
36 PROCESS_MINUS_STRAND=Parse.parseBoolean(b); | |
37 } | |
38 | |
39 else if(a.equalsIgnoreCase("min16SIdentity") || a.equalsIgnoreCase("min16SId")) { | |
40 min16SIdentity=Float.parseFloat(b); | |
41 }else if(a.equalsIgnoreCase("min18SIdentity") || a.equalsIgnoreCase("min18SId")) { | |
42 min18SIdentity=Float.parseFloat(b); | |
43 }else if(a.equalsIgnoreCase("min23SIdentity") || a.equalsIgnoreCase("min23SId")) { | |
44 min23SIdentity=Float.parseFloat(b); | |
45 }else if(a.equalsIgnoreCase("min5SIdentity") || a.equalsIgnoreCase("min5SId")) { | |
46 min5SIdentity=Float.parseFloat(b); | |
47 } | |
48 | |
49 else if(a.equalsIgnoreCase("align16s") || a.equalsIgnoreCase("load16SSequence")){ | |
50 load16SSequence=Parse.parseBoolean(b); | |
51 }else if(a.equalsIgnoreCase("align23s") || a.equalsIgnoreCase("load23SSequence")){ | |
52 load23SSequence=Parse.parseBoolean(b); | |
53 }else if(a.equalsIgnoreCase("align18s") || a.equalsIgnoreCase("load18SSequence")){ | |
54 load18SSequence=Parse.parseBoolean(b); | |
55 }else if(a.equalsIgnoreCase("align5s") || a.equalsIgnoreCase("load5SSequence")){ | |
56 load5SSequence=Parse.parseBoolean(b); | |
57 } | |
58 | |
59 else if(a.equalsIgnoreCase("load16skmers") || a.equalsIgnoreCase("load18skmers") || a.equalsIgnoreCase("loadssukmers")){ | |
60 loadSSUkmers=Parse.parseBoolean(b); | |
61 }else if(a.equalsIgnoreCase("load23skmers") || a.equalsIgnoreCase("load28skmers") || a.equalsIgnoreCase("loadlsukmers")){ | |
62 loadLSUkmers=Parse.parseBoolean(b); | |
63 }else if(a.equalsIgnoreCase("load5skmers")){ | |
64 load5Skmers=Parse.parseBoolean(b); | |
65 }else if(a.equalsIgnoreCase("loadtrnakmers")){ | |
66 loadtRNAkmers=Parse.parseBoolean(b); | |
67 }else if(a.equalsIgnoreCase("klongtrna")){ | |
68 kLongTRna=Integer.parseInt(b); | |
69 }else if(a.equalsIgnoreCase("longkmers")){ | |
70 loadSSUkmers=loadLSUkmers=load5Skmers=loadtRNAkmers=Parse.parseBoolean(b); | |
71 }else if(a.equalsIgnoreCase("klong5s")){ | |
72 kLong5S=Integer.parseInt(b); | |
73 }else if(a.equalsIgnoreCase("klong16s") || a.equalsIgnoreCase("klong18s") || a.equalsIgnoreCase("klongssu")){ | |
74 kLongSSU=Integer.parseInt(b); | |
75 }else if(a.equalsIgnoreCase("klong23s") || a.equalsIgnoreCase("klong28s") || a.equalsIgnoreCase("klonglsu")){ | |
76 kLongLSU=Integer.parseInt(b); | |
77 }else if(a.equalsIgnoreCase("klongtrna")){ | |
78 kLongTRna=Integer.parseInt(b); | |
79 } | |
80 | |
81 else{ | |
82 return false; | |
83 } | |
84 return true; | |
85 } | |
86 | |
87 /*--------------------------------------------------------------*/ | |
88 | |
89 public static boolean processType(int type){ | |
90 return (type==CDS ? callCDS : type==r16S ? call16S : type==r23S ? call23S : type==r18S ? call18S : type==r5S ? call5S : type==tRNA ? calltRNA : true); | |
91 } | |
92 | |
93 public static int startSlop(int type) { | |
94 int slop=(type==r16S ? ssuStartSlop : type==r23S ? lsuStartSlop : type==r18S ? ssuStartSlop : type==r5S ? r5SStartSlop : 9999); | |
95 return slop; | |
96 } | |
97 | |
98 public static int stopSlop(int type) { | |
99 int slop=(type==r16S ? ssuStopSlop : type==r23S ? lsuStopSlop : type==r18S ? ssuStopSlop : type==r5S ? r5SStopSlop : 9999); | |
100 return slop; | |
101 } | |
102 | |
103 public static float minID(int type) { | |
104 float minIdentity=(type==r16S ? min16SIdentity : type==r23S ? min23SIdentity : type==r18S ? min18SIdentity : type==r5S ? min5SIdentity : 0); | |
105 return minIdentity; | |
106 } | |
107 | |
108 public static Read[] consensusReads(int type) { | |
109 Read[] consensusReads=(type==r16S ? r16SSequence : type==r23S ? r23SSequence : type==r18S ? r18SSequence : type==r5S ? r5SSequence : null); | |
110 return consensusReads; | |
111 } | |
112 | |
113 public static LongHashSet kmerSet(int type) { | |
114 LongHashSet set=(type==tRNA ? trnaKmers : type==r16S ? ssuKmers : type==r23S ? lsuKmers : type==r5S ? r5SKmers : type==r18S ? ssuKmers : null); | |
115 return set; | |
116 } | |
117 | |
118 public static int kLongLen(int type) { | |
119 int kLongLen=(type==tRNA ? kLongTRna : type==r16S ? kLongSSU : type==r23S ? kLongLSU : type==r5S ? kLong5S : type==r18S ? kLongSSU : -1); | |
120 return kLongLen; | |
121 } | |
122 | |
123 public static int flagToType(int flag) { | |
124 return Integer.numberOfTrailingZeros(flag)+1; | |
125 } | |
126 | |
127 public static byte typeToFlag(int type) { | |
128 assert(type<=6); | |
129 return (byte)(1<<(type-1)); | |
130 } | |
131 | |
132 public static boolean callType(int type){//TODO: Turn these functions into array lookups | |
133 if(type==CDS){return callCDS;} | |
134 else if(type==tRNA){return calltRNA;} | |
135 else if(type==r16S){return call16S;} | |
136 else if(type==r23S){return call23S;} | |
137 else if(type==r5S){return call5S;} | |
138 else if(type==r18S){return call18S;} | |
139 assert(false) : type; | |
140 return false; | |
141 } | |
142 | |
143 /*--------------------------------------------------------------*/ | |
144 /*---------------- Long Kmers ----------------*/ | |
145 /*--------------------------------------------------------------*/ | |
146 | |
147 public static synchronized void loadLongKmers(){ | |
148 // assert(ssuKmers==null); | |
149 // assert(false) : load5Skmers+", "+kLong5s; | |
150 if(loadedLongKmers){return;} | |
151 if(loadSSUkmers){ssuKmers=loadLongKmersByType(kLongSSU, "ssu");} | |
152 if(loadLSUkmers){lsuKmers=loadLongKmersByType(kLongLSU, "lsu");} | |
153 if(load5Skmers){r5SKmers=loadLongKmersByType(kLong5S, "5S");} | |
154 if(loadtRNAkmers){trnaKmers=loadLongKmersByType(kLongTRna, "tRNA");} | |
155 loadedLongKmers=true; | |
156 } | |
157 | |
158 // private static LongHashSet loadLongKmers(StatsContainer sc, int k, String prefix){ | |
159 // String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa"); | |
160 // if(!new File(fname).exists()){ | |
161 // fname=fname+".gz"; | |
162 // if(!new File(fname).exists()){ | |
163 // System.err.println("Can't find "+fname); | |
164 // return null; | |
165 // } | |
166 // } | |
167 // LongHashSet set=loadLongKmers(fname, k); | |
168 // sc.kmerSet=set; | |
169 // sc.kLongLen=k; | |
170 // return set; | |
171 // } | |
172 | |
173 private static LongHashSet loadLongKmersByType(int k, String prefix){ | |
174 String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa", true); | |
175 if(!new File(fname).exists()){ | |
176 fname=fname+".gz"; | |
177 if(!new File(fname).exists()){ | |
178 System.err.println("Can't find "+fname); | |
179 return null; | |
180 } | |
181 } | |
182 LongHashSet set=loadLongKmers(fname, k); | |
183 return set; | |
184 } | |
185 | |
186 private static LongHashSet loadLongKmers(String fname, int k){//TODO: Consider making this a LongHashSet. No reason not to... | |
187 FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false); | |
188 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1, false, ff, null); | |
189 cris.start(); //Start the stream | |
190 // if(verbose){outstream.println("Started cris");} | |
191 | |
192 LongHashSet set=new LongHashSet(1000); | |
193 ListNum<Read> ln=cris.nextList(); | |
194 while(ln!=null && ln.size()>0){ | |
195 processList(ln, set, k); | |
196 cris.returnList(ln); | |
197 ln=cris.nextList(); | |
198 } | |
199 if(ln!=null){cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());} | |
200 ReadWrite.closeStream(cris); | |
201 return set; | |
202 } | |
203 | |
204 private static LongHashSet processList(ListNum<Read> ln, LongHashSet set, int k){ | |
205 final long mask=~((-1L)<<(2*k)); | |
206 for(Read r : ln){ | |
207 final byte[] bases=r.bases; | |
208 long kmer=0; | |
209 int len=0; | |
210 for(byte b : bases){ | |
211 final int num=AminoAcid.baseToNumber[b]; | |
212 if(num>=0){ | |
213 len++; | |
214 kmer=((kmer<<2)|num)&mask; | |
215 if(len>=k){ | |
216 set.add(kmer); | |
217 } | |
218 }else{ | |
219 len=0; | |
220 } | |
221 } | |
222 } | |
223 return set; | |
224 } | |
225 | |
226 /*--------------------------------------------------------------*/ | |
227 /*---------------- Consensus Sequence ----------------*/ | |
228 /*--------------------------------------------------------------*/ | |
229 | |
230 public static synchronized void loadConsensusSequenceFromFile(boolean removeMito, boolean removeChloro){ | |
231 if(loadedConsensusSequence){return;} | |
232 // assert(r16SSequence==null); | |
233 if(load16SSequence){r16SSequence=loadConsensusSequenceType("16S", removeMito, removeChloro);} | |
234 if(load18SSequence){r18SSequence=loadConsensusSequenceType("18S", removeMito, removeChloro);} | |
235 if(load23SSequence){r23SSequence=loadConsensusSequenceType("23S", removeMito, removeChloro);} | |
236 if(load5SSequence){r5SSequence=loadConsensusSequenceType("5S", removeMito, removeChloro);} | |
237 if(loadtRNASequence){trnaSequence=loadConsensusSequenceType("tRNA", removeMito, removeChloro);} | |
238 loadedConsensusSequence=true; | |
239 } | |
240 | |
241 public static Read[] loadConsensusSequenceType(String prefix, boolean removeMito, boolean removeChloro){ | |
242 String fname=null; | |
243 fname=Data.findPath("?"+prefix+"_consensus_sequence.fq", false); | |
244 if(fname!=null && (fname.endsWith(".jar") || new File(fname).exists())){ | |
245 fname=Tools.fixExtension(fname); | |
246 }else{ | |
247 fname=Data.findPath("?"+prefix+"_consensus_sequence.fa", true); | |
248 fname=Tools.fixExtension(fname); | |
249 if(!fname.endsWith(".jar") && !new File(fname).exists()){ | |
250 System.err.println("Can't find "+fname); | |
251 return null; | |
252 } | |
253 } | |
254 Read[] array=loadConsensusSequence(fname); | |
255 if(removeMito){array=stripOrganelle(array, "mito");} | |
256 if(removeChloro){array=stripOrganelle(array, "plastid");} | |
257 return array; | |
258 } | |
259 | |
260 private static Read[] loadConsensusSequence(String fname){ | |
261 FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false); | |
262 Read[] array=ReadInputStream.toReadArray(ff, -1); | |
263 return array; | |
264 } | |
265 | |
266 private static Read[] stripOrganelle(Read[] array, String key){ | |
267 int removed=0; | |
268 for(int j=0; j<array.length; j++){ | |
269 if(array[j].id.toLowerCase().startsWith(key)) { | |
270 array[j]=null; | |
271 removed++; | |
272 } | |
273 } | |
274 if(removed>0){array=Tools.condenseStrict(array);} | |
275 return array; | |
276 } | |
277 | |
278 /*--------------------------------------------------------------*/ | |
279 | |
280 public static final int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7; | |
281 public static String[] typeStrings=new String[] {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; | |
282 public static String[] typeStrings2=new String[] {"CDS", "tRNA", "rRNA", "rRNA", "rRNA", "rRNA", "rRNA", "RNA"}; | |
283 public static String[] specialTypeStrings=new String[] {null, "tRNA", "16S", "23S", "5S", "18S", "28S", null}; | |
284 public static boolean isSpecialType(String type){ | |
285 if(type==null){return false;} | |
286 for(String s : specialTypeStrings){ | |
287 if(type.equalsIgnoreCase(s)){return true;} | |
288 } | |
289 return false; | |
290 } | |
291 | |
292 public static int kInnerRNA=6; | |
293 public static int kStartRNA=3; | |
294 public static int kStopRNA=3; | |
295 | |
296 public static int kLongSSU=15; | |
297 public static int kLongLSU=15; | |
298 public static int kLong5S=15; | |
299 public static int kLongTRna=15; | |
300 | |
301 public static float min16SIdentity=0.62f; | |
302 public static float min23SIdentity=0.60f; | |
303 public static float min5SIdentity=0.60f; | |
304 public static float min18SIdentity=0.60f; | |
305 | |
306 static int ssuStartSlop=200; | |
307 static int ssuStopSlop=0; | |
308 static int lsuStartSlop=220; | |
309 static int lsuStopSlop=0; | |
310 static int r5SStartSlop=50; | |
311 static int r5SStopSlop=50; | |
312 | |
313 public static boolean callCDS=true; | |
314 public static boolean calltRNA=true; | |
315 public static boolean call16S=true; | |
316 public static boolean call23S=true; | |
317 public static boolean call5S=true; | |
318 public static boolean call18S=false; | |
319 | |
320 public static LongHashSet ssuKmers=null; | |
321 public static LongHashSet lsuKmers=null; | |
322 public static LongHashSet r5SKmers=null; | |
323 public static LongHashSet trnaKmers=null; | |
324 | |
325 public static Read[] trnaSequence=null; | |
326 public static Read[] r16SSequence=null; | |
327 public static Read[] r23SSequence=null; | |
328 public static Read[] r5SSequence=null; | |
329 public static Read[] r18SSequence=null; | |
330 | |
331 public static boolean PROCESS_PLUS_STRAND=true; | |
332 public static boolean PROCESS_MINUS_STRAND=true; | |
333 | |
334 public static boolean loadSSUkmers=true; | |
335 public static boolean loadLSUkmers=true; | |
336 public static boolean load5Skmers=true; | |
337 public static boolean loadtRNAkmers=true; | |
338 private static boolean loadedLongKmers=false; | |
339 | |
340 public static boolean loadtRNASequence=false; | |
341 public static boolean load16SSequence=true; | |
342 public static boolean load23SSequence=true; | |
343 public static boolean load5SSequence=true; | |
344 public static boolean load18SSequence=true; | |
345 private static boolean loadedConsensusSequence=false; | |
346 | |
347 } |