comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/ProkObject.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package prok;
2
3 import java.io.File;
4
5 import dna.AminoAcid;
6 import dna.Data;
7 import fileIO.FileFormat;
8 import fileIO.ReadWrite;
9 import shared.Parse;
10 import shared.Tools;
11 import stream.ConcurrentReadInputStream;
12 import stream.Read;
13 import stream.ReadInputStream;
14 import structures.ListNum;
15 import structures.LongHashSet;
16
17 /** Contains a lot of statics and static methods for gene-calling */
18 public abstract class ProkObject {
19
20 public static boolean parse(String arg, String a, String b){
21 if(a.equalsIgnoreCase("16sstartslop") || a.equalsIgnoreCase("ssustartslop")){
22 ssuStartSlop=Integer.parseInt(b);
23 }else if(a.equalsIgnoreCase("23sstartslop") || a.equalsIgnoreCase("lsustartslop")){
24 lsuStartSlop=Integer.parseInt(b);
25 }else if(a.equalsIgnoreCase("5sstartslop")){
26 r5SStartSlop=Integer.parseInt(b);
27 }else if(a.equalsIgnoreCase("16sstopslop") || a.equalsIgnoreCase("ssustopslop")){
28 ssuStopSlop=Integer.parseInt(b);
29 }else if(a.equalsIgnoreCase("23sstopslop") || a.equalsIgnoreCase("lsustopslop")){
30 lsuStopSlop=Integer.parseInt(b);
31 }else if(a.equalsIgnoreCase("5sstopslop")){
32 r5SStopSlop=Integer.parseInt(b);
33 }else if(a.equals("plus")){
34 PROCESS_PLUS_STRAND=Parse.parseBoolean(b);
35 }else if(a.equals("minus")){
36 PROCESS_MINUS_STRAND=Parse.parseBoolean(b);
37 }
38
39 else if(a.equalsIgnoreCase("min16SIdentity") || a.equalsIgnoreCase("min16SId")) {
40 min16SIdentity=Float.parseFloat(b);
41 }else if(a.equalsIgnoreCase("min18SIdentity") || a.equalsIgnoreCase("min18SId")) {
42 min18SIdentity=Float.parseFloat(b);
43 }else if(a.equalsIgnoreCase("min23SIdentity") || a.equalsIgnoreCase("min23SId")) {
44 min23SIdentity=Float.parseFloat(b);
45 }else if(a.equalsIgnoreCase("min5SIdentity") || a.equalsIgnoreCase("min5SId")) {
46 min5SIdentity=Float.parseFloat(b);
47 }
48
49 else if(a.equalsIgnoreCase("align16s") || a.equalsIgnoreCase("load16SSequence")){
50 load16SSequence=Parse.parseBoolean(b);
51 }else if(a.equalsIgnoreCase("align23s") || a.equalsIgnoreCase("load23SSequence")){
52 load23SSequence=Parse.parseBoolean(b);
53 }else if(a.equalsIgnoreCase("align18s") || a.equalsIgnoreCase("load18SSequence")){
54 load18SSequence=Parse.parseBoolean(b);
55 }else if(a.equalsIgnoreCase("align5s") || a.equalsIgnoreCase("load5SSequence")){
56 load5SSequence=Parse.parseBoolean(b);
57 }
58
59 else if(a.equalsIgnoreCase("load16skmers") || a.equalsIgnoreCase("load18skmers") || a.equalsIgnoreCase("loadssukmers")){
60 loadSSUkmers=Parse.parseBoolean(b);
61 }else if(a.equalsIgnoreCase("load23skmers") || a.equalsIgnoreCase("load28skmers") || a.equalsIgnoreCase("loadlsukmers")){
62 loadLSUkmers=Parse.parseBoolean(b);
63 }else if(a.equalsIgnoreCase("load5skmers")){
64 load5Skmers=Parse.parseBoolean(b);
65 }else if(a.equalsIgnoreCase("loadtrnakmers")){
66 loadtRNAkmers=Parse.parseBoolean(b);
67 }else if(a.equalsIgnoreCase("klongtrna")){
68 kLongTRna=Integer.parseInt(b);
69 }else if(a.equalsIgnoreCase("longkmers")){
70 loadSSUkmers=loadLSUkmers=load5Skmers=loadtRNAkmers=Parse.parseBoolean(b);
71 }else if(a.equalsIgnoreCase("klong5s")){
72 kLong5S=Integer.parseInt(b);
73 }else if(a.equalsIgnoreCase("klong16s") || a.equalsIgnoreCase("klong18s") || a.equalsIgnoreCase("klongssu")){
74 kLongSSU=Integer.parseInt(b);
75 }else if(a.equalsIgnoreCase("klong23s") || a.equalsIgnoreCase("klong28s") || a.equalsIgnoreCase("klonglsu")){
76 kLongLSU=Integer.parseInt(b);
77 }else if(a.equalsIgnoreCase("klongtrna")){
78 kLongTRna=Integer.parseInt(b);
79 }
80
81 else{
82 return false;
83 }
84 return true;
85 }
86
87 /*--------------------------------------------------------------*/
88
89 public static boolean processType(int type){
90 return (type==CDS ? callCDS : type==r16S ? call16S : type==r23S ? call23S : type==r18S ? call18S : type==r5S ? call5S : type==tRNA ? calltRNA : true);
91 }
92
93 public static int startSlop(int type) {
94 int slop=(type==r16S ? ssuStartSlop : type==r23S ? lsuStartSlop : type==r18S ? ssuStartSlop : type==r5S ? r5SStartSlop : 9999);
95 return slop;
96 }
97
98 public static int stopSlop(int type) {
99 int slop=(type==r16S ? ssuStopSlop : type==r23S ? lsuStopSlop : type==r18S ? ssuStopSlop : type==r5S ? r5SStopSlop : 9999);
100 return slop;
101 }
102
103 public static float minID(int type) {
104 float minIdentity=(type==r16S ? min16SIdentity : type==r23S ? min23SIdentity : type==r18S ? min18SIdentity : type==r5S ? min5SIdentity : 0);
105 return minIdentity;
106 }
107
108 public static Read[] consensusReads(int type) {
109 Read[] consensusReads=(type==r16S ? r16SSequence : type==r23S ? r23SSequence : type==r18S ? r18SSequence : type==r5S ? r5SSequence : null);
110 return consensusReads;
111 }
112
113 public static LongHashSet kmerSet(int type) {
114 LongHashSet set=(type==tRNA ? trnaKmers : type==r16S ? ssuKmers : type==r23S ? lsuKmers : type==r5S ? r5SKmers : type==r18S ? ssuKmers : null);
115 return set;
116 }
117
118 public static int kLongLen(int type) {
119 int kLongLen=(type==tRNA ? kLongTRna : type==r16S ? kLongSSU : type==r23S ? kLongLSU : type==r5S ? kLong5S : type==r18S ? kLongSSU : -1);
120 return kLongLen;
121 }
122
123 public static int flagToType(int flag) {
124 return Integer.numberOfTrailingZeros(flag)+1;
125 }
126
127 public static byte typeToFlag(int type) {
128 assert(type<=6);
129 return (byte)(1<<(type-1));
130 }
131
132 public static boolean callType(int type){//TODO: Turn these functions into array lookups
133 if(type==CDS){return callCDS;}
134 else if(type==tRNA){return calltRNA;}
135 else if(type==r16S){return call16S;}
136 else if(type==r23S){return call23S;}
137 else if(type==r5S){return call5S;}
138 else if(type==r18S){return call18S;}
139 assert(false) : type;
140 return false;
141 }
142
143 /*--------------------------------------------------------------*/
144 /*---------------- Long Kmers ----------------*/
145 /*--------------------------------------------------------------*/
146
147 public static synchronized void loadLongKmers(){
148 // assert(ssuKmers==null);
149 // assert(false) : load5Skmers+", "+kLong5s;
150 if(loadedLongKmers){return;}
151 if(loadSSUkmers){ssuKmers=loadLongKmersByType(kLongSSU, "ssu");}
152 if(loadLSUkmers){lsuKmers=loadLongKmersByType(kLongLSU, "lsu");}
153 if(load5Skmers){r5SKmers=loadLongKmersByType(kLong5S, "5S");}
154 if(loadtRNAkmers){trnaKmers=loadLongKmersByType(kLongTRna, "tRNA");}
155 loadedLongKmers=true;
156 }
157
158 // private static LongHashSet loadLongKmers(StatsContainer sc, int k, String prefix){
159 // String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa");
160 // if(!new File(fname).exists()){
161 // fname=fname+".gz";
162 // if(!new File(fname).exists()){
163 // System.err.println("Can't find "+fname);
164 // return null;
165 // }
166 // }
167 // LongHashSet set=loadLongKmers(fname, k);
168 // sc.kmerSet=set;
169 // sc.kLongLen=k;
170 // return set;
171 // }
172
173 private static LongHashSet loadLongKmersByType(int k, String prefix){
174 String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa", true);
175 if(!new File(fname).exists()){
176 fname=fname+".gz";
177 if(!new File(fname).exists()){
178 System.err.println("Can't find "+fname);
179 return null;
180 }
181 }
182 LongHashSet set=loadLongKmers(fname, k);
183 return set;
184 }
185
186 private static LongHashSet loadLongKmers(String fname, int k){//TODO: Consider making this a LongHashSet. No reason not to...
187 FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false);
188 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1, false, ff, null);
189 cris.start(); //Start the stream
190 // if(verbose){outstream.println("Started cris");}
191
192 LongHashSet set=new LongHashSet(1000);
193 ListNum<Read> ln=cris.nextList();
194 while(ln!=null && ln.size()>0){
195 processList(ln, set, k);
196 cris.returnList(ln);
197 ln=cris.nextList();
198 }
199 if(ln!=null){cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());}
200 ReadWrite.closeStream(cris);
201 return set;
202 }
203
204 private static LongHashSet processList(ListNum<Read> ln, LongHashSet set, int k){
205 final long mask=~((-1L)<<(2*k));
206 for(Read r : ln){
207 final byte[] bases=r.bases;
208 long kmer=0;
209 int len=0;
210 for(byte b : bases){
211 final int num=AminoAcid.baseToNumber[b];
212 if(num>=0){
213 len++;
214 kmer=((kmer<<2)|num)&mask;
215 if(len>=k){
216 set.add(kmer);
217 }
218 }else{
219 len=0;
220 }
221 }
222 }
223 return set;
224 }
225
226 /*--------------------------------------------------------------*/
227 /*---------------- Consensus Sequence ----------------*/
228 /*--------------------------------------------------------------*/
229
230 public static synchronized void loadConsensusSequenceFromFile(boolean removeMito, boolean removeChloro){
231 if(loadedConsensusSequence){return;}
232 // assert(r16SSequence==null);
233 if(load16SSequence){r16SSequence=loadConsensusSequenceType("16S", removeMito, removeChloro);}
234 if(load18SSequence){r18SSequence=loadConsensusSequenceType("18S", removeMito, removeChloro);}
235 if(load23SSequence){r23SSequence=loadConsensusSequenceType("23S", removeMito, removeChloro);}
236 if(load5SSequence){r5SSequence=loadConsensusSequenceType("5S", removeMito, removeChloro);}
237 if(loadtRNASequence){trnaSequence=loadConsensusSequenceType("tRNA", removeMito, removeChloro);}
238 loadedConsensusSequence=true;
239 }
240
241 public static Read[] loadConsensusSequenceType(String prefix, boolean removeMito, boolean removeChloro){
242 String fname=null;
243 fname=Data.findPath("?"+prefix+"_consensus_sequence.fq", false);
244 if(fname!=null && (fname.endsWith(".jar") || new File(fname).exists())){
245 fname=Tools.fixExtension(fname);
246 }else{
247 fname=Data.findPath("?"+prefix+"_consensus_sequence.fa", true);
248 fname=Tools.fixExtension(fname);
249 if(!fname.endsWith(".jar") && !new File(fname).exists()){
250 System.err.println("Can't find "+fname);
251 return null;
252 }
253 }
254 Read[] array=loadConsensusSequence(fname);
255 if(removeMito){array=stripOrganelle(array, "mito");}
256 if(removeChloro){array=stripOrganelle(array, "plastid");}
257 return array;
258 }
259
260 private static Read[] loadConsensusSequence(String fname){
261 FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false);
262 Read[] array=ReadInputStream.toReadArray(ff, -1);
263 return array;
264 }
265
266 private static Read[] stripOrganelle(Read[] array, String key){
267 int removed=0;
268 for(int j=0; j<array.length; j++){
269 if(array[j].id.toLowerCase().startsWith(key)) {
270 array[j]=null;
271 removed++;
272 }
273 }
274 if(removed>0){array=Tools.condenseStrict(array);}
275 return array;
276 }
277
278 /*--------------------------------------------------------------*/
279
280 public static final int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
281 public static String[] typeStrings=new String[] {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
282 public static String[] typeStrings2=new String[] {"CDS", "tRNA", "rRNA", "rRNA", "rRNA", "rRNA", "rRNA", "RNA"};
283 public static String[] specialTypeStrings=new String[] {null, "tRNA", "16S", "23S", "5S", "18S", "28S", null};
284 public static boolean isSpecialType(String type){
285 if(type==null){return false;}
286 for(String s : specialTypeStrings){
287 if(type.equalsIgnoreCase(s)){return true;}
288 }
289 return false;
290 }
291
292 public static int kInnerRNA=6;
293 public static int kStartRNA=3;
294 public static int kStopRNA=3;
295
296 public static int kLongSSU=15;
297 public static int kLongLSU=15;
298 public static int kLong5S=15;
299 public static int kLongTRna=15;
300
301 public static float min16SIdentity=0.62f;
302 public static float min23SIdentity=0.60f;
303 public static float min5SIdentity=0.60f;
304 public static float min18SIdentity=0.60f;
305
306 static int ssuStartSlop=200;
307 static int ssuStopSlop=0;
308 static int lsuStartSlop=220;
309 static int lsuStopSlop=0;
310 static int r5SStartSlop=50;
311 static int r5SStopSlop=50;
312
313 public static boolean callCDS=true;
314 public static boolean calltRNA=true;
315 public static boolean call16S=true;
316 public static boolean call23S=true;
317 public static boolean call5S=true;
318 public static boolean call18S=false;
319
320 public static LongHashSet ssuKmers=null;
321 public static LongHashSet lsuKmers=null;
322 public static LongHashSet r5SKmers=null;
323 public static LongHashSet trnaKmers=null;
324
325 public static Read[] trnaSequence=null;
326 public static Read[] r16SSequence=null;
327 public static Read[] r23SSequence=null;
328 public static Read[] r5SSequence=null;
329 public static Read[] r18SSequence=null;
330
331 public static boolean PROCESS_PLUS_STRAND=true;
332 public static boolean PROCESS_MINUS_STRAND=true;
333
334 public static boolean loadSSUkmers=true;
335 public static boolean loadLSUkmers=true;
336 public static boolean load5Skmers=true;
337 public static boolean loadtRNAkmers=true;
338 private static boolean loadedLongKmers=false;
339
340 public static boolean loadtRNASequence=false;
341 public static boolean load16SSequence=true;
342 public static boolean load23SSequence=true;
343 public static boolean load5SSequence=true;
344 public static boolean load18SSequence=true;
345 private static boolean loadedConsensusSequence=false;
346
347 }