jpayne@68
|
1 package tax;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.io.File;
|
jpayne@68
|
4 import java.io.PrintStream;
|
jpayne@68
|
5
|
jpayne@68
|
6 import dna.Data;
|
jpayne@68
|
7 import fileIO.ByteFile;
|
jpayne@68
|
8 import fileIO.ByteFile1;
|
jpayne@68
|
9 import fileIO.ByteFile2;
|
jpayne@68
|
10 import fileIO.ByteStreamWriter;
|
jpayne@68
|
11 import fileIO.FileFormat;
|
jpayne@68
|
12 import fileIO.ReadWrite;
|
jpayne@68
|
13 import shared.Parse;
|
jpayne@68
|
14 import shared.Parser;
|
jpayne@68
|
15 import shared.PreParser;
|
jpayne@68
|
16 import shared.ReadStats;
|
jpayne@68
|
17 import shared.Shared;
|
jpayne@68
|
18 import shared.Timer;
|
jpayne@68
|
19 import shared.Tools;
|
jpayne@68
|
20 import stream.FastaReadInputStream;
|
jpayne@68
|
21 import structures.ByteBuilder;
|
jpayne@68
|
22
|
jpayne@68
|
23 /**
|
jpayne@68
|
24 * @author Brian Bushnell
|
jpayne@68
|
25 * @date April 4, 2017
|
jpayne@68
|
26 *
|
jpayne@68
|
27 */
|
jpayne@68
|
28 public class ShrinkAccession {
|
jpayne@68
|
29
|
jpayne@68
|
30 public static void main(String[] args){
|
jpayne@68
|
31 Timer t=new Timer();
|
jpayne@68
|
32 ShrinkAccession x=new ShrinkAccession(args);
|
jpayne@68
|
33 x.process(t);
|
jpayne@68
|
34
|
jpayne@68
|
35 //Close the print stream if it was redirected
|
jpayne@68
|
36 Shared.closeStream(x.outstream);
|
jpayne@68
|
37 }
|
jpayne@68
|
38
|
jpayne@68
|
39 public ShrinkAccession(String[] args){
|
jpayne@68
|
40
|
jpayne@68
|
41 {//Preparse block for help, config files, and outstream
|
jpayne@68
|
42 PreParser pp=new PreParser(args, getClass(), false);
|
jpayne@68
|
43 args=pp.args;
|
jpayne@68
|
44 outstream=pp.outstream;
|
jpayne@68
|
45 }
|
jpayne@68
|
46
|
jpayne@68
|
47 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
|
jpayne@68
|
48 ReadWrite.MAX_ZIP_THREADS=Shared.threads();
|
jpayne@68
|
49 if(Data.PIGZ()){
|
jpayne@68
|
50 ReadWrite.ZIPLEVEL=Tools.max(ReadWrite.ZIPLEVEL, 6);
|
jpayne@68
|
51 }
|
jpayne@68
|
52
|
jpayne@68
|
53 Parser parser=new Parser();
|
jpayne@68
|
54 for(int i=0; i<args.length; i++){
|
jpayne@68
|
55 String arg=args[i];
|
jpayne@68
|
56 String[] split=arg.split("=");
|
jpayne@68
|
57 String a=split[0].toLowerCase();
|
jpayne@68
|
58 String b=split.length>1 ? split[1] : null;
|
jpayne@68
|
59
|
jpayne@68
|
60 if(parser.parse(arg, a, b)){
|
jpayne@68
|
61 //do nothing
|
jpayne@68
|
62 }else if(a.equals("verbose")){
|
jpayne@68
|
63 verbose=Parse.parseBoolean(b);
|
jpayne@68
|
64 ByteFile1.verbose=verbose;
|
jpayne@68
|
65 ByteFile2.verbose=verbose;
|
jpayne@68
|
66 ReadWrite.verbose=verbose;
|
jpayne@68
|
67 }else if(a.equals("gi")){
|
jpayne@68
|
68 KEEP_GI_NUMBERS=Parse.parseBoolean(b);
|
jpayne@68
|
69 }else if(a.equals("outgi") || a.equals("giout") || a.equals("gi")){
|
jpayne@68
|
70 giOut=b;
|
jpayne@68
|
71 }else if(parser.in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){
|
jpayne@68
|
72 parser.in1=arg;
|
jpayne@68
|
73 }else if(parser.out1==null && i==1 && !arg.contains("=")){
|
jpayne@68
|
74 parser.out1=arg;
|
jpayne@68
|
75 }else{
|
jpayne@68
|
76 outstream.println("Unknown parameter "+args[i]);
|
jpayne@68
|
77 assert(false) : "Unknown parameter "+args[i];
|
jpayne@68
|
78 // throw new RuntimeException("Unknown parameter "+args[i]);
|
jpayne@68
|
79 }
|
jpayne@68
|
80 }
|
jpayne@68
|
81
|
jpayne@68
|
82 {//Process parser fields
|
jpayne@68
|
83 Parser.processQuality();
|
jpayne@68
|
84
|
jpayne@68
|
85 overwrite=ReadStats.overwrite=parser.overwrite;
|
jpayne@68
|
86 append=ReadStats.append=parser.append;
|
jpayne@68
|
87
|
jpayne@68
|
88 in=parser.in1;
|
jpayne@68
|
89
|
jpayne@68
|
90 out=parser.out1;
|
jpayne@68
|
91 }
|
jpayne@68
|
92
|
jpayne@68
|
93 assert(FastaReadInputStream.settingsOK());
|
jpayne@68
|
94
|
jpayne@68
|
95 if(in==null){throw new RuntimeException("Error - at least one input file is required.");}
|
jpayne@68
|
96 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2){
|
jpayne@68
|
97 ByteFile.FORCE_MODE_BF2=false;
|
jpayne@68
|
98 ByteFile.FORCE_MODE_BF1=true;
|
jpayne@68
|
99 }
|
jpayne@68
|
100
|
jpayne@68
|
101 if(out!=null && out.equalsIgnoreCase("null")){out=null;}
|
jpayne@68
|
102
|
jpayne@68
|
103 if(!Tools.testOutputFiles(overwrite, append, false, out)){
|
jpayne@68
|
104 outstream.println((out==null)+", "+out);
|
jpayne@68
|
105 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out+"\n");
|
jpayne@68
|
106 }
|
jpayne@68
|
107
|
jpayne@68
|
108 ffout=FileFormat.testOutput(out, FileFormat.TXT, null, true, overwrite, append, false);
|
jpayne@68
|
109 ffoutGi=FileFormat.testOutput(giOut, FileFormat.TXT, null, true, overwrite, append, false);
|
jpayne@68
|
110 ffin=FileFormat.testInput(in, FileFormat.TXT, null, true, true);
|
jpayne@68
|
111
|
jpayne@68
|
112 }
|
jpayne@68
|
113
|
jpayne@68
|
114 void process(Timer t){
|
jpayne@68
|
115
|
jpayne@68
|
116 ByteFile bf=ByteFile.makeByteFile(ffin);
|
jpayne@68
|
117 ByteStreamWriter bsw=new ByteStreamWriter(ffout);
|
jpayne@68
|
118 bsw.start();
|
jpayne@68
|
119
|
jpayne@68
|
120 long linesProcessed=0;
|
jpayne@68
|
121 long charsProcessed=0;
|
jpayne@68
|
122 long badLines=0;
|
jpayne@68
|
123
|
jpayne@68
|
124 byte[] line=bf.nextLine();
|
jpayne@68
|
125 ByteBuilder bb=new ByteBuilder(10000);
|
jpayne@68
|
126 int columns=4;
|
jpayne@68
|
127 while(line!=null){
|
jpayne@68
|
128 if(Tools.startsWith(line, "accession\t")){
|
jpayne@68
|
129 bb.append(line);
|
jpayne@68
|
130 bb.nl();
|
jpayne@68
|
131 }else if(Tools.startsWith(line, "accession.version\ttaxid")){
|
jpayne@68
|
132 columns=2;
|
jpayne@68
|
133 bb.append("accession\t\ttaxid\t");//dummy header
|
jpayne@68
|
134 bb.nl();
|
jpayne@68
|
135 }else{
|
jpayne@68
|
136 charsProcessed+=line.length+1;
|
jpayne@68
|
137 linesProcessed++;
|
jpayne@68
|
138
|
jpayne@68
|
139 final int tid=(columns==4 ? AccessionToTaxid.parseLineToTaxid(line, (byte)'\t') :
|
jpayne@68
|
140 AccessionToTaxid.parseLineToTaxid_2col(line, (byte)'\t'));
|
jpayne@68
|
141 if(tid<1){
|
jpayne@68
|
142 badLines++;
|
jpayne@68
|
143 }else{
|
jpayne@68
|
144 int i=0;
|
jpayne@68
|
145
|
jpayne@68
|
146 while(i<line.length){//Accession
|
jpayne@68
|
147 byte b=line[i];
|
jpayne@68
|
148 bb.append(b);
|
jpayne@68
|
149 i++;
|
jpayne@68
|
150 if(b=='\t'){break;}
|
jpayne@68
|
151 }
|
jpayne@68
|
152
|
jpayne@68
|
153 if(columns==4){
|
jpayne@68
|
154 while(i<line.length){//Accession with decimal
|
jpayne@68
|
155 byte b=line[i];
|
jpayne@68
|
156 // bb.append(b);
|
jpayne@68
|
157 i++;
|
jpayne@68
|
158 if(b=='\t'){break;}
|
jpayne@68
|
159 }
|
jpayne@68
|
160 }
|
jpayne@68
|
161 bb.append('\t');
|
jpayne@68
|
162
|
jpayne@68
|
163 while(i<line.length){//Taxid
|
jpayne@68
|
164 byte b=line[i];
|
jpayne@68
|
165 bb.append(b);
|
jpayne@68
|
166 i++;
|
jpayne@68
|
167 if(b=='\t'){break;}
|
jpayne@68
|
168 }
|
jpayne@68
|
169
|
jpayne@68
|
170 if(KEEP_GI_NUMBERS){
|
jpayne@68
|
171 if(line.length>i && Tools.isDigit(line[i])){//GI number or "na"
|
jpayne@68
|
172 while(i<line.length){
|
jpayne@68
|
173 byte b=line[i];
|
jpayne@68
|
174 bb.append(b);
|
jpayne@68
|
175 i++;
|
jpayne@68
|
176 // if(b=='\t'){break;}
|
jpayne@68
|
177 }
|
jpayne@68
|
178 }
|
jpayne@68
|
179 }
|
jpayne@68
|
180 bb.nl();
|
jpayne@68
|
181 }
|
jpayne@68
|
182
|
jpayne@68
|
183 // String[] split=new String(line).split("\t");
|
jpayne@68
|
184 // bb.append(split[0]);
|
jpayne@68
|
185 // bb.tab();
|
jpayne@68
|
186 // bb.tab();
|
jpayne@68
|
187 // bb.append(split[2]);
|
jpayne@68
|
188 // bb.tab();
|
jpayne@68
|
189 // bb.nl();
|
jpayne@68
|
190 }
|
jpayne@68
|
191 if(bb.length()>8000){
|
jpayne@68
|
192 bsw.print(bb);
|
jpayne@68
|
193 bb.clear();
|
jpayne@68
|
194 }
|
jpayne@68
|
195 line=bf.nextLine();
|
jpayne@68
|
196 }
|
jpayne@68
|
197 if(bb.length()>0){
|
jpayne@68
|
198 bsw.print(bb);
|
jpayne@68
|
199 bb.clear();
|
jpayne@68
|
200 }
|
jpayne@68
|
201
|
jpayne@68
|
202 errorState|=bf.close();
|
jpayne@68
|
203 if(bsw!=null){errorState|=bsw.poisonAndWait();}
|
jpayne@68
|
204
|
jpayne@68
|
205 t.stop();
|
jpayne@68
|
206 outstream.println("Discarded "+badLines+" lines.\n");
|
jpayne@68
|
207 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, charsProcessed, 8));
|
jpayne@68
|
208
|
jpayne@68
|
209 if(errorState){
|
jpayne@68
|
210 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
|
jpayne@68
|
211 }
|
jpayne@68
|
212 }
|
jpayne@68
|
213
|
jpayne@68
|
214 /*--------------------------------------------------------------*/
|
jpayne@68
|
215
|
jpayne@68
|
216
|
jpayne@68
|
217 /*--------------------------------------------------------------*/
|
jpayne@68
|
218
|
jpayne@68
|
219 private String in=null;
|
jpayne@68
|
220 private String out=null;
|
jpayne@68
|
221 private String giOut=null;
|
jpayne@68
|
222
|
jpayne@68
|
223 /*--------------------------------------------------------------*/
|
jpayne@68
|
224
|
jpayne@68
|
225 private final FileFormat ffin;
|
jpayne@68
|
226 private final FileFormat ffout;
|
jpayne@68
|
227 private final FileFormat ffoutGi;
|
jpayne@68
|
228
|
jpayne@68
|
229 /*--------------------------------------------------------------*/
|
jpayne@68
|
230
|
jpayne@68
|
231 private PrintStream outstream=System.err;
|
jpayne@68
|
232 public static boolean verbose=false;
|
jpayne@68
|
233 public static boolean KEEP_GI_NUMBERS=true;
|
jpayne@68
|
234 public boolean errorState=false;
|
jpayne@68
|
235 private boolean overwrite=false;
|
jpayne@68
|
236 private boolean append=false;
|
jpayne@68
|
237
|
jpayne@68
|
238 }
|