jpayne@68
|
1 package prok;
|
jpayne@68
|
2
|
jpayne@68
|
3 import java.util.ArrayList;
|
jpayne@68
|
4 import java.util.HashMap;
|
jpayne@68
|
5
|
jpayne@68
|
6 import fileIO.FileFormat;
|
jpayne@68
|
7 import fileIO.TextStreamWriter;
|
jpayne@68
|
8 import server.ServerTools;
|
jpayne@68
|
9 import shared.Parse;
|
jpayne@68
|
10 import shared.Tools;
|
jpayne@68
|
11 import template.ThreadWaiter;
|
jpayne@68
|
12
|
jpayne@68
|
13 /** Crawls ncbi's ftp site to download genomes and annotations */
|
jpayne@68
|
14 public class FetchProks {
|
jpayne@68
|
15
|
jpayne@68
|
16 public static void main(String[] args){
|
jpayne@68
|
17 //ftp://ftp.ncbi.nih.gov:21/genomes/refseq/bacteria/
|
jpayne@68
|
18
|
jpayne@68
|
19 String baseAddress=args[0];
|
jpayne@68
|
20 String out=args.length>1 ? args[1] : "stdout";
|
jpayne@68
|
21 if(args.length>2){
|
jpayne@68
|
22 maxSpeciesPerGenus=Integer.parseInt(args[2]);
|
jpayne@68
|
23 System.err.println("Set maxSpeciesPerGenus="+maxSpeciesPerGenus);
|
jpayne@68
|
24 }
|
jpayne@68
|
25 if(args.length>3){
|
jpayne@68
|
26 findBest=Parse.parseBoolean(args[3]);
|
jpayne@68
|
27 System.err.println("Set findBest="+findBest);
|
jpayne@68
|
28 }
|
jpayne@68
|
29 TextStreamWriter tsw=new TextStreamWriter(out, true, false, false, FileFormat.TEXT);
|
jpayne@68
|
30 tsw.start();
|
jpayne@68
|
31
|
jpayne@68
|
32 // iterateOuter(baseAddress, tsw);
|
jpayne@68
|
33 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
|
jpayne@68
|
34
|
jpayne@68
|
35 int threads=7;
|
jpayne@68
|
36 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
|
jpayne@68
|
37 for(int i=0; i<threads; i++){
|
jpayne@68
|
38 alpt.add(new ProcessThread(contents, tsw, i, threads));
|
jpayne@68
|
39 }
|
jpayne@68
|
40 for(ProcessThread pt : alpt){pt.start();}
|
jpayne@68
|
41 boolean success=ThreadWaiter.waitForThreads(alpt);
|
jpayne@68
|
42
|
jpayne@68
|
43 for(ProcessThread pt : alpt){
|
jpayne@68
|
44 totalSpecies+=pt.totalSpeciesT;
|
jpayne@68
|
45 totalGenus+=pt.totalGenusT;
|
jpayne@68
|
46 totalGenomes+=pt.totalGenomesT;
|
jpayne@68
|
47 }
|
jpayne@68
|
48 System.err.println("Total Genomes: "+totalGenomes);
|
jpayne@68
|
49 System.err.println("Total Species: "+totalSpecies);
|
jpayne@68
|
50 System.err.println("Total Genuses: "+totalGenus);
|
jpayne@68
|
51
|
jpayne@68
|
52 tsw.poisonAndWait();
|
jpayne@68
|
53 assert(success);
|
jpayne@68
|
54 }
|
jpayne@68
|
55
|
jpayne@68
|
56 static class ProcessThread extends Thread {
|
jpayne@68
|
57
|
jpayne@68
|
58 ProcessThread(ArrayList<String> speciesList_, TextStreamWriter tsw_, int tid_, int threads_){
|
jpayne@68
|
59 speciesList=speciesList_;
|
jpayne@68
|
60 tsw=tsw_;
|
jpayne@68
|
61 tid=tid_;
|
jpayne@68
|
62 threads=threads_;
|
jpayne@68
|
63 }
|
jpayne@68
|
64
|
jpayne@68
|
65 @Override
|
jpayne@68
|
66 public void run(){
|
jpayne@68
|
67 for(String s : speciesList){
|
jpayne@68
|
68 // if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
|
jpayne@68
|
69 // processSpecies(s);
|
jpayne@68
|
70 // }
|
jpayne@68
|
71
|
jpayne@68
|
72 //This way one thread handles an entire genus
|
jpayne@68
|
73 if(s!=null){
|
jpayne@68
|
74 String genus=getGenus(s);
|
jpayne@68
|
75 if(genus!=null){
|
jpayne@68
|
76 if((genus.hashCode()&Integer.MAX_VALUE)%threads==tid) {
|
jpayne@68
|
77 processSpecies(s);
|
jpayne@68
|
78 }
|
jpayne@68
|
79 }else{
|
jpayne@68
|
80 if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
|
jpayne@68
|
81 processSpecies(s);
|
jpayne@68
|
82 }
|
jpayne@68
|
83 }
|
jpayne@68
|
84 }
|
jpayne@68
|
85 }
|
jpayne@68
|
86 }
|
jpayne@68
|
87
|
jpayne@68
|
88 void processSpecies(String species){
|
jpayne@68
|
89 String genus=getGenus(species);
|
jpayne@68
|
90 if(genus!=null){
|
jpayne@68
|
91 final int count=seen(genus, seen);
|
jpayne@68
|
92
|
jpayne@68
|
93 if(maxSpeciesPerGenus<1 || count<maxSpeciesPerGenus){
|
jpayne@68
|
94 int found=examineSpecies(species, tsw);
|
jpayne@68
|
95 if(found>=1){
|
jpayne@68
|
96 totalSpeciesT++;
|
jpayne@68
|
97 totalGenomesT+=found;
|
jpayne@68
|
98 if(count==0){totalGenusT++;}
|
jpayne@68
|
99 put(genus, found, seen);
|
jpayne@68
|
100 }
|
jpayne@68
|
101 }else{
|
jpayne@68
|
102 if(verbose){System.err.println("same genus: "+species+"\n"+genus);}
|
jpayne@68
|
103 }
|
jpayne@68
|
104 }else{
|
jpayne@68
|
105 if(verbose){System.err.println("bad species: "+species+"\n"+genus);}
|
jpayne@68
|
106 }
|
jpayne@68
|
107 }
|
jpayne@68
|
108
|
jpayne@68
|
109 final ArrayList<String> speciesList;
|
jpayne@68
|
110 final int tid;
|
jpayne@68
|
111 final int threads;
|
jpayne@68
|
112 //This is OK now that threads work on a per-genus basis
|
jpayne@68
|
113 HashMap<String, Integer> seen=new HashMap<String, Integer>();
|
jpayne@68
|
114 final TextStreamWriter tsw;
|
jpayne@68
|
115
|
jpayne@68
|
116 int totalSpeciesT=0;
|
jpayne@68
|
117 int totalGenusT=0;
|
jpayne@68
|
118 int totalGenomesT=0;
|
jpayne@68
|
119 }
|
jpayne@68
|
120
|
jpayne@68
|
121 static String getGenus(String path){
|
jpayne@68
|
122 //Candidatus_Hamiltonella
|
jpayne@68
|
123 String name=path.substring(path.lastIndexOf('/')+1);
|
jpayne@68
|
124 if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
|
jpayne@68
|
125 int under=name.indexOf('_');
|
jpayne@68
|
126 if(under>0){
|
jpayne@68
|
127 return name.substring(0, under);
|
jpayne@68
|
128 }else{
|
jpayne@68
|
129 return null;
|
jpayne@68
|
130 }
|
jpayne@68
|
131 }
|
jpayne@68
|
132
|
jpayne@68
|
133 static String getSpecies(String path){
|
jpayne@68
|
134 //Candidatus_Hamiltonella
|
jpayne@68
|
135 String name=path.substring(path.lastIndexOf('/')+1);
|
jpayne@68
|
136 if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
|
jpayne@68
|
137 return name;
|
jpayne@68
|
138 }
|
jpayne@68
|
139
|
jpayne@68
|
140 static int examineSpecies(String baseAddress, TextStreamWriter tsw){
|
jpayne@68
|
141 if(verbose){System.err.println("examineSpecies: "+baseAddress);}
|
jpayne@68
|
142 String speciesName=getSpecies(baseAddress);
|
jpayne@68
|
143 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
|
jpayne@68
|
144 // System.err.println("B: "+contents);
|
jpayne@68
|
145 int found=0;
|
jpayne@68
|
146 for(String s : contents){
|
jpayne@68
|
147 // System.err.println(s);
|
jpayne@68
|
148 if(s.contains("reference")){
|
jpayne@68
|
149 // System.err.println("Looking at '"+s+"'");
|
jpayne@68
|
150 found+=examineAssemblies(s, tsw, speciesName);
|
jpayne@68
|
151 }
|
jpayne@68
|
152 }
|
jpayne@68
|
153 if(found>0){return found;}
|
jpayne@68
|
154 for(String s : contents){
|
jpayne@68
|
155 // System.err.println(s);
|
jpayne@68
|
156 if(s.contains("latest_assembly_versions")){
|
jpayne@68
|
157 // System.err.println("Looking at '"+s+"'");
|
jpayne@68
|
158 found+=examineAssemblies(s, tsw, speciesName);
|
jpayne@68
|
159 }
|
jpayne@68
|
160 }
|
jpayne@68
|
161 if(found>0){return found;}
|
jpayne@68
|
162 for(String s : contents){
|
jpayne@68
|
163 // System.err.println(s);
|
jpayne@68
|
164 if(s.contains("all_assembly_versions")){
|
jpayne@68
|
165 // System.err.println("Looking at '"+s+"'");
|
jpayne@68
|
166 found+=examineAssemblies(s, tsw, speciesName);
|
jpayne@68
|
167 }
|
jpayne@68
|
168 }
|
jpayne@68
|
169 return found;
|
jpayne@68
|
170 }
|
jpayne@68
|
171
|
jpayne@68
|
172 static int examineAssemblies(String baseAddress, TextStreamWriter tsw, String speciesName){
|
jpayne@68
|
173 if(verbose){System.err.println("examineAssemblies: "+baseAddress);}
|
jpayne@68
|
174 Stats stats=null;
|
jpayne@68
|
175 if(findBest){
|
jpayne@68
|
176 stats=findBestAssembly(baseAddress);
|
jpayne@68
|
177 if(stats!=null){
|
jpayne@68
|
178 stats.name=speciesName;
|
jpayne@68
|
179 int x=examineAssembly(stats, tsw, speciesName);
|
jpayne@68
|
180 if(x>0){return x;}
|
jpayne@68
|
181 }
|
jpayne@68
|
182 }
|
jpayne@68
|
183
|
jpayne@68
|
184 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
|
jpayne@68
|
185 // System.err.println("C: "+contents);
|
jpayne@68
|
186
|
jpayne@68
|
187 int found=0;
|
jpayne@68
|
188 for(String s : contents){
|
jpayne@68
|
189 stats=calcStats(s);
|
jpayne@68
|
190 if(stats!=null){
|
jpayne@68
|
191 stats.name=speciesName;
|
jpayne@68
|
192 found+=examineAssembly(stats, tsw, speciesName);
|
jpayne@68
|
193 if(found>0){break;}
|
jpayne@68
|
194 }
|
jpayne@68
|
195 }
|
jpayne@68
|
196 return found;
|
jpayne@68
|
197 }
|
jpayne@68
|
198
|
jpayne@68
|
199 /** Tries to find the assembly with the longest contig */
|
jpayne@68
|
200 static Stats findBestAssembly(String baseAddress){
|
jpayne@68
|
201 if(verbose){System.err.println("findBestAssembly: "+baseAddress);}
|
jpayne@68
|
202 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
|
jpayne@68
|
203 // System.err.println("C: "+contents);
|
jpayne@68
|
204 Stats best=null;
|
jpayne@68
|
205 for(String s : contents){
|
jpayne@68
|
206 // System.err.println(s);
|
jpayne@68
|
207 Stats stats=calcStats(s);
|
jpayne@68
|
208 if(stats!=null){
|
jpayne@68
|
209 if(best==null || stats.compareTo(best)>0){
|
jpayne@68
|
210 best=stats;
|
jpayne@68
|
211 }
|
jpayne@68
|
212 }
|
jpayne@68
|
213 }
|
jpayne@68
|
214 return best;
|
jpayne@68
|
215 }
|
jpayne@68
|
216
|
jpayne@68
|
217 static Stats calcStats(String baseAddress){
|
jpayne@68
|
218 if(verbose){System.err.println("calcStats: "+baseAddress);}
|
jpayne@68
|
219 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
|
jpayne@68
|
220 String report=null;
|
jpayne@68
|
221 for(String s : contents){
|
jpayne@68
|
222 if(s.endsWith("_assembly_report.txt")){
|
jpayne@68
|
223 report=s;
|
jpayne@68
|
224 break;
|
jpayne@68
|
225 }
|
jpayne@68
|
226 }
|
jpayne@68
|
227 if(report==null){
|
jpayne@68
|
228 if(verbose){System.err.println("Could not find report for "+baseAddress);}
|
jpayne@68
|
229 return null;
|
jpayne@68
|
230 }
|
jpayne@68
|
231 if(verbose){System.err.println("Report: "+report);}
|
jpayne@68
|
232 ArrayList<String> data=null;
|
jpayne@68
|
233 for(int i=0; i<=retries && data==null; i++){
|
jpayne@68
|
234 try {
|
jpayne@68
|
235 data = ServerTools.readFTPFile(report);
|
jpayne@68
|
236 } catch (Exception e) {
|
jpayne@68
|
237 // TODO Auto-generated catch block
|
jpayne@68
|
238 e.printStackTrace();
|
jpayne@68
|
239 try {
|
jpayne@68
|
240 Thread.sleep(Tools.mid(10000, i*1000, 1000));
|
jpayne@68
|
241 } catch (InterruptedException e1) {
|
jpayne@68
|
242 // TODO Auto-generated catch block
|
jpayne@68
|
243 e1.printStackTrace();
|
jpayne@68
|
244 }
|
jpayne@68
|
245 }
|
jpayne@68
|
246 }
|
jpayne@68
|
247 if(data==null){return null;}
|
jpayne@68
|
248 int contigs=0;
|
jpayne@68
|
249 long size=0;
|
jpayne@68
|
250 long max=0;
|
jpayne@68
|
251 int taxid=-1;
|
jpayne@68
|
252 for(String s : data){
|
jpayne@68
|
253 if(s!=null && s.length()>0){
|
jpayne@68
|
254 if(s.charAt(0)=='#'){
|
jpayne@68
|
255 if(s.startsWith("# Taxid:")){
|
jpayne@68
|
256 String[] split=Tools.whitespacePlus.split(s);
|
jpayne@68
|
257 try {
|
jpayne@68
|
258 taxid=Integer.parseInt(split[split.length-1]);
|
jpayne@68
|
259 } catch (NumberFormatException e) {
|
jpayne@68
|
260 e.printStackTrace();
|
jpayne@68
|
261 }
|
jpayne@68
|
262 assert(taxid>-1) : "Bad TaxID: '"+s+"'";
|
jpayne@68
|
263 }
|
jpayne@68
|
264 }else{
|
jpayne@68
|
265 String[] split=s.split("\t");
|
jpayne@68
|
266 contigs++;
|
jpayne@68
|
267 long len;
|
jpayne@68
|
268 try {
|
jpayne@68
|
269 len=Long.parseLong(split[8]);
|
jpayne@68
|
270 } catch (NumberFormatException e) {
|
jpayne@68
|
271 len=1;
|
jpayne@68
|
272 }
|
jpayne@68
|
273 size+=len;
|
jpayne@68
|
274 max=Tools.max(max, len);
|
jpayne@68
|
275 }
|
jpayne@68
|
276 }
|
jpayne@68
|
277 }
|
jpayne@68
|
278 return new Stats(baseAddress, max, size, contigs, taxid);
|
jpayne@68
|
279 }
|
jpayne@68
|
280
|
jpayne@68
|
281 static int examineAssembly(Stats stats, TextStreamWriter tsw, String speciesName){
|
jpayne@68
|
282 if(verbose){System.err.println("examineAssembly: "+stats.path);}
|
jpayne@68
|
283 ArrayList<String> contents=ServerTools.listDirectory(stats.path, retries);
|
jpayne@68
|
284 // System.err.println("D: "+contents);
|
jpayne@68
|
285 String gff=null;
|
jpayne@68
|
286 String fna=null;
|
jpayne@68
|
287 for(String s : contents){
|
jpayne@68
|
288 // System.err.println(s);
|
jpayne@68
|
289 if(!s.contains("_from_genomic")){
|
jpayne@68
|
290 if(s.endsWith("genomic.fna.gz")){fna=s;}
|
jpayne@68
|
291 else if(s.endsWith("genomic.gff.gz")){gff=s;}
|
jpayne@68
|
292 }
|
jpayne@68
|
293 }
|
jpayne@68
|
294 if(fna!=null && gff!=null){
|
jpayne@68
|
295 System.err.println("Printing: "+fna);
|
jpayne@68
|
296 String prefix=(tidInFilename ? "tid_"+stats.taxID+"_" : "");
|
jpayne@68
|
297
|
jpayne@68
|
298 synchronized(tsw){
|
jpayne@68
|
299 if(renameSequences){
|
jpayne@68
|
300 tsw.println("wget -q -O - "+fna+" | "
|
jpayne@68
|
301 + "gi2taxid.sh in=stdin.fa.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".fna.gz");
|
jpayne@68
|
302 tsw.println("wget -q -O - "+gff+" | "
|
jpayne@68
|
303 + "gi2taxid.sh in=stdin.gff.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".gff.gz");
|
jpayne@68
|
304 }else if(renameFiles){
|
jpayne@68
|
305 tsw.println("wget -q -O - "+fna+" > "+prefix+speciesName+".fna.gz");
|
jpayne@68
|
306 tsw.println("wget -q -O - "+gff+" > "+prefix+speciesName+".gff.gz");
|
jpayne@68
|
307 }else{
|
jpayne@68
|
308 tsw.println("wget -q "+fna);
|
jpayne@68
|
309 tsw.println("wget -q "+gff);
|
jpayne@68
|
310 }
|
jpayne@68
|
311 tsw.println();
|
jpayne@68
|
312 }
|
jpayne@68
|
313 return 1;
|
jpayne@68
|
314 }
|
jpayne@68
|
315 return 0;
|
jpayne@68
|
316 }
|
jpayne@68
|
317
|
jpayne@68
|
318 static String makeSubAddress(String baseAddress, String extension){
|
jpayne@68
|
319 if(!baseAddress.endsWith("/")){baseAddress=baseAddress+"/";}
|
jpayne@68
|
320 String subAddress=baseAddress+extension.substring(extension.indexOf('/')+1);
|
jpayne@68
|
321 return subAddress;
|
jpayne@68
|
322 }
|
jpayne@68
|
323
|
jpayne@68
|
324 static int seen(String s, HashMap<String, Integer> map){
|
jpayne@68
|
325 // synchronized(map){
|
jpayne@68
|
326 Integer x=map.get(s);
|
jpayne@68
|
327 return x==null ? 0 : x.intValue();
|
jpayne@68
|
328 // }
|
jpayne@68
|
329 }
|
jpayne@68
|
330 static void put(String s, int found, HashMap<String, Integer> map){
|
jpayne@68
|
331 // synchronized(map){
|
jpayne@68
|
332 int present=seen(s, map);
|
jpayne@68
|
333 map.put(s, present+found);
|
jpayne@68
|
334 // }
|
jpayne@68
|
335 }
|
jpayne@68
|
336
|
jpayne@68
|
337 static class Stats implements Comparable<Stats>{
|
jpayne@68
|
338
|
jpayne@68
|
339 public Stats(String path_, long maxContig_, long size_, int contigs_, int taxID_){
|
jpayne@68
|
340 path=path_;
|
jpayne@68
|
341 maxContig=maxContig_;
|
jpayne@68
|
342 size=size_;
|
jpayne@68
|
343 contigs=contigs_;
|
jpayne@68
|
344 taxID=taxID_;
|
jpayne@68
|
345 }
|
jpayne@68
|
346
|
jpayne@68
|
347 @Override
|
jpayne@68
|
348 public int compareTo(Stats b) {//true if b is better
|
jpayne@68
|
349 if(b==null){return 1;}
|
jpayne@68
|
350 if(taxID>0 && b.taxID<1){return 1;}
|
jpayne@68
|
351 if(b.taxID>0 && taxID<1){return -1;}
|
jpayne@68
|
352
|
jpayne@68
|
353 if(size>2*b.size){return 1;}
|
jpayne@68
|
354 if(size<2*b.size){return -1;}
|
jpayne@68
|
355
|
jpayne@68
|
356 if(maxContig>b.maxContig){return 1;}
|
jpayne@68
|
357 if(maxContig<b.maxContig){return -1;}
|
jpayne@68
|
358
|
jpayne@68
|
359 return b.contigs-contigs;
|
jpayne@68
|
360 }
|
jpayne@68
|
361
|
jpayne@68
|
362 String path;
|
jpayne@68
|
363 String name;
|
jpayne@68
|
364 long maxContig;
|
jpayne@68
|
365 long size;
|
jpayne@68
|
366 int contigs;
|
jpayne@68
|
367 int taxID;
|
jpayne@68
|
368 }
|
jpayne@68
|
369
|
jpayne@68
|
370 static boolean verbose=true;
|
jpayne@68
|
371 // static boolean allowSameGenus=false;
|
jpayne@68
|
372 static int maxSpeciesPerGenus=1;
|
jpayne@68
|
373 static boolean renameFiles=true;
|
jpayne@68
|
374 static boolean renameSequences=true;
|
jpayne@68
|
375 static int retries=40;
|
jpayne@68
|
376 static boolean findBest=false;
|
jpayne@68
|
377
|
jpayne@68
|
378 static boolean tidInFilename=true;
|
jpayne@68
|
379
|
jpayne@68
|
380 // private static HashMap<String, Integer> seen=new HashMap<String, Integer>();
|
jpayne@68
|
381
|
jpayne@68
|
382 static int totalSpecies=0;
|
jpayne@68
|
383 static int totalGenus=0;
|
jpayne@68
|
384 static int totalGenomes=0;
|
jpayne@68
|
385
|
jpayne@68
|
386 private static final Integer one=1;
|
jpayne@68
|
387
|
jpayne@68
|
388 }
|