Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FetchProks.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 68:5028fdace37b |
---|---|
1 package prok; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.HashMap; | |
5 | |
6 import fileIO.FileFormat; | |
7 import fileIO.TextStreamWriter; | |
8 import server.ServerTools; | |
9 import shared.Parse; | |
10 import shared.Tools; | |
11 import template.ThreadWaiter; | |
12 | |
13 /** Crawls ncbi's ftp site to download genomes and annotations */ | |
14 public class FetchProks { | |
15 | |
16 public static void main(String[] args){ | |
17 //ftp://ftp.ncbi.nih.gov:21/genomes/refseq/bacteria/ | |
18 | |
19 String baseAddress=args[0]; | |
20 String out=args.length>1 ? args[1] : "stdout"; | |
21 if(args.length>2){ | |
22 maxSpeciesPerGenus=Integer.parseInt(args[2]); | |
23 System.err.println("Set maxSpeciesPerGenus="+maxSpeciesPerGenus); | |
24 } | |
25 if(args.length>3){ | |
26 findBest=Parse.parseBoolean(args[3]); | |
27 System.err.println("Set findBest="+findBest); | |
28 } | |
29 TextStreamWriter tsw=new TextStreamWriter(out, true, false, false, FileFormat.TEXT); | |
30 tsw.start(); | |
31 | |
32 // iterateOuter(baseAddress, tsw); | |
33 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); | |
34 | |
35 int threads=7; | |
36 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); | |
37 for(int i=0; i<threads; i++){ | |
38 alpt.add(new ProcessThread(contents, tsw, i, threads)); | |
39 } | |
40 for(ProcessThread pt : alpt){pt.start();} | |
41 boolean success=ThreadWaiter.waitForThreads(alpt); | |
42 | |
43 for(ProcessThread pt : alpt){ | |
44 totalSpecies+=pt.totalSpeciesT; | |
45 totalGenus+=pt.totalGenusT; | |
46 totalGenomes+=pt.totalGenomesT; | |
47 } | |
48 System.err.println("Total Genomes: "+totalGenomes); | |
49 System.err.println("Total Species: "+totalSpecies); | |
50 System.err.println("Total Genuses: "+totalGenus); | |
51 | |
52 tsw.poisonAndWait(); | |
53 assert(success); | |
54 } | |
55 | |
56 static class ProcessThread extends Thread { | |
57 | |
58 ProcessThread(ArrayList<String> speciesList_, TextStreamWriter tsw_, int tid_, int threads_){ | |
59 speciesList=speciesList_; | |
60 tsw=tsw_; | |
61 tid=tid_; | |
62 threads=threads_; | |
63 } | |
64 | |
65 @Override | |
66 public void run(){ | |
67 for(String s : speciesList){ | |
68 // if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) { | |
69 // processSpecies(s); | |
70 // } | |
71 | |
72 //This way one thread handles an entire genus | |
73 if(s!=null){ | |
74 String genus=getGenus(s); | |
75 if(genus!=null){ | |
76 if((genus.hashCode()&Integer.MAX_VALUE)%threads==tid) { | |
77 processSpecies(s); | |
78 } | |
79 }else{ | |
80 if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) { | |
81 processSpecies(s); | |
82 } | |
83 } | |
84 } | |
85 } | |
86 } | |
87 | |
88 void processSpecies(String species){ | |
89 String genus=getGenus(species); | |
90 if(genus!=null){ | |
91 final int count=seen(genus, seen); | |
92 | |
93 if(maxSpeciesPerGenus<1 || count<maxSpeciesPerGenus){ | |
94 int found=examineSpecies(species, tsw); | |
95 if(found>=1){ | |
96 totalSpeciesT++; | |
97 totalGenomesT+=found; | |
98 if(count==0){totalGenusT++;} | |
99 put(genus, found, seen); | |
100 } | |
101 }else{ | |
102 if(verbose){System.err.println("same genus: "+species+"\n"+genus);} | |
103 } | |
104 }else{ | |
105 if(verbose){System.err.println("bad species: "+species+"\n"+genus);} | |
106 } | |
107 } | |
108 | |
109 final ArrayList<String> speciesList; | |
110 final int tid; | |
111 final int threads; | |
112 //This is OK now that threads work on a per-genus basis | |
113 HashMap<String, Integer> seen=new HashMap<String, Integer>(); | |
114 final TextStreamWriter tsw; | |
115 | |
116 int totalSpeciesT=0; | |
117 int totalGenusT=0; | |
118 int totalGenomesT=0; | |
119 } | |
120 | |
121 static String getGenus(String path){ | |
122 //Candidatus_Hamiltonella | |
123 String name=path.substring(path.lastIndexOf('/')+1); | |
124 if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());} | |
125 int under=name.indexOf('_'); | |
126 if(under>0){ | |
127 return name.substring(0, under); | |
128 }else{ | |
129 return null; | |
130 } | |
131 } | |
132 | |
133 static String getSpecies(String path){ | |
134 //Candidatus_Hamiltonella | |
135 String name=path.substring(path.lastIndexOf('/')+1); | |
136 if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());} | |
137 return name; | |
138 } | |
139 | |
140 static int examineSpecies(String baseAddress, TextStreamWriter tsw){ | |
141 if(verbose){System.err.println("examineSpecies: "+baseAddress);} | |
142 String speciesName=getSpecies(baseAddress); | |
143 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); | |
144 // System.err.println("B: "+contents); | |
145 int found=0; | |
146 for(String s : contents){ | |
147 // System.err.println(s); | |
148 if(s.contains("reference")){ | |
149 // System.err.println("Looking at '"+s+"'"); | |
150 found+=examineAssemblies(s, tsw, speciesName); | |
151 } | |
152 } | |
153 if(found>0){return found;} | |
154 for(String s : contents){ | |
155 // System.err.println(s); | |
156 if(s.contains("latest_assembly_versions")){ | |
157 // System.err.println("Looking at '"+s+"'"); | |
158 found+=examineAssemblies(s, tsw, speciesName); | |
159 } | |
160 } | |
161 if(found>0){return found;} | |
162 for(String s : contents){ | |
163 // System.err.println(s); | |
164 if(s.contains("all_assembly_versions")){ | |
165 // System.err.println("Looking at '"+s+"'"); | |
166 found+=examineAssemblies(s, tsw, speciesName); | |
167 } | |
168 } | |
169 return found; | |
170 } | |
171 | |
172 static int examineAssemblies(String baseAddress, TextStreamWriter tsw, String speciesName){ | |
173 if(verbose){System.err.println("examineAssemblies: "+baseAddress);} | |
174 Stats stats=null; | |
175 if(findBest){ | |
176 stats=findBestAssembly(baseAddress); | |
177 if(stats!=null){ | |
178 stats.name=speciesName; | |
179 int x=examineAssembly(stats, tsw, speciesName); | |
180 if(x>0){return x;} | |
181 } | |
182 } | |
183 | |
184 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); | |
185 // System.err.println("C: "+contents); | |
186 | |
187 int found=0; | |
188 for(String s : contents){ | |
189 stats=calcStats(s); | |
190 if(stats!=null){ | |
191 stats.name=speciesName; | |
192 found+=examineAssembly(stats, tsw, speciesName); | |
193 if(found>0){break;} | |
194 } | |
195 } | |
196 return found; | |
197 } | |
198 | |
199 /** Tries to find the assembly with the longest contig */ | |
200 static Stats findBestAssembly(String baseAddress){ | |
201 if(verbose){System.err.println("findBestAssembly: "+baseAddress);} | |
202 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); | |
203 // System.err.println("C: "+contents); | |
204 Stats best=null; | |
205 for(String s : contents){ | |
206 // System.err.println(s); | |
207 Stats stats=calcStats(s); | |
208 if(stats!=null){ | |
209 if(best==null || stats.compareTo(best)>0){ | |
210 best=stats; | |
211 } | |
212 } | |
213 } | |
214 return best; | |
215 } | |
216 | |
217 static Stats calcStats(String baseAddress){ | |
218 if(verbose){System.err.println("calcStats: "+baseAddress);} | |
219 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); | |
220 String report=null; | |
221 for(String s : contents){ | |
222 if(s.endsWith("_assembly_report.txt")){ | |
223 report=s; | |
224 break; | |
225 } | |
226 } | |
227 if(report==null){ | |
228 if(verbose){System.err.println("Could not find report for "+baseAddress);} | |
229 return null; | |
230 } | |
231 if(verbose){System.err.println("Report: "+report);} | |
232 ArrayList<String> data=null; | |
233 for(int i=0; i<=retries && data==null; i++){ | |
234 try { | |
235 data = ServerTools.readFTPFile(report); | |
236 } catch (Exception e) { | |
237 // TODO Auto-generated catch block | |
238 e.printStackTrace(); | |
239 try { | |
240 Thread.sleep(Tools.mid(10000, i*1000, 1000)); | |
241 } catch (InterruptedException e1) { | |
242 // TODO Auto-generated catch block | |
243 e1.printStackTrace(); | |
244 } | |
245 } | |
246 } | |
247 if(data==null){return null;} | |
248 int contigs=0; | |
249 long size=0; | |
250 long max=0; | |
251 int taxid=-1; | |
252 for(String s : data){ | |
253 if(s!=null && s.length()>0){ | |
254 if(s.charAt(0)=='#'){ | |
255 if(s.startsWith("# Taxid:")){ | |
256 String[] split=Tools.whitespacePlus.split(s); | |
257 try { | |
258 taxid=Integer.parseInt(split[split.length-1]); | |
259 } catch (NumberFormatException e) { | |
260 e.printStackTrace(); | |
261 } | |
262 assert(taxid>-1) : "Bad TaxID: '"+s+"'"; | |
263 } | |
264 }else{ | |
265 String[] split=s.split("\t"); | |
266 contigs++; | |
267 long len; | |
268 try { | |
269 len=Long.parseLong(split[8]); | |
270 } catch (NumberFormatException e) { | |
271 len=1; | |
272 } | |
273 size+=len; | |
274 max=Tools.max(max, len); | |
275 } | |
276 } | |
277 } | |
278 return new Stats(baseAddress, max, size, contigs, taxid); | |
279 } | |
280 | |
281 static int examineAssembly(Stats stats, TextStreamWriter tsw, String speciesName){ | |
282 if(verbose){System.err.println("examineAssembly: "+stats.path);} | |
283 ArrayList<String> contents=ServerTools.listDirectory(stats.path, retries); | |
284 // System.err.println("D: "+contents); | |
285 String gff=null; | |
286 String fna=null; | |
287 for(String s : contents){ | |
288 // System.err.println(s); | |
289 if(!s.contains("_from_genomic")){ | |
290 if(s.endsWith("genomic.fna.gz")){fna=s;} | |
291 else if(s.endsWith("genomic.gff.gz")){gff=s;} | |
292 } | |
293 } | |
294 if(fna!=null && gff!=null){ | |
295 System.err.println("Printing: "+fna); | |
296 String prefix=(tidInFilename ? "tid_"+stats.taxID+"_" : ""); | |
297 | |
298 synchronized(tsw){ | |
299 if(renameSequences){ | |
300 tsw.println("wget -q -O - "+fna+" | " | |
301 + "gi2taxid.sh in=stdin.fa.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".fna.gz"); | |
302 tsw.println("wget -q -O - "+gff+" | " | |
303 + "gi2taxid.sh in=stdin.gff.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".gff.gz"); | |
304 }else if(renameFiles){ | |
305 tsw.println("wget -q -O - "+fna+" > "+prefix+speciesName+".fna.gz"); | |
306 tsw.println("wget -q -O - "+gff+" > "+prefix+speciesName+".gff.gz"); | |
307 }else{ | |
308 tsw.println("wget -q "+fna); | |
309 tsw.println("wget -q "+gff); | |
310 } | |
311 tsw.println(); | |
312 } | |
313 return 1; | |
314 } | |
315 return 0; | |
316 } | |
317 | |
318 static String makeSubAddress(String baseAddress, String extension){ | |
319 if(!baseAddress.endsWith("/")){baseAddress=baseAddress+"/";} | |
320 String subAddress=baseAddress+extension.substring(extension.indexOf('/')+1); | |
321 return subAddress; | |
322 } | |
323 | |
324 static int seen(String s, HashMap<String, Integer> map){ | |
325 // synchronized(map){ | |
326 Integer x=map.get(s); | |
327 return x==null ? 0 : x.intValue(); | |
328 // } | |
329 } | |
330 static void put(String s, int found, HashMap<String, Integer> map){ | |
331 // synchronized(map){ | |
332 int present=seen(s, map); | |
333 map.put(s, present+found); | |
334 // } | |
335 } | |
336 | |
337 static class Stats implements Comparable<Stats>{ | |
338 | |
339 public Stats(String path_, long maxContig_, long size_, int contigs_, int taxID_){ | |
340 path=path_; | |
341 maxContig=maxContig_; | |
342 size=size_; | |
343 contigs=contigs_; | |
344 taxID=taxID_; | |
345 } | |
346 | |
347 @Override | |
348 public int compareTo(Stats b) {//true if b is better | |
349 if(b==null){return 1;} | |
350 if(taxID>0 && b.taxID<1){return 1;} | |
351 if(b.taxID>0 && taxID<1){return -1;} | |
352 | |
353 if(size>2*b.size){return 1;} | |
354 if(size<2*b.size){return -1;} | |
355 | |
356 if(maxContig>b.maxContig){return 1;} | |
357 if(maxContig<b.maxContig){return -1;} | |
358 | |
359 return b.contigs-contigs; | |
360 } | |
361 | |
362 String path; | |
363 String name; | |
364 long maxContig; | |
365 long size; | |
366 int contigs; | |
367 int taxID; | |
368 } | |
369 | |
370 static boolean verbose=true; | |
371 // static boolean allowSameGenus=false; | |
372 static int maxSpeciesPerGenus=1; | |
373 static boolean renameFiles=true; | |
374 static boolean renameSequences=true; | |
375 static int retries=40; | |
376 static boolean findBest=false; | |
377 | |
378 static boolean tidInFilename=true; | |
379 | |
380 // private static HashMap<String, Integer> seen=new HashMap<String, Integer>(); | |
381 | |
382 static int totalSpecies=0; | |
383 static int totalGenus=0; | |
384 static int totalGenomes=0; | |
385 | |
386 private static final Integer one=1; | |
387 | |
388 } |