comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FetchProks.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 package prok;
2
3 import java.util.ArrayList;
4 import java.util.HashMap;
5
6 import fileIO.FileFormat;
7 import fileIO.TextStreamWriter;
8 import server.ServerTools;
9 import shared.Parse;
10 import shared.Tools;
11 import template.ThreadWaiter;
12
13 /** Crawls ncbi's ftp site to download genomes and annotations */
14 public class FetchProks {
15
16 public static void main(String[] args){
17 //ftp://ftp.ncbi.nih.gov:21/genomes/refseq/bacteria/
18
19 String baseAddress=args[0];
20 String out=args.length>1 ? args[1] : "stdout";
21 if(args.length>2){
22 maxSpeciesPerGenus=Integer.parseInt(args[2]);
23 System.err.println("Set maxSpeciesPerGenus="+maxSpeciesPerGenus);
24 }
25 if(args.length>3){
26 findBest=Parse.parseBoolean(args[3]);
27 System.err.println("Set findBest="+findBest);
28 }
29 TextStreamWriter tsw=new TextStreamWriter(out, true, false, false, FileFormat.TEXT);
30 tsw.start();
31
32 // iterateOuter(baseAddress, tsw);
33 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
34
35 int threads=7;
36 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
37 for(int i=0; i<threads; i++){
38 alpt.add(new ProcessThread(contents, tsw, i, threads));
39 }
40 for(ProcessThread pt : alpt){pt.start();}
41 boolean success=ThreadWaiter.waitForThreads(alpt);
42
43 for(ProcessThread pt : alpt){
44 totalSpecies+=pt.totalSpeciesT;
45 totalGenus+=pt.totalGenusT;
46 totalGenomes+=pt.totalGenomesT;
47 }
48 System.err.println("Total Genomes: "+totalGenomes);
49 System.err.println("Total Species: "+totalSpecies);
50 System.err.println("Total Genuses: "+totalGenus);
51
52 tsw.poisonAndWait();
53 assert(success);
54 }
55
56 static class ProcessThread extends Thread {
57
58 ProcessThread(ArrayList<String> speciesList_, TextStreamWriter tsw_, int tid_, int threads_){
59 speciesList=speciesList_;
60 tsw=tsw_;
61 tid=tid_;
62 threads=threads_;
63 }
64
65 @Override
66 public void run(){
67 for(String s : speciesList){
68 // if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
69 // processSpecies(s);
70 // }
71
72 //This way one thread handles an entire genus
73 if(s!=null){
74 String genus=getGenus(s);
75 if(genus!=null){
76 if((genus.hashCode()&Integer.MAX_VALUE)%threads==tid) {
77 processSpecies(s);
78 }
79 }else{
80 if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
81 processSpecies(s);
82 }
83 }
84 }
85 }
86 }
87
88 void processSpecies(String species){
89 String genus=getGenus(species);
90 if(genus!=null){
91 final int count=seen(genus, seen);
92
93 if(maxSpeciesPerGenus<1 || count<maxSpeciesPerGenus){
94 int found=examineSpecies(species, tsw);
95 if(found>=1){
96 totalSpeciesT++;
97 totalGenomesT+=found;
98 if(count==0){totalGenusT++;}
99 put(genus, found, seen);
100 }
101 }else{
102 if(verbose){System.err.println("same genus: "+species+"\n"+genus);}
103 }
104 }else{
105 if(verbose){System.err.println("bad species: "+species+"\n"+genus);}
106 }
107 }
108
109 final ArrayList<String> speciesList;
110 final int tid;
111 final int threads;
112 //This is OK now that threads work on a per-genus basis
113 HashMap<String, Integer> seen=new HashMap<String, Integer>();
114 final TextStreamWriter tsw;
115
116 int totalSpeciesT=0;
117 int totalGenusT=0;
118 int totalGenomesT=0;
119 }
120
121 static String getGenus(String path){
122 //Candidatus_Hamiltonella
123 String name=path.substring(path.lastIndexOf('/')+1);
124 if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
125 int under=name.indexOf('_');
126 if(under>0){
127 return name.substring(0, under);
128 }else{
129 return null;
130 }
131 }
132
133 static String getSpecies(String path){
134 //Candidatus_Hamiltonella
135 String name=path.substring(path.lastIndexOf('/')+1);
136 if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
137 return name;
138 }
139
140 static int examineSpecies(String baseAddress, TextStreamWriter tsw){
141 if(verbose){System.err.println("examineSpecies: "+baseAddress);}
142 String speciesName=getSpecies(baseAddress);
143 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
144 // System.err.println("B: "+contents);
145 int found=0;
146 for(String s : contents){
147 // System.err.println(s);
148 if(s.contains("reference")){
149 // System.err.println("Looking at '"+s+"'");
150 found+=examineAssemblies(s, tsw, speciesName);
151 }
152 }
153 if(found>0){return found;}
154 for(String s : contents){
155 // System.err.println(s);
156 if(s.contains("latest_assembly_versions")){
157 // System.err.println("Looking at '"+s+"'");
158 found+=examineAssemblies(s, tsw, speciesName);
159 }
160 }
161 if(found>0){return found;}
162 for(String s : contents){
163 // System.err.println(s);
164 if(s.contains("all_assembly_versions")){
165 // System.err.println("Looking at '"+s+"'");
166 found+=examineAssemblies(s, tsw, speciesName);
167 }
168 }
169 return found;
170 }
171
172 static int examineAssemblies(String baseAddress, TextStreamWriter tsw, String speciesName){
173 if(verbose){System.err.println("examineAssemblies: "+baseAddress);}
174 Stats stats=null;
175 if(findBest){
176 stats=findBestAssembly(baseAddress);
177 if(stats!=null){
178 stats.name=speciesName;
179 int x=examineAssembly(stats, tsw, speciesName);
180 if(x>0){return x;}
181 }
182 }
183
184 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
185 // System.err.println("C: "+contents);
186
187 int found=0;
188 for(String s : contents){
189 stats=calcStats(s);
190 if(stats!=null){
191 stats.name=speciesName;
192 found+=examineAssembly(stats, tsw, speciesName);
193 if(found>0){break;}
194 }
195 }
196 return found;
197 }
198
199 /** Tries to find the assembly with the longest contig */
200 static Stats findBestAssembly(String baseAddress){
201 if(verbose){System.err.println("findBestAssembly: "+baseAddress);}
202 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
203 // System.err.println("C: "+contents);
204 Stats best=null;
205 for(String s : contents){
206 // System.err.println(s);
207 Stats stats=calcStats(s);
208 if(stats!=null){
209 if(best==null || stats.compareTo(best)>0){
210 best=stats;
211 }
212 }
213 }
214 return best;
215 }
216
217 static Stats calcStats(String baseAddress){
218 if(verbose){System.err.println("calcStats: "+baseAddress);}
219 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
220 String report=null;
221 for(String s : contents){
222 if(s.endsWith("_assembly_report.txt")){
223 report=s;
224 break;
225 }
226 }
227 if(report==null){
228 if(verbose){System.err.println("Could not find report for "+baseAddress);}
229 return null;
230 }
231 if(verbose){System.err.println("Report: "+report);}
232 ArrayList<String> data=null;
233 for(int i=0; i<=retries && data==null; i++){
234 try {
235 data = ServerTools.readFTPFile(report);
236 } catch (Exception e) {
237 // TODO Auto-generated catch block
238 e.printStackTrace();
239 try {
240 Thread.sleep(Tools.mid(10000, i*1000, 1000));
241 } catch (InterruptedException e1) {
242 // TODO Auto-generated catch block
243 e1.printStackTrace();
244 }
245 }
246 }
247 if(data==null){return null;}
248 int contigs=0;
249 long size=0;
250 long max=0;
251 int taxid=-1;
252 for(String s : data){
253 if(s!=null && s.length()>0){
254 if(s.charAt(0)=='#'){
255 if(s.startsWith("# Taxid:")){
256 String[] split=Tools.whitespacePlus.split(s);
257 try {
258 taxid=Integer.parseInt(split[split.length-1]);
259 } catch (NumberFormatException e) {
260 e.printStackTrace();
261 }
262 assert(taxid>-1) : "Bad TaxID: '"+s+"'";
263 }
264 }else{
265 String[] split=s.split("\t");
266 contigs++;
267 long len;
268 try {
269 len=Long.parseLong(split[8]);
270 } catch (NumberFormatException e) {
271 len=1;
272 }
273 size+=len;
274 max=Tools.max(max, len);
275 }
276 }
277 }
278 return new Stats(baseAddress, max, size, contigs, taxid);
279 }
280
281 static int examineAssembly(Stats stats, TextStreamWriter tsw, String speciesName){
282 if(verbose){System.err.println("examineAssembly: "+stats.path);}
283 ArrayList<String> contents=ServerTools.listDirectory(stats.path, retries);
284 // System.err.println("D: "+contents);
285 String gff=null;
286 String fna=null;
287 for(String s : contents){
288 // System.err.println(s);
289 if(!s.contains("_from_genomic")){
290 if(s.endsWith("genomic.fna.gz")){fna=s;}
291 else if(s.endsWith("genomic.gff.gz")){gff=s;}
292 }
293 }
294 if(fna!=null && gff!=null){
295 System.err.println("Printing: "+fna);
296 String prefix=(tidInFilename ? "tid_"+stats.taxID+"_" : "");
297
298 synchronized(tsw){
299 if(renameSequences){
300 tsw.println("wget -q -O - "+fna+" | "
301 + "gi2taxid.sh in=stdin.fa.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".fna.gz");
302 tsw.println("wget -q -O - "+gff+" | "
303 + "gi2taxid.sh in=stdin.gff.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".gff.gz");
304 }else if(renameFiles){
305 tsw.println("wget -q -O - "+fna+" > "+prefix+speciesName+".fna.gz");
306 tsw.println("wget -q -O - "+gff+" > "+prefix+speciesName+".gff.gz");
307 }else{
308 tsw.println("wget -q "+fna);
309 tsw.println("wget -q "+gff);
310 }
311 tsw.println();
312 }
313 return 1;
314 }
315 return 0;
316 }
317
318 static String makeSubAddress(String baseAddress, String extension){
319 if(!baseAddress.endsWith("/")){baseAddress=baseAddress+"/";}
320 String subAddress=baseAddress+extension.substring(extension.indexOf('/')+1);
321 return subAddress;
322 }
323
324 static int seen(String s, HashMap<String, Integer> map){
325 // synchronized(map){
326 Integer x=map.get(s);
327 return x==null ? 0 : x.intValue();
328 // }
329 }
330 static void put(String s, int found, HashMap<String, Integer> map){
331 // synchronized(map){
332 int present=seen(s, map);
333 map.put(s, present+found);
334 // }
335 }
336
337 static class Stats implements Comparable<Stats>{
338
339 public Stats(String path_, long maxContig_, long size_, int contigs_, int taxID_){
340 path=path_;
341 maxContig=maxContig_;
342 size=size_;
343 contigs=contigs_;
344 taxID=taxID_;
345 }
346
347 @Override
348 public int compareTo(Stats b) {//true if b is better
349 if(b==null){return 1;}
350 if(taxID>0 && b.taxID<1){return 1;}
351 if(b.taxID>0 && taxID<1){return -1;}
352
353 if(size>2*b.size){return 1;}
354 if(size<2*b.size){return -1;}
355
356 if(maxContig>b.maxContig){return 1;}
357 if(maxContig<b.maxContig){return -1;}
358
359 return b.contigs-contigs;
360 }
361
362 String path;
363 String name;
364 long maxContig;
365 long size;
366 int contigs;
367 int taxID;
368 }
369
370 static boolean verbose=true;
371 // static boolean allowSameGenus=false;
372 static int maxSpeciesPerGenus=1;
373 static boolean renameFiles=true;
374 static boolean renameSequences=true;
375 static int retries=40;
376 static boolean findBest=false;
377
378 static boolean tidInFilename=true;
379
380 // private static HashMap<String, Integer> seen=new HashMap<String, Integer>();
381
382 static int totalSpecies=0;
383 static int totalGenus=0;
384 static int totalGenomes=0;
385
386 private static final Integer one=1;
387
388 }