annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FetchProks.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package prok;
jpayne@68 2
jpayne@68 3 import java.util.ArrayList;
jpayne@68 4 import java.util.HashMap;
jpayne@68 5
jpayne@68 6 import fileIO.FileFormat;
jpayne@68 7 import fileIO.TextStreamWriter;
jpayne@68 8 import server.ServerTools;
jpayne@68 9 import shared.Parse;
jpayne@68 10 import shared.Tools;
jpayne@68 11 import template.ThreadWaiter;
jpayne@68 12
jpayne@68 13 /** Crawls ncbi's ftp site to download genomes and annotations */
jpayne@68 14 public class FetchProks {
jpayne@68 15
jpayne@68 16 public static void main(String[] args){
jpayne@68 17 //ftp://ftp.ncbi.nih.gov:21/genomes/refseq/bacteria/
jpayne@68 18
jpayne@68 19 String baseAddress=args[0];
jpayne@68 20 String out=args.length>1 ? args[1] : "stdout";
jpayne@68 21 if(args.length>2){
jpayne@68 22 maxSpeciesPerGenus=Integer.parseInt(args[2]);
jpayne@68 23 System.err.println("Set maxSpeciesPerGenus="+maxSpeciesPerGenus);
jpayne@68 24 }
jpayne@68 25 if(args.length>3){
jpayne@68 26 findBest=Parse.parseBoolean(args[3]);
jpayne@68 27 System.err.println("Set findBest="+findBest);
jpayne@68 28 }
jpayne@68 29 TextStreamWriter tsw=new TextStreamWriter(out, true, false, false, FileFormat.TEXT);
jpayne@68 30 tsw.start();
jpayne@68 31
jpayne@68 32 // iterateOuter(baseAddress, tsw);
jpayne@68 33 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
jpayne@68 34
jpayne@68 35 int threads=7;
jpayne@68 36 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
jpayne@68 37 for(int i=0; i<threads; i++){
jpayne@68 38 alpt.add(new ProcessThread(contents, tsw, i, threads));
jpayne@68 39 }
jpayne@68 40 for(ProcessThread pt : alpt){pt.start();}
jpayne@68 41 boolean success=ThreadWaiter.waitForThreads(alpt);
jpayne@68 42
jpayne@68 43 for(ProcessThread pt : alpt){
jpayne@68 44 totalSpecies+=pt.totalSpeciesT;
jpayne@68 45 totalGenus+=pt.totalGenusT;
jpayne@68 46 totalGenomes+=pt.totalGenomesT;
jpayne@68 47 }
jpayne@68 48 System.err.println("Total Genomes: "+totalGenomes);
jpayne@68 49 System.err.println("Total Species: "+totalSpecies);
jpayne@68 50 System.err.println("Total Genuses: "+totalGenus);
jpayne@68 51
jpayne@68 52 tsw.poisonAndWait();
jpayne@68 53 assert(success);
jpayne@68 54 }
jpayne@68 55
jpayne@68 56 static class ProcessThread extends Thread {
jpayne@68 57
jpayne@68 58 ProcessThread(ArrayList<String> speciesList_, TextStreamWriter tsw_, int tid_, int threads_){
jpayne@68 59 speciesList=speciesList_;
jpayne@68 60 tsw=tsw_;
jpayne@68 61 tid=tid_;
jpayne@68 62 threads=threads_;
jpayne@68 63 }
jpayne@68 64
jpayne@68 65 @Override
jpayne@68 66 public void run(){
jpayne@68 67 for(String s : speciesList){
jpayne@68 68 // if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
jpayne@68 69 // processSpecies(s);
jpayne@68 70 // }
jpayne@68 71
jpayne@68 72 //This way one thread handles an entire genus
jpayne@68 73 if(s!=null){
jpayne@68 74 String genus=getGenus(s);
jpayne@68 75 if(genus!=null){
jpayne@68 76 if((genus.hashCode()&Integer.MAX_VALUE)%threads==tid) {
jpayne@68 77 processSpecies(s);
jpayne@68 78 }
jpayne@68 79 }else{
jpayne@68 80 if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
jpayne@68 81 processSpecies(s);
jpayne@68 82 }
jpayne@68 83 }
jpayne@68 84 }
jpayne@68 85 }
jpayne@68 86 }
jpayne@68 87
jpayne@68 88 void processSpecies(String species){
jpayne@68 89 String genus=getGenus(species);
jpayne@68 90 if(genus!=null){
jpayne@68 91 final int count=seen(genus, seen);
jpayne@68 92
jpayne@68 93 if(maxSpeciesPerGenus<1 || count<maxSpeciesPerGenus){
jpayne@68 94 int found=examineSpecies(species, tsw);
jpayne@68 95 if(found>=1){
jpayne@68 96 totalSpeciesT++;
jpayne@68 97 totalGenomesT+=found;
jpayne@68 98 if(count==0){totalGenusT++;}
jpayne@68 99 put(genus, found, seen);
jpayne@68 100 }
jpayne@68 101 }else{
jpayne@68 102 if(verbose){System.err.println("same genus: "+species+"\n"+genus);}
jpayne@68 103 }
jpayne@68 104 }else{
jpayne@68 105 if(verbose){System.err.println("bad species: "+species+"\n"+genus);}
jpayne@68 106 }
jpayne@68 107 }
jpayne@68 108
jpayne@68 109 final ArrayList<String> speciesList;
jpayne@68 110 final int tid;
jpayne@68 111 final int threads;
jpayne@68 112 //This is OK now that threads work on a per-genus basis
jpayne@68 113 HashMap<String, Integer> seen=new HashMap<String, Integer>();
jpayne@68 114 final TextStreamWriter tsw;
jpayne@68 115
jpayne@68 116 int totalSpeciesT=0;
jpayne@68 117 int totalGenusT=0;
jpayne@68 118 int totalGenomesT=0;
jpayne@68 119 }
jpayne@68 120
jpayne@68 121 static String getGenus(String path){
jpayne@68 122 //Candidatus_Hamiltonella
jpayne@68 123 String name=path.substring(path.lastIndexOf('/')+1);
jpayne@68 124 if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
jpayne@68 125 int under=name.indexOf('_');
jpayne@68 126 if(under>0){
jpayne@68 127 return name.substring(0, under);
jpayne@68 128 }else{
jpayne@68 129 return null;
jpayne@68 130 }
jpayne@68 131 }
jpayne@68 132
jpayne@68 133 static String getSpecies(String path){
jpayne@68 134 //Candidatus_Hamiltonella
jpayne@68 135 String name=path.substring(path.lastIndexOf('/')+1);
jpayne@68 136 if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
jpayne@68 137 return name;
jpayne@68 138 }
jpayne@68 139
jpayne@68 140 static int examineSpecies(String baseAddress, TextStreamWriter tsw){
jpayne@68 141 if(verbose){System.err.println("examineSpecies: "+baseAddress);}
jpayne@68 142 String speciesName=getSpecies(baseAddress);
jpayne@68 143 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
jpayne@68 144 // System.err.println("B: "+contents);
jpayne@68 145 int found=0;
jpayne@68 146 for(String s : contents){
jpayne@68 147 // System.err.println(s);
jpayne@68 148 if(s.contains("reference")){
jpayne@68 149 // System.err.println("Looking at '"+s+"'");
jpayne@68 150 found+=examineAssemblies(s, tsw, speciesName);
jpayne@68 151 }
jpayne@68 152 }
jpayne@68 153 if(found>0){return found;}
jpayne@68 154 for(String s : contents){
jpayne@68 155 // System.err.println(s);
jpayne@68 156 if(s.contains("latest_assembly_versions")){
jpayne@68 157 // System.err.println("Looking at '"+s+"'");
jpayne@68 158 found+=examineAssemblies(s, tsw, speciesName);
jpayne@68 159 }
jpayne@68 160 }
jpayne@68 161 if(found>0){return found;}
jpayne@68 162 for(String s : contents){
jpayne@68 163 // System.err.println(s);
jpayne@68 164 if(s.contains("all_assembly_versions")){
jpayne@68 165 // System.err.println("Looking at '"+s+"'");
jpayne@68 166 found+=examineAssemblies(s, tsw, speciesName);
jpayne@68 167 }
jpayne@68 168 }
jpayne@68 169 return found;
jpayne@68 170 }
jpayne@68 171
jpayne@68 172 static int examineAssemblies(String baseAddress, TextStreamWriter tsw, String speciesName){
jpayne@68 173 if(verbose){System.err.println("examineAssemblies: "+baseAddress);}
jpayne@68 174 Stats stats=null;
jpayne@68 175 if(findBest){
jpayne@68 176 stats=findBestAssembly(baseAddress);
jpayne@68 177 if(stats!=null){
jpayne@68 178 stats.name=speciesName;
jpayne@68 179 int x=examineAssembly(stats, tsw, speciesName);
jpayne@68 180 if(x>0){return x;}
jpayne@68 181 }
jpayne@68 182 }
jpayne@68 183
jpayne@68 184 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
jpayne@68 185 // System.err.println("C: "+contents);
jpayne@68 186
jpayne@68 187 int found=0;
jpayne@68 188 for(String s : contents){
jpayne@68 189 stats=calcStats(s);
jpayne@68 190 if(stats!=null){
jpayne@68 191 stats.name=speciesName;
jpayne@68 192 found+=examineAssembly(stats, tsw, speciesName);
jpayne@68 193 if(found>0){break;}
jpayne@68 194 }
jpayne@68 195 }
jpayne@68 196 return found;
jpayne@68 197 }
jpayne@68 198
jpayne@68 199 /** Tries to find the assembly with the longest contig */
jpayne@68 200 static Stats findBestAssembly(String baseAddress){
jpayne@68 201 if(verbose){System.err.println("findBestAssembly: "+baseAddress);}
jpayne@68 202 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
jpayne@68 203 // System.err.println("C: "+contents);
jpayne@68 204 Stats best=null;
jpayne@68 205 for(String s : contents){
jpayne@68 206 // System.err.println(s);
jpayne@68 207 Stats stats=calcStats(s);
jpayne@68 208 if(stats!=null){
jpayne@68 209 if(best==null || stats.compareTo(best)>0){
jpayne@68 210 best=stats;
jpayne@68 211 }
jpayne@68 212 }
jpayne@68 213 }
jpayne@68 214 return best;
jpayne@68 215 }
jpayne@68 216
jpayne@68 217 static Stats calcStats(String baseAddress){
jpayne@68 218 if(verbose){System.err.println("calcStats: "+baseAddress);}
jpayne@68 219 ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
jpayne@68 220 String report=null;
jpayne@68 221 for(String s : contents){
jpayne@68 222 if(s.endsWith("_assembly_report.txt")){
jpayne@68 223 report=s;
jpayne@68 224 break;
jpayne@68 225 }
jpayne@68 226 }
jpayne@68 227 if(report==null){
jpayne@68 228 if(verbose){System.err.println("Could not find report for "+baseAddress);}
jpayne@68 229 return null;
jpayne@68 230 }
jpayne@68 231 if(verbose){System.err.println("Report: "+report);}
jpayne@68 232 ArrayList<String> data=null;
jpayne@68 233 for(int i=0; i<=retries && data==null; i++){
jpayne@68 234 try {
jpayne@68 235 data = ServerTools.readFTPFile(report);
jpayne@68 236 } catch (Exception e) {
jpayne@68 237 // TODO Auto-generated catch block
jpayne@68 238 e.printStackTrace();
jpayne@68 239 try {
jpayne@68 240 Thread.sleep(Tools.mid(10000, i*1000, 1000));
jpayne@68 241 } catch (InterruptedException e1) {
jpayne@68 242 // TODO Auto-generated catch block
jpayne@68 243 e1.printStackTrace();
jpayne@68 244 }
jpayne@68 245 }
jpayne@68 246 }
jpayne@68 247 if(data==null){return null;}
jpayne@68 248 int contigs=0;
jpayne@68 249 long size=0;
jpayne@68 250 long max=0;
jpayne@68 251 int taxid=-1;
jpayne@68 252 for(String s : data){
jpayne@68 253 if(s!=null && s.length()>0){
jpayne@68 254 if(s.charAt(0)=='#'){
jpayne@68 255 if(s.startsWith("# Taxid:")){
jpayne@68 256 String[] split=Tools.whitespacePlus.split(s);
jpayne@68 257 try {
jpayne@68 258 taxid=Integer.parseInt(split[split.length-1]);
jpayne@68 259 } catch (NumberFormatException e) {
jpayne@68 260 e.printStackTrace();
jpayne@68 261 }
jpayne@68 262 assert(taxid>-1) : "Bad TaxID: '"+s+"'";
jpayne@68 263 }
jpayne@68 264 }else{
jpayne@68 265 String[] split=s.split("\t");
jpayne@68 266 contigs++;
jpayne@68 267 long len;
jpayne@68 268 try {
jpayne@68 269 len=Long.parseLong(split[8]);
jpayne@68 270 } catch (NumberFormatException e) {
jpayne@68 271 len=1;
jpayne@68 272 }
jpayne@68 273 size+=len;
jpayne@68 274 max=Tools.max(max, len);
jpayne@68 275 }
jpayne@68 276 }
jpayne@68 277 }
jpayne@68 278 return new Stats(baseAddress, max, size, contigs, taxid);
jpayne@68 279 }
jpayne@68 280
jpayne@68 281 static int examineAssembly(Stats stats, TextStreamWriter tsw, String speciesName){
jpayne@68 282 if(verbose){System.err.println("examineAssembly: "+stats.path);}
jpayne@68 283 ArrayList<String> contents=ServerTools.listDirectory(stats.path, retries);
jpayne@68 284 // System.err.println("D: "+contents);
jpayne@68 285 String gff=null;
jpayne@68 286 String fna=null;
jpayne@68 287 for(String s : contents){
jpayne@68 288 // System.err.println(s);
jpayne@68 289 if(!s.contains("_from_genomic")){
jpayne@68 290 if(s.endsWith("genomic.fna.gz")){fna=s;}
jpayne@68 291 else if(s.endsWith("genomic.gff.gz")){gff=s;}
jpayne@68 292 }
jpayne@68 293 }
jpayne@68 294 if(fna!=null && gff!=null){
jpayne@68 295 System.err.println("Printing: "+fna);
jpayne@68 296 String prefix=(tidInFilename ? "tid_"+stats.taxID+"_" : "");
jpayne@68 297
jpayne@68 298 synchronized(tsw){
jpayne@68 299 if(renameSequences){
jpayne@68 300 tsw.println("wget -q -O - "+fna+" | "
jpayne@68 301 + "gi2taxid.sh in=stdin.fa.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".fna.gz");
jpayne@68 302 tsw.println("wget -q -O - "+gff+" | "
jpayne@68 303 + "gi2taxid.sh in=stdin.gff.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".gff.gz");
jpayne@68 304 }else if(renameFiles){
jpayne@68 305 tsw.println("wget -q -O - "+fna+" > "+prefix+speciesName+".fna.gz");
jpayne@68 306 tsw.println("wget -q -O - "+gff+" > "+prefix+speciesName+".gff.gz");
jpayne@68 307 }else{
jpayne@68 308 tsw.println("wget -q "+fna);
jpayne@68 309 tsw.println("wget -q "+gff);
jpayne@68 310 }
jpayne@68 311 tsw.println();
jpayne@68 312 }
jpayne@68 313 return 1;
jpayne@68 314 }
jpayne@68 315 return 0;
jpayne@68 316 }
jpayne@68 317
jpayne@68 318 static String makeSubAddress(String baseAddress, String extension){
jpayne@68 319 if(!baseAddress.endsWith("/")){baseAddress=baseAddress+"/";}
jpayne@68 320 String subAddress=baseAddress+extension.substring(extension.indexOf('/')+1);
jpayne@68 321 return subAddress;
jpayne@68 322 }
jpayne@68 323
jpayne@68 324 static int seen(String s, HashMap<String, Integer> map){
jpayne@68 325 // synchronized(map){
jpayne@68 326 Integer x=map.get(s);
jpayne@68 327 return x==null ? 0 : x.intValue();
jpayne@68 328 // }
jpayne@68 329 }
jpayne@68 330 static void put(String s, int found, HashMap<String, Integer> map){
jpayne@68 331 // synchronized(map){
jpayne@68 332 int present=seen(s, map);
jpayne@68 333 map.put(s, present+found);
jpayne@68 334 // }
jpayne@68 335 }
jpayne@68 336
jpayne@68 337 static class Stats implements Comparable<Stats>{
jpayne@68 338
jpayne@68 339 public Stats(String path_, long maxContig_, long size_, int contigs_, int taxID_){
jpayne@68 340 path=path_;
jpayne@68 341 maxContig=maxContig_;
jpayne@68 342 size=size_;
jpayne@68 343 contigs=contigs_;
jpayne@68 344 taxID=taxID_;
jpayne@68 345 }
jpayne@68 346
jpayne@68 347 @Override
jpayne@68 348 public int compareTo(Stats b) {//true if b is better
jpayne@68 349 if(b==null){return 1;}
jpayne@68 350 if(taxID>0 && b.taxID<1){return 1;}
jpayne@68 351 if(b.taxID>0 && taxID<1){return -1;}
jpayne@68 352
jpayne@68 353 if(size>2*b.size){return 1;}
jpayne@68 354 if(size<2*b.size){return -1;}
jpayne@68 355
jpayne@68 356 if(maxContig>b.maxContig){return 1;}
jpayne@68 357 if(maxContig<b.maxContig){return -1;}
jpayne@68 358
jpayne@68 359 return b.contigs-contigs;
jpayne@68 360 }
jpayne@68 361
jpayne@68 362 String path;
jpayne@68 363 String name;
jpayne@68 364 long maxContig;
jpayne@68 365 long size;
jpayne@68 366 int contigs;
jpayne@68 367 int taxID;
jpayne@68 368 }
jpayne@68 369
jpayne@68 370 static boolean verbose=true;
jpayne@68 371 // static boolean allowSameGenus=false;
jpayne@68 372 static int maxSpeciesPerGenus=1;
jpayne@68 373 static boolean renameFiles=true;
jpayne@68 374 static boolean renameSequences=true;
jpayne@68 375 static int retries=40;
jpayne@68 376 static boolean findBest=false;
jpayne@68 377
jpayne@68 378 static boolean tidInFilename=true;
jpayne@68 379
jpayne@68 380 // private static HashMap<String, Integer> seen=new HashMap<String, Integer>();
jpayne@68 381
jpayne@68 382 static int totalSpecies=0;
jpayne@68 383 static int totalGenus=0;
jpayne@68 384 static int totalGenomes=0;
jpayne@68 385
jpayne@68 386 private static final Integer one=1;
jpayne@68 387
jpayne@68 388 }