Mercurial > repos > rliterman > csp2
view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FetchProks.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line source
package prok; import java.util.ArrayList; import java.util.HashMap; import fileIO.FileFormat; import fileIO.TextStreamWriter; import server.ServerTools; import shared.Parse; import shared.Tools; import template.ThreadWaiter; /** Crawls ncbi's ftp site to download genomes and annotations */ public class FetchProks { public static void main(String[] args){ //ftp://ftp.ncbi.nih.gov:21/genomes/refseq/bacteria/ String baseAddress=args[0]; String out=args.length>1 ? args[1] : "stdout"; if(args.length>2){ maxSpeciesPerGenus=Integer.parseInt(args[2]); System.err.println("Set maxSpeciesPerGenus="+maxSpeciesPerGenus); } if(args.length>3){ findBest=Parse.parseBoolean(args[3]); System.err.println("Set findBest="+findBest); } TextStreamWriter tsw=new TextStreamWriter(out, true, false, false, FileFormat.TEXT); tsw.start(); // iterateOuter(baseAddress, tsw); ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); int threads=7; ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); for(int i=0; i<threads; i++){ alpt.add(new ProcessThread(contents, tsw, i, threads)); } for(ProcessThread pt : alpt){pt.start();} boolean success=ThreadWaiter.waitForThreads(alpt); for(ProcessThread pt : alpt){ totalSpecies+=pt.totalSpeciesT; totalGenus+=pt.totalGenusT; totalGenomes+=pt.totalGenomesT; } System.err.println("Total Genomes: "+totalGenomes); System.err.println("Total Species: "+totalSpecies); System.err.println("Total Genuses: "+totalGenus); tsw.poisonAndWait(); assert(success); } static class ProcessThread extends Thread { ProcessThread(ArrayList<String> speciesList_, TextStreamWriter tsw_, int tid_, int threads_){ speciesList=speciesList_; tsw=tsw_; tid=tid_; threads=threads_; } @Override public void run(){ for(String s : speciesList){ // if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) { // processSpecies(s); // } //This way one thread handles an entire genus if(s!=null){ String genus=getGenus(s); if(genus!=null){ if((genus.hashCode()&Integer.MAX_VALUE)%threads==tid) { processSpecies(s); } }else{ if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) { processSpecies(s); } } } } } void processSpecies(String species){ String genus=getGenus(species); if(genus!=null){ final int count=seen(genus, seen); if(maxSpeciesPerGenus<1 || count<maxSpeciesPerGenus){ int found=examineSpecies(species, tsw); if(found>=1){ totalSpeciesT++; totalGenomesT+=found; if(count==0){totalGenusT++;} put(genus, found, seen); } }else{ if(verbose){System.err.println("same genus: "+species+"\n"+genus);} } }else{ if(verbose){System.err.println("bad species: "+species+"\n"+genus);} } } final ArrayList<String> speciesList; final int tid; final int threads; //This is OK now that threads work on a per-genus basis HashMap<String, Integer> seen=new HashMap<String, Integer>(); final TextStreamWriter tsw; int totalSpeciesT=0; int totalGenusT=0; int totalGenomesT=0; } static String getGenus(String path){ //Candidatus_Hamiltonella String name=path.substring(path.lastIndexOf('/')+1); if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());} int under=name.indexOf('_'); if(under>0){ return name.substring(0, under); }else{ return null; } } static String getSpecies(String path){ //Candidatus_Hamiltonella String name=path.substring(path.lastIndexOf('/')+1); if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());} return name; } static int examineSpecies(String baseAddress, TextStreamWriter tsw){ if(verbose){System.err.println("examineSpecies: "+baseAddress);} String speciesName=getSpecies(baseAddress); ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); // System.err.println("B: "+contents); int found=0; for(String s : contents){ // System.err.println(s); if(s.contains("reference")){ // System.err.println("Looking at '"+s+"'"); found+=examineAssemblies(s, tsw, speciesName); } } if(found>0){return found;} for(String s : contents){ // System.err.println(s); if(s.contains("latest_assembly_versions")){ // System.err.println("Looking at '"+s+"'"); found+=examineAssemblies(s, tsw, speciesName); } } if(found>0){return found;} for(String s : contents){ // System.err.println(s); if(s.contains("all_assembly_versions")){ // System.err.println("Looking at '"+s+"'"); found+=examineAssemblies(s, tsw, speciesName); } } return found; } static int examineAssemblies(String baseAddress, TextStreamWriter tsw, String speciesName){ if(verbose){System.err.println("examineAssemblies: "+baseAddress);} Stats stats=null; if(findBest){ stats=findBestAssembly(baseAddress); if(stats!=null){ stats.name=speciesName; int x=examineAssembly(stats, tsw, speciesName); if(x>0){return x;} } } ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); // System.err.println("C: "+contents); int found=0; for(String s : contents){ stats=calcStats(s); if(stats!=null){ stats.name=speciesName; found+=examineAssembly(stats, tsw, speciesName); if(found>0){break;} } } return found; } /** Tries to find the assembly with the longest contig */ static Stats findBestAssembly(String baseAddress){ if(verbose){System.err.println("findBestAssembly: "+baseAddress);} ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); // System.err.println("C: "+contents); Stats best=null; for(String s : contents){ // System.err.println(s); Stats stats=calcStats(s); if(stats!=null){ if(best==null || stats.compareTo(best)>0){ best=stats; } } } return best; } static Stats calcStats(String baseAddress){ if(verbose){System.err.println("calcStats: "+baseAddress);} ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); String report=null; for(String s : contents){ if(s.endsWith("_assembly_report.txt")){ report=s; break; } } if(report==null){ if(verbose){System.err.println("Could not find report for "+baseAddress);} return null; } if(verbose){System.err.println("Report: "+report);} ArrayList<String> data=null; for(int i=0; i<=retries && data==null; i++){ try { data = ServerTools.readFTPFile(report); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); try { Thread.sleep(Tools.mid(10000, i*1000, 1000)); } catch (InterruptedException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } } if(data==null){return null;} int contigs=0; long size=0; long max=0; int taxid=-1; for(String s : data){ if(s!=null && s.length()>0){ if(s.charAt(0)=='#'){ if(s.startsWith("# Taxid:")){ String[] split=Tools.whitespacePlus.split(s); try { taxid=Integer.parseInt(split[split.length-1]); } catch (NumberFormatException e) { e.printStackTrace(); } assert(taxid>-1) : "Bad TaxID: '"+s+"'"; } }else{ String[] split=s.split("\t"); contigs++; long len; try { len=Long.parseLong(split[8]); } catch (NumberFormatException e) { len=1; } size+=len; max=Tools.max(max, len); } } } return new Stats(baseAddress, max, size, contigs, taxid); } static int examineAssembly(Stats stats, TextStreamWriter tsw, String speciesName){ if(verbose){System.err.println("examineAssembly: "+stats.path);} ArrayList<String> contents=ServerTools.listDirectory(stats.path, retries); // System.err.println("D: "+contents); String gff=null; String fna=null; for(String s : contents){ // System.err.println(s); if(!s.contains("_from_genomic")){ if(s.endsWith("genomic.fna.gz")){fna=s;} else if(s.endsWith("genomic.gff.gz")){gff=s;} } } if(fna!=null && gff!=null){ System.err.println("Printing: "+fna); String prefix=(tidInFilename ? "tid_"+stats.taxID+"_" : ""); synchronized(tsw){ if(renameSequences){ tsw.println("wget -q -O - "+fna+" | " + "gi2taxid.sh in=stdin.fa.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".fna.gz"); tsw.println("wget -q -O - "+gff+" | " + "gi2taxid.sh in=stdin.gff.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".gff.gz"); }else if(renameFiles){ tsw.println("wget -q -O - "+fna+" > "+prefix+speciesName+".fna.gz"); tsw.println("wget -q -O - "+gff+" > "+prefix+speciesName+".gff.gz"); }else{ tsw.println("wget -q "+fna); tsw.println("wget -q "+gff); } tsw.println(); } return 1; } return 0; } static String makeSubAddress(String baseAddress, String extension){ if(!baseAddress.endsWith("/")){baseAddress=baseAddress+"/";} String subAddress=baseAddress+extension.substring(extension.indexOf('/')+1); return subAddress; } static int seen(String s, HashMap<String, Integer> map){ // synchronized(map){ Integer x=map.get(s); return x==null ? 0 : x.intValue(); // } } static void put(String s, int found, HashMap<String, Integer> map){ // synchronized(map){ int present=seen(s, map); map.put(s, present+found); // } } static class Stats implements Comparable<Stats>{ public Stats(String path_, long maxContig_, long size_, int contigs_, int taxID_){ path=path_; maxContig=maxContig_; size=size_; contigs=contigs_; taxID=taxID_; } @Override public int compareTo(Stats b) {//true if b is better if(b==null){return 1;} if(taxID>0 && b.taxID<1){return 1;} if(b.taxID>0 && taxID<1){return -1;} if(size>2*b.size){return 1;} if(size<2*b.size){return -1;} if(maxContig>b.maxContig){return 1;} if(maxContig<b.maxContig){return -1;} return b.contigs-contigs; } String path; String name; long maxContig; long size; int contigs; int taxID; } static boolean verbose=true; // static boolean allowSameGenus=false; static int maxSpeciesPerGenus=1; static boolean renameFiles=true; static boolean renameSequences=true; static int retries=40; static boolean findBest=false; static boolean tidInFilename=true; // private static HashMap<String, Integer> seen=new HashMap<String, Integer>(); static int totalSpecies=0; static int totalGenus=0; static int totalGenomes=0; private static final Integer one=1; }