Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FetchProks.java @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FetchProks.java Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,388 @@ +package prok; + +import java.util.ArrayList; +import java.util.HashMap; + +import fileIO.FileFormat; +import fileIO.TextStreamWriter; +import server.ServerTools; +import shared.Parse; +import shared.Tools; +import template.ThreadWaiter; + +/** Crawls ncbi's ftp site to download genomes and annotations */ +public class FetchProks { + + public static void main(String[] args){ + //ftp://ftp.ncbi.nih.gov:21/genomes/refseq/bacteria/ + + String baseAddress=args[0]; + String out=args.length>1 ? args[1] : "stdout"; + if(args.length>2){ + maxSpeciesPerGenus=Integer.parseInt(args[2]); + System.err.println("Set maxSpeciesPerGenus="+maxSpeciesPerGenus); + } + if(args.length>3){ + findBest=Parse.parseBoolean(args[3]); + System.err.println("Set findBest="+findBest); + } + TextStreamWriter tsw=new TextStreamWriter(out, true, false, false, FileFormat.TEXT); + tsw.start(); + +// iterateOuter(baseAddress, tsw); + ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); + + int threads=7; + ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads); + for(int i=0; i<threads; i++){ + alpt.add(new ProcessThread(contents, tsw, i, threads)); + } + for(ProcessThread pt : alpt){pt.start();} + boolean success=ThreadWaiter.waitForThreads(alpt); + + for(ProcessThread pt : alpt){ + totalSpecies+=pt.totalSpeciesT; + totalGenus+=pt.totalGenusT; + totalGenomes+=pt.totalGenomesT; + } + System.err.println("Total Genomes: "+totalGenomes); + System.err.println("Total Species: "+totalSpecies); + System.err.println("Total Genuses: "+totalGenus); + + tsw.poisonAndWait(); + assert(success); + } + + static class ProcessThread extends Thread { + + ProcessThread(ArrayList<String> speciesList_, TextStreamWriter tsw_, int tid_, int threads_){ + speciesList=speciesList_; + tsw=tsw_; + tid=tid_; + threads=threads_; + } + + @Override + public void run(){ + for(String s : speciesList){ +// if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) { +// processSpecies(s); +// } + + //This way one thread handles an entire genus + if(s!=null){ + String genus=getGenus(s); + if(genus!=null){ + if((genus.hashCode()&Integer.MAX_VALUE)%threads==tid) { + processSpecies(s); + } + }else{ + if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) { + processSpecies(s); + } + } + } + } + } + + void processSpecies(String species){ + String genus=getGenus(species); + if(genus!=null){ + final int count=seen(genus, seen); + + if(maxSpeciesPerGenus<1 || count<maxSpeciesPerGenus){ + int found=examineSpecies(species, tsw); + if(found>=1){ + totalSpeciesT++; + totalGenomesT+=found; + if(count==0){totalGenusT++;} + put(genus, found, seen); + } + }else{ + if(verbose){System.err.println("same genus: "+species+"\n"+genus);} + } + }else{ + if(verbose){System.err.println("bad species: "+species+"\n"+genus);} + } + } + + final ArrayList<String> speciesList; + final int tid; + final int threads; + //This is OK now that threads work on a per-genus basis + HashMap<String, Integer> seen=new HashMap<String, Integer>(); + final TextStreamWriter tsw; + + int totalSpeciesT=0; + int totalGenusT=0; + int totalGenomesT=0; + } + + static String getGenus(String path){ + //Candidatus_Hamiltonella + String name=path.substring(path.lastIndexOf('/')+1); + if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());} + int under=name.indexOf('_'); + if(under>0){ + return name.substring(0, under); + }else{ + return null; + } + } + + static String getSpecies(String path){ + //Candidatus_Hamiltonella + String name=path.substring(path.lastIndexOf('/')+1); + if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());} + return name; + } + + static int examineSpecies(String baseAddress, TextStreamWriter tsw){ + if(verbose){System.err.println("examineSpecies: "+baseAddress);} + String speciesName=getSpecies(baseAddress); + ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); +// System.err.println("B: "+contents); + int found=0; + for(String s : contents){ +// System.err.println(s); + if(s.contains("reference")){ +// System.err.println("Looking at '"+s+"'"); + found+=examineAssemblies(s, tsw, speciesName); + } + } + if(found>0){return found;} + for(String s : contents){ +// System.err.println(s); + if(s.contains("latest_assembly_versions")){ +// System.err.println("Looking at '"+s+"'"); + found+=examineAssemblies(s, tsw, speciesName); + } + } + if(found>0){return found;} + for(String s : contents){ +// System.err.println(s); + if(s.contains("all_assembly_versions")){ +// System.err.println("Looking at '"+s+"'"); + found+=examineAssemblies(s, tsw, speciesName); + } + } + return found; + } + + static int examineAssemblies(String baseAddress, TextStreamWriter tsw, String speciesName){ + if(verbose){System.err.println("examineAssemblies: "+baseAddress);} + Stats stats=null; + if(findBest){ + stats=findBestAssembly(baseAddress); + if(stats!=null){ + stats.name=speciesName; + int x=examineAssembly(stats, tsw, speciesName); + if(x>0){return x;} + } + } + + ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); +// System.err.println("C: "+contents); + + int found=0; + for(String s : contents){ + stats=calcStats(s); + if(stats!=null){ + stats.name=speciesName; + found+=examineAssembly(stats, tsw, speciesName); + if(found>0){break;} + } + } + return found; + } + + /** Tries to find the assembly with the longest contig */ + static Stats findBestAssembly(String baseAddress){ + if(verbose){System.err.println("findBestAssembly: "+baseAddress);} + ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); +// System.err.println("C: "+contents); + Stats best=null; + for(String s : contents){ +// System.err.println(s); + Stats stats=calcStats(s); + if(stats!=null){ + if(best==null || stats.compareTo(best)>0){ + best=stats; + } + } + } + return best; + } + + static Stats calcStats(String baseAddress){ + if(verbose){System.err.println("calcStats: "+baseAddress);} + ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries); + String report=null; + for(String s : contents){ + if(s.endsWith("_assembly_report.txt")){ + report=s; + break; + } + } + if(report==null){ + if(verbose){System.err.println("Could not find report for "+baseAddress);} + return null; + } + if(verbose){System.err.println("Report: "+report);} + ArrayList<String> data=null; + for(int i=0; i<=retries && data==null; i++){ + try { + data = ServerTools.readFTPFile(report); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + try { + Thread.sleep(Tools.mid(10000, i*1000, 1000)); + } catch (InterruptedException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + } + } + if(data==null){return null;} + int contigs=0; + long size=0; + long max=0; + int taxid=-1; + for(String s : data){ + if(s!=null && s.length()>0){ + if(s.charAt(0)=='#'){ + if(s.startsWith("# Taxid:")){ + String[] split=Tools.whitespacePlus.split(s); + try { + taxid=Integer.parseInt(split[split.length-1]); + } catch (NumberFormatException e) { + e.printStackTrace(); + } + assert(taxid>-1) : "Bad TaxID: '"+s+"'"; + } + }else{ + String[] split=s.split("\t"); + contigs++; + long len; + try { + len=Long.parseLong(split[8]); + } catch (NumberFormatException e) { + len=1; + } + size+=len; + max=Tools.max(max, len); + } + } + } + return new Stats(baseAddress, max, size, contigs, taxid); + } + + static int examineAssembly(Stats stats, TextStreamWriter tsw, String speciesName){ + if(verbose){System.err.println("examineAssembly: "+stats.path);} + ArrayList<String> contents=ServerTools.listDirectory(stats.path, retries); +// System.err.println("D: "+contents); + String gff=null; + String fna=null; + for(String s : contents){ +// System.err.println(s); + if(!s.contains("_from_genomic")){ + if(s.endsWith("genomic.fna.gz")){fna=s;} + else if(s.endsWith("genomic.gff.gz")){gff=s;} + } + } + if(fna!=null && gff!=null){ + System.err.println("Printing: "+fna); + String prefix=(tidInFilename ? "tid_"+stats.taxID+"_" : ""); + + synchronized(tsw){ + if(renameSequences){ + tsw.println("wget -q -O - "+fna+" | " + + "gi2taxid.sh in=stdin.fa.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".fna.gz"); + tsw.println("wget -q -O - "+gff+" | " + + "gi2taxid.sh in=stdin.gff.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".gff.gz"); + }else if(renameFiles){ + tsw.println("wget -q -O - "+fna+" > "+prefix+speciesName+".fna.gz"); + tsw.println("wget -q -O - "+gff+" > "+prefix+speciesName+".gff.gz"); + }else{ + tsw.println("wget -q "+fna); + tsw.println("wget -q "+gff); + } + tsw.println(); + } + return 1; + } + return 0; + } + + static String makeSubAddress(String baseAddress, String extension){ + if(!baseAddress.endsWith("/")){baseAddress=baseAddress+"/";} + String subAddress=baseAddress+extension.substring(extension.indexOf('/')+1); + return subAddress; + } + + static int seen(String s, HashMap<String, Integer> map){ +// synchronized(map){ + Integer x=map.get(s); + return x==null ? 0 : x.intValue(); +// } + } + static void put(String s, int found, HashMap<String, Integer> map){ +// synchronized(map){ + int present=seen(s, map); + map.put(s, present+found); +// } + } + + static class Stats implements Comparable<Stats>{ + + public Stats(String path_, long maxContig_, long size_, int contigs_, int taxID_){ + path=path_; + maxContig=maxContig_; + size=size_; + contigs=contigs_; + taxID=taxID_; + } + + @Override + public int compareTo(Stats b) {//true if b is better + if(b==null){return 1;} + if(taxID>0 && b.taxID<1){return 1;} + if(b.taxID>0 && taxID<1){return -1;} + + if(size>2*b.size){return 1;} + if(size<2*b.size){return -1;} + + if(maxContig>b.maxContig){return 1;} + if(maxContig<b.maxContig){return -1;} + + return b.contigs-contigs; + } + + String path; + String name; + long maxContig; + long size; + int contigs; + int taxID; + } + + static boolean verbose=true; +// static boolean allowSameGenus=false; + static int maxSpeciesPerGenus=1; + static boolean renameFiles=true; + static boolean renameSequences=true; + static int retries=40; + static boolean findBest=false; + + static boolean tidInFilename=true; + +// private static HashMap<String, Integer> seen=new HashMap<String, Integer>(); + + static int totalSpecies=0; + static int totalGenus=0; + static int totalGenomes=0; + + private static final Integer one=1; + +}