diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FetchProks.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/prok/FetchProks.java	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,388 @@
+package prok;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import fileIO.FileFormat;
+import fileIO.TextStreamWriter;
+import server.ServerTools;
+import shared.Parse;
+import shared.Tools;
+import template.ThreadWaiter;
+
+/** Crawls ncbi's ftp site to download genomes and annotations */
+public class FetchProks {
+	
+	public static void main(String[] args){
+		//ftp://ftp.ncbi.nih.gov:21/genomes/refseq/bacteria/
+		
+		String baseAddress=args[0];
+		String out=args.length>1 ? args[1] : "stdout";
+		if(args.length>2){
+			maxSpeciesPerGenus=Integer.parseInt(args[2]);
+			System.err.println("Set maxSpeciesPerGenus="+maxSpeciesPerGenus);
+		}
+		if(args.length>3){
+			findBest=Parse.parseBoolean(args[3]);
+			System.err.println("Set findBest="+findBest);
+		}
+		TextStreamWriter tsw=new TextStreamWriter(out, true, false, false, FileFormat.TEXT);
+		tsw.start();
+
+//		iterateOuter(baseAddress, tsw);
+		ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
+		
+		int threads=7;
+		ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(threads);
+		for(int i=0; i<threads; i++){
+			alpt.add(new ProcessThread(contents, tsw, i, threads));
+		}
+		for(ProcessThread pt : alpt){pt.start();}
+		boolean success=ThreadWaiter.waitForThreads(alpt);
+		
+		for(ProcessThread pt : alpt){
+			totalSpecies+=pt.totalSpeciesT;
+			totalGenus+=pt.totalGenusT;
+			totalGenomes+=pt.totalGenomesT;
+		}
+		System.err.println("Total Genomes: "+totalGenomes);
+		System.err.println("Total Species: "+totalSpecies);
+		System.err.println("Total Genuses: "+totalGenus);
+		
+		tsw.poisonAndWait();
+		assert(success);
+	}
+	
+	static class ProcessThread extends Thread {
+		
+		ProcessThread(ArrayList<String> speciesList_, TextStreamWriter tsw_, int tid_, int threads_){
+			speciesList=speciesList_;
+			tsw=tsw_;
+			tid=tid_;
+			threads=threads_;
+		}
+		
+		@Override
+		public void run(){
+			for(String s : speciesList){
+//				if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
+//					processSpecies(s);
+//				}
+				
+				//This way one thread handles an entire genus
+				if(s!=null){
+					String genus=getGenus(s);
+					if(genus!=null){
+						if((genus.hashCode()&Integer.MAX_VALUE)%threads==tid) {
+							processSpecies(s);
+						}
+					}else{
+						if((s.hashCode()&Integer.MAX_VALUE)%threads==tid) {
+							processSpecies(s);
+						}
+					}
+				}
+			}
+		}
+		
+		void processSpecies(String species){
+			String genus=getGenus(species);
+			if(genus!=null){
+				final int count=seen(genus, seen);
+				
+				if(maxSpeciesPerGenus<1 || count<maxSpeciesPerGenus){
+					int found=examineSpecies(species, tsw);
+					if(found>=1){
+						totalSpeciesT++;
+						totalGenomesT+=found;
+						if(count==0){totalGenusT++;}
+						put(genus, found, seen);
+					}
+				}else{
+					if(verbose){System.err.println("same genus: "+species+"\n"+genus);}
+				}
+			}else{
+				if(verbose){System.err.println("bad species: "+species+"\n"+genus);}
+			}
+		}
+		
+		final ArrayList<String> speciesList;
+		final int tid;
+		final int threads;
+		//This is OK now that threads work on a per-genus basis
+		HashMap<String, Integer> seen=new HashMap<String, Integer>();
+		final TextStreamWriter tsw;
+		
+		int totalSpeciesT=0;
+		int totalGenusT=0;
+		int totalGenomesT=0;
+	}
+	
+	static String getGenus(String path){
+		//Candidatus_Hamiltonella
+		String name=path.substring(path.lastIndexOf('/')+1);
+		if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
+		int under=name.indexOf('_');
+		if(under>0){
+			return name.substring(0, under);
+		}else{
+			return null;
+		}
+	}
+	
+	static String getSpecies(String path){
+		//Candidatus_Hamiltonella
+		String name=path.substring(path.lastIndexOf('/')+1);
+		if(name.startsWith("Candidatus_")){name=name.substring("Candidatus_".length());}
+		return name;
+	}
+	
+	static int examineSpecies(String baseAddress, TextStreamWriter tsw){
+		if(verbose){System.err.println("examineSpecies: "+baseAddress);}
+		String speciesName=getSpecies(baseAddress);
+		ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
+//		System.err.println("B: "+contents);
+		int found=0;
+		for(String s : contents){
+//			System.err.println(s);
+			if(s.contains("reference")){
+//				System.err.println("Looking at '"+s+"'");
+				found+=examineAssemblies(s, tsw, speciesName);
+			}
+		}
+		if(found>0){return found;}
+		for(String s : contents){
+//			System.err.println(s);
+			 if(s.contains("latest_assembly_versions")){
+//				System.err.println("Looking at '"+s+"'");
+				 found+=examineAssemblies(s, tsw, speciesName);
+			}
+		}
+		if(found>0){return found;}
+		for(String s : contents){
+//			System.err.println(s);
+			if(s.contains("all_assembly_versions")){
+//				System.err.println("Looking at '"+s+"'");
+				found+=examineAssemblies(s, tsw, speciesName);
+			}
+		}
+		return found;
+	}
+	
+	static int examineAssemblies(String baseAddress, TextStreamWriter tsw, String speciesName){
+		if(verbose){System.err.println("examineAssemblies: "+baseAddress);}
+		Stats stats=null;
+		if(findBest){
+			stats=findBestAssembly(baseAddress);
+			if(stats!=null){
+				stats.name=speciesName;
+				int x=examineAssembly(stats, tsw, speciesName);
+				if(x>0){return x;}
+			}
+		}
+		
+		ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
+//		System.err.println("C: "+contents);
+		
+		int found=0;
+		for(String s : contents){
+			stats=calcStats(s);
+			if(stats!=null){
+				stats.name=speciesName;
+				found+=examineAssembly(stats, tsw, speciesName);
+				if(found>0){break;}
+			}
+		}
+		return found;
+	}
+	
+	/** Tries to find the assembly with the longest contig */
+	static Stats findBestAssembly(String baseAddress){
+		if(verbose){System.err.println("findBestAssembly: "+baseAddress);}
+		ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
+//		System.err.println("C: "+contents);
+		Stats best=null;
+		for(String s : contents){
+//			System.err.println(s);
+			Stats stats=calcStats(s);
+			if(stats!=null){
+				if(best==null || stats.compareTo(best)>0){
+					best=stats;
+				}
+			}
+		}
+		return best;
+	}
+	
+	static Stats calcStats(String baseAddress){
+		if(verbose){System.err.println("calcStats: "+baseAddress);}
+		ArrayList<String> contents=ServerTools.listDirectory(baseAddress, retries);
+		String report=null;
+		for(String s : contents){
+			if(s.endsWith("_assembly_report.txt")){
+				report=s;
+				break;
+			}
+		}
+		if(report==null){
+			if(verbose){System.err.println("Could not find report for "+baseAddress);}
+			return null;
+		}
+		if(verbose){System.err.println("Report: "+report);}
+		ArrayList<String> data=null;
+		for(int i=0; i<=retries && data==null; i++){
+			try {
+				data = ServerTools.readFTPFile(report);
+			} catch (Exception e) {
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+				try {
+					Thread.sleep(Tools.mid(10000, i*1000, 1000));
+				} catch (InterruptedException e1) {
+					// TODO Auto-generated catch block
+					e1.printStackTrace();
+				}
+			}
+		}
+		if(data==null){return null;}
+		int contigs=0;
+		long size=0;
+		long max=0;
+		int taxid=-1;
+		for(String s : data){
+			if(s!=null && s.length()>0){
+				if(s.charAt(0)=='#'){
+					if(s.startsWith("# Taxid:")){
+						String[] split=Tools.whitespacePlus.split(s);
+						try {
+							taxid=Integer.parseInt(split[split.length-1]);
+						} catch (NumberFormatException e) {
+							e.printStackTrace();
+						}
+						assert(taxid>-1) : "Bad TaxID: '"+s+"'";
+					}
+				}else{
+					String[] split=s.split("\t");
+					contigs++;
+					long len;
+					try {
+						len=Long.parseLong(split[8]);
+					} catch (NumberFormatException e) {
+						len=1;
+					}
+					size+=len;
+					max=Tools.max(max, len);
+				}
+			}
+		}
+		return new Stats(baseAddress, max, size, contigs, taxid);
+	}
+	
+	static int examineAssembly(Stats stats, TextStreamWriter tsw, String speciesName){
+		if(verbose){System.err.println("examineAssembly: "+stats.path);}
+		ArrayList<String> contents=ServerTools.listDirectory(stats.path, retries);
+//		System.err.println("D: "+contents);
+		String gff=null;
+		String fna=null;
+		for(String s : contents){
+//			System.err.println(s);
+			if(!s.contains("_from_genomic")){
+				if(s.endsWith("genomic.fna.gz")){fna=s;}
+				else if(s.endsWith("genomic.gff.gz")){gff=s;}
+			}
+		}
+		if(fna!=null && gff!=null){
+			System.err.println("Printing: "+fna);
+			String prefix=(tidInFilename ? "tid_"+stats.taxID+"_" : "");
+			
+			synchronized(tsw){
+				if(renameSequences){
+					tsw.println("wget -q -O - "+fna+" | "
+							+ "gi2taxid.sh in=stdin.fa.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".fna.gz");
+					tsw.println("wget -q -O - "+gff+" | "
+							+ "gi2taxid.sh in=stdin.gff.gz deleteinvalid zl=9 server -Xmx1g out="+prefix+speciesName+".gff.gz");
+				}else if(renameFiles){
+					tsw.println("wget -q -O - "+fna+" > "+prefix+speciesName+".fna.gz");
+					tsw.println("wget -q -O - "+gff+" > "+prefix+speciesName+".gff.gz");
+				}else{
+					tsw.println("wget -q "+fna);
+					tsw.println("wget -q "+gff);
+				}
+				tsw.println();
+			}
+			return 1;
+		}
+		return 0;
+	}
+	
+	static String makeSubAddress(String baseAddress, String extension){
+		if(!baseAddress.endsWith("/")){baseAddress=baseAddress+"/";}
+		String subAddress=baseAddress+extension.substring(extension.indexOf('/')+1);
+		return subAddress;
+	}
+	
+	static int seen(String s, HashMap<String, Integer> map){
+//		synchronized(map){
+			Integer x=map.get(s);
+			return x==null ? 0 : x.intValue();
+//		}
+	}
+	static void put(String s, int found, HashMap<String, Integer> map){
+//		synchronized(map){
+			int present=seen(s, map);
+			map.put(s, present+found);
+//		}
+	}
+	
+	static class Stats implements Comparable<Stats>{
+		
+		public Stats(String path_, long maxContig_, long size_, int contigs_, int taxID_){
+			path=path_;
+			maxContig=maxContig_;
+			size=size_;
+			contigs=contigs_;
+			taxID=taxID_;
+		}
+
+		@Override
+		public int compareTo(Stats b) {//true if b is better
+			if(b==null){return 1;}
+			if(taxID>0 && b.taxID<1){return 1;}
+			if(b.taxID>0 && taxID<1){return -1;}
+			
+			if(size>2*b.size){return 1;}
+			if(size<2*b.size){return -1;}
+
+			if(maxContig>b.maxContig){return 1;}
+			if(maxContig<b.maxContig){return -1;}
+			
+			return b.contigs-contigs;
+		}
+		
+		String path;
+		String name;
+		long maxContig;
+		long size;
+		int contigs;
+		int taxID;
+	}
+	
+	static boolean verbose=true;
+//	static boolean allowSameGenus=false;
+	static int maxSpeciesPerGenus=1;
+	static boolean renameFiles=true;
+	static boolean renameSequences=true;
+	static int retries=40;
+	static boolean findBest=false;
+	
+	static boolean tidInFilename=true;
+	
+//	private static HashMap<String, Integer> seen=new HashMap<String, Integer>();
+	
+	static int totalSpecies=0;
+	static int totalGenus=0;
+	static int totalGenomes=0;
+
+	private static final Integer one=1;
+	
+}