annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/current/bbmin/Minimizer.java @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 package bbmin;
jpayne@68 2
jpayne@68 3 import java.util.Arrays;
jpayne@68 4
jpayne@68 5 /**
jpayne@68 6 * Generates an array of minimal hash codes (as positive 64-bit longs) for an input sequence.<br>
jpayne@68 7 * The resulting array is guaranteed to contain the minimal hash code<br>
jpayne@68 8 * for every window, with no duplicates.
jpayne@68 9 * On average this is expected to yield 2*(L-K)/W hash codes for sequence length L and window size W.
jpayne@68 10 *
jpayne@68 11 * @author Brian Bushnell
jpayne@68 12 * @date October 8, 2021
jpayne@68 13 *
jpayne@68 14 */
jpayne@68 15 public class Minimizer {
jpayne@68 16
jpayne@68 17 public static void main(String[] args){
jpayne@68 18 int k=4, w=7;
jpayne@68 19 String seq="ACGTCTGAGCCTTGACACATGACT";
jpayne@68 20 try {
jpayne@68 21 k=Integer.parseInt(args[0]);
jpayne@68 22 w=Integer.parseInt(args[1]);
jpayne@68 23 seq=args[2];
jpayne@68 24 } catch (NumberFormatException e) {
jpayne@68 25 //e.printStackTrace();
jpayne@68 26 System.err.println("Usage: bbmin.Minimizer kmerlen window seq\n"
jpayne@68 27 + "E.G.\n"
jpayne@68 28 + "bbmin.Minimizer 4 7 ACGTCTGAGCCTTGACACATGACT");
jpayne@68 29 System.exit(1);
jpayne@68 30 }
jpayne@68 31 Minimizer minnow=new Minimizer(k, w);
jpayne@68 32 long[] array=minnow.minimize(seq.getBytes());
jpayne@68 33 System.err.println(Arrays.toString(array));
jpayne@68 34 }
jpayne@68 35
jpayne@68 36 public Minimizer(int k_, int window_){this(k_, window_, 2);}
jpayne@68 37 public Minimizer(int k_, int window_, int bitsPerSymbol_){
jpayne@68 38 k=k_;
jpayne@68 39 window=window_;
jpayne@68 40 bitsPerSymbol=bitsPerSymbol_;
jpayne@68 41 shift=bitsPerSymbol*k;
jpayne@68 42 shift2=shift-bitsPerSymbol;
jpayne@68 43 mask=(shift>63 ? -1L : ~((-1L)<<shift));
jpayne@68 44 }
jpayne@68 45
jpayne@68 46 public long[] minimize(String str){
jpayne@68 47 return minimize(str.getBytes());
jpayne@68 48 }
jpayne@68 49
jpayne@68 50 public long[] minimize(byte[] bases){
jpayne@68 51 return minimize(bases, new LongList(16), new LongHashSet(16));
jpayne@68 52 }
jpayne@68 53
jpayne@68 54 /** This method is typically faster since you don't need to construct a new set each time. */
jpayne@68 55 public long[] minimize(byte[] bases, LongList list, LongHashSet set){
jpayne@68 56 list.clear();
jpayne@68 57 //If the set is way too big, resize it
jpayne@68 58 if(set.capacity()*(long)window>100L+16L*bases.length){
jpayne@68 59 set.resizeDestructive(16);
jpayne@68 60 }else{
jpayne@68 61 set.clear();
jpayne@68 62 }
jpayne@68 63
jpayne@68 64 long kmersProcessed=0;
jpayne@68 65 long kmer=0;
jpayne@68 66 long rkmer=0;
jpayne@68 67 int len=0;
jpayne@68 68
jpayne@68 69 long bestCode=Long.MAX_VALUE;
jpayne@68 70 int bestPosition=-1;
jpayne@68 71 long bestKmer=-1;
jpayne@68 72 long bestRkmer=-1;
jpayne@68 73 int currentWindow=0;
jpayne@68 74
jpayne@68 75 for(int i=0; i<bases.length; i++){
jpayne@68 76 byte b=bases[i];
jpayne@68 77 long x=baseToNumber[b];
jpayne@68 78 long x2=baseToComplementNumber[b];
jpayne@68 79
jpayne@68 80 kmer=((kmer<<2)|x)&mask;
jpayne@68 81 rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
jpayne@68 82 if(x<0){
jpayne@68 83 len=0;
jpayne@68 84 rkmer=0;
jpayne@68 85 }else{
jpayne@68 86 len++;
jpayne@68 87 }
jpayne@68 88
jpayne@68 89 if(len>=k){
jpayne@68 90 kmersProcessed++;
jpayne@68 91 currentWindow++;
jpayne@68 92
jpayne@68 93 final long hashcode=hash(kmer, rkmer);
jpayne@68 94 System.err.println("i="+i+", code="+hashcode);
jpayne@68 95
jpayne@68 96 //Track the best code in the window and its state
jpayne@68 97 if(hashcode>=minCode && hashcode<=bestCode){
jpayne@68 98 bestCode=hashcode;
jpayne@68 99 bestPosition=i;
jpayne@68 100 bestKmer=kmer;
jpayne@68 101 bestRkmer=rkmer;
jpayne@68 102 }
jpayne@68 103
jpayne@68 104 //Once the window size is met, store the best code,
jpayne@68 105 //and backtrack to its position to start the next window
jpayne@68 106 if(currentWindow>=window && bestPosition>=0){
jpayne@68 107 if(!set.contains(bestCode)){
jpayne@68 108 set.add(bestCode);
jpayne@68 109 list.add(bestCode);
jpayne@68 110 }
jpayne@68 111 i=bestPosition;
jpayne@68 112 kmer=bestKmer;
jpayne@68 113 rkmer=bestRkmer;
jpayne@68 114 len=k;
jpayne@68 115
jpayne@68 116 bestCode=Long.MAX_VALUE;
jpayne@68 117 bestPosition=-1;
jpayne@68 118 currentWindow=0;
jpayne@68 119 }
jpayne@68 120 }
jpayne@68 121 }
jpayne@68 122 list.sort();//optional
jpayne@68 123 return list.toArray();
jpayne@68 124 }
jpayne@68 125
jpayne@68 126 public static long canon(long kmer, long rkmer){return max(kmer, rkmer);}
jpayne@68 127 public static long hash(long kmer, long rkmer){return hash(canon(kmer, rkmer));}
jpayne@68 128 public static long hash(long key) {
jpayne@68 129 key = (~key) + (key << 21); // key = (key << 21) - key - 1;
jpayne@68 130 key = key ^ (key >>> 24);
jpayne@68 131 key = (key + (key << 3)) + (key << 8); // key * 265
jpayne@68 132 key = key ^ (key >>> 14);
jpayne@68 133 key = (key + (key << 2)) + (key << 4); // key * 21
jpayne@68 134 key = key ^ (key >>> 28);
jpayne@68 135 key = key + (key << 31);
jpayne@68 136 return key;
jpayne@68 137 }
jpayne@68 138 private static final long max(long x, long y){return x>y ? x : y;}
jpayne@68 139
jpayne@68 140 public final int k;
jpayne@68 141 public final int window;
jpayne@68 142 public final int bitsPerSymbol; //2 for nucleotides, 5 for amino acids.
jpayne@68 143 private final int shift;
jpayne@68 144 private final int shift2;
jpayne@68 145 private final long mask;
jpayne@68 146 private final long minCode=0;
jpayne@68 147
jpayne@68 148 static final byte[] baseToNumber = new byte[128];
jpayne@68 149 static final byte[] baseToComplementNumber = new byte[128];
jpayne@68 150
jpayne@68 151 static {
jpayne@68 152 Arrays.fill(baseToNumber, (byte)-1);
jpayne@68 153 Arrays.fill(baseToComplementNumber, (byte)-1);
jpayne@68 154 baseToNumber['A']=baseToNumber['a']=baseToComplementNumber['T']=baseToComplementNumber['t']=0;
jpayne@68 155 baseToNumber['C']=baseToNumber['c']=baseToComplementNumber['G']=baseToComplementNumber['c']=1;
jpayne@68 156 baseToNumber['G']=baseToNumber['g']=baseToComplementNumber['C']=baseToComplementNumber['g']=2;
jpayne@68 157 baseToNumber['T']=baseToNumber['t']=baseToComplementNumber['A']=baseToComplementNumber['a']=3;
jpayne@68 158 }
jpayne@68 159 }