annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/Sketch.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 // Copyright © 2015, Battelle National Biodefense Institute (BNBI);
jpayne@69 2 // all rights reserved. Authored by: Brian Ondov, Todd Treangen,
jpayne@69 3 // Sergey Koren, and Adam Phillippy
jpayne@69 4 //
jpayne@69 5 // See the LICENSE.txt file included with this software for license information.
jpayne@69 6
jpayne@69 7 #ifndef Sketch_h
jpayne@69 8 #define Sketch_h
jpayne@69 9
jpayne@69 10 #include "mash/capnp/MinHash.capnp.h"
jpayne@69 11 #include "robin_hood.h"
jpayne@69 12 #include <map>
jpayne@69 13 #include <vector>
jpayne@69 14 #include <string>
jpayne@69 15 #include <string.h>
jpayne@69 16 #include "MinHashHeap.h"
jpayne@69 17 #include "ThreadPool.h"
jpayne@69 18
jpayne@69 19 static const char * capnpHeader = "Cap'n Proto";
jpayne@69 20 static const int capnpHeaderLength = strlen(capnpHeader);
jpayne@69 21
jpayne@69 22 static const char * suffixSketch = ".msh";
jpayne@69 23 static const char * suffixSketchWindowed = ".msw";
jpayne@69 24
jpayne@69 25 static const char * alphabetNucleotide = "ACGT";
jpayne@69 26 static const char * alphabetProtein = "ACDEFGHIKLMNPQRSTVWY";
jpayne@69 27
jpayne@69 28 class Sketch
jpayne@69 29 {
jpayne@69 30 public:
jpayne@69 31
jpayne@69 32 typedef uint64_t hash_t;
jpayne@69 33
jpayne@69 34 struct Parameters
jpayne@69 35 {
jpayne@69 36 Parameters()
jpayne@69 37 :
jpayne@69 38 parallelism(1),
jpayne@69 39 kmerSize(0),
jpayne@69 40 alphabetSize(0),
jpayne@69 41 preserveCase(false),
jpayne@69 42 use64(false),
jpayne@69 43 seed(0),
jpayne@69 44 error(0),
jpayne@69 45 warning(0),
jpayne@69 46 minHashesPerWindow(0),
jpayne@69 47 windowSize(0),
jpayne@69 48 windowed(false),
jpayne@69 49 concatenated(false),
jpayne@69 50 noncanonical(false),
jpayne@69 51 reads(false),
jpayne@69 52 memoryBound(0),
jpayne@69 53 minCov(1),
jpayne@69 54 targetCov(0),
jpayne@69 55 genomeSize(0)
jpayne@69 56 {
jpayne@69 57 memset(alphabet, 0, 256);
jpayne@69 58 }
jpayne@69 59
jpayne@69 60 Parameters(const Parameters & other)
jpayne@69 61 :
jpayne@69 62 parallelism(other.parallelism),
jpayne@69 63 kmerSize(other.kmerSize),
jpayne@69 64 alphabetSize(other.alphabetSize),
jpayne@69 65 preserveCase(other.preserveCase),
jpayne@69 66 use64(other.use64),
jpayne@69 67 seed(other.seed),
jpayne@69 68 error(other.error),
jpayne@69 69 warning(other.warning),
jpayne@69 70 minHashesPerWindow(other.minHashesPerWindow),
jpayne@69 71 windowSize(other.windowSize),
jpayne@69 72 windowed(other.windowed),
jpayne@69 73 concatenated(other.concatenated),
jpayne@69 74 noncanonical(other.noncanonical),
jpayne@69 75 reads(other.reads),
jpayne@69 76 memoryBound(other.memoryBound),
jpayne@69 77 minCov(other.minCov),
jpayne@69 78 targetCov(other.targetCov),
jpayne@69 79 genomeSize(other.genomeSize)
jpayne@69 80 {
jpayne@69 81 memcpy(alphabet, other.alphabet, 256);
jpayne@69 82 }
jpayne@69 83
jpayne@69 84 int parallelism;
jpayne@69 85 int kmerSize;
jpayne@69 86 bool alphabet[256];
jpayne@69 87 uint32_t alphabetSize;
jpayne@69 88 bool preserveCase;
jpayne@69 89 bool use64;
jpayne@69 90 uint32_t seed;
jpayne@69 91 double error;
jpayne@69 92 double warning;
jpayne@69 93 uint64_t minHashesPerWindow;
jpayne@69 94 uint64_t windowSize;
jpayne@69 95 bool windowed;
jpayne@69 96 bool concatenated;
jpayne@69 97 bool noncanonical;
jpayne@69 98 bool reads;
jpayne@69 99 uint64_t memoryBound;
jpayne@69 100 uint32_t minCov;
jpayne@69 101 double targetCov;
jpayne@69 102 uint64_t genomeSize;
jpayne@69 103 };
jpayne@69 104
jpayne@69 105 struct PositionHash
jpayne@69 106 {
jpayne@69 107 PositionHash(uint32_t positionNew, hash_t hashNew) :
jpayne@69 108 position(positionNew),
jpayne@69 109 hash(hashNew)
jpayne@69 110 {}
jpayne@69 111
jpayne@69 112 uint32_t position;
jpayne@69 113 hash_t hash;
jpayne@69 114 };
jpayne@69 115
jpayne@69 116 struct Locus
jpayne@69 117 {
jpayne@69 118 Locus(uint32_t sequenceNew, uint32_t positionNew)
jpayne@69 119 :
jpayne@69 120 sequence(sequenceNew),
jpayne@69 121 position(positionNew)
jpayne@69 122 {}
jpayne@69 123
jpayne@69 124 uint32_t sequence;
jpayne@69 125 uint32_t position;
jpayne@69 126 };
jpayne@69 127
jpayne@69 128 struct Reference
jpayne@69 129 {
jpayne@69 130 // no sequence for now
jpayne@69 131
jpayne@69 132 std::string name;
jpayne@69 133 std::string comment;
jpayne@69 134 uint64_t length;
jpayne@69 135 HashList hashesSorted;
jpayne@69 136 std::vector<uint32_t> counts;
jpayne@69 137 };
jpayne@69 138
jpayne@69 139 struct SketchInput
jpayne@69 140 {
jpayne@69 141 SketchInput(std::vector<std::string> fileNamesNew, char * seqNew, uint64_t lengthNew, const std::string & nameNew, const std::string & commentNew, const Sketch::Parameters & parametersNew)
jpayne@69 142 :
jpayne@69 143 fileNames(fileNamesNew),
jpayne@69 144 seq(seqNew),
jpayne@69 145 length(lengthNew),
jpayne@69 146 name(nameNew),
jpayne@69 147 comment(commentNew),
jpayne@69 148 parameters(parametersNew)
jpayne@69 149 {}
jpayne@69 150
jpayne@69 151 ~SketchInput()
jpayne@69 152 {
jpayne@69 153 if ( seq != 0 )
jpayne@69 154 {
jpayne@69 155 delete [] seq;
jpayne@69 156 }
jpayne@69 157 }
jpayne@69 158
jpayne@69 159 std::vector<std::string> fileNames;
jpayne@69 160
jpayne@69 161 char * seq;
jpayne@69 162
jpayne@69 163 uint64_t length;
jpayne@69 164
jpayne@69 165 std::string name;
jpayne@69 166 std::string comment;
jpayne@69 167
jpayne@69 168 Sketch::Parameters parameters;
jpayne@69 169 };
jpayne@69 170
jpayne@69 171 struct SketchOutput
jpayne@69 172 {
jpayne@69 173 std::vector<Reference> references;
jpayne@69 174 std::vector<std::vector<PositionHash>> positionHashesByReference;
jpayne@69 175 };
jpayne@69 176
jpayne@69 177 void getAlphabetAsString(std::string & alphabet) const;
jpayne@69 178 uint32_t getAlphabetSize() const {return parameters.alphabetSize;}
jpayne@69 179 bool getConcatenated() const {return parameters.concatenated;}
jpayne@69 180 float getError() const {return parameters.error;}
jpayne@69 181 int getHashCount() const {return lociByHash.size();}
jpayne@69 182 uint32_t getHashSeed() const {return parameters.seed;}
jpayne@69 183 const std::vector<Locus> & getLociByHash(hash_t hash) const;
jpayne@69 184 float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;}
jpayne@69 185 int getMinKmerSize(uint64_t reference) const;
jpayne@69 186 bool getPreserveCase() const {return parameters.preserveCase;}
jpayne@69 187 double getRandomKmerChance(uint64_t reference) const;
jpayne@69 188 const Reference & getReference(uint64_t index) const {return references.at(index);}
jpayne@69 189 uint64_t getReferenceCount() const {return references.size();}
jpayne@69 190 void getReferenceHistogram(uint64_t index, std::map<uint32_t, uint64_t> & histogram) const;
jpayne@69 191 uint64_t getReferenceIndex(std::string id) const;
jpayne@69 192 int getKmerSize() const {return parameters.kmerSize;}
jpayne@69 193 double getKmerSpace() const {return kmerSpace;}
jpayne@69 194 bool getUse64() const {return parameters.use64;}
jpayne@69 195 uint64_t getWindowSize() const {return parameters.windowSize;}
jpayne@69 196 bool getNoncanonical() const {return parameters.noncanonical;}
jpayne@69 197 bool hasHashCounts() const {return references.size() > 0 && references.at(0).counts.size() > 0;}
jpayne@69 198 bool hasLociByHash(hash_t hash) const {return lociByHash.count(hash);}
jpayne@69 199 int initFromFiles(const std::vector<std::string> & files, const Parameters & parametersNew, int verbosity = 0, bool enforceParameters = false, bool contain = false);
jpayne@69 200 void initFromReads(const std::vector<std::string> & files, const Parameters & parametersNew);
jpayne@69 201 uint64_t initParametersFromCapnp(const char * file);
jpayne@69 202 void setReferenceName(int i, const std::string name) {references[i].name = name;}
jpayne@69 203 void setReferenceComment(int i, const std::string comment) {references[i].comment = comment;}
jpayne@69 204 bool sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, Sketch::SketchOutput> * threadPool);
jpayne@69 205 void useThreadOutput(SketchOutput * output);
jpayne@69 206 void warnKmerSize(uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount) const;
jpayne@69 207 bool writeToFile() const;
jpayne@69 208 int writeToCapnp(const char * file) const;
jpayne@69 209
jpayne@69 210 private:
jpayne@69 211
jpayne@69 212 void createIndex();
jpayne@69 213
jpayne@69 214 std::vector<Reference> references;
jpayne@69 215 robin_hood::unordered_map<std::string, int> referenceIndecesById;
jpayne@69 216 std::vector<std::vector<PositionHash>> positionHashesByReference;
jpayne@69 217 robin_hood::unordered_map<hash_t, std::vector<Locus>> lociByHash;
jpayne@69 218
jpayne@69 219 Parameters parameters;
jpayne@69 220 double kmerSpace;
jpayne@69 221 std::string file;
jpayne@69 222 };
jpayne@69 223
jpayne@69 224 void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const Sketch::Parameters & parameters);
jpayne@69 225 void getMinHashPositions(std::vector<Sketch::PositionHash> & loci, char * seq, uint32_t length, const Sketch::Parameters & parameters, int verbosity = 0);
jpayne@69 226 bool hasSuffix(std::string const & whole, std::string const & suffix);
jpayne@69 227 Sketch::SketchOutput * loadCapnp(Sketch::SketchInput * input);
jpayne@69 228 void reverseComplement(const char * src, char * dest, int length);
jpayne@69 229 void setAlphabetFromString(Sketch::Parameters & parameters, const char * characters);
jpayne@69 230 void setMinHashesForReference(Sketch::Reference & reference, const MinHashHeap & hashes);
jpayne@69 231 Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input);
jpayne@69 232 Sketch::SketchOutput * sketchSequence(Sketch::SketchInput * input);
jpayne@69 233
jpayne@69 234 int def(int fdSource, int fdDest, int level);
jpayne@69 235 int inf(int fdSource, int fdDest);
jpayne@69 236 void zerr(int ret);
jpayne@69 237
jpayne@69 238 #endif