Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/Sketch.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 // Copyright © 2015, Battelle National Biodefense Institute (BNBI); | |
2 // all rights reserved. Authored by: Brian Ondov, Todd Treangen, | |
3 // Sergey Koren, and Adam Phillippy | |
4 // | |
5 // See the LICENSE.txt file included with this software for license information. | |
6 | |
7 #ifndef Sketch_h | |
8 #define Sketch_h | |
9 | |
10 #include "mash/capnp/MinHash.capnp.h" | |
11 #include "robin_hood.h" | |
12 #include <map> | |
13 #include <vector> | |
14 #include <string> | |
15 #include <string.h> | |
16 #include "MinHashHeap.h" | |
17 #include "ThreadPool.h" | |
18 | |
19 static const char * capnpHeader = "Cap'n Proto"; | |
20 static const int capnpHeaderLength = strlen(capnpHeader); | |
21 | |
22 static const char * suffixSketch = ".msh"; | |
23 static const char * suffixSketchWindowed = ".msw"; | |
24 | |
25 static const char * alphabetNucleotide = "ACGT"; | |
26 static const char * alphabetProtein = "ACDEFGHIKLMNPQRSTVWY"; | |
27 | |
28 class Sketch | |
29 { | |
30 public: | |
31 | |
32 typedef uint64_t hash_t; | |
33 | |
34 struct Parameters | |
35 { | |
36 Parameters() | |
37 : | |
38 parallelism(1), | |
39 kmerSize(0), | |
40 alphabetSize(0), | |
41 preserveCase(false), | |
42 use64(false), | |
43 seed(0), | |
44 error(0), | |
45 warning(0), | |
46 minHashesPerWindow(0), | |
47 windowSize(0), | |
48 windowed(false), | |
49 concatenated(false), | |
50 noncanonical(false), | |
51 reads(false), | |
52 memoryBound(0), | |
53 minCov(1), | |
54 targetCov(0), | |
55 genomeSize(0) | |
56 { | |
57 memset(alphabet, 0, 256); | |
58 } | |
59 | |
60 Parameters(const Parameters & other) | |
61 : | |
62 parallelism(other.parallelism), | |
63 kmerSize(other.kmerSize), | |
64 alphabetSize(other.alphabetSize), | |
65 preserveCase(other.preserveCase), | |
66 use64(other.use64), | |
67 seed(other.seed), | |
68 error(other.error), | |
69 warning(other.warning), | |
70 minHashesPerWindow(other.minHashesPerWindow), | |
71 windowSize(other.windowSize), | |
72 windowed(other.windowed), | |
73 concatenated(other.concatenated), | |
74 noncanonical(other.noncanonical), | |
75 reads(other.reads), | |
76 memoryBound(other.memoryBound), | |
77 minCov(other.minCov), | |
78 targetCov(other.targetCov), | |
79 genomeSize(other.genomeSize) | |
80 { | |
81 memcpy(alphabet, other.alphabet, 256); | |
82 } | |
83 | |
84 int parallelism; | |
85 int kmerSize; | |
86 bool alphabet[256]; | |
87 uint32_t alphabetSize; | |
88 bool preserveCase; | |
89 bool use64; | |
90 uint32_t seed; | |
91 double error; | |
92 double warning; | |
93 uint64_t minHashesPerWindow; | |
94 uint64_t windowSize; | |
95 bool windowed; | |
96 bool concatenated; | |
97 bool noncanonical; | |
98 bool reads; | |
99 uint64_t memoryBound; | |
100 uint32_t minCov; | |
101 double targetCov; | |
102 uint64_t genomeSize; | |
103 }; | |
104 | |
105 struct PositionHash | |
106 { | |
107 PositionHash(uint32_t positionNew, hash_t hashNew) : | |
108 position(positionNew), | |
109 hash(hashNew) | |
110 {} | |
111 | |
112 uint32_t position; | |
113 hash_t hash; | |
114 }; | |
115 | |
116 struct Locus | |
117 { | |
118 Locus(uint32_t sequenceNew, uint32_t positionNew) | |
119 : | |
120 sequence(sequenceNew), | |
121 position(positionNew) | |
122 {} | |
123 | |
124 uint32_t sequence; | |
125 uint32_t position; | |
126 }; | |
127 | |
128 struct Reference | |
129 { | |
130 // no sequence for now | |
131 | |
132 std::string name; | |
133 std::string comment; | |
134 uint64_t length; | |
135 HashList hashesSorted; | |
136 std::vector<uint32_t> counts; | |
137 }; | |
138 | |
139 struct SketchInput | |
140 { | |
141 SketchInput(std::vector<std::string> fileNamesNew, char * seqNew, uint64_t lengthNew, const std::string & nameNew, const std::string & commentNew, const Sketch::Parameters & parametersNew) | |
142 : | |
143 fileNames(fileNamesNew), | |
144 seq(seqNew), | |
145 length(lengthNew), | |
146 name(nameNew), | |
147 comment(commentNew), | |
148 parameters(parametersNew) | |
149 {} | |
150 | |
151 ~SketchInput() | |
152 { | |
153 if ( seq != 0 ) | |
154 { | |
155 delete [] seq; | |
156 } | |
157 } | |
158 | |
159 std::vector<std::string> fileNames; | |
160 | |
161 char * seq; | |
162 | |
163 uint64_t length; | |
164 | |
165 std::string name; | |
166 std::string comment; | |
167 | |
168 Sketch::Parameters parameters; | |
169 }; | |
170 | |
171 struct SketchOutput | |
172 { | |
173 std::vector<Reference> references; | |
174 std::vector<std::vector<PositionHash>> positionHashesByReference; | |
175 }; | |
176 | |
177 void getAlphabetAsString(std::string & alphabet) const; | |
178 uint32_t getAlphabetSize() const {return parameters.alphabetSize;} | |
179 bool getConcatenated() const {return parameters.concatenated;} | |
180 float getError() const {return parameters.error;} | |
181 int getHashCount() const {return lociByHash.size();} | |
182 uint32_t getHashSeed() const {return parameters.seed;} | |
183 const std::vector<Locus> & getLociByHash(hash_t hash) const; | |
184 float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;} | |
185 int getMinKmerSize(uint64_t reference) const; | |
186 bool getPreserveCase() const {return parameters.preserveCase;} | |
187 double getRandomKmerChance(uint64_t reference) const; | |
188 const Reference & getReference(uint64_t index) const {return references.at(index);} | |
189 uint64_t getReferenceCount() const {return references.size();} | |
190 void getReferenceHistogram(uint64_t index, std::map<uint32_t, uint64_t> & histogram) const; | |
191 uint64_t getReferenceIndex(std::string id) const; | |
192 int getKmerSize() const {return parameters.kmerSize;} | |
193 double getKmerSpace() const {return kmerSpace;} | |
194 bool getUse64() const {return parameters.use64;} | |
195 uint64_t getWindowSize() const {return parameters.windowSize;} | |
196 bool getNoncanonical() const {return parameters.noncanonical;} | |
197 bool hasHashCounts() const {return references.size() > 0 && references.at(0).counts.size() > 0;} | |
198 bool hasLociByHash(hash_t hash) const {return lociByHash.count(hash);} | |
199 int initFromFiles(const std::vector<std::string> & files, const Parameters & parametersNew, int verbosity = 0, bool enforceParameters = false, bool contain = false); | |
200 void initFromReads(const std::vector<std::string> & files, const Parameters & parametersNew); | |
201 uint64_t initParametersFromCapnp(const char * file); | |
202 void setReferenceName(int i, const std::string name) {references[i].name = name;} | |
203 void setReferenceComment(int i, const std::string comment) {references[i].comment = comment;} | |
204 bool sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, Sketch::SketchOutput> * threadPool); | |
205 void useThreadOutput(SketchOutput * output); | |
206 void warnKmerSize(uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount) const; | |
207 bool writeToFile() const; | |
208 int writeToCapnp(const char * file) const; | |
209 | |
210 private: | |
211 | |
212 void createIndex(); | |
213 | |
214 std::vector<Reference> references; | |
215 robin_hood::unordered_map<std::string, int> referenceIndecesById; | |
216 std::vector<std::vector<PositionHash>> positionHashesByReference; | |
217 robin_hood::unordered_map<hash_t, std::vector<Locus>> lociByHash; | |
218 | |
219 Parameters parameters; | |
220 double kmerSpace; | |
221 std::string file; | |
222 }; | |
223 | |
224 void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const Sketch::Parameters & parameters); | |
225 void getMinHashPositions(std::vector<Sketch::PositionHash> & loci, char * seq, uint32_t length, const Sketch::Parameters & parameters, int verbosity = 0); | |
226 bool hasSuffix(std::string const & whole, std::string const & suffix); | |
227 Sketch::SketchOutput * loadCapnp(Sketch::SketchInput * input); | |
228 void reverseComplement(const char * src, char * dest, int length); | |
229 void setAlphabetFromString(Sketch::Parameters & parameters, const char * characters); | |
230 void setMinHashesForReference(Sketch::Reference & reference, const MinHashHeap & hashes); | |
231 Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input); | |
232 Sketch::SketchOutput * sketchSequence(Sketch::SketchInput * input); | |
233 | |
234 int def(int fdSource, int fdDest, int level); | |
235 int inf(int fdSource, int fdDest); | |
236 void zerr(int ret); | |
237 | |
238 #endif |