comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/Sketch.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 // Copyright © 2015, Battelle National Biodefense Institute (BNBI);
2 // all rights reserved. Authored by: Brian Ondov, Todd Treangen,
3 // Sergey Koren, and Adam Phillippy
4 //
5 // See the LICENSE.txt file included with this software for license information.
6
7 #ifndef Sketch_h
8 #define Sketch_h
9
10 #include "mash/capnp/MinHash.capnp.h"
11 #include "robin_hood.h"
12 #include <map>
13 #include <vector>
14 #include <string>
15 #include <string.h>
16 #include "MinHashHeap.h"
17 #include "ThreadPool.h"
18
19 static const char * capnpHeader = "Cap'n Proto";
20 static const int capnpHeaderLength = strlen(capnpHeader);
21
22 static const char * suffixSketch = ".msh";
23 static const char * suffixSketchWindowed = ".msw";
24
25 static const char * alphabetNucleotide = "ACGT";
26 static const char * alphabetProtein = "ACDEFGHIKLMNPQRSTVWY";
27
28 class Sketch
29 {
30 public:
31
32 typedef uint64_t hash_t;
33
34 struct Parameters
35 {
36 Parameters()
37 :
38 parallelism(1),
39 kmerSize(0),
40 alphabetSize(0),
41 preserveCase(false),
42 use64(false),
43 seed(0),
44 error(0),
45 warning(0),
46 minHashesPerWindow(0),
47 windowSize(0),
48 windowed(false),
49 concatenated(false),
50 noncanonical(false),
51 reads(false),
52 memoryBound(0),
53 minCov(1),
54 targetCov(0),
55 genomeSize(0)
56 {
57 memset(alphabet, 0, 256);
58 }
59
60 Parameters(const Parameters & other)
61 :
62 parallelism(other.parallelism),
63 kmerSize(other.kmerSize),
64 alphabetSize(other.alphabetSize),
65 preserveCase(other.preserveCase),
66 use64(other.use64),
67 seed(other.seed),
68 error(other.error),
69 warning(other.warning),
70 minHashesPerWindow(other.minHashesPerWindow),
71 windowSize(other.windowSize),
72 windowed(other.windowed),
73 concatenated(other.concatenated),
74 noncanonical(other.noncanonical),
75 reads(other.reads),
76 memoryBound(other.memoryBound),
77 minCov(other.minCov),
78 targetCov(other.targetCov),
79 genomeSize(other.genomeSize)
80 {
81 memcpy(alphabet, other.alphabet, 256);
82 }
83
84 int parallelism;
85 int kmerSize;
86 bool alphabet[256];
87 uint32_t alphabetSize;
88 bool preserveCase;
89 bool use64;
90 uint32_t seed;
91 double error;
92 double warning;
93 uint64_t minHashesPerWindow;
94 uint64_t windowSize;
95 bool windowed;
96 bool concatenated;
97 bool noncanonical;
98 bool reads;
99 uint64_t memoryBound;
100 uint32_t minCov;
101 double targetCov;
102 uint64_t genomeSize;
103 };
104
105 struct PositionHash
106 {
107 PositionHash(uint32_t positionNew, hash_t hashNew) :
108 position(positionNew),
109 hash(hashNew)
110 {}
111
112 uint32_t position;
113 hash_t hash;
114 };
115
116 struct Locus
117 {
118 Locus(uint32_t sequenceNew, uint32_t positionNew)
119 :
120 sequence(sequenceNew),
121 position(positionNew)
122 {}
123
124 uint32_t sequence;
125 uint32_t position;
126 };
127
128 struct Reference
129 {
130 // no sequence for now
131
132 std::string name;
133 std::string comment;
134 uint64_t length;
135 HashList hashesSorted;
136 std::vector<uint32_t> counts;
137 };
138
139 struct SketchInput
140 {
141 SketchInput(std::vector<std::string> fileNamesNew, char * seqNew, uint64_t lengthNew, const std::string & nameNew, const std::string & commentNew, const Sketch::Parameters & parametersNew)
142 :
143 fileNames(fileNamesNew),
144 seq(seqNew),
145 length(lengthNew),
146 name(nameNew),
147 comment(commentNew),
148 parameters(parametersNew)
149 {}
150
151 ~SketchInput()
152 {
153 if ( seq != 0 )
154 {
155 delete [] seq;
156 }
157 }
158
159 std::vector<std::string> fileNames;
160
161 char * seq;
162
163 uint64_t length;
164
165 std::string name;
166 std::string comment;
167
168 Sketch::Parameters parameters;
169 };
170
171 struct SketchOutput
172 {
173 std::vector<Reference> references;
174 std::vector<std::vector<PositionHash>> positionHashesByReference;
175 };
176
177 void getAlphabetAsString(std::string & alphabet) const;
178 uint32_t getAlphabetSize() const {return parameters.alphabetSize;}
179 bool getConcatenated() const {return parameters.concatenated;}
180 float getError() const {return parameters.error;}
181 int getHashCount() const {return lociByHash.size();}
182 uint32_t getHashSeed() const {return parameters.seed;}
183 const std::vector<Locus> & getLociByHash(hash_t hash) const;
184 float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;}
185 int getMinKmerSize(uint64_t reference) const;
186 bool getPreserveCase() const {return parameters.preserveCase;}
187 double getRandomKmerChance(uint64_t reference) const;
188 const Reference & getReference(uint64_t index) const {return references.at(index);}
189 uint64_t getReferenceCount() const {return references.size();}
190 void getReferenceHistogram(uint64_t index, std::map<uint32_t, uint64_t> & histogram) const;
191 uint64_t getReferenceIndex(std::string id) const;
192 int getKmerSize() const {return parameters.kmerSize;}
193 double getKmerSpace() const {return kmerSpace;}
194 bool getUse64() const {return parameters.use64;}
195 uint64_t getWindowSize() const {return parameters.windowSize;}
196 bool getNoncanonical() const {return parameters.noncanonical;}
197 bool hasHashCounts() const {return references.size() > 0 && references.at(0).counts.size() > 0;}
198 bool hasLociByHash(hash_t hash) const {return lociByHash.count(hash);}
199 int initFromFiles(const std::vector<std::string> & files, const Parameters & parametersNew, int verbosity = 0, bool enforceParameters = false, bool contain = false);
200 void initFromReads(const std::vector<std::string> & files, const Parameters & parametersNew);
201 uint64_t initParametersFromCapnp(const char * file);
202 void setReferenceName(int i, const std::string name) {references[i].name = name;}
203 void setReferenceComment(int i, const std::string comment) {references[i].comment = comment;}
204 bool sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, Sketch::SketchOutput> * threadPool);
205 void useThreadOutput(SketchOutput * output);
206 void warnKmerSize(uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount) const;
207 bool writeToFile() const;
208 int writeToCapnp(const char * file) const;
209
210 private:
211
212 void createIndex();
213
214 std::vector<Reference> references;
215 robin_hood::unordered_map<std::string, int> referenceIndecesById;
216 std::vector<std::vector<PositionHash>> positionHashesByReference;
217 robin_hood::unordered_map<hash_t, std::vector<Locus>> lociByHash;
218
219 Parameters parameters;
220 double kmerSpace;
221 std::string file;
222 };
223
224 void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const Sketch::Parameters & parameters);
225 void getMinHashPositions(std::vector<Sketch::PositionHash> & loci, char * seq, uint32_t length, const Sketch::Parameters & parameters, int verbosity = 0);
226 bool hasSuffix(std::string const & whole, std::string const & suffix);
227 Sketch::SketchOutput * loadCapnp(Sketch::SketchInput * input);
228 void reverseComplement(const char * src, char * dest, int length);
229 void setAlphabetFromString(Sketch::Parameters & parameters, const char * characters);
230 void setMinHashesForReference(Sketch::Reference & reference, const MinHashHeap & hashes);
231 Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input);
232 Sketch::SketchOutput * sketchSequence(Sketch::SketchInput * input);
233
234 int def(int fdSource, int fdDest, int level);
235 int inf(int fdSource, int fdDest);
236 void zerr(int ret);
237
238 #endif