jpayne@69
|
1 // Copyright © 2015, Battelle National Biodefense Institute (BNBI);
|
jpayne@69
|
2 // all rights reserved. Authored by: Brian Ondov, Todd Treangen,
|
jpayne@69
|
3 // Sergey Koren, and Adam Phillippy
|
jpayne@69
|
4 //
|
jpayne@69
|
5 // See the LICENSE.txt file included with this software for license information.
|
jpayne@69
|
6
|
jpayne@69
|
7 #ifndef Sketch_h
|
jpayne@69
|
8 #define Sketch_h
|
jpayne@69
|
9
|
jpayne@69
|
10 #include "mash/capnp/MinHash.capnp.h"
|
jpayne@69
|
11 #include "robin_hood.h"
|
jpayne@69
|
12 #include <map>
|
jpayne@69
|
13 #include <vector>
|
jpayne@69
|
14 #include <string>
|
jpayne@69
|
15 #include <string.h>
|
jpayne@69
|
16 #include "MinHashHeap.h"
|
jpayne@69
|
17 #include "ThreadPool.h"
|
jpayne@69
|
18
|
jpayne@69
|
19 static const char * capnpHeader = "Cap'n Proto";
|
jpayne@69
|
20 static const int capnpHeaderLength = strlen(capnpHeader);
|
jpayne@69
|
21
|
jpayne@69
|
22 static const char * suffixSketch = ".msh";
|
jpayne@69
|
23 static const char * suffixSketchWindowed = ".msw";
|
jpayne@69
|
24
|
jpayne@69
|
25 static const char * alphabetNucleotide = "ACGT";
|
jpayne@69
|
26 static const char * alphabetProtein = "ACDEFGHIKLMNPQRSTVWY";
|
jpayne@69
|
27
|
jpayne@69
|
28 class Sketch
|
jpayne@69
|
29 {
|
jpayne@69
|
30 public:
|
jpayne@69
|
31
|
jpayne@69
|
32 typedef uint64_t hash_t;
|
jpayne@69
|
33
|
jpayne@69
|
34 struct Parameters
|
jpayne@69
|
35 {
|
jpayne@69
|
36 Parameters()
|
jpayne@69
|
37 :
|
jpayne@69
|
38 parallelism(1),
|
jpayne@69
|
39 kmerSize(0),
|
jpayne@69
|
40 alphabetSize(0),
|
jpayne@69
|
41 preserveCase(false),
|
jpayne@69
|
42 use64(false),
|
jpayne@69
|
43 seed(0),
|
jpayne@69
|
44 error(0),
|
jpayne@69
|
45 warning(0),
|
jpayne@69
|
46 minHashesPerWindow(0),
|
jpayne@69
|
47 windowSize(0),
|
jpayne@69
|
48 windowed(false),
|
jpayne@69
|
49 concatenated(false),
|
jpayne@69
|
50 noncanonical(false),
|
jpayne@69
|
51 reads(false),
|
jpayne@69
|
52 memoryBound(0),
|
jpayne@69
|
53 minCov(1),
|
jpayne@69
|
54 targetCov(0),
|
jpayne@69
|
55 genomeSize(0)
|
jpayne@69
|
56 {
|
jpayne@69
|
57 memset(alphabet, 0, 256);
|
jpayne@69
|
58 }
|
jpayne@69
|
59
|
jpayne@69
|
60 Parameters(const Parameters & other)
|
jpayne@69
|
61 :
|
jpayne@69
|
62 parallelism(other.parallelism),
|
jpayne@69
|
63 kmerSize(other.kmerSize),
|
jpayne@69
|
64 alphabetSize(other.alphabetSize),
|
jpayne@69
|
65 preserveCase(other.preserveCase),
|
jpayne@69
|
66 use64(other.use64),
|
jpayne@69
|
67 seed(other.seed),
|
jpayne@69
|
68 error(other.error),
|
jpayne@69
|
69 warning(other.warning),
|
jpayne@69
|
70 minHashesPerWindow(other.minHashesPerWindow),
|
jpayne@69
|
71 windowSize(other.windowSize),
|
jpayne@69
|
72 windowed(other.windowed),
|
jpayne@69
|
73 concatenated(other.concatenated),
|
jpayne@69
|
74 noncanonical(other.noncanonical),
|
jpayne@69
|
75 reads(other.reads),
|
jpayne@69
|
76 memoryBound(other.memoryBound),
|
jpayne@69
|
77 minCov(other.minCov),
|
jpayne@69
|
78 targetCov(other.targetCov),
|
jpayne@69
|
79 genomeSize(other.genomeSize)
|
jpayne@69
|
80 {
|
jpayne@69
|
81 memcpy(alphabet, other.alphabet, 256);
|
jpayne@69
|
82 }
|
jpayne@69
|
83
|
jpayne@69
|
84 int parallelism;
|
jpayne@69
|
85 int kmerSize;
|
jpayne@69
|
86 bool alphabet[256];
|
jpayne@69
|
87 uint32_t alphabetSize;
|
jpayne@69
|
88 bool preserveCase;
|
jpayne@69
|
89 bool use64;
|
jpayne@69
|
90 uint32_t seed;
|
jpayne@69
|
91 double error;
|
jpayne@69
|
92 double warning;
|
jpayne@69
|
93 uint64_t minHashesPerWindow;
|
jpayne@69
|
94 uint64_t windowSize;
|
jpayne@69
|
95 bool windowed;
|
jpayne@69
|
96 bool concatenated;
|
jpayne@69
|
97 bool noncanonical;
|
jpayne@69
|
98 bool reads;
|
jpayne@69
|
99 uint64_t memoryBound;
|
jpayne@69
|
100 uint32_t minCov;
|
jpayne@69
|
101 double targetCov;
|
jpayne@69
|
102 uint64_t genomeSize;
|
jpayne@69
|
103 };
|
jpayne@69
|
104
|
jpayne@69
|
105 struct PositionHash
|
jpayne@69
|
106 {
|
jpayne@69
|
107 PositionHash(uint32_t positionNew, hash_t hashNew) :
|
jpayne@69
|
108 position(positionNew),
|
jpayne@69
|
109 hash(hashNew)
|
jpayne@69
|
110 {}
|
jpayne@69
|
111
|
jpayne@69
|
112 uint32_t position;
|
jpayne@69
|
113 hash_t hash;
|
jpayne@69
|
114 };
|
jpayne@69
|
115
|
jpayne@69
|
116 struct Locus
|
jpayne@69
|
117 {
|
jpayne@69
|
118 Locus(uint32_t sequenceNew, uint32_t positionNew)
|
jpayne@69
|
119 :
|
jpayne@69
|
120 sequence(sequenceNew),
|
jpayne@69
|
121 position(positionNew)
|
jpayne@69
|
122 {}
|
jpayne@69
|
123
|
jpayne@69
|
124 uint32_t sequence;
|
jpayne@69
|
125 uint32_t position;
|
jpayne@69
|
126 };
|
jpayne@69
|
127
|
jpayne@69
|
128 struct Reference
|
jpayne@69
|
129 {
|
jpayne@69
|
130 // no sequence for now
|
jpayne@69
|
131
|
jpayne@69
|
132 std::string name;
|
jpayne@69
|
133 std::string comment;
|
jpayne@69
|
134 uint64_t length;
|
jpayne@69
|
135 HashList hashesSorted;
|
jpayne@69
|
136 std::vector<uint32_t> counts;
|
jpayne@69
|
137 };
|
jpayne@69
|
138
|
jpayne@69
|
139 struct SketchInput
|
jpayne@69
|
140 {
|
jpayne@69
|
141 SketchInput(std::vector<std::string> fileNamesNew, char * seqNew, uint64_t lengthNew, const std::string & nameNew, const std::string & commentNew, const Sketch::Parameters & parametersNew)
|
jpayne@69
|
142 :
|
jpayne@69
|
143 fileNames(fileNamesNew),
|
jpayne@69
|
144 seq(seqNew),
|
jpayne@69
|
145 length(lengthNew),
|
jpayne@69
|
146 name(nameNew),
|
jpayne@69
|
147 comment(commentNew),
|
jpayne@69
|
148 parameters(parametersNew)
|
jpayne@69
|
149 {}
|
jpayne@69
|
150
|
jpayne@69
|
151 ~SketchInput()
|
jpayne@69
|
152 {
|
jpayne@69
|
153 if ( seq != 0 )
|
jpayne@69
|
154 {
|
jpayne@69
|
155 delete [] seq;
|
jpayne@69
|
156 }
|
jpayne@69
|
157 }
|
jpayne@69
|
158
|
jpayne@69
|
159 std::vector<std::string> fileNames;
|
jpayne@69
|
160
|
jpayne@69
|
161 char * seq;
|
jpayne@69
|
162
|
jpayne@69
|
163 uint64_t length;
|
jpayne@69
|
164
|
jpayne@69
|
165 std::string name;
|
jpayne@69
|
166 std::string comment;
|
jpayne@69
|
167
|
jpayne@69
|
168 Sketch::Parameters parameters;
|
jpayne@69
|
169 };
|
jpayne@69
|
170
|
jpayne@69
|
171 struct SketchOutput
|
jpayne@69
|
172 {
|
jpayne@69
|
173 std::vector<Reference> references;
|
jpayne@69
|
174 std::vector<std::vector<PositionHash>> positionHashesByReference;
|
jpayne@69
|
175 };
|
jpayne@69
|
176
|
jpayne@69
|
177 void getAlphabetAsString(std::string & alphabet) const;
|
jpayne@69
|
178 uint32_t getAlphabetSize() const {return parameters.alphabetSize;}
|
jpayne@69
|
179 bool getConcatenated() const {return parameters.concatenated;}
|
jpayne@69
|
180 float getError() const {return parameters.error;}
|
jpayne@69
|
181 int getHashCount() const {return lociByHash.size();}
|
jpayne@69
|
182 uint32_t getHashSeed() const {return parameters.seed;}
|
jpayne@69
|
183 const std::vector<Locus> & getLociByHash(hash_t hash) const;
|
jpayne@69
|
184 float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;}
|
jpayne@69
|
185 int getMinKmerSize(uint64_t reference) const;
|
jpayne@69
|
186 bool getPreserveCase() const {return parameters.preserveCase;}
|
jpayne@69
|
187 double getRandomKmerChance(uint64_t reference) const;
|
jpayne@69
|
188 const Reference & getReference(uint64_t index) const {return references.at(index);}
|
jpayne@69
|
189 uint64_t getReferenceCount() const {return references.size();}
|
jpayne@69
|
190 void getReferenceHistogram(uint64_t index, std::map<uint32_t, uint64_t> & histogram) const;
|
jpayne@69
|
191 uint64_t getReferenceIndex(std::string id) const;
|
jpayne@69
|
192 int getKmerSize() const {return parameters.kmerSize;}
|
jpayne@69
|
193 double getKmerSpace() const {return kmerSpace;}
|
jpayne@69
|
194 bool getUse64() const {return parameters.use64;}
|
jpayne@69
|
195 uint64_t getWindowSize() const {return parameters.windowSize;}
|
jpayne@69
|
196 bool getNoncanonical() const {return parameters.noncanonical;}
|
jpayne@69
|
197 bool hasHashCounts() const {return references.size() > 0 && references.at(0).counts.size() > 0;}
|
jpayne@69
|
198 bool hasLociByHash(hash_t hash) const {return lociByHash.count(hash);}
|
jpayne@69
|
199 int initFromFiles(const std::vector<std::string> & files, const Parameters & parametersNew, int verbosity = 0, bool enforceParameters = false, bool contain = false);
|
jpayne@69
|
200 void initFromReads(const std::vector<std::string> & files, const Parameters & parametersNew);
|
jpayne@69
|
201 uint64_t initParametersFromCapnp(const char * file);
|
jpayne@69
|
202 void setReferenceName(int i, const std::string name) {references[i].name = name;}
|
jpayne@69
|
203 void setReferenceComment(int i, const std::string comment) {references[i].comment = comment;}
|
jpayne@69
|
204 bool sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, Sketch::SketchOutput> * threadPool);
|
jpayne@69
|
205 void useThreadOutput(SketchOutput * output);
|
jpayne@69
|
206 void warnKmerSize(uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount) const;
|
jpayne@69
|
207 bool writeToFile() const;
|
jpayne@69
|
208 int writeToCapnp(const char * file) const;
|
jpayne@69
|
209
|
jpayne@69
|
210 private:
|
jpayne@69
|
211
|
jpayne@69
|
212 void createIndex();
|
jpayne@69
|
213
|
jpayne@69
|
214 std::vector<Reference> references;
|
jpayne@69
|
215 robin_hood::unordered_map<std::string, int> referenceIndecesById;
|
jpayne@69
|
216 std::vector<std::vector<PositionHash>> positionHashesByReference;
|
jpayne@69
|
217 robin_hood::unordered_map<hash_t, std::vector<Locus>> lociByHash;
|
jpayne@69
|
218
|
jpayne@69
|
219 Parameters parameters;
|
jpayne@69
|
220 double kmerSpace;
|
jpayne@69
|
221 std::string file;
|
jpayne@69
|
222 };
|
jpayne@69
|
223
|
jpayne@69
|
224 void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const Sketch::Parameters & parameters);
|
jpayne@69
|
225 void getMinHashPositions(std::vector<Sketch::PositionHash> & loci, char * seq, uint32_t length, const Sketch::Parameters & parameters, int verbosity = 0);
|
jpayne@69
|
226 bool hasSuffix(std::string const & whole, std::string const & suffix);
|
jpayne@69
|
227 Sketch::SketchOutput * loadCapnp(Sketch::SketchInput * input);
|
jpayne@69
|
228 void reverseComplement(const char * src, char * dest, int length);
|
jpayne@69
|
229 void setAlphabetFromString(Sketch::Parameters & parameters, const char * characters);
|
jpayne@69
|
230 void setMinHashesForReference(Sketch::Reference & reference, const MinHashHeap & hashes);
|
jpayne@69
|
231 Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input);
|
jpayne@69
|
232 Sketch::SketchOutput * sketchSequence(Sketch::SketchInput * input);
|
jpayne@69
|
233
|
jpayne@69
|
234 int def(int fdSource, int fdDest, int level);
|
jpayne@69
|
235 int inf(int fdSource, int fdDest);
|
jpayne@69
|
236 void zerr(int ret);
|
jpayne@69
|
237
|
jpayne@69
|
238 #endif
|