jpayne@69: // Copyright © 2015, Battelle National Biodefense Institute (BNBI); jpayne@69: // all rights reserved. Authored by: Brian Ondov, Todd Treangen, jpayne@69: // Sergey Koren, and Adam Phillippy jpayne@69: // jpayne@69: // See the LICENSE.txt file included with this software for license information. jpayne@69: jpayne@69: #ifndef Sketch_h jpayne@69: #define Sketch_h jpayne@69: jpayne@69: #include "mash/capnp/MinHash.capnp.h" jpayne@69: #include "robin_hood.h" jpayne@69: #include jpayne@69: #include jpayne@69: #include jpayne@69: #include jpayne@69: #include "MinHashHeap.h" jpayne@69: #include "ThreadPool.h" jpayne@69: jpayne@69: static const char * capnpHeader = "Cap'n Proto"; jpayne@69: static const int capnpHeaderLength = strlen(capnpHeader); jpayne@69: jpayne@69: static const char * suffixSketch = ".msh"; jpayne@69: static const char * suffixSketchWindowed = ".msw"; jpayne@69: jpayne@69: static const char * alphabetNucleotide = "ACGT"; jpayne@69: static const char * alphabetProtein = "ACDEFGHIKLMNPQRSTVWY"; jpayne@69: jpayne@69: class Sketch jpayne@69: { jpayne@69: public: jpayne@69: jpayne@69: typedef uint64_t hash_t; jpayne@69: jpayne@69: struct Parameters jpayne@69: { jpayne@69: Parameters() jpayne@69: : jpayne@69: parallelism(1), jpayne@69: kmerSize(0), jpayne@69: alphabetSize(0), jpayne@69: preserveCase(false), jpayne@69: use64(false), jpayne@69: seed(0), jpayne@69: error(0), jpayne@69: warning(0), jpayne@69: minHashesPerWindow(0), jpayne@69: windowSize(0), jpayne@69: windowed(false), jpayne@69: concatenated(false), jpayne@69: noncanonical(false), jpayne@69: reads(false), jpayne@69: memoryBound(0), jpayne@69: minCov(1), jpayne@69: targetCov(0), jpayne@69: genomeSize(0) jpayne@69: { jpayne@69: memset(alphabet, 0, 256); jpayne@69: } jpayne@69: jpayne@69: Parameters(const Parameters & other) jpayne@69: : jpayne@69: parallelism(other.parallelism), jpayne@69: kmerSize(other.kmerSize), jpayne@69: alphabetSize(other.alphabetSize), jpayne@69: preserveCase(other.preserveCase), jpayne@69: use64(other.use64), jpayne@69: seed(other.seed), jpayne@69: error(other.error), jpayne@69: warning(other.warning), jpayne@69: minHashesPerWindow(other.minHashesPerWindow), jpayne@69: windowSize(other.windowSize), jpayne@69: windowed(other.windowed), jpayne@69: concatenated(other.concatenated), jpayne@69: noncanonical(other.noncanonical), jpayne@69: reads(other.reads), jpayne@69: memoryBound(other.memoryBound), jpayne@69: minCov(other.minCov), jpayne@69: targetCov(other.targetCov), jpayne@69: genomeSize(other.genomeSize) jpayne@69: { jpayne@69: memcpy(alphabet, other.alphabet, 256); jpayne@69: } jpayne@69: jpayne@69: int parallelism; jpayne@69: int kmerSize; jpayne@69: bool alphabet[256]; jpayne@69: uint32_t alphabetSize; jpayne@69: bool preserveCase; jpayne@69: bool use64; jpayne@69: uint32_t seed; jpayne@69: double error; jpayne@69: double warning; jpayne@69: uint64_t minHashesPerWindow; jpayne@69: uint64_t windowSize; jpayne@69: bool windowed; jpayne@69: bool concatenated; jpayne@69: bool noncanonical; jpayne@69: bool reads; jpayne@69: uint64_t memoryBound; jpayne@69: uint32_t minCov; jpayne@69: double targetCov; jpayne@69: uint64_t genomeSize; jpayne@69: }; jpayne@69: jpayne@69: struct PositionHash jpayne@69: { jpayne@69: PositionHash(uint32_t positionNew, hash_t hashNew) : jpayne@69: position(positionNew), jpayne@69: hash(hashNew) jpayne@69: {} jpayne@69: jpayne@69: uint32_t position; jpayne@69: hash_t hash; jpayne@69: }; jpayne@69: jpayne@69: struct Locus jpayne@69: { jpayne@69: Locus(uint32_t sequenceNew, uint32_t positionNew) jpayne@69: : jpayne@69: sequence(sequenceNew), jpayne@69: position(positionNew) jpayne@69: {} jpayne@69: jpayne@69: uint32_t sequence; jpayne@69: uint32_t position; jpayne@69: }; jpayne@69: jpayne@69: struct Reference jpayne@69: { jpayne@69: // no sequence for now jpayne@69: jpayne@69: std::string name; jpayne@69: std::string comment; jpayne@69: uint64_t length; jpayne@69: HashList hashesSorted; jpayne@69: std::vector counts; jpayne@69: }; jpayne@69: jpayne@69: struct SketchInput jpayne@69: { jpayne@69: SketchInput(std::vector fileNamesNew, char * seqNew, uint64_t lengthNew, const std::string & nameNew, const std::string & commentNew, const Sketch::Parameters & parametersNew) jpayne@69: : jpayne@69: fileNames(fileNamesNew), jpayne@69: seq(seqNew), jpayne@69: length(lengthNew), jpayne@69: name(nameNew), jpayne@69: comment(commentNew), jpayne@69: parameters(parametersNew) jpayne@69: {} jpayne@69: jpayne@69: ~SketchInput() jpayne@69: { jpayne@69: if ( seq != 0 ) jpayne@69: { jpayne@69: delete [] seq; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: std::vector fileNames; jpayne@69: jpayne@69: char * seq; jpayne@69: jpayne@69: uint64_t length; jpayne@69: jpayne@69: std::string name; jpayne@69: std::string comment; jpayne@69: jpayne@69: Sketch::Parameters parameters; jpayne@69: }; jpayne@69: jpayne@69: struct SketchOutput jpayne@69: { jpayne@69: std::vector references; jpayne@69: std::vector> positionHashesByReference; jpayne@69: }; jpayne@69: jpayne@69: void getAlphabetAsString(std::string & alphabet) const; jpayne@69: uint32_t getAlphabetSize() const {return parameters.alphabetSize;} jpayne@69: bool getConcatenated() const {return parameters.concatenated;} jpayne@69: float getError() const {return parameters.error;} jpayne@69: int getHashCount() const {return lociByHash.size();} jpayne@69: uint32_t getHashSeed() const {return parameters.seed;} jpayne@69: const std::vector & getLociByHash(hash_t hash) const; jpayne@69: float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;} jpayne@69: int getMinKmerSize(uint64_t reference) const; jpayne@69: bool getPreserveCase() const {return parameters.preserveCase;} jpayne@69: double getRandomKmerChance(uint64_t reference) const; jpayne@69: const Reference & getReference(uint64_t index) const {return references.at(index);} jpayne@69: uint64_t getReferenceCount() const {return references.size();} jpayne@69: void getReferenceHistogram(uint64_t index, std::map & histogram) const; jpayne@69: uint64_t getReferenceIndex(std::string id) const; jpayne@69: int getKmerSize() const {return parameters.kmerSize;} jpayne@69: double getKmerSpace() const {return kmerSpace;} jpayne@69: bool getUse64() const {return parameters.use64;} jpayne@69: uint64_t getWindowSize() const {return parameters.windowSize;} jpayne@69: bool getNoncanonical() const {return parameters.noncanonical;} jpayne@69: bool hasHashCounts() const {return references.size() > 0 && references.at(0).counts.size() > 0;} jpayne@69: bool hasLociByHash(hash_t hash) const {return lociByHash.count(hash);} jpayne@69: int initFromFiles(const std::vector & files, const Parameters & parametersNew, int verbosity = 0, bool enforceParameters = false, bool contain = false); jpayne@69: void initFromReads(const std::vector & files, const Parameters & parametersNew); jpayne@69: uint64_t initParametersFromCapnp(const char * file); jpayne@69: void setReferenceName(int i, const std::string name) {references[i].name = name;} jpayne@69: void setReferenceComment(int i, const std::string comment) {references[i].comment = comment;} jpayne@69: bool sketchFileBySequence(FILE * file, ThreadPool * threadPool); jpayne@69: void useThreadOutput(SketchOutput * output); jpayne@69: void warnKmerSize(uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount) const; jpayne@69: bool writeToFile() const; jpayne@69: int writeToCapnp(const char * file) const; jpayne@69: jpayne@69: private: jpayne@69: jpayne@69: void createIndex(); jpayne@69: jpayne@69: std::vector references; jpayne@69: robin_hood::unordered_map referenceIndecesById; jpayne@69: std::vector> positionHashesByReference; jpayne@69: robin_hood::unordered_map> lociByHash; jpayne@69: jpayne@69: Parameters parameters; jpayne@69: double kmerSpace; jpayne@69: std::string file; jpayne@69: }; jpayne@69: jpayne@69: void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const Sketch::Parameters & parameters); jpayne@69: void getMinHashPositions(std::vector & loci, char * seq, uint32_t length, const Sketch::Parameters & parameters, int verbosity = 0); jpayne@69: bool hasSuffix(std::string const & whole, std::string const & suffix); jpayne@69: Sketch::SketchOutput * loadCapnp(Sketch::SketchInput * input); jpayne@69: void reverseComplement(const char * src, char * dest, int length); jpayne@69: void setAlphabetFromString(Sketch::Parameters & parameters, const char * characters); jpayne@69: void setMinHashesForReference(Sketch::Reference & reference, const MinHashHeap & hashes); jpayne@69: Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input); jpayne@69: Sketch::SketchOutput * sketchSequence(Sketch::SketchInput * input); jpayne@69: jpayne@69: int def(int fdSource, int fdDest, int level); jpayne@69: int inf(int fdSource, int fdDest); jpayne@69: void zerr(int ret); jpayne@69: jpayne@69: #endif