Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/MinHashHeap.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/MinHashHeap.h Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,50 @@ +#ifndef HashHeapCounted_h +#define HashHeapCounted_h + +#include "HashList.h" +#include "HashPriorityQueue.h" +#include "HashSet.h" +#include <math.h> +#include "bloom_filter.hpp" + +class MinHashHeap +{ +public: + + MinHashHeap(bool use64New, uint64_t cardinalityMaximumNew, uint64_t multiplicityMinimumNew = 1, uint64_t memoryBoundBytes = 0); + ~MinHashHeap(); + void computeStats(); + void clear(); + double estimateMultiplicity() const; + double estimateSetSize() const; + void toCounts(std::vector<uint32_t> & counts) const; + void toHashList(HashList & hashList) const; + void tryInsert(hash_u hash); + +private: + + bool use64; + + HashSet hashes; + HashPriorityQueue hashesQueue; + + HashSet hashesPending; + HashPriorityQueue hashesQueuePending; + + uint64_t cardinalityMaximum; + uint64_t multiplicityMinimum; + + uint64_t multiplicitySum; + + bloom_filter * bloomFilter; + + uint64_t kmersTotal; + uint64_t kmersUsed; +}; + +inline double MinHashHeap::estimateMultiplicity() const {return hashes.size() ? (double)multiplicitySum / hashes.size() : 0;} +inline double MinHashHeap::estimateSetSize() const {return hashes.size() ? pow(2.0, use64 ? 64.0 : 32.0) * (double)hashes.size() / (use64 ? (double)hashesQueue.top().hash64 : (double)hashesQueue.top().hash32) : 0;} +inline void MinHashHeap::toHashList(HashList & hashList) const {hashes.toHashList(hashList);} +inline void MinHashHeap::toCounts(std::vector<uint32_t> & counts) const {hashes.toCounts(counts);} + +#endif