diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/MinHashHeap.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/MinHashHeap.h	Tue Mar 18 17:55:14 2025 -0400
@@ -0,0 +1,50 @@
+#ifndef HashHeapCounted_h
+#define HashHeapCounted_h
+
+#include "HashList.h"
+#include "HashPriorityQueue.h"
+#include "HashSet.h"
+#include <math.h>
+#include "bloom_filter.hpp"
+
+class MinHashHeap
+{
+public:
+
+	MinHashHeap(bool use64New, uint64_t cardinalityMaximumNew, uint64_t multiplicityMinimumNew = 1, uint64_t memoryBoundBytes = 0);
+	~MinHashHeap();
+	void computeStats();
+	void clear();
+	double estimateMultiplicity() const;
+	double estimateSetSize() const;
+	void toCounts(std::vector<uint32_t> & counts) const;
+    void toHashList(HashList & hashList) const;
+	void tryInsert(hash_u hash);
+
+private:
+
+	bool use64;
+	
+	HashSet hashes;
+	HashPriorityQueue hashesQueue;
+	
+	HashSet hashesPending;
+	HashPriorityQueue hashesQueuePending;
+	
+	uint64_t cardinalityMaximum;
+	uint64_t multiplicityMinimum;
+	
+	uint64_t multiplicitySum;
+	
+    bloom_filter * bloomFilter;
+    
+    uint64_t kmersTotal;
+    uint64_t kmersUsed;
+};
+
+inline double MinHashHeap::estimateMultiplicity() const {return hashes.size() ? (double)multiplicitySum / hashes.size() : 0;}
+inline double MinHashHeap::estimateSetSize() const {return hashes.size() ? pow(2.0, use64 ? 64.0 : 32.0) * (double)hashes.size() / (use64 ? (double)hashesQueue.top().hash64 : (double)hashesQueue.top().hash32) : 0;}
+inline void MinHashHeap::toHashList(HashList & hashList) const {hashes.toHashList(hashList);}
+inline void MinHashHeap::toCounts(std::vector<uint32_t> & counts) const {hashes.toCounts(counts);}
+
+#endif