diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/Sketch.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/Sketch.h	Tue Mar 18 17:55:14 2025 -0400
@@ -0,0 +1,238 @@
+// Copyright © 2015, Battelle National Biodefense Institute (BNBI);
+// all rights reserved. Authored by: Brian Ondov, Todd Treangen,
+// Sergey Koren, and Adam Phillippy
+//
+// See the LICENSE.txt file included with this software for license information.
+
+#ifndef Sketch_h
+#define Sketch_h
+
+#include "mash/capnp/MinHash.capnp.h"
+#include "robin_hood.h"
+#include <map>
+#include <vector>
+#include <string>
+#include <string.h>
+#include "MinHashHeap.h"
+#include "ThreadPool.h"
+
+static const char * capnpHeader = "Cap'n Proto";
+static const int capnpHeaderLength = strlen(capnpHeader);
+
+static const char * suffixSketch = ".msh";
+static const char * suffixSketchWindowed = ".msw";
+
+static const char * alphabetNucleotide = "ACGT";
+static const char * alphabetProtein = "ACDEFGHIKLMNPQRSTVWY";
+
+class Sketch
+{
+public:
+    
+    typedef uint64_t hash_t;
+    
+    struct Parameters
+    {
+        Parameters()
+            :
+            parallelism(1),
+            kmerSize(0),
+            alphabetSize(0),
+            preserveCase(false),
+            use64(false),
+            seed(0),
+            error(0),
+            warning(0),
+            minHashesPerWindow(0),
+            windowSize(0),
+            windowed(false),
+            concatenated(false),
+            noncanonical(false),
+            reads(false),
+            memoryBound(0),
+            minCov(1),
+            targetCov(0),
+            genomeSize(0)
+        {
+        	memset(alphabet, 0, 256);
+        }
+        
+        Parameters(const Parameters & other)
+            :
+            parallelism(other.parallelism),
+            kmerSize(other.kmerSize),
+            alphabetSize(other.alphabetSize),
+            preserveCase(other.preserveCase),
+            use64(other.use64),
+            seed(other.seed),
+            error(other.error),
+            warning(other.warning),
+            minHashesPerWindow(other.minHashesPerWindow),
+            windowSize(other.windowSize),
+            windowed(other.windowed),
+            concatenated(other.concatenated),
+            noncanonical(other.noncanonical),
+            reads(other.reads),
+            memoryBound(other.memoryBound),
+            minCov(other.minCov),
+            targetCov(other.targetCov),
+            genomeSize(other.genomeSize)
+		{
+			memcpy(alphabet, other.alphabet, 256);
+		}
+        
+        int parallelism;
+        int kmerSize;
+        bool alphabet[256];
+        uint32_t alphabetSize;
+        bool preserveCase;
+        bool use64;
+        uint32_t seed;
+        double error;
+        double warning;
+        uint64_t minHashesPerWindow;
+        uint64_t windowSize;
+        bool windowed;
+        bool concatenated;
+        bool noncanonical;
+        bool reads;
+        uint64_t memoryBound;
+        uint32_t minCov;
+        double targetCov;
+        uint64_t genomeSize;
+    };
+    
+    struct PositionHash
+    {
+        PositionHash(uint32_t positionNew, hash_t hashNew) :
+            position(positionNew),
+            hash(hashNew)
+            {}
+
+        uint32_t position;
+        hash_t hash;
+    };
+    
+    struct Locus
+    {
+        Locus(uint32_t sequenceNew, uint32_t positionNew)
+            :
+            sequence(sequenceNew),
+            position(positionNew)
+            {}
+        
+        uint32_t sequence;
+        uint32_t position;
+    };
+    
+    struct Reference
+    {
+        // no sequence for now
+        
+        std::string name;
+        std::string comment;
+        uint64_t length;
+        HashList hashesSorted;
+        std::vector<uint32_t> counts;
+    };
+    
+    struct SketchInput
+    {
+    	SketchInput(std::vector<std::string> fileNamesNew, char * seqNew, uint64_t lengthNew, const std::string & nameNew, const std::string & commentNew, const Sketch::Parameters & parametersNew)
+    	:
+    	fileNames(fileNamesNew),
+    	seq(seqNew),
+    	length(lengthNew),
+    	name(nameNew),
+    	comment(commentNew),
+    	parameters(parametersNew)
+    	{}
+    	
+    	~SketchInput()
+    	{
+    		if ( seq != 0 )
+    		{
+	    		delete [] seq;
+	    	}
+    	}
+    	
+    	std::vector<std::string> fileNames;
+    	
+    	char * seq;
+    	
+    	uint64_t length;
+    	
+    	std::string name;
+    	std::string comment;
+    	
+    	Sketch::Parameters parameters;
+    };
+    
+    struct SketchOutput
+    {
+    	std::vector<Reference> references;
+	    std::vector<std::vector<PositionHash>> positionHashesByReference;
+    };
+    
+    void getAlphabetAsString(std::string & alphabet) const;
+    uint32_t getAlphabetSize() const {return parameters.alphabetSize;}
+    bool getConcatenated() const {return parameters.concatenated;}
+    float getError() const {return parameters.error;}
+    int getHashCount() const {return lociByHash.size();}
+    uint32_t getHashSeed() const {return parameters.seed;}
+    const std::vector<Locus> & getLociByHash(hash_t hash) const;
+    float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;}
+	int getMinKmerSize(uint64_t reference) const;
+	bool getPreserveCase() const {return parameters.preserveCase;}
+	double getRandomKmerChance(uint64_t reference) const;
+    const Reference & getReference(uint64_t index) const {return references.at(index);}
+    uint64_t getReferenceCount() const {return references.size();}
+    void getReferenceHistogram(uint64_t index, std::map<uint32_t, uint64_t> & histogram) const;
+    uint64_t getReferenceIndex(std::string id) const;
+    int getKmerSize() const {return parameters.kmerSize;}
+    double getKmerSpace() const {return kmerSpace;}
+    bool getUse64() const {return parameters.use64;}
+    uint64_t getWindowSize() const {return parameters.windowSize;}
+    bool getNoncanonical() const {return parameters.noncanonical;}
+    bool hasHashCounts() const {return references.size() > 0 && references.at(0).counts.size() > 0;}
+    bool hasLociByHash(hash_t hash) const {return lociByHash.count(hash);}
+    int initFromFiles(const std::vector<std::string> & files, const Parameters & parametersNew, int verbosity = 0, bool enforceParameters = false, bool contain = false);
+    void initFromReads(const std::vector<std::string> & files, const Parameters & parametersNew);
+    uint64_t initParametersFromCapnp(const char * file);
+    void setReferenceName(int i, const std::string name) {references[i].name = name;}
+    void setReferenceComment(int i, const std::string comment) {references[i].comment = comment;}
+	bool sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, Sketch::SketchOutput> * threadPool);
+	void useThreadOutput(SketchOutput * output);
+    void warnKmerSize(uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount) const;
+    bool writeToFile() const;
+    int writeToCapnp(const char * file) const;
+    
+private:
+    
+    void createIndex();
+    
+    std::vector<Reference> references;
+    robin_hood::unordered_map<std::string, int> referenceIndecesById;
+    std::vector<std::vector<PositionHash>> positionHashesByReference;
+    robin_hood::unordered_map<hash_t, std::vector<Locus>> lociByHash;
+    
+    Parameters parameters;
+    double kmerSpace;
+    std::string file;
+};
+
+void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const Sketch::Parameters & parameters);
+void getMinHashPositions(std::vector<Sketch::PositionHash> & loci, char * seq, uint32_t length, const Sketch::Parameters & parameters, int verbosity = 0);
+bool hasSuffix(std::string const & whole, std::string const & suffix);
+Sketch::SketchOutput * loadCapnp(Sketch::SketchInput * input);
+void reverseComplement(const char * src, char * dest, int length);
+void setAlphabetFromString(Sketch::Parameters & parameters, const char * characters);
+void setMinHashesForReference(Sketch::Reference & reference, const MinHashHeap & hashes);
+Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input);
+Sketch::SketchOutput * sketchSequence(Sketch::SketchInput * input);
+
+int def(int fdSource, int fdDest, int level);
+int inf(int fdSource, int fdDest);
+void zerr(int ret);
+
+#endif