view CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/mash/Sketch.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
line wrap: on
line source
// Copyright © 2015, Battelle National Biodefense Institute (BNBI);
// all rights reserved. Authored by: Brian Ondov, Todd Treangen,
// Sergey Koren, and Adam Phillippy
//
// See the LICENSE.txt file included with this software for license information.

#ifndef Sketch_h
#define Sketch_h

#include "mash/capnp/MinHash.capnp.h"
#include "robin_hood.h"
#include <map>
#include <vector>
#include <string>
#include <string.h>
#include "MinHashHeap.h"
#include "ThreadPool.h"

static const char * capnpHeader = "Cap'n Proto";
static const int capnpHeaderLength = strlen(capnpHeader);

static const char * suffixSketch = ".msh";
static const char * suffixSketchWindowed = ".msw";

static const char * alphabetNucleotide = "ACGT";
static const char * alphabetProtein = "ACDEFGHIKLMNPQRSTVWY";

class Sketch
{
public:
    
    typedef uint64_t hash_t;
    
    struct Parameters
    {
        Parameters()
            :
            parallelism(1),
            kmerSize(0),
            alphabetSize(0),
            preserveCase(false),
            use64(false),
            seed(0),
            error(0),
            warning(0),
            minHashesPerWindow(0),
            windowSize(0),
            windowed(false),
            concatenated(false),
            noncanonical(false),
            reads(false),
            memoryBound(0),
            minCov(1),
            targetCov(0),
            genomeSize(0)
        {
        	memset(alphabet, 0, 256);
        }
        
        Parameters(const Parameters & other)
            :
            parallelism(other.parallelism),
            kmerSize(other.kmerSize),
            alphabetSize(other.alphabetSize),
            preserveCase(other.preserveCase),
            use64(other.use64),
            seed(other.seed),
            error(other.error),
            warning(other.warning),
            minHashesPerWindow(other.minHashesPerWindow),
            windowSize(other.windowSize),
            windowed(other.windowed),
            concatenated(other.concatenated),
            noncanonical(other.noncanonical),
            reads(other.reads),
            memoryBound(other.memoryBound),
            minCov(other.minCov),
            targetCov(other.targetCov),
            genomeSize(other.genomeSize)
		{
			memcpy(alphabet, other.alphabet, 256);
		}
        
        int parallelism;
        int kmerSize;
        bool alphabet[256];
        uint32_t alphabetSize;
        bool preserveCase;
        bool use64;
        uint32_t seed;
        double error;
        double warning;
        uint64_t minHashesPerWindow;
        uint64_t windowSize;
        bool windowed;
        bool concatenated;
        bool noncanonical;
        bool reads;
        uint64_t memoryBound;
        uint32_t minCov;
        double targetCov;
        uint64_t genomeSize;
    };
    
    struct PositionHash
    {
        PositionHash(uint32_t positionNew, hash_t hashNew) :
            position(positionNew),
            hash(hashNew)
            {}

        uint32_t position;
        hash_t hash;
    };
    
    struct Locus
    {
        Locus(uint32_t sequenceNew, uint32_t positionNew)
            :
            sequence(sequenceNew),
            position(positionNew)
            {}
        
        uint32_t sequence;
        uint32_t position;
    };
    
    struct Reference
    {
        // no sequence for now
        
        std::string name;
        std::string comment;
        uint64_t length;
        HashList hashesSorted;
        std::vector<uint32_t> counts;
    };
    
    struct SketchInput
    {
    	SketchInput(std::vector<std::string> fileNamesNew, char * seqNew, uint64_t lengthNew, const std::string & nameNew, const std::string & commentNew, const Sketch::Parameters & parametersNew)
    	:
    	fileNames(fileNamesNew),
    	seq(seqNew),
    	length(lengthNew),
    	name(nameNew),
    	comment(commentNew),
    	parameters(parametersNew)
    	{}
    	
    	~SketchInput()
    	{
    		if ( seq != 0 )
    		{
	    		delete [] seq;
	    	}
    	}
    	
    	std::vector<std::string> fileNames;
    	
    	char * seq;
    	
    	uint64_t length;
    	
    	std::string name;
    	std::string comment;
    	
    	Sketch::Parameters parameters;
    };
    
    struct SketchOutput
    {
    	std::vector<Reference> references;
	    std::vector<std::vector<PositionHash>> positionHashesByReference;
    };
    
    void getAlphabetAsString(std::string & alphabet) const;
    uint32_t getAlphabetSize() const {return parameters.alphabetSize;}
    bool getConcatenated() const {return parameters.concatenated;}
    float getError() const {return parameters.error;}
    int getHashCount() const {return lociByHash.size();}
    uint32_t getHashSeed() const {return parameters.seed;}
    const std::vector<Locus> & getLociByHash(hash_t hash) const;
    float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;}
	int getMinKmerSize(uint64_t reference) const;
	bool getPreserveCase() const {return parameters.preserveCase;}
	double getRandomKmerChance(uint64_t reference) const;
    const Reference & getReference(uint64_t index) const {return references.at(index);}
    uint64_t getReferenceCount() const {return references.size();}
    void getReferenceHistogram(uint64_t index, std::map<uint32_t, uint64_t> & histogram) const;
    uint64_t getReferenceIndex(std::string id) const;
    int getKmerSize() const {return parameters.kmerSize;}
    double getKmerSpace() const {return kmerSpace;}
    bool getUse64() const {return parameters.use64;}
    uint64_t getWindowSize() const {return parameters.windowSize;}
    bool getNoncanonical() const {return parameters.noncanonical;}
    bool hasHashCounts() const {return references.size() > 0 && references.at(0).counts.size() > 0;}
    bool hasLociByHash(hash_t hash) const {return lociByHash.count(hash);}
    int initFromFiles(const std::vector<std::string> & files, const Parameters & parametersNew, int verbosity = 0, bool enforceParameters = false, bool contain = false);
    void initFromReads(const std::vector<std::string> & files, const Parameters & parametersNew);
    uint64_t initParametersFromCapnp(const char * file);
    void setReferenceName(int i, const std::string name) {references[i].name = name;}
    void setReferenceComment(int i, const std::string comment) {references[i].comment = comment;}
	bool sketchFileBySequence(FILE * file, ThreadPool<Sketch::SketchInput, Sketch::SketchOutput> * threadPool);
	void useThreadOutput(SketchOutput * output);
    void warnKmerSize(uint64_t lengthMax, const std::string & lengthMaxName, double randomChance, int kMin, int warningCount) const;
    bool writeToFile() const;
    int writeToCapnp(const char * file) const;
    
private:
    
    void createIndex();
    
    std::vector<Reference> references;
    robin_hood::unordered_map<std::string, int> referenceIndecesById;
    std::vector<std::vector<PositionHash>> positionHashesByReference;
    robin_hood::unordered_map<hash_t, std::vector<Locus>> lociByHash;
    
    Parameters parameters;
    double kmerSpace;
    std::string file;
};

void addMinHashes(MinHashHeap & minHashHeap, char * seq, uint64_t length, const Sketch::Parameters & parameters);
void getMinHashPositions(std::vector<Sketch::PositionHash> & loci, char * seq, uint32_t length, const Sketch::Parameters & parameters, int verbosity = 0);
bool hasSuffix(std::string const & whole, std::string const & suffix);
Sketch::SketchOutput * loadCapnp(Sketch::SketchInput * input);
void reverseComplement(const char * src, char * dest, int length);
void setAlphabetFromString(Sketch::Parameters & parameters, const char * characters);
void setMinHashesForReference(Sketch::Reference & reference, const MinHashHeap & hashes);
Sketch::SketchOutput * sketchFile(Sketch::SketchInput * input);
Sketch::SketchOutput * sketchSequence(Sketch::SketchInput * input);

int def(int fdSource, int fdDest, int level);
int inf(int fdSource, int fdDest);
void zerr(int ret);

#endif