rliterman@0
|
1 .PHONY:
|
rliterman@0
|
2
|
rliterman@0
|
3 .ONESHELL:
|
rliterman@0
|
4
|
rliterman@0
|
5
|
rliterman@0
|
6
|
rliterman@0
|
7 usage: ## Show this menu
|
rliterman@0
|
8 @grep -E '^[a-zA-Z_-]+:.*?##.*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?##"}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
rliterman@0
|
9
|
rliterman@0
|
10 version: ## Show version and branch
|
rliterman@0
|
11 @echo "CSP2 v$${CSP2_VER}/$${CSP2_BRANCH}"
|
rliterman@0
|
12
|
rliterman@0
|
13 # ENV CSP2_VER=${CSP2_VER}
|
rliterman@0
|
14 # ENV BEDTOOLS_VER=${BEDTOOLS_VER}
|
rliterman@0
|
15 # ENV MIUMMER_VER=${MUMMER_VER}
|
rliterman@0
|
16 # ENV SKESA_VER=${SKESA_VER}
|
rliterman@0
|
17 # ENV MASH_VER=${MASH_VER}
|
rliterman@0
|
18 # ENV BBMAP_VER=${BBMAP_VER}
|
rliterman@0
|
19 # ENV PYTHON_VER=${PYTHON_VER}
|
rliterman@0
|
20
|
rliterman@0
|
21 versions: version ## Show versions of key installed depedencies
|
rliterman@0
|
22 @echo `nextflow -v`
|
rliterman@0
|
23 @echo `python3 --version` " (container says ${PYTHON_VER})"
|
rliterman@0
|
24 @echo `bedtools --version` " (container says ${BEDTOOLS_VER})"
|
rliterman@0
|
25 @echo "mummer " `mummer --version` " (container says ${MUMMER_VER})"
|
rliterman@0
|
26 @echo `skesa --version 2>&1` " (container says ${SKESA_VER})"
|
rliterman@0
|
27 @echo "mash " `mash --version` " (container says ${MASH_VER})"
|
rliterman@0
|
28 @echo `bbmap.sh --version 2>&1` " (container says ${BBMAP_VER})"
|
rliterman@0
|
29
|
rliterman@0
|
30 help: ## Show help
|
rliterman@0
|
31 @echo "Citation: CFSAN SNP Pipeline 2, v$${CSP2_VER}, Literman et al. 2024"
|
rliterman@0
|
32 @echo
|
rliterman@0
|
33 @echo "CSP2 is a Nextflow pipeline for rapid, accurate SNP distance estimation"
|
rliterman@0
|
34 @echo "from assembly data."
|
rliterman@0
|
35 @echo
|
rliterman@0
|
36 @echo "Please see: https://github.com/CFSAN-Biostatistics/CSP2"
|
rliterman@0
|
37 @echo
|
rliterman@0
|
38 @echo "CSP2 runs are managed via Nextflow, providing the user with an array of"
|
rliterman@0
|
39 @echo "customizations while also facilitating module development and additions in"
|
rliterman@0
|
40 @echo "future releases."
|
rliterman@0
|
41 @echo
|
rliterman@0
|
42 @echo "Important Note: The software continues to be focused on the analysis of"
|
rliterman@0
|
43 @echo "groups of bacterial genomes with limited evolutionary differences (<1000"
|
rliterman@0
|
44 @echo "SNPs). Testing is underway to determine how the underlying cluster"
|
rliterman@0
|
45 @echo "diversity impacts distances estimates."
|
rliterman@0
|
46 @echo
|
rliterman@0
|
47 @echo "CSP2 has two main run modes:"
|
rliterman@0
|
48 @echo "1) "Screening Mode" (screen): Used to determine whether query isolates are"
|
rliterman@0
|
49 @echo "close to a set of reference isolates (e.g., lab control strains, strains"
|
rliterman@0
|
50 @echo "related to an outbreak, etc.) Given one or more user-provided reference"
|
rliterman@0
|
51 @echo "isolates (--ref_reads; --ref_fasta), get alignment statistics and SNP"
|
rliterman@0
|
52 @echo "distances between all reference and query isolates (--reads; --fasta)"
|
rliterman@0
|
53 @echo
|
rliterman@0
|
54 @echo "2) "SNP Pipeline Mode" (snp): Used to generate pairwise distances and"
|
rliterman@0
|
55 @echo "alignments for a set of query isolates Generate pairwise SNP distances and"
|
rliterman@0
|
56 @echo "alignments for 2+ isolates (--reads; --fasta) based on comparisons to:"
|
rliterman@0
|
57 @echo
|
rliterman@0
|
58 @echo "One or more user-provided references (--ref_reads; --ref_fasta), or One or"
|
rliterman@0
|
59 @echo "more reference isolates selected by RefChooser (--n_ref)"
|
rliterman@0
|
60 @echo
|
rliterman@0
|
61 @echo "Usage: screen [options] {--fasta PATH {--reads=PATH | --forward=STR --reverse=STR} --out=PATH}"
|
rliterman@0
|
62 @echo " or snp [options] {--fasta {--reads=PATH | --forward=STR --reverse=STR} --out=PATH}"
|
rliterman@0
|
63 @echo
|
rliterman@0
|
64 @echo "Options:"
|
rliterman@0
|
65 @echo " --outroot=PATH\tBase directory to create output folder [default=$CWD] "
|
rliterman@0
|
66 @echo " --out=PATH\t\tName of the output folder to create (must not exist)"
|
rliterman@0
|
67 @echo "\t\t\t [default=CSP2_<current_datetime>]"
|
rliterman@0
|
68 @echo " --forward=STR\t\tFull file extension for forward/left reads of query"
|
rliterman@0
|
69 @echo "\t\t\t [default='_1.fastq.gz']"
|
rliterman@0
|
70 @echo " --reverse=STR\t\tFull file extension for reverse/right reads of reference"
|
rliterman@0
|
71 @echo "\t\t\t [default='_2.fastq.gz']"
|
rliterman@0
|
72 @echo " --ref_forward=STR\tFull file extension for forward/left reads of reference"
|
rliterman@0
|
73 @echo "\t\t\t [default='_1.fastq.gz']"
|
rliterman@0
|
74 @echo " --ref_reverse=STR\tFull file extension for reverse/right reads of reference"
|
rliterman@0
|
75 @echo "\t\t\t [default='_2.fastq.gz']"
|
rliterman@0
|
76 @echo " --readext=STR\t\tExtension for single-end reads for query [default='fastq.gz']"
|
rliterman@0
|
77 @echo " --ref_readext=STR\tExtension for single-end reads for reference"
|
rliterman@0
|
78 @echo "\t\t\t [default='fastq.gz']"
|
rliterman@0
|
79 @echo " --min_cov=NUM\t\tDo not analyze queries that cover less than <min_cov>% of the"
|
rliterman@0
|
80 @echo "\t\t\treference assembly [default=85]"
|
rliterman@0
|
81 @echo " --min_iden=NUM\tOnly consider alignments where the percent identity is at least"
|
rliterman@0
|
82 @echo "\t\t\t <min_iden> [default=99]"
|
rliterman@0
|
83 @echo " --min_len=NUM\t\tOnly consider alignments that span at least <min_len> in bp"
|
rliterman@0
|
84 @echo "\t\t\t [default=500]"
|
rliterman@0
|
85 @echo " --dwin=LIST\t\tA comma-separated list of windows to check SNP densities"
|
rliterman@0
|
86 @echo "\t\t\t [default=1000,125,15]"
|
rliterman@0
|
87 @echo " --wsnps=LIST\t\tThe maximum number of SNPs allowed in the corresponding window from"
|
rliterman@0
|
88 @echo "\t\t\t --dwin [default=3,2,1]"
|
rliterman@0
|
89 @echo " --query_edge=NUM\tOnly consider SNPs that occur within <query_edge>bp of the end"
|
rliterman@0
|
90 @echo "\t\t\t of a query contig [default=250]"
|
rliterman@0
|
91 @echo " --ref_edge=NUM\tOnly consider SNPs that occur within <query_edge>bp of the end"
|
rliterman@0
|
92 @echo "\t\t\t of a reference contig [default=250]"
|
rliterman@0
|
93 @echo " --n_ref=NUM\t\tThe number of RefChooser reference isolates to consider (only"
|
rliterman@0
|
94 @echo "\t\t\t applied if using RefChooser) [default=3]"
|
rliterman@0
|
95 @echo " --reads=PATH\t\tLocation of query read data (Path to directory, or path to file with"
|
rliterman@0
|
96 @echo "\t\t\t multiple directories)"
|
rliterman@0
|
97 @echo " --fasta=PATH\t\tLocation of query assembly data (Path to directory containing"
|
rliterman@0
|
98 @echo "\t\t\t FASTAs, path to FASTA, path to multiple FASTAs)"
|
rliterman@0
|
99 @echo " --ref_reads=PATH\tLocation of reference read data (Path to directory, or path to"
|
rliterman@0
|
100 @echo "\t\t\t file with multiple directories)"
|
rliterman@0
|
101 @echo " --ref_fasta=PATH\tLocation of reference assembly data (Path to directory"
|
rliterman@0
|
102 @echo "\t\t\t containing FASTAs, path to FASTA, path to multiple FASTAs)"
|
rliterman@0
|
103 @echo " --trim_name=STR\tA string in assembly file names that you want to remove from"
|
rliterman@0
|
104 @echo "\t\t\t sample IDs (e.g., _contigs_skesa)"
|
rliterman@0
|
105
|
rliterman@0
|
106 config:
|
rliterman@0
|
107 @cat <<- EOF
|
rliterman@0
|
108 profiles {
|
rliterman@0
|
109 standard {
|
rliterman@0
|
110 process.executor = 'local'
|
rliterman@0
|
111 params.cores = `nproc --all`
|
rliterman@0
|
112 }
|
rliterman@0
|
113 }
|
rliterman@0
|
114 EOF > ~/.nextflow/config
|
rliterman@0
|
115
|
rliterman@0
|
116
|
rliterman@0
|
117 ifeq (screen, $(firstword $(MAKECMDGOALS)))
|
rliterman@0
|
118 runargs := $(wordlist 2, $(words $(MAKECMDGOALS)), $(MAKECMDGOALS))
|
rliterman@0
|
119 $(eval $(runargs):;@true)
|
rliterman@0
|
120 endif
|
rliterman@0
|
121
|
rliterman@0
|
122 ifeq (snp, $(firstword $(MAKECMDGOALS)))
|
rliterman@0
|
123 runargs := $(wordlist 2, $(words $(MAKECMDGOALS)), $(MAKECMDGOALS))
|
rliterman@0
|
124 $(eval $(runargs):;@true)
|
rliterman@0
|
125 endif
|
rliterman@0
|
126
|
rliterman@0
|
127 screen: config ## determine whether query isolates are close to a reference
|
rliterman@0
|
128 nextflow run CSP2.nf -profile standard --runmode screen $(runargs)
|
rliterman@0
|
129
|
rliterman@0
|
130 snp: config ## generate pairwise distances for a set of query isolates
|
rliterman@0
|
131 nextflow run CSP2.nf -profile standard --runmode snp $(runargs)
|
rliterman@0
|
132
|
rliterman@0
|
133 snpdiffs: config
|
rliterman@0
|
134
|
rliterman@0
|
135 test_screen:
|
rliterman@0
|
136 nextflow run CSP2.nf -profile standard --runmode screen --fasta assets/Screen/Assembly/Week_42_Assembly.fasta --reads assets/Screen/Reads/ --ref_fasta assets/Screen/Assembly/Lab_Control.fasta --out ./CSP2_Test_Screen --readext fq.gz --forward _1.fq.gz --reverse _2.fq.gz
|
rliterman@0
|
137
|
rliterman@0
|
138 test_snp:
|
rliterman@0
|
139 nextflow run CSP2.nf -profile standard --runmode snp --fasta assets/SNP/ --n_ref 3 --out ./CSP2_Test_SNP --max_missing 50
|
rliterman@0
|
140
|
rliterman@0
|
141 test: config test_screen test_snp
|
rliterman@0
|
142 ls -lah assets/Screen/Output/Contamination_Screen/
|
rliterman@0
|
143 diff -bur ./CSP2_Test_SNP/snpdiffs assets/SNP/Output/Soil_Analysis/snpdiffs |