comparison CSP2/CSP2.nf @ 0:01431fa12065

"planemo upload"
author rliterman
date Mon, 02 Dec 2024 10:40:55 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:01431fa12065
1 #! /usr/bin/env nextflow
2 nextflow.enable.dsl=2
3
4 // CSP2 Main Script
5 // Params are read in from command line or from nextflow.config and/or conf/profiles.config
6
7 // Check if help flag was passed
8 help1 = "${params.help}" == "nohelp" ? "nohelp" : "help"
9 help2 = "${params.h}" == "nohelp" ? "nohelp" : "help"
10
11 def printHelp() {
12 println """
13 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
14 CSP2
15 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16
17 Global default params:
18
19 --out Set name for output folder/file prefixes (Default: CSP2_<timestamp>)
20 --outroot Set output parent directory (Default: CWD; Useful to hardset in nextflow.config if
21 you want all output go to the same parent folder, with unique IDs set by --out)
22 --tmp_dir Manually specify a TMP directory for pybedtools output
23 --help/--h Display this help menu
24
25
26 CSP2 can run in the following run modes:
27
28 --runmode Run mode for CSP2:
29
30 - assemble: Assemble read data (--reads/--ref_reads) into FASTA using SKESA
31
32 - align: Given query data (--reads/--fasta) and reference data (--ref_reads/--ref_fasta),
33 run MUMmer alignment analysis for each query/ref combination
34
35 - screen: Given query data (--reads/--fasta) and reference data (--ref_reads/--ref_fasta)
36 and/or MUMmer output (.snpdiffs), create a report for raw SNP
37 distances between each query and reference assembly
38
39 - snp: Given query data (--reads/--fasta) and reference data (--ref_reads/--ref_fasta)
40 and/or MUMmer output (.snpdiffs), generate alignments and pairwise
41 distances for all queries based on each reference dataset
42
43 Input Data:
44
45 --fasta Location for query isolate assembly data (.fasta/.fa/.fna). Can be a list of files, a path
46 to a signle single FASTA, or a path to a directories with assemblies.
47 --ref_fasta Location for reference isolate assembly data (.fasta/.fa/.fna). Can be a list of files, a
48 path to a signle single FASTA, or a path to a directories with assemblies.
49
50 --reads Directory or list of directories containing query isolate read data
51 --readext Read file extension (Default: fastq.gz)
52 --forward Forward read file suffix (Default: _1.fastq.gz)
53 --reverse Reverse read file suffix (Default: _2.fastq.gz)
54
55 --ref_reads Directory or list of directories containing reference isolate read data
56 --ref_readext Reference read file extension (Default: fastq.gz)
57 --ref_forward Reference forward read file suffix (Default: _1.fastq.gz)
58 --ref_reverse Reference reverse read file suffix (Default: _2.fastq.gz)
59
60 --snpdiffs Location for pre-generated snpdiffs files (List of snpdiffs files, directory with snpdiffs)
61
62 --ref_id IDs to specify reference sequences (Comma-separated list; e.g., Sample_A,Sample_B,Sample_C)
63
64 --trim_name A common string to remove from all sample IDs (Default: ''; Useful if all assemblies end in
65 something like "_contigs_skesa.fasta")
66
67 --n_ref If running in --runmode snp, the number of reference genomes for CSP2 to select if none are provided (Default: 1)
68
69 --exclude A comma-separated list of IDs to remove prior to analysis (Useful for removing low quality
70 isolates in combination with --snpdiffs)
71
72 QC variables:
73
74 --min_cov Only consider queries if the reference genome is covered by at least <min_cov>% (Default: 85)
75 --min_len Only consider SNPs from contig alignments longer than <min_len> bp (Default: 500)
76 --min_iden Only consider SNPs from alignments with at least <min_iden> percent identity (Default: 99)
77 --dwin A comma-separated set of window sizes for SNP density filters (Default: 1000,125,15; Set --dwin 0 to disable density filtering)
78 --wsnps A comma-separated set of maximum SNP counts per window above (Default: 3,2,1)
79 --max_missing If running in --runmode snp, mask SNPs where data is missing or purged from <max_missing>% of isolates (Default: 50)
80
81 Edge Trimming:
82
83 --ref_edge Don't include SNPs that fall within <ref_edge>bp of a reference contig edge (Default: 150)
84 --query_edge Don't include SNPs that fall within <query_edge>bp of a query contig edge (Default: 150)
85 --rescue If flagged (Default: not flagged), sites that were filtered out due solely to query edge proximity are rescued if
86 the same reference position is covered more centrally by another query
87 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
88
89 Example Commands:
90
91 1) Run CSP2 in SNP Pipeline mode using all the FASTA from /my/data/dir, and choose 3 references
92
93 nextflow run CSP2.nf --runmode snp --fasta /my/data/dir --n_ref 3
94
95 2) Screen all the paired-end .fastq files from /my/read/dir against the reference isolate in /my/reference/isolates.txt
96
97 nextflow run CSP2.nf --runmode screen --ref_fasta /my/reference/isolates.txt --reads /my/read/dir --readext .fastq --forward _1.fastq --reverse _2.fastq
98
99 3) Re-run the SNP pipeline using old snpdiffs files after changing the density filters and removing a bad sample
100
101 nextflow run CSP2.nf --runmode snp --snpdiffs /my/old/analysis/snpdiffs --dwin 5000,2500,1000 --wsnps 6,4,2 --ref_id Sample_A --exclude Sample_Q --out HQ_Density
102
103 4) Run in assembly mode and use HPC modules specified in profiles.config (NOTE: Setting the profile in nextflow uses a single hyphen (-) as compared to other arguments (--))
104
105 nextflow run CSP2.nf -profile myHPC --runmode assemble --reads /my/read/dir --out Assemblies
106
107 5) Run in SNP pipeline mode using SLURM and use the built in conda environment (NOTE: For local jobs using conda, use -profile standard_conda)
108
109 nextflow run CSP2.nf -profile slurm_conda --runmode snp --fasta /my/data/dir --out CSP2_Conda
110
111 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
112 """
113 System.exit(0)
114 }
115
116 if (help1 == "help") {
117 printHelp()
118 } else if(help2 =="help"){
119 printHelp()
120 }
121
122 // Assess run mode
123 if (params.runmode == "") {
124 error "--runmode must be specified..."
125 } else if (!['align','assemble', 'screen', 'snp','conda_init'].contains(params.runmode)){
126 error "--runmode must be 'align','assemble', 'screen', or 'snp', not ${params.runmode}..."
127 }
128
129 // If runmode is conda_init, launch a local process to spurn the generation of the conda environment and exit
130 if (params.runmode != "conda_init") {
131
132 // Ensure necessary data is provided given the run mode
133 // Runmode 'assemble'
134 // - Requires: --reads/--ref_reads
135 // - Runs SKESA and summarzies output FASTA
136 if (params.runmode == "assemble"){
137 if((params.reads == "") && (params.ref_reads == "")){
138 error "Runmode is --assemble but no read data provided via --reads/--ref_reads"
139 }
140 }
141
142 // Runmode 'align'
143 // - Requires: --reads/--fasta/--snpdiffs
144 // - Optional: --ref_reads/--ref_fasta/--ref_id
145 // - Runs MUMmer, generates .snpdiffs, and alignment summary.
146 // - If references are provided via --ref_reads/--ref_fasta/--ref_id, non-reference samples are aligned to each reference
147 // - If no references are provided, alignments are all-vs-all
148 // - If --snpdiffs are provided, their FASTAs will be autodetected and, if present, used as queries or references as specified by --ref_reads/--ref_fasta/--ref_id
149 // - Does NOT perform QC filtering
150
151 else if (params.runmode == "align"){
152 if((params.fasta == "") && (params.reads == "") && (params.snpdiffs == "")){
153 error "Runmode is --align but no query data provided via --fasta/--reads/--snpdiffs"
154 }
155 }
156
157 // Runmode 'screen'
158 // - Requires: --reads/--fasta/--snpdiffs
159 // - Optional: --ref_reads/--ref_fasta/--ref_id
160 // - Generates .snpdiffs files (if needed), applies QC, and generates alignment summaries and SNP distance estimates
161 // - If references are provided via --ref_reads/--ref_fasta/--ref_id, non-reference samples are aligned to each reference
162 // - If no references are provided, alignments are all-vs-all
163 // - If --snpdiffs are provided, (1) they will be QC filtered and included in the output report and (2) their FASTAs will be autodetected and, if present, used as queries or references as specified by --ref_reads/--ref_fasta/--ref_id
164
165 else if (params.runmode == "screen"){
166 if((params.fasta == "") && (params.reads == "") && (params.snpdiffs == "")){
167 error "Runmode is --screen but no query data provided via --snpdiffs/--reads/--fasta"
168 }
169 }
170
171 // Runmode 'snp'
172 // - Requires: --reads/--fasta/--snpdiffs
173 // - Optional: --ref_reads/--ref_fasta/--ref_id
174 // - If references are not provided, runs RefChooser using all FASTAs to choose references (--n_ref sets how many references to choose)
175 // - Each query is aligned to each reference, and pairwise SNP distances for all queries are generated based on that reference
176 // - Generates .snpdiffs files (if needed), applies QC, and generates SNP distance data between all queries based on their alignment to each reference
177 else if (params.runmode == "snp"){
178 if((params.snpdiffs == "") && (params.fasta == "") && (params.reads == "")) {
179 error "Runmode is --snp but no query data provided via --snpdiffs/--reads/--fasta"
180 }
181 }
182
183 // Set directory structure
184 if (params.outroot == "") {
185 output_directory = file(params.out)
186 } else {
187 out_root = file(params.outroot)
188 output_directory = file("${out_root}/${params.out}")
189 }
190
191 // If the output directory exists, create a new subdirectory with the default output name ("CSP2_<TIME>")
192 if(!output_directory.getParent().isDirectory()){
193 error "Parent directory for output (--outroot) is not a valid directory [${output_directory.getParent()}]..."
194 } else if(output_directory.isDirectory()){
195 output_directory = file("${output_directory}/CSP2_${new java.util.Date().getTime()}")
196 output_directory.mkdirs()
197 } else{
198 output_directory.mkdirs()
199 }
200
201 if(params.tmp_dir != ""){
202 tempdir = file(params.tmp_dir)
203
204 if(tempdir.isDirectory()){
205 temp_dir = file("${tempdir}/CSP2_${new java.util.Date().getTime()}_tmp")
206 temp_dir.mkdirs()
207 params.temp_dir = file(temp_dir)
208
209 } else if(tempdir.getParent().isDirectory()){
210 tempdir.mkdirs()
211 params.temp_dir = tempdir
212 } else{
213 error "Parent directory for temp directory --tmp_dir (${params.tmp_dir}) is not a valid directory..."
214 }
215 } else{
216 params.temp_dir = ""
217 }
218
219 // Set MUMmer and SNP directories
220 mummer_directory = file("${output_directory}/MUMmer_Output")
221 snpdiffs_directory = file("${output_directory}/snpdiffs")
222 snp_directory = file("${output_directory}/SNP_Analysis")
223
224 // Set paths for output files
225 isolate_data_file = file("${output_directory}/Isolate_Data.tsv")
226 screening_results_file = file("${output_directory}/Screening_Results.tsv")
227
228 // In --runmode assembly, results save to output_directory
229 if(params.runmode == "assemble"){
230 ref_mode = false
231
232 log_directory = file("${output_directory}")
233 assembly_directory = file("${output_directory}")
234
235 // Set dummy paths for log files
236 screen_log_dir = file("${log_directory}/Screening_Logs")
237 snp_log_dir = file("${log_directory}/SNP_Logs")
238 ref_id_file = file("${log_directory}/Reference_IDs.txt")
239 mummer_log_directory = file("${log_directory}/MUMmer_Logs")
240 mash_dir = file("${log_directory}/sketch_dir")
241
242 } else{
243
244 // Get reference mode
245 if(params.ref_reads == "" && params.ref_fasta == "" && params.ref_id == ""){
246 ref_mode = false
247 } else{
248 ref_mode = true
249 }
250
251 // Set directories
252 log_directory = file("${output_directory}/logs")
253 assembly_directory = file("${output_directory}/Assemblies")
254 mummer_log_directory = file("${log_directory}/MUMmer_Logs")
255 ref_id_file = file("${log_directory}/Reference_IDs.txt")
256
257 // Create directories
258 log_directory.mkdirs()
259 mummer_directory.mkdirs()
260 mummer_log_directory.mkdirs()
261 snpdiffs_directory.mkdirs()
262
263 // Touch Reference_IDs.txt to establish it
264 file(ref_id_file).text = ''
265
266 // Set paths for log subdirectories
267 screen_log_dir = file("${log_directory}/Screening_Logs")
268 snp_log_dir = file("${log_directory}/SNP_Logs")
269 mash_dir = file("${log_directory}/sketch_dir")
270
271 // If --reads/--ref_reads are provided, prepare a directory for assemblies
272 if((params.reads != "") || (params.ref_reads != "")){
273 assembly_directory.mkdirs()
274 }
275
276 // If runmode is snp, prepare a directory for SNP analysis + logs
277 if(params.runmode == "snp"){
278 snp_directory.mkdirs()
279 snp_log_dir.mkdirs()
280 if(!ref_mode){
281 mash_dir.mkdirs()
282 }
283 }
284
285 // If runmode is screen, prepare a directory for screening logs
286 if(params.runmode == "screen"){
287 screen_log_dir.mkdirs()
288 }
289 }
290
291 // Parameterize variables to pass between scripts
292 params.output_directory = file(output_directory)
293 params.log_directory = file(log_directory)
294 params.screen_log_dir = file(screen_log_dir)
295 params.snp_log_dir = file(snp_log_dir)
296 params.assembly_directory = file(assembly_directory)
297 params.mummer_directory = file(mummer_directory)
298 params.mummer_log_directory = file(mummer_log_directory)
299 params.snpdiffs_directory = file(snpdiffs_directory)
300 params.snp_directory = file(snp_directory)
301 params.ref_id_file = file(ref_id_file)
302 params.mash_directory = file(mash_dir)
303
304 params.ref_mode = ref_mode
305
306 // Set up modules if needed
307 params.load_python_module = params.python_module == "" ? "" : "module load -s ${params.python_module}"
308 params.load_skesa_module = params.skesa_module == "" ? "" : "module load -s ${params.skesa_module}"
309 params.load_bedtools_module = params.bedtools_module == "" ? "" : "module load -s ${params.bedtools_module}"
310 params.load_bbtools_module = params.bbtools_module == "" ? "" : "module load -s ${params.bbtools_module}"
311 params.load_mummer_module = params.mummer_module == "" ? "" : "module load -s ${params.mummer_module}"
312 params.load_mash_module = params.mash_module == "" ? "" : "module load -s ${params.mash_module}"
313
314 // Save params to log file
315 params.each { key, value ->
316 file("${log_directory}/CSP2_Params.txt") << "$key = $value\n"
317 }
318 } else{
319 params.output_directory = "./"
320 params.log_directory = "./"
321 params.screen_log_dir = "./"
322 params.snp_log_dir = "./"
323 params.assembly_directory = "./"
324 params.mummer_directory = "./"
325 params.mummer_log_directory = "./"
326 params.snpdiffs_directory = "./"
327 params.snp_directory = "./"
328 params.ref_id_file = "./"
329 params.mash_directory = "./"
330 params.ref_mode = false
331 params.load_python_module = ""
332 params.load_skesa_module = ""
333 params.load_bedtools_module = ""
334 params.load_bbtools_module = ""
335 params.load_mummer_module = ""
336 params.load_mash_module = ""
337 }
338
339 //////////////////////////////////////////////////////////////////////////////////////////
340
341 // Import modules
342 include {fetchData} from "./subworkflows/fetchData/main.nf"
343 include {alignGenomes} from "./subworkflows/alignData/main.nf"
344 include {runScreen;runSNPPipeline} from "./subworkflows/snpdiffs/main.nf"
345 include {runRefChooser} from "./subworkflows/refchooser/main.nf"
346
347 workflow{
348
349 if(params.runmode == "conda_init"){
350 conda_init()
351 } else{
352 // Read in data
353 input_data = fetchData()
354
355 query_data = input_data.query_data
356 reference_data = input_data.reference_data
357 snpdiffs_data = input_data.snpdiff_data
358
359 // Create channel for pre-aligned data [(Query_1,Query_2),SNPDiffs_File]
360 prealigned = snpdiffs_data
361 .map { it -> tuple([it[0], it[1]].sort().join(',').toString(), it[2]) }
362 .unique{it -> it[0]}
363
364 // If run mode is 'assemble', tasks are complete
365 if((params.runmode == "align") || (params.runmode == "screen") || (params.runmode == "snp")){
366
367 // If there is no reference data, align all query_data against each other
368 if(!ref_mode){
369
370 if((params.runmode == "align") || (params.runmode == "screen")){
371 seen_combinations = []
372
373 to_align = query_data.combine(query_data) // Self-combine query data
374 .collect().flatten().collate(4)
375 .filter{it -> (it[1].toString() != "null") && (it[3].toString() != "null")} // Can't align without FASTA
376 .filter{ it -> // Get unique combinations
377
378 combination = ["${it[0]}", "${it[2]}"].sort()
379
380 if(combination in seen_combinations) {
381 return false
382 } else {
383 seen_combinations << combination
384 return true
385 }}
386 }
387
388 // If running SNP pipeline without references, run RefChooser to choose references
389 else if(params.runmode == "snp"){
390 reference_data = runRefChooser(query_data)
391
392 to_align = query_data
393 .combine(reference_data)
394 .filter{it -> (it[1].toString() != "null") && (it[3].toString() != "null")} // Can't align without FASTA
395 }
396 }
397
398 // If references are provided, align all queries against all references
399 else{
400 to_align = query_data
401 .combine(reference_data)
402 .filter{it -> (it[1].toString() != "null") && (it[3].toString() != "null")} // Can't align without FASTA
403 }
404
405 // Don't align things that are already aligned via --snpdiffs
406 unaligned = to_align
407 .map { it -> tuple([it[0], it[2]].sort().join(',').toString(),it[0], it[1], it[2], it[3]) }
408 .unique{it -> it[0]}
409 .join(prealigned, by:0, remainder:true)
410 .filter{it -> it[5].toString() == "null"}
411 .map{it -> [it[1], it[2], it[3], it[4]]}
412 | collect | flatten | collate(4)
413
414 all_snpdiffs = alignGenomes(unaligned,snpdiffs_data)
415 .ifEmpty { error "No .snpdiffs to process..." }
416 .collect().flatten().collate(3)
417
418 if(params.runmode == "screen"){
419 runScreen(all_snpdiffs)
420 } else if(params.runmode == "snp"){
421 runSNPPipeline(all_snpdiffs,reference_data)
422 }
423 }
424 }
425 }
426
427 // Dummy process to stimulate conda env generation
428 process conda_init {
429 executor = 'local'
430 cpus = 1
431 maxForks = 1
432
433 script:
434 """
435 """
436 }