Mercurial > repos > rliterman > csp2
view CSP2/CSP2.nf @ 18:eda1a8a50746
"planemo upload"
author | rliterman |
---|---|
date | Tue, 03 Dec 2024 12:44:32 -0500 |
parents | 01431fa12065 |
children |
line wrap: on
line source
#! /usr/bin/env nextflow nextflow.enable.dsl=2 // CSP2 Main Script // Params are read in from command line or from nextflow.config and/or conf/profiles.config // Check if help flag was passed help1 = "${params.help}" == "nohelp" ? "nohelp" : "help" help2 = "${params.h}" == "nohelp" ? "nohelp" : "help" def printHelp() { println """ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CSP2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Global default params: --out Set name for output folder/file prefixes (Default: CSP2_<timestamp>) --outroot Set output parent directory (Default: CWD; Useful to hardset in nextflow.config if you want all output go to the same parent folder, with unique IDs set by --out) --tmp_dir Manually specify a TMP directory for pybedtools output --help/--h Display this help menu CSP2 can run in the following run modes: --runmode Run mode for CSP2: - assemble: Assemble read data (--reads/--ref_reads) into FASTA using SKESA - align: Given query data (--reads/--fasta) and reference data (--ref_reads/--ref_fasta), run MUMmer alignment analysis for each query/ref combination - screen: Given query data (--reads/--fasta) and reference data (--ref_reads/--ref_fasta) and/or MUMmer output (.snpdiffs), create a report for raw SNP distances between each query and reference assembly - snp: Given query data (--reads/--fasta) and reference data (--ref_reads/--ref_fasta) and/or MUMmer output (.snpdiffs), generate alignments and pairwise distances for all queries based on each reference dataset Input Data: --fasta Location for query isolate assembly data (.fasta/.fa/.fna). Can be a list of files, a path to a signle single FASTA, or a path to a directories with assemblies. --ref_fasta Location for reference isolate assembly data (.fasta/.fa/.fna). Can be a list of files, a path to a signle single FASTA, or a path to a directories with assemblies. --reads Directory or list of directories containing query isolate read data --readext Read file extension (Default: fastq.gz) --forward Forward read file suffix (Default: _1.fastq.gz) --reverse Reverse read file suffix (Default: _2.fastq.gz) --ref_reads Directory or list of directories containing reference isolate read data --ref_readext Reference read file extension (Default: fastq.gz) --ref_forward Reference forward read file suffix (Default: _1.fastq.gz) --ref_reverse Reference reverse read file suffix (Default: _2.fastq.gz) --snpdiffs Location for pre-generated snpdiffs files (List of snpdiffs files, directory with snpdiffs) --ref_id IDs to specify reference sequences (Comma-separated list; e.g., Sample_A,Sample_B,Sample_C) --trim_name A common string to remove from all sample IDs (Default: ''; Useful if all assemblies end in something like "_contigs_skesa.fasta") --n_ref If running in --runmode snp, the number of reference genomes for CSP2 to select if none are provided (Default: 1) --exclude A comma-separated list of IDs to remove prior to analysis (Useful for removing low quality isolates in combination with --snpdiffs) QC variables: --min_cov Only consider queries if the reference genome is covered by at least <min_cov>% (Default: 85) --min_len Only consider SNPs from contig alignments longer than <min_len> bp (Default: 500) --min_iden Only consider SNPs from alignments with at least <min_iden> percent identity (Default: 99) --dwin A comma-separated set of window sizes for SNP density filters (Default: 1000,125,15; Set --dwin 0 to disable density filtering) --wsnps A comma-separated set of maximum SNP counts per window above (Default: 3,2,1) --max_missing If running in --runmode snp, mask SNPs where data is missing or purged from <max_missing>% of isolates (Default: 50) Edge Trimming: --ref_edge Don't include SNPs that fall within <ref_edge>bp of a reference contig edge (Default: 150) --query_edge Don't include SNPs that fall within <query_edge>bp of a query contig edge (Default: 150) --rescue If flagged (Default: not flagged), sites that were filtered out due solely to query edge proximity are rescued if the same reference position is covered more centrally by another query ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Example Commands: 1) Run CSP2 in SNP Pipeline mode using all the FASTA from /my/data/dir, and choose 3 references nextflow run CSP2.nf --runmode snp --fasta /my/data/dir --n_ref 3 2) Screen all the paired-end .fastq files from /my/read/dir against the reference isolate in /my/reference/isolates.txt nextflow run CSP2.nf --runmode screen --ref_fasta /my/reference/isolates.txt --reads /my/read/dir --readext .fastq --forward _1.fastq --reverse _2.fastq 3) Re-run the SNP pipeline using old snpdiffs files after changing the density filters and removing a bad sample nextflow run CSP2.nf --runmode snp --snpdiffs /my/old/analysis/snpdiffs --dwin 5000,2500,1000 --wsnps 6,4,2 --ref_id Sample_A --exclude Sample_Q --out HQ_Density 4) Run in assembly mode and use HPC modules specified in profiles.config (NOTE: Setting the profile in nextflow uses a single hyphen (-) as compared to other arguments (--)) nextflow run CSP2.nf -profile myHPC --runmode assemble --reads /my/read/dir --out Assemblies 5) Run in SNP pipeline mode using SLURM and use the built in conda environment (NOTE: For local jobs using conda, use -profile standard_conda) nextflow run CSP2.nf -profile slurm_conda --runmode snp --fasta /my/data/dir --out CSP2_Conda ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ """ System.exit(0) } if (help1 == "help") { printHelp() } else if(help2 =="help"){ printHelp() } // Assess run mode if (params.runmode == "") { error "--runmode must be specified..." } else if (!['align','assemble', 'screen', 'snp','conda_init'].contains(params.runmode)){ error "--runmode must be 'align','assemble', 'screen', or 'snp', not ${params.runmode}..." } // If runmode is conda_init, launch a local process to spurn the generation of the conda environment and exit if (params.runmode != "conda_init") { // Ensure necessary data is provided given the run mode // Runmode 'assemble' // - Requires: --reads/--ref_reads // - Runs SKESA and summarzies output FASTA if (params.runmode == "assemble"){ if((params.reads == "") && (params.ref_reads == "")){ error "Runmode is --assemble but no read data provided via --reads/--ref_reads" } } // Runmode 'align' // - Requires: --reads/--fasta/--snpdiffs // - Optional: --ref_reads/--ref_fasta/--ref_id // - Runs MUMmer, generates .snpdiffs, and alignment summary. // - If references are provided via --ref_reads/--ref_fasta/--ref_id, non-reference samples are aligned to each reference // - If no references are provided, alignments are all-vs-all // - If --snpdiffs are provided, their FASTAs will be autodetected and, if present, used as queries or references as specified by --ref_reads/--ref_fasta/--ref_id // - Does NOT perform QC filtering else if (params.runmode == "align"){ if((params.fasta == "") && (params.reads == "") && (params.snpdiffs == "")){ error "Runmode is --align but no query data provided via --fasta/--reads/--snpdiffs" } } // Runmode 'screen' // - Requires: --reads/--fasta/--snpdiffs // - Optional: --ref_reads/--ref_fasta/--ref_id // - Generates .snpdiffs files (if needed), applies QC, and generates alignment summaries and SNP distance estimates // - If references are provided via --ref_reads/--ref_fasta/--ref_id, non-reference samples are aligned to each reference // - If no references are provided, alignments are all-vs-all // - If --snpdiffs are provided, (1) they will be QC filtered and included in the output report and (2) their FASTAs will be autodetected and, if present, used as queries or references as specified by --ref_reads/--ref_fasta/--ref_id else if (params.runmode == "screen"){ if((params.fasta == "") && (params.reads == "") && (params.snpdiffs == "")){ error "Runmode is --screen but no query data provided via --snpdiffs/--reads/--fasta" } } // Runmode 'snp' // - Requires: --reads/--fasta/--snpdiffs // - Optional: --ref_reads/--ref_fasta/--ref_id // - If references are not provided, runs RefChooser using all FASTAs to choose references (--n_ref sets how many references to choose) // - Each query is aligned to each reference, and pairwise SNP distances for all queries are generated based on that reference // - Generates .snpdiffs files (if needed), applies QC, and generates SNP distance data between all queries based on their alignment to each reference else if (params.runmode == "snp"){ if((params.snpdiffs == "") && (params.fasta == "") && (params.reads == "")) { error "Runmode is --snp but no query data provided via --snpdiffs/--reads/--fasta" } } // Set directory structure if (params.outroot == "") { output_directory = file(params.out) } else { out_root = file(params.outroot) output_directory = file("${out_root}/${params.out}") } // If the output directory exists, create a new subdirectory with the default output name ("CSP2_<TIME>") if(!output_directory.getParent().isDirectory()){ error "Parent directory for output (--outroot) is not a valid directory [${output_directory.getParent()}]..." } else if(output_directory.isDirectory()){ output_directory = file("${output_directory}/CSP2_${new java.util.Date().getTime()}") output_directory.mkdirs() } else{ output_directory.mkdirs() } if(params.tmp_dir != ""){ tempdir = file(params.tmp_dir) if(tempdir.isDirectory()){ temp_dir = file("${tempdir}/CSP2_${new java.util.Date().getTime()}_tmp") temp_dir.mkdirs() params.temp_dir = file(temp_dir) } else if(tempdir.getParent().isDirectory()){ tempdir.mkdirs() params.temp_dir = tempdir } else{ error "Parent directory for temp directory --tmp_dir (${params.tmp_dir}) is not a valid directory..." } } else{ params.temp_dir = "" } // Set MUMmer and SNP directories mummer_directory = file("${output_directory}/MUMmer_Output") snpdiffs_directory = file("${output_directory}/snpdiffs") snp_directory = file("${output_directory}/SNP_Analysis") // Set paths for output files isolate_data_file = file("${output_directory}/Isolate_Data.tsv") screening_results_file = file("${output_directory}/Screening_Results.tsv") // In --runmode assembly, results save to output_directory if(params.runmode == "assemble"){ ref_mode = false log_directory = file("${output_directory}") assembly_directory = file("${output_directory}") // Set dummy paths for log files screen_log_dir = file("${log_directory}/Screening_Logs") snp_log_dir = file("${log_directory}/SNP_Logs") ref_id_file = file("${log_directory}/Reference_IDs.txt") mummer_log_directory = file("${log_directory}/MUMmer_Logs") mash_dir = file("${log_directory}/sketch_dir") } else{ // Get reference mode if(params.ref_reads == "" && params.ref_fasta == "" && params.ref_id == ""){ ref_mode = false } else{ ref_mode = true } // Set directories log_directory = file("${output_directory}/logs") assembly_directory = file("${output_directory}/Assemblies") mummer_log_directory = file("${log_directory}/MUMmer_Logs") ref_id_file = file("${log_directory}/Reference_IDs.txt") // Create directories log_directory.mkdirs() mummer_directory.mkdirs() mummer_log_directory.mkdirs() snpdiffs_directory.mkdirs() // Touch Reference_IDs.txt to establish it file(ref_id_file).text = '' // Set paths for log subdirectories screen_log_dir = file("${log_directory}/Screening_Logs") snp_log_dir = file("${log_directory}/SNP_Logs") mash_dir = file("${log_directory}/sketch_dir") // If --reads/--ref_reads are provided, prepare a directory for assemblies if((params.reads != "") || (params.ref_reads != "")){ assembly_directory.mkdirs() } // If runmode is snp, prepare a directory for SNP analysis + logs if(params.runmode == "snp"){ snp_directory.mkdirs() snp_log_dir.mkdirs() if(!ref_mode){ mash_dir.mkdirs() } } // If runmode is screen, prepare a directory for screening logs if(params.runmode == "screen"){ screen_log_dir.mkdirs() } } // Parameterize variables to pass between scripts params.output_directory = file(output_directory) params.log_directory = file(log_directory) params.screen_log_dir = file(screen_log_dir) params.snp_log_dir = file(snp_log_dir) params.assembly_directory = file(assembly_directory) params.mummer_directory = file(mummer_directory) params.mummer_log_directory = file(mummer_log_directory) params.snpdiffs_directory = file(snpdiffs_directory) params.snp_directory = file(snp_directory) params.ref_id_file = file(ref_id_file) params.mash_directory = file(mash_dir) params.ref_mode = ref_mode // Set up modules if needed params.load_python_module = params.python_module == "" ? "" : "module load -s ${params.python_module}" params.load_skesa_module = params.skesa_module == "" ? "" : "module load -s ${params.skesa_module}" params.load_bedtools_module = params.bedtools_module == "" ? "" : "module load -s ${params.bedtools_module}" params.load_bbtools_module = params.bbtools_module == "" ? "" : "module load -s ${params.bbtools_module}" params.load_mummer_module = params.mummer_module == "" ? "" : "module load -s ${params.mummer_module}" params.load_mash_module = params.mash_module == "" ? "" : "module load -s ${params.mash_module}" // Save params to log file params.each { key, value -> file("${log_directory}/CSP2_Params.txt") << "$key = $value\n" } } else{ params.output_directory = "./" params.log_directory = "./" params.screen_log_dir = "./" params.snp_log_dir = "./" params.assembly_directory = "./" params.mummer_directory = "./" params.mummer_log_directory = "./" params.snpdiffs_directory = "./" params.snp_directory = "./" params.ref_id_file = "./" params.mash_directory = "./" params.ref_mode = false params.load_python_module = "" params.load_skesa_module = "" params.load_bedtools_module = "" params.load_bbtools_module = "" params.load_mummer_module = "" params.load_mash_module = "" } ////////////////////////////////////////////////////////////////////////////////////////// // Import modules include {fetchData} from "./subworkflows/fetchData/main.nf" include {alignGenomes} from "./subworkflows/alignData/main.nf" include {runScreen;runSNPPipeline} from "./subworkflows/snpdiffs/main.nf" include {runRefChooser} from "./subworkflows/refchooser/main.nf" workflow{ if(params.runmode == "conda_init"){ conda_init() } else{ // Read in data input_data = fetchData() query_data = input_data.query_data reference_data = input_data.reference_data snpdiffs_data = input_data.snpdiff_data // Create channel for pre-aligned data [(Query_1,Query_2),SNPDiffs_File] prealigned = snpdiffs_data .map { it -> tuple([it[0], it[1]].sort().join(',').toString(), it[2]) } .unique{it -> it[0]} // If run mode is 'assemble', tasks are complete if((params.runmode == "align") || (params.runmode == "screen") || (params.runmode == "snp")){ // If there is no reference data, align all query_data against each other if(!ref_mode){ if((params.runmode == "align") || (params.runmode == "screen")){ seen_combinations = [] to_align = query_data.combine(query_data) // Self-combine query data .collect().flatten().collate(4) .filter{it -> (it[1].toString() != "null") && (it[3].toString() != "null")} // Can't align without FASTA .filter{ it -> // Get unique combinations combination = ["${it[0]}", "${it[2]}"].sort() if(combination in seen_combinations) { return false } else { seen_combinations << combination return true }} } // If running SNP pipeline without references, run RefChooser to choose references else if(params.runmode == "snp"){ reference_data = runRefChooser(query_data) to_align = query_data .combine(reference_data) .filter{it -> (it[1].toString() != "null") && (it[3].toString() != "null")} // Can't align without FASTA } } // If references are provided, align all queries against all references else{ to_align = query_data .combine(reference_data) .filter{it -> (it[1].toString() != "null") && (it[3].toString() != "null")} // Can't align without FASTA } // Don't align things that are already aligned via --snpdiffs unaligned = to_align .map { it -> tuple([it[0], it[2]].sort().join(',').toString(),it[0], it[1], it[2], it[3]) } .unique{it -> it[0]} .join(prealigned, by:0, remainder:true) .filter{it -> it[5].toString() == "null"} .map{it -> [it[1], it[2], it[3], it[4]]} | collect | flatten | collate(4) all_snpdiffs = alignGenomes(unaligned,snpdiffs_data) .ifEmpty { error "No .snpdiffs to process..." } .collect().flatten().collate(3) if(params.runmode == "screen"){ runScreen(all_snpdiffs) } else if(params.runmode == "snp"){ runSNPPipeline(all_snpdiffs,reference_data) } } } } // Dummy process to stimulate conda env generation process conda_init { executor = 'local' cpus = 1 maxForks = 1 script: """ """ }