diff 0.5.0/subworkflows/process_fastq.nf @ 1:365849f031fd

"planemo upload"
author kkonganti
date Mon, 05 Jun 2023 18:48:51 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/0.5.0/subworkflows/process_fastq.nf	Mon Jun 05 18:48:51 2023 -0400
@@ -0,0 +1,144 @@
+// Include any necessary methods and modules
+include { stopNow; validateParamsForFASTQ } from "${params.routines}"
+include { GEN_SAMPLESHEET                 } from "${params.modules}${params.fs}gen_samplesheet${params.fs}main"
+include { SAMPLESHEET_CHECK               } from "${params.modules}${params.fs}samplesheet_check${params.fs}main"
+include { CAT_FASTQ                       } from "${params.modules}${params.fs}cat${params.fs}fastq${params.fs}main"
+include { SEQKIT_SEQ                      } from "${params.modules}${params.fs}seqkit${params.fs}seq${params.fs}main"
+
+// Validate 4 required workflow parameters if
+// FASTQ files are the input for the
+// entry point.
+validateParamsForFASTQ()
+
+// Start the subworkflow
+workflow PROCESS_FASTQ {
+    main:
+        versions = Channel.empty()
+        input_ch = Channel.empty()
+        reads = Channel.empty()
+
+        def input = file( (params.input ?: params.metadata) )
+
+        if (params.input) {
+            def fastq_files = []
+
+            if (params.fq_suffix == null) {
+            stopNow("We need to know what suffix the FASTQ files ends with inside the\n" +
+                "directory. Please use the --fq_suffix option to indicate the file\n" +
+                "suffix by which the files are to be collected to run the pipeline on.")
+            }
+
+            if (params.fq_strandedness == null) {
+                stopNow("We need to know if the FASTQ files inside the directory\n" +
+                    "are sequenced using stranded or non-stranded sequencing. This is generally\n" +
+                    "required if the sequencing experiment is RNA-SEQ. For almost all of the other\n" +
+                    "cases, you can probably use the --fq_strandedness unstranded option to indicate\n" +
+                    "that the reads are unstranded.")
+            }
+
+            if (params.fq_filename_delim == null || params.fq_filename_delim_idx == null) {
+                stopNow("We need to know the delimiter of the filename of the FASTQ files.\n" +
+                    "By default the filename delimiter is _ (underscore). This delimiter character\n" +
+                    "is used to split and assign a group name. The group name can be controlled by\n" +
+                    "using the --fq_filename_delim_idx option (1-based). For example, if the FASTQ\n" +
+                    "filename is WT_REP1_001.fastq, then to create a group WT, use the following\n" +
+                    "options: --fq_filename_delim _ --fq_filename_delim_idx 1")
+            }
+
+            if (!input.exists()) {
+                stopNow("The input directory,\n${params.input}\ndoes not exist!")
+            }
+
+            input.eachFileRecurse {
+                it.name.endsWith("${params.fq_suffix}") ? fastq_files << it : fastq_files << null
+            }
+
+            if (fastq_files.findAll{ it != null }.size() == 0) {
+                stopNow("The input directory,\n${params.input}\nis empty! or does not " +
+                    "have FASTQ files ending with the suffix: ${params.fq_suffix}")
+            }
+            
+            GEN_SAMPLESHEET( Channel.fromPath(params.input, type: 'dir') )
+            GEN_SAMPLESHEET.out.csv.set{ input_ch }
+            versions.mix( GEN_SAMPLESHEET.out.versions )
+                .set { versions }
+        } else if (params.metadata) {
+            if (!input.exists()) {
+                stopNow("The metadata CSV file,\n${params.metadata}\ndoes not exist!")
+            }
+
+            if (input.size() <= 0) {
+                stopNow("The metadata CSV file,\n${params.metadata}\nis empty!")
+            }
+
+            Channel.fromPath(params.metadata, type: 'file')
+                .set { input_ch }
+        }
+
+        SAMPLESHEET_CHECK( input_ch )
+            .csv
+            .splitCsv( header: true, sep: ',')
+            .map { create_fastq_channel(it) }
+            .groupTuple(by: [0])
+            .branch {
+                meta, fastq ->
+                    single   : fastq.size() == 1
+                        return [ meta, fastq.flatten() ]
+                    multiple : fastq.size() > 1
+                        return [ meta, fastq.flatten() ]
+            }
+            .set { reads }
+
+        CAT_FASTQ( reads.multiple )
+            .catted_reads
+            .mix( reads.single )
+            .set { processed_reads }
+
+        if (params.fq_filter_by_len.toInteger() > 0) {
+            SEQKIT_SEQ( processed_reads )
+                .fastx
+                .set { processed_reads }
+
+            versions.mix( SEQKIT_SEQ.out.versions.first().ifEmpty(null) )
+                .set { versions }
+        }
+
+        versions.mix(
+            SAMPLESHEET_CHECK.out.versions,
+            CAT_FASTQ.out.versions.first().ifEmpty(null)
+        )
+        .set { versions }
+
+    emit:
+        processed_reads
+        versions
+}
+
+// Function to get list of [ meta, [ fq1, fq2 ] ]
+def create_fastq_channel(LinkedHashMap row) {
+
+    def meta = [:]
+    meta.id           = row.sample
+    meta.single_end   = row.single_end.toBoolean()
+    meta.strandedness = row.strandedness
+    meta.id = meta.id.split(params.fq_filename_delim)[0..params.fq_filename_delim_idx.toInteger() - 1]
+        .join(params.fq_filename_delim)
+    meta.id = (meta.id =~ /\./ ? meta.id.take(meta.id.indexOf('.')) : meta.id)
+
+    def array = []
+
+    if (!file(row.fq1).exists()) {
+        stopNow("Please check input metadata CSV. The following Read 1 FASTQ file does not exist!" +
+            "\n${row.fq1}")
+    }
+    if (meta.single_end) {
+        array = [ meta, [ file(row.fq1) ] ]
+    } else {
+        if (!file(row.fq2).exists()) {
+            stopNow("Please check input metadata CSV. The following Read 2 FASTQ file does not exist!" +
+                "\n${row.fq2}")
+        }
+        array = [ meta, [ file(row.fq1), file(row.fq2) ] ]
+    }
+    return array
+}
\ No newline at end of file