diff 0.5.0/bin/fastq_dir_to_samplesheet.py @ 1:365849f031fd

"planemo upload"
author kkonganti
date Mon, 05 Jun 2023 18:48:51 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/0.5.0/bin/fastq_dir_to_samplesheet.py	Mon Jun 05 18:48:51 2023 -0400
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import glob
+import argparse
+import re
+
+
+def parse_args(args=None):
+    Description = "Generate samplesheet from a directory of FastQ files."
+    Epilog = "Example usage: python fastq_dir_to_samplesheet.py <FASTQ_DIR> <SAMPLESHEET_FILE>"
+
+    parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
+    parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.")
+    parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.")
+    parser.add_argument(
+        "-st",
+        "--strandedness",
+        type=str,
+        dest="STRANDEDNESS",
+        default="unstranded",
+        help="Value for 'strandedness' in samplesheet. Must be one of 'unstranded', 'forward', 'reverse'.",
+    )
+    parser.add_argument(
+        "-r1",
+        "--read1_extension",
+        type=str,
+        dest="READ1_EXTENSION",
+        default="_R1_001.fastq.gz",
+        help="File extension for read 1.",
+    )
+    parser.add_argument(
+        "-r2",
+        "--read2_extension",
+        type=str,
+        dest="READ2_EXTENSION",
+        default="_R2_001.fastq.gz",
+        help="File extension for read 2.",
+    )
+    parser.add_argument(
+        "-se",
+        "--single_end",
+        dest="SINGLE_END",
+        action="store_true",
+        help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.",
+    )
+    parser.add_argument(
+        "-sn",
+        "--sanitise_name",
+        dest="SANITISE_NAME",
+        action="store_true",
+        help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.",
+    )
+    parser.add_argument(
+        "-sd",
+        "--sanitise_name_delimiter",
+        type=str,
+        dest="SANITISE_NAME_DELIMITER",
+        default="_",
+        help="Delimiter to use to sanitise sample name.",
+    )
+    parser.add_argument(
+        "-si",
+        "--sanitise_name_index",
+        type=int,
+        dest="SANITISE_NAME_INDEX",
+        default=1,
+        help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.",
+    )
+    return parser.parse_args(args)
+
+
+def fastq_dir_to_samplesheet(
+    fastq_dir,
+    samplesheet_file,
+    strandedness="unstranded",
+    read1_extension="_R1_001.fastq.gz",
+    read2_extension="_R2_001.fastq.gz",
+    single_end=False,
+    sanitise_name=False,
+    sanitise_name_delimiter="_",
+    sanitise_name_index=1,
+):
+    def sanitize_sample(path, extension):
+        """Retrieve sample id from filename"""
+        sample = os.path.basename(path).replace(extension, "")
+        if sanitise_name:
+            if sanitise_name_index > 0:
+                sample = sanitise_name_delimiter.join(
+                    os.path.basename(path).split(sanitise_name_delimiter)[
+                        :sanitise_name_index
+                    ]
+                )
+            # elif sanitise_name_index == -1:
+            #     sample = os.path.basename(path)[ :os.path.basename(path).index('.') ]
+        return sample
+
+    def get_fastqs(extension):
+        """
+        Needs to be sorted to ensure R1 and R2 are in the same order
+        when merging technical replicates. Glob is not guaranteed to produce
+        sorted results.
+        See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered
+        """
+        abs_fq_files = glob.glob(os.path.join(fastq_dir, f"**", f"*{extension}"), recursive=True)
+        return sorted(
+            [
+                fq for _, fq in enumerate(abs_fq_files) if re.match('^((?!undetermined|unclassified|downloads).)*$', fq, flags=re.IGNORECASE)
+            ]
+        )
+
+    read_dict = {}
+
+    ## Get read 1 files
+    for read1_file in get_fastqs(read1_extension):
+        sample = sanitize_sample(read1_file, read1_extension)
+        if sample not in read_dict:
+            read_dict[sample] = {"R1": [], "R2": []}
+        read_dict[sample]["R1"].append(read1_file)
+
+    ## Get read 2 files
+    if not single_end:
+        for read2_file in get_fastqs(read2_extension):
+            sample = sanitize_sample(read2_file, read2_extension)
+            read_dict[sample]["R2"].append(read2_file)
+
+    ## Write to file
+    if len(read_dict) > 0:
+        out_dir = os.path.dirname(samplesheet_file)
+        if out_dir and not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+
+        with open(samplesheet_file, "w") as fout:
+            header = ["sample", "fq1", "fq2", "strandedness"]
+            fout.write(",".join(header) + "\n")
+            for sample, reads in sorted(read_dict.items()):
+                for idx, read_1 in enumerate(reads["R1"]):
+                    read_2 = ""
+                    if idx < len(reads["R2"]):
+                        read_2 = reads["R2"][idx]
+                    sample_info = ",".join([sample, read_1, read_2, strandedness])
+                    fout.write(f"{sample_info}\n")
+    else:
+        error_str = (
+            "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n"
+        )
+        error_str += "Please check the values provided for the:\n"
+        error_str += "  - Path to the directory containing the FastQ files\n"
+        error_str += "  - '--read1_extension' parameter\n"
+        error_str += "  - '--read2_extension' parameter\n"
+        print(error_str)
+        sys.exit(1)
+
+
+def main(args=None):
+    args = parse_args(args)
+
+    strandedness = "unstranded"
+    if args.STRANDEDNESS in ["unstranded", "forward", "reverse"]:
+        strandedness = args.STRANDEDNESS
+
+    fastq_dir_to_samplesheet(
+        fastq_dir=args.FASTQ_DIR,
+        samplesheet_file=args.SAMPLESHEET_FILE,
+        strandedness=strandedness,
+        read1_extension=args.READ1_EXTENSION,
+        read2_extension=args.READ2_EXTENSION,
+        single_end=args.SINGLE_END,
+        sanitise_name=args.SANITISE_NAME,
+        sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER,
+        sanitise_name_index=args.SANITISE_NAME_INDEX,
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(main())