Mercurial > repos > kkonganti > cfsan_bettercallsal
diff 0.5.0/bin/fastq_dir_to_samplesheet.py @ 1:365849f031fd
"planemo upload"
author | kkonganti |
---|---|
date | Mon, 05 Jun 2023 18:48:51 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/0.5.0/bin/fastq_dir_to_samplesheet.py Mon Jun 05 18:48:51 2023 -0400 @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 + +import os +import sys +import glob +import argparse +import re + + +def parse_args(args=None): + Description = "Generate samplesheet from a directory of FastQ files." + Epilog = "Example usage: python fastq_dir_to_samplesheet.py <FASTQ_DIR> <SAMPLESHEET_FILE>" + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.") + parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.") + parser.add_argument( + "-st", + "--strandedness", + type=str, + dest="STRANDEDNESS", + default="unstranded", + help="Value for 'strandedness' in samplesheet. Must be one of 'unstranded', 'forward', 'reverse'.", + ) + parser.add_argument( + "-r1", + "--read1_extension", + type=str, + dest="READ1_EXTENSION", + default="_R1_001.fastq.gz", + help="File extension for read 1.", + ) + parser.add_argument( + "-r2", + "--read2_extension", + type=str, + dest="READ2_EXTENSION", + default="_R2_001.fastq.gz", + help="File extension for read 2.", + ) + parser.add_argument( + "-se", + "--single_end", + dest="SINGLE_END", + action="store_true", + help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.", + ) + parser.add_argument( + "-sn", + "--sanitise_name", + dest="SANITISE_NAME", + action="store_true", + help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.", + ) + parser.add_argument( + "-sd", + "--sanitise_name_delimiter", + type=str, + dest="SANITISE_NAME_DELIMITER", + default="_", + help="Delimiter to use to sanitise sample name.", + ) + parser.add_argument( + "-si", + "--sanitise_name_index", + type=int, + dest="SANITISE_NAME_INDEX", + default=1, + help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.", + ) + return parser.parse_args(args) + + +def fastq_dir_to_samplesheet( + fastq_dir, + samplesheet_file, + strandedness="unstranded", + read1_extension="_R1_001.fastq.gz", + read2_extension="_R2_001.fastq.gz", + single_end=False, + sanitise_name=False, + sanitise_name_delimiter="_", + sanitise_name_index=1, +): + def sanitize_sample(path, extension): + """Retrieve sample id from filename""" + sample = os.path.basename(path).replace(extension, "") + if sanitise_name: + if sanitise_name_index > 0: + sample = sanitise_name_delimiter.join( + os.path.basename(path).split(sanitise_name_delimiter)[ + :sanitise_name_index + ] + ) + # elif sanitise_name_index == -1: + # sample = os.path.basename(path)[ :os.path.basename(path).index('.') ] + return sample + + def get_fastqs(extension): + """ + Needs to be sorted to ensure R1 and R2 are in the same order + when merging technical replicates. Glob is not guaranteed to produce + sorted results. + See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered + """ + abs_fq_files = glob.glob(os.path.join(fastq_dir, f"**", f"*{extension}"), recursive=True) + return sorted( + [ + fq for _, fq in enumerate(abs_fq_files) if re.match('^((?!undetermined|unclassified|downloads).)*$', fq, flags=re.IGNORECASE) + ] + ) + + read_dict = {} + + ## Get read 1 files + for read1_file in get_fastqs(read1_extension): + sample = sanitize_sample(read1_file, read1_extension) + if sample not in read_dict: + read_dict[sample] = {"R1": [], "R2": []} + read_dict[sample]["R1"].append(read1_file) + + ## Get read 2 files + if not single_end: + for read2_file in get_fastqs(read2_extension): + sample = sanitize_sample(read2_file, read2_extension) + read_dict[sample]["R2"].append(read2_file) + + ## Write to file + if len(read_dict) > 0: + out_dir = os.path.dirname(samplesheet_file) + if out_dir and not os.path.exists(out_dir): + os.makedirs(out_dir) + + with open(samplesheet_file, "w") as fout: + header = ["sample", "fq1", "fq2", "strandedness"] + fout.write(",".join(header) + "\n") + for sample, reads in sorted(read_dict.items()): + for idx, read_1 in enumerate(reads["R1"]): + read_2 = "" + if idx < len(reads["R2"]): + read_2 = reads["R2"][idx] + sample_info = ",".join([sample, read_1, read_2, strandedness]) + fout.write(f"{sample_info}\n") + else: + error_str = ( + "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n" + ) + error_str += "Please check the values provided for the:\n" + error_str += " - Path to the directory containing the FastQ files\n" + error_str += " - '--read1_extension' parameter\n" + error_str += " - '--read2_extension' parameter\n" + print(error_str) + sys.exit(1) + + +def main(args=None): + args = parse_args(args) + + strandedness = "unstranded" + if args.STRANDEDNESS in ["unstranded", "forward", "reverse"]: + strandedness = args.STRANDEDNESS + + fastq_dir_to_samplesheet( + fastq_dir=args.FASTQ_DIR, + samplesheet_file=args.SAMPLESHEET_FILE, + strandedness=strandedness, + read1_extension=args.READ1_EXTENSION, + read2_extension=args.READ2_EXTENSION, + single_end=args.SINGLE_END, + sanitise_name=args.SANITISE_NAME, + sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER, + sanitise_name_index=args.SANITISE_NAME_INDEX, + ) + + +if __name__ == "__main__": + sys.exit(main())