kkonganti@92: #!/usr/bin/env python3 kkonganti@92: kkonganti@92: import os kkonganti@92: import sys kkonganti@92: import glob kkonganti@92: import argparse kkonganti@92: import re kkonganti@92: kkonganti@92: kkonganti@92: def parse_args(args=None): kkonganti@92: Description = "Generate samplesheet from a directory of FastQ files." kkonganti@92: Epilog = "Example usage: python fastq_dir_to_samplesheet.py " kkonganti@92: kkonganti@92: parser = argparse.ArgumentParser(description=Description, epilog=Epilog) kkonganti@92: parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.") kkonganti@92: parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.") kkonganti@92: parser.add_argument( kkonganti@92: "-st", kkonganti@92: "--strandedness", kkonganti@92: type=str, kkonganti@92: dest="STRANDEDNESS", kkonganti@92: default="unstranded", kkonganti@92: help="Value for 'strandedness' in samplesheet. Must be one of 'unstranded', 'forward', 'reverse'.", kkonganti@92: ) kkonganti@92: parser.add_argument( kkonganti@92: "-r1", kkonganti@92: "--read1_extension", kkonganti@92: type=str, kkonganti@92: dest="READ1_EXTENSION", kkonganti@92: default="_R1_001.fastq.gz", kkonganti@92: help="File extension for read 1.", kkonganti@92: ) kkonganti@92: parser.add_argument( kkonganti@92: "-r2", kkonganti@92: "--read2_extension", kkonganti@92: type=str, kkonganti@92: dest="READ2_EXTENSION", kkonganti@92: default="_R2_001.fastq.gz", kkonganti@92: help="File extension for read 2.", kkonganti@92: ) kkonganti@92: parser.add_argument( kkonganti@92: "-se", kkonganti@92: "--single_end", kkonganti@92: dest="SINGLE_END", kkonganti@92: action="store_true", kkonganti@92: help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.", kkonganti@92: ) kkonganti@92: parser.add_argument( kkonganti@92: "-sn", kkonganti@92: "--sanitise_name", kkonganti@92: dest="SANITISE_NAME", kkonganti@92: action="store_true", kkonganti@92: help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.", kkonganti@92: ) kkonganti@92: parser.add_argument( kkonganti@92: "-sd", kkonganti@92: "--sanitise_name_delimiter", kkonganti@92: type=str, kkonganti@92: dest="SANITISE_NAME_DELIMITER", kkonganti@92: default="_", kkonganti@92: help="Delimiter to use to sanitise sample name.", kkonganti@92: ) kkonganti@92: parser.add_argument( kkonganti@92: "-si", kkonganti@92: "--sanitise_name_index", kkonganti@92: type=int, kkonganti@92: dest="SANITISE_NAME_INDEX", kkonganti@92: default=1, kkonganti@92: help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.", kkonganti@92: ) kkonganti@92: return parser.parse_args(args) kkonganti@92: kkonganti@92: kkonganti@92: def fastq_dir_to_samplesheet( kkonganti@92: fastq_dir, kkonganti@92: samplesheet_file, kkonganti@92: strandedness="unstranded", kkonganti@92: read1_extension="_R1_001.fastq.gz", kkonganti@92: read2_extension="_R2_001.fastq.gz", kkonganti@92: single_end=False, kkonganti@92: sanitise_name=False, kkonganti@92: sanitise_name_delimiter="_", kkonganti@92: sanitise_name_index=1, kkonganti@92: ): kkonganti@92: def sanitize_sample(path, extension): kkonganti@92: """Retrieve sample id from filename""" kkonganti@92: sample = os.path.basename(path).replace(extension, "") kkonganti@92: if sanitise_name: kkonganti@92: if sanitise_name_index > 0: kkonganti@92: sample = sanitise_name_delimiter.join( kkonganti@92: os.path.basename(path).split(sanitise_name_delimiter)[ kkonganti@92: :sanitise_name_index kkonganti@92: ] kkonganti@92: ) kkonganti@92: # elif sanitise_name_index == -1: kkonganti@92: # sample = os.path.basename(path)[ :os.path.basename(path).index('.') ] kkonganti@92: return sample kkonganti@92: kkonganti@92: def get_fastqs(extension): kkonganti@92: """ kkonganti@92: Needs to be sorted to ensure R1 and R2 are in the same order kkonganti@92: when merging technical replicates. Glob is not guaranteed to produce kkonganti@92: sorted results. kkonganti@92: See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered kkonganti@92: """ kkonganti@92: abs_fq_files = glob.glob(os.path.join(fastq_dir, f"**", f"*{extension}"), recursive=True) kkonganti@92: return sorted( kkonganti@92: [ kkonganti@92: fq for _, fq in enumerate(abs_fq_files) if re.match('^((?!undetermined|unclassified|fail|class|qc|downloads).)*$', fq, flags=re.IGNORECASE) kkonganti@92: ] kkonganti@92: ) kkonganti@92: kkonganti@92: read_dict = {} kkonganti@92: kkonganti@92: ## Get read 1 files kkonganti@92: for read1_file in get_fastqs(read1_extension): kkonganti@92: sample = sanitize_sample(read1_file, read1_extension) kkonganti@92: if sample not in read_dict: kkonganti@92: read_dict[sample] = {"R1": [], "R2": []} kkonganti@92: read_dict[sample]["R1"].append(read1_file) kkonganti@92: kkonganti@92: ## Get read 2 files kkonganti@92: if not single_end: kkonganti@92: for read2_file in get_fastqs(read2_extension): kkonganti@92: sample = sanitize_sample(read2_file, read2_extension) kkonganti@92: read_dict[sample]["R2"].append(read2_file) kkonganti@92: kkonganti@92: ## Write to file kkonganti@92: if len(read_dict) > 0: kkonganti@92: out_dir = os.path.dirname(samplesheet_file) kkonganti@92: if out_dir and not os.path.exists(out_dir): kkonganti@92: os.makedirs(out_dir) kkonganti@92: kkonganti@92: with open(samplesheet_file, "w") as fout: kkonganti@92: header = ["sample", "fq1", "fq2", "strandedness"] kkonganti@92: fout.write(",".join(header) + "\n") kkonganti@92: for sample, reads in sorted(read_dict.items()): kkonganti@92: for idx, read_1 in enumerate(reads["R1"]): kkonganti@92: read_2 = "" kkonganti@92: if idx < len(reads["R2"]): kkonganti@92: read_2 = reads["R2"][idx] kkonganti@92: sample_info = ",".join([sample, read_1, read_2, strandedness]) kkonganti@92: fout.write(f"{sample_info}\n") kkonganti@92: else: kkonganti@92: error_str = ( kkonganti@92: "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n" kkonganti@92: ) kkonganti@92: error_str += "Please check the values provided for the:\n" kkonganti@92: error_str += " - Path to the directory containing the FastQ files\n" kkonganti@92: error_str += " - '--read1_extension' parameter\n" kkonganti@92: error_str += " - '--read2_extension' parameter\n" kkonganti@92: print(error_str) kkonganti@92: sys.exit(1) kkonganti@92: kkonganti@92: kkonganti@92: def main(args=None): kkonganti@92: args = parse_args(args) kkonganti@92: kkonganti@92: strandedness = "unstranded" kkonganti@92: if args.STRANDEDNESS in ["unstranded", "forward", "reverse"]: kkonganti@92: strandedness = args.STRANDEDNESS kkonganti@92: kkonganti@92: fastq_dir_to_samplesheet( kkonganti@92: fastq_dir=args.FASTQ_DIR, kkonganti@92: samplesheet_file=args.SAMPLESHEET_FILE, kkonganti@92: strandedness=strandedness, kkonganti@92: read1_extension=args.READ1_EXTENSION, kkonganti@92: read2_extension=args.READ2_EXTENSION, kkonganti@92: single_end=args.SINGLE_END, kkonganti@92: sanitise_name=args.SANITISE_NAME, kkonganti@92: sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER, kkonganti@92: sanitise_name_index=args.SANITISE_NAME_INDEX, kkonganti@92: ) kkonganti@92: kkonganti@92: kkonganti@92: if __name__ == "__main__": kkonganti@92: sys.exit(main())