annotate 0.4.0/bin/fastq_dir_to_samplesheet.py @ 101:ce6d9548fe89

"planemo upload"
author kkonganti
date Thu, 04 Aug 2022 10:45:55 -0400
parents
children 17890124001d
rev   line source
kkonganti@101 1 #!/usr/bin/env python3
kkonganti@101 2
kkonganti@101 3 import os
kkonganti@101 4 import sys
kkonganti@101 5 import glob
kkonganti@101 6 import argparse
kkonganti@101 7 import re
kkonganti@101 8
kkonganti@101 9
kkonganti@101 10 def parse_args(args=None):
kkonganti@101 11 Description = "Generate samplesheet from a directory of FastQ files."
kkonganti@101 12 Epilog = "Example usage: python fastq_dir_to_samplesheet.py <FASTQ_DIR> <SAMPLESHEET_FILE>"
kkonganti@101 13
kkonganti@101 14 parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
kkonganti@101 15 parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.")
kkonganti@101 16 parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.")
kkonganti@101 17 parser.add_argument(
kkonganti@101 18 "-st",
kkonganti@101 19 "--strandedness",
kkonganti@101 20 type=str,
kkonganti@101 21 dest="STRANDEDNESS",
kkonganti@101 22 default="unstranded",
kkonganti@101 23 help="Value for 'strandedness' in samplesheet. Must be one of 'unstranded', 'forward', 'reverse'.",
kkonganti@101 24 )
kkonganti@101 25 parser.add_argument(
kkonganti@101 26 "-r1",
kkonganti@101 27 "--read1_extension",
kkonganti@101 28 type=str,
kkonganti@101 29 dest="READ1_EXTENSION",
kkonganti@101 30 default="_R1_001.fastq.gz",
kkonganti@101 31 help="File extension for read 1.",
kkonganti@101 32 )
kkonganti@101 33 parser.add_argument(
kkonganti@101 34 "-r2",
kkonganti@101 35 "--read2_extension",
kkonganti@101 36 type=str,
kkonganti@101 37 dest="READ2_EXTENSION",
kkonganti@101 38 default="_R2_001.fastq.gz",
kkonganti@101 39 help="File extension for read 2.",
kkonganti@101 40 )
kkonganti@101 41 parser.add_argument(
kkonganti@101 42 "-se",
kkonganti@101 43 "--single_end",
kkonganti@101 44 dest="SINGLE_END",
kkonganti@101 45 action="store_true",
kkonganti@101 46 help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.",
kkonganti@101 47 )
kkonganti@101 48 parser.add_argument(
kkonganti@101 49 "-sn",
kkonganti@101 50 "--sanitise_name",
kkonganti@101 51 dest="SANITISE_NAME",
kkonganti@101 52 action="store_true",
kkonganti@101 53 help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.",
kkonganti@101 54 )
kkonganti@101 55 parser.add_argument(
kkonganti@101 56 "-sd",
kkonganti@101 57 "--sanitise_name_delimiter",
kkonganti@101 58 type=str,
kkonganti@101 59 dest="SANITISE_NAME_DELIMITER",
kkonganti@101 60 default="_",
kkonganti@101 61 help="Delimiter to use to sanitise sample name.",
kkonganti@101 62 )
kkonganti@101 63 parser.add_argument(
kkonganti@101 64 "-si",
kkonganti@101 65 "--sanitise_name_index",
kkonganti@101 66 type=int,
kkonganti@101 67 dest="SANITISE_NAME_INDEX",
kkonganti@101 68 default=1,
kkonganti@101 69 help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.",
kkonganti@101 70 )
kkonganti@101 71 return parser.parse_args(args)
kkonganti@101 72
kkonganti@101 73
kkonganti@101 74 def fastq_dir_to_samplesheet(
kkonganti@101 75 fastq_dir,
kkonganti@101 76 samplesheet_file,
kkonganti@101 77 strandedness="unstranded",
kkonganti@101 78 read1_extension="_R1_001.fastq.gz",
kkonganti@101 79 read2_extension="_R2_001.fastq.gz",
kkonganti@101 80 single_end=False,
kkonganti@101 81 sanitise_name=False,
kkonganti@101 82 sanitise_name_delimiter="_",
kkonganti@101 83 sanitise_name_index=1,
kkonganti@101 84 ):
kkonganti@101 85 def sanitize_sample(path, extension):
kkonganti@101 86 """Retrieve sample id from filename"""
kkonganti@101 87 sample = os.path.basename(path).replace(extension, "")
kkonganti@101 88 if sanitise_name:
kkonganti@101 89 if sanitise_name_index > 0:
kkonganti@101 90 sample = sanitise_name_delimiter.join(
kkonganti@101 91 os.path.basename(path).split(sanitise_name_delimiter)[
kkonganti@101 92 :sanitise_name_index
kkonganti@101 93 ]
kkonganti@101 94 )
kkonganti@101 95 # elif sanitise_name_index == -1:
kkonganti@101 96 # sample = os.path.basename(path)[ :os.path.basename(path).index('.') ]
kkonganti@101 97 return sample
kkonganti@101 98
kkonganti@101 99 def get_fastqs(extension):
kkonganti@101 100 """
kkonganti@101 101 Needs to be sorted to ensure R1 and R2 are in the same order
kkonganti@101 102 when merging technical replicates. Glob is not guaranteed to produce
kkonganti@101 103 sorted results.
kkonganti@101 104 See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered
kkonganti@101 105 """
kkonganti@101 106 abs_fq_files = glob.glob(os.path.join(fastq_dir, f"**", f"*{extension}"), recursive=True)
kkonganti@101 107 return sorted(
kkonganti@101 108 [
kkonganti@101 109 fq for _, fq in enumerate(abs_fq_files) if re.match('^((?!undetermined|unclassified|fail|class|qc|downloads).)*$', fq, flags=re.IGNORECASE)
kkonganti@101 110 ]
kkonganti@101 111 )
kkonganti@101 112
kkonganti@101 113 read_dict = {}
kkonganti@101 114
kkonganti@101 115 ## Get read 1 files
kkonganti@101 116 for read1_file in get_fastqs(read1_extension):
kkonganti@101 117 sample = sanitize_sample(read1_file, read1_extension)
kkonganti@101 118 if sample not in read_dict:
kkonganti@101 119 read_dict[sample] = {"R1": [], "R2": []}
kkonganti@101 120 read_dict[sample]["R1"].append(read1_file)
kkonganti@101 121
kkonganti@101 122 ## Get read 2 files
kkonganti@101 123 if not single_end:
kkonganti@101 124 for read2_file in get_fastqs(read2_extension):
kkonganti@101 125 sample = sanitize_sample(read2_file, read2_extension)
kkonganti@101 126 read_dict[sample]["R2"].append(read2_file)
kkonganti@101 127
kkonganti@101 128 ## Write to file
kkonganti@101 129 if len(read_dict) > 0:
kkonganti@101 130 out_dir = os.path.dirname(samplesheet_file)
kkonganti@101 131 if out_dir and not os.path.exists(out_dir):
kkonganti@101 132 os.makedirs(out_dir)
kkonganti@101 133
kkonganti@101 134 with open(samplesheet_file, "w") as fout:
kkonganti@101 135 header = ["sample", "fq1", "fq2", "strandedness"]
kkonganti@101 136 fout.write(",".join(header) + "\n")
kkonganti@101 137 for sample, reads in sorted(read_dict.items()):
kkonganti@101 138 for idx, read_1 in enumerate(reads["R1"]):
kkonganti@101 139 read_2 = ""
kkonganti@101 140 if idx < len(reads["R2"]):
kkonganti@101 141 read_2 = reads["R2"][idx]
kkonganti@101 142 sample_info = ",".join([sample, read_1, read_2, strandedness])
kkonganti@101 143 fout.write(f"{sample_info}\n")
kkonganti@101 144 else:
kkonganti@101 145 error_str = (
kkonganti@101 146 "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n"
kkonganti@101 147 )
kkonganti@101 148 error_str += "Please check the values provided for the:\n"
kkonganti@101 149 error_str += " - Path to the directory containing the FastQ files\n"
kkonganti@101 150 error_str += " - '--read1_extension' parameter\n"
kkonganti@101 151 error_str += " - '--read2_extension' parameter\n"
kkonganti@101 152 print(error_str)
kkonganti@101 153 sys.exit(1)
kkonganti@101 154
kkonganti@101 155
kkonganti@101 156 def main(args=None):
kkonganti@101 157 args = parse_args(args)
kkonganti@101 158
kkonganti@101 159 strandedness = "unstranded"
kkonganti@101 160 if args.STRANDEDNESS in ["unstranded", "forward", "reverse"]:
kkonganti@101 161 strandedness = args.STRANDEDNESS
kkonganti@101 162
kkonganti@101 163 fastq_dir_to_samplesheet(
kkonganti@101 164 fastq_dir=args.FASTQ_DIR,
kkonganti@101 165 samplesheet_file=args.SAMPLESHEET_FILE,
kkonganti@101 166 strandedness=strandedness,
kkonganti@101 167 read1_extension=args.READ1_EXTENSION,
kkonganti@101 168 read2_extension=args.READ2_EXTENSION,
kkonganti@101 169 single_end=args.SINGLE_END,
kkonganti@101 170 sanitise_name=args.SANITISE_NAME,
kkonganti@101 171 sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER,
kkonganti@101 172 sanitise_name_index=args.SANITISE_NAME_INDEX,
kkonganti@101 173 )
kkonganti@101 174
kkonganti@101 175
kkonganti@101 176 if __name__ == "__main__":
kkonganti@101 177 sys.exit(main())