Mercurial > repos > galaxytrakr > hfp_centriflaken_awsbatch
comparison 0.4.2/bin/fastq_dir_to_samplesheet.py @ 0:082e0091e813 draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Fri, 29 May 2026 13:27:47 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:082e0091e813 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 import os | |
| 4 import sys | |
| 5 import glob | |
| 6 import argparse | |
| 7 import re | |
| 8 | |
| 9 | |
| 10 def parse_args(args=None): | |
| 11 Description = "Generate samplesheet from a directory of FastQ files." | |
| 12 Epilog = "Example usage: python fastq_dir_to_samplesheet.py <FASTQ_DIR> <SAMPLESHEET_FILE>" | |
| 13 | |
| 14 parser = argparse.ArgumentParser(description=Description, epilog=Epilog) | |
| 15 parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.") | |
| 16 parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.") | |
| 17 parser.add_argument( | |
| 18 "-st", | |
| 19 "--strandedness", | |
| 20 type=str, | |
| 21 dest="STRANDEDNESS", | |
| 22 default="unstranded", | |
| 23 help="Value for 'strandedness' in samplesheet. Must be one of 'unstranded', 'forward', 'reverse'.", | |
| 24 ) | |
| 25 parser.add_argument( | |
| 26 "-r1", | |
| 27 "--read1_extension", | |
| 28 type=str, | |
| 29 dest="READ1_EXTENSION", | |
| 30 default="_R1_001.fastq.gz", | |
| 31 help="File extension for read 1.", | |
| 32 ) | |
| 33 parser.add_argument( | |
| 34 "-r2", | |
| 35 "--read2_extension", | |
| 36 type=str, | |
| 37 dest="READ2_EXTENSION", | |
| 38 default="_R2_001.fastq.gz", | |
| 39 help="File extension for read 2.", | |
| 40 ) | |
| 41 parser.add_argument( | |
| 42 "-se", | |
| 43 "--single_end", | |
| 44 dest="SINGLE_END", | |
| 45 action="store_true", | |
| 46 help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.", | |
| 47 ) | |
| 48 parser.add_argument( | |
| 49 "-sn", | |
| 50 "--sanitise_name", | |
| 51 dest="SANITISE_NAME", | |
| 52 action="store_true", | |
| 53 help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.", | |
| 54 ) | |
| 55 parser.add_argument( | |
| 56 "-sd", | |
| 57 "--sanitise_name_delimiter", | |
| 58 type=str, | |
| 59 dest="SANITISE_NAME_DELIMITER", | |
| 60 default="_", | |
| 61 help="Delimiter to use to sanitise sample name.", | |
| 62 ) | |
| 63 parser.add_argument( | |
| 64 "-si", | |
| 65 "--sanitise_name_index", | |
| 66 type=int, | |
| 67 dest="SANITISE_NAME_INDEX", | |
| 68 default=1, | |
| 69 help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.", | |
| 70 ) | |
| 71 return parser.parse_args(args) | |
| 72 | |
| 73 | |
| 74 def fastq_dir_to_samplesheet( | |
| 75 fastq_dir, | |
| 76 samplesheet_file, | |
| 77 strandedness="unstranded", | |
| 78 read1_extension="_R1_001.fastq.gz", | |
| 79 read2_extension="_R2_001.fastq.gz", | |
| 80 single_end=False, | |
| 81 sanitise_name=False, | |
| 82 sanitise_name_delimiter="_", | |
| 83 sanitise_name_index=1, | |
| 84 ): | |
| 85 def sanitize_sample(path, extension): | |
| 86 """Retrieve sample id from filename""" | |
| 87 sample = os.path.basename(path).replace(extension, "") | |
| 88 if sanitise_name: | |
| 89 if sanitise_name_index > 0: | |
| 90 sample = sanitise_name_delimiter.join( | |
| 91 os.path.basename(path).split(sanitise_name_delimiter)[ | |
| 92 :sanitise_name_index | |
| 93 ] | |
| 94 ) | |
| 95 # elif sanitise_name_index == -1: | |
| 96 # sample = os.path.basename(path)[ :os.path.basename(path).index('.') ] | |
| 97 return sample | |
| 98 | |
| 99 def get_fastqs(extension): | |
| 100 """ | |
| 101 Needs to be sorted to ensure R1 and R2 are in the same order | |
| 102 when merging technical replicates. Glob is not guaranteed to produce | |
| 103 sorted results. | |
| 104 See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered | |
| 105 """ | |
| 106 abs_fq_files = glob.glob(os.path.join(fastq_dir, f"**", f"*{extension}"), recursive=True) | |
| 107 return sorted( | |
| 108 [ | |
| 109 fq for _, fq in enumerate(abs_fq_files) if re.match('^((?!undetermined|unclassified|downloads).)*$', fq, flags=re.IGNORECASE) | |
| 110 ] | |
| 111 ) | |
| 112 | |
| 113 read_dict = {} | |
| 114 | |
| 115 ## Get read 1 files | |
| 116 for read1_file in get_fastqs(read1_extension): | |
| 117 sample = sanitize_sample(read1_file, read1_extension) | |
| 118 if sample not in read_dict: | |
| 119 read_dict[sample] = {"R1": [], "R2": []} | |
| 120 read_dict[sample]["R1"].append(read1_file) | |
| 121 | |
| 122 ## Get read 2 files | |
| 123 if not single_end: | |
| 124 for read2_file in get_fastqs(read2_extension): | |
| 125 sample = sanitize_sample(read2_file, read2_extension) | |
| 126 read_dict[sample]["R2"].append(read2_file) | |
| 127 | |
| 128 ## Write to file | |
| 129 if len(read_dict) > 0: | |
| 130 out_dir = os.path.dirname(samplesheet_file) | |
| 131 if out_dir and not os.path.exists(out_dir): | |
| 132 os.makedirs(out_dir) | |
| 133 | |
| 134 with open(samplesheet_file, "w") as fout: | |
| 135 header = ["sample", "fq1", "fq2", "strandedness"] | |
| 136 fout.write(",".join(header) + "\n") | |
| 137 for sample, reads in sorted(read_dict.items()): | |
| 138 for idx, read_1 in enumerate(reads["R1"]): | |
| 139 read_2 = "" | |
| 140 if idx < len(reads["R2"]): | |
| 141 read_2 = reads["R2"][idx] | |
| 142 sample_info = ",".join([sample, read_1, read_2, strandedness]) | |
| 143 fout.write(f"{sample_info}\n") | |
| 144 else: | |
| 145 error_str = ( | |
| 146 "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n" | |
| 147 ) | |
| 148 error_str += "Please check the values provided for the:\n" | |
| 149 error_str += " - Path to the directory containing the FastQ files\n" | |
| 150 error_str += " - '--read1_extension' parameter\n" | |
| 151 error_str += " - '--read2_extension' parameter\n" | |
| 152 print(error_str) | |
| 153 sys.exit(1) | |
| 154 | |
| 155 | |
| 156 def main(args=None): | |
| 157 args = parse_args(args) | |
| 158 | |
| 159 strandedness = "unstranded" | |
| 160 if args.STRANDEDNESS in ["unstranded", "forward", "reverse"]: | |
| 161 strandedness = args.STRANDEDNESS | |
| 162 | |
| 163 fastq_dir_to_samplesheet( | |
| 164 fastq_dir=args.FASTQ_DIR, | |
| 165 samplesheet_file=args.SAMPLESHEET_FILE, | |
| 166 strandedness=strandedness, | |
| 167 read1_extension=args.READ1_EXTENSION, | |
| 168 read2_extension=args.READ2_EXTENSION, | |
| 169 single_end=args.SINGLE_END, | |
| 170 sanitise_name=args.SANITISE_NAME, | |
| 171 sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER, | |
| 172 sanitise_name_index=args.SANITISE_NAME_INDEX, | |
| 173 ) | |
| 174 | |
| 175 | |
| 176 if __name__ == "__main__": | |
| 177 sys.exit(main()) |
