annotate 1.0.0/bin/fastq_dir_to_samplesheet.py @ 0:0a8dda29956e draft default tip

planemo upload
author galaxytrakr
date Thu, 28 May 2026 20:41:10 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
2
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
3 import os
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
4 import sys
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
5 import glob
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
6 import argparse
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
7 import re
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
8
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
9
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
10 def parse_args(args=None):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
11 Description = "Generate samplesheet from a directory of FastQ files."
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
12 Epilog = "Example usage: python fastq_dir_to_samplesheet.py <FASTQ_DIR> <SAMPLESHEET_FILE>"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
13
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
14 parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
15 parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
16 parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
17 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
18 "-st",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
19 "--strandedness",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
20 type=str,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
21 dest="STRANDEDNESS",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
22 default="unstranded",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
23 help="Value for 'strandedness' in samplesheet. Must be one of 'unstranded', 'forward', 'reverse'.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
24 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
25 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
26 "-r1",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
27 "--read1_extension",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
28 type=str,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
29 dest="READ1_EXTENSION",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
30 default="_R1_001.fastq.gz",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
31 help="File extension for read 1.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
32 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
33 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
34 "-r2",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
35 "--read2_extension",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
36 type=str,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
37 dest="READ2_EXTENSION",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
38 default="_R2_001.fastq.gz",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
39 help="File extension for read 2.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
40 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
41 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
42 "-se",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
43 "--single_end",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
44 dest="SINGLE_END",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
45 action="store_true",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
46 help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
47 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
48 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
49 "-sn",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
50 "--sanitise_name",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
51 dest="SANITISE_NAME",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
52 action="store_true",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
53 help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
54 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
55 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
56 "-sd",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
57 "--sanitise_name_delimiter",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
58 type=str,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
59 dest="SANITISE_NAME_DELIMITER",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
60 default="_",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
61 help="Delimiter to use to sanitise sample name.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
62 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
63 parser.add_argument(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
64 "-si",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
65 "--sanitise_name_index",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
66 type=int,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
67 dest="SANITISE_NAME_INDEX",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
68 default=1,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
69 help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
70 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
71 return parser.parse_args(args)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
72
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
73
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
74 def fastq_dir_to_samplesheet(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
75 fastq_dir,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
76 samplesheet_file,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
77 strandedness="unstranded",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
78 read1_extension="_R1_001.fastq.gz",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
79 read2_extension="_R2_001.fastq.gz",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
80 single_end=False,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
81 sanitise_name=False,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
82 sanitise_name_delimiter="_",
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
83 sanitise_name_index=1,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
84 ):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
85 def sanitize_sample(path, extension):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
86 """Retrieve sample id from filename"""
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
87 sample = os.path.basename(path).replace(extension, "")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
88 if sanitise_name:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
89 if sanitise_name_index > 0:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
90 sample = sanitise_name_delimiter.join(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
91 os.path.basename(path).split(sanitise_name_delimiter)[
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
92 :sanitise_name_index
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
93 ]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
94 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
95 # elif sanitise_name_index == -1:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
96 # sample = os.path.basename(path)[ :os.path.basename(path).index('.') ]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
97 return sample
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
98
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
99 def get_fastqs(extension):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
100 """
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
101 Needs to be sorted to ensure R1 and R2 are in the same order
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
102 when merging technical replicates. Glob is not guaranteed to produce
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
103 sorted results.
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
104 See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
105 """
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
106 abs_fq_files = glob.glob(os.path.join(fastq_dir, f"**", f"*{extension}"), recursive=True)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
107 return sorted(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
108 [
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
109 fq for _, fq in enumerate(abs_fq_files) if re.match('^((?!undetermined|unclassified|downloads).)*$', fq, flags=re.IGNORECASE)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
110 ]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
111 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
112
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
113 read_dict = {}
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
114
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
115 ## Get read 1 files
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
116 for read1_file in get_fastqs(read1_extension):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
117 sample = sanitize_sample(read1_file, read1_extension)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
118 if sample not in read_dict:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
119 read_dict[sample] = {"R1": [], "R2": []}
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
120 read_dict[sample]["R1"].append(read1_file)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
121
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
122 ## Get read 2 files
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
123 if not single_end:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
124 for read2_file in get_fastqs(read2_extension):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
125 sample = sanitize_sample(read2_file, read2_extension)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
126 read_dict[sample]["R2"].append(read2_file)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
127
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
128 ## Write to file
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
129 if len(read_dict) > 0:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
130 out_dir = os.path.dirname(samplesheet_file)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
131 if out_dir and not os.path.exists(out_dir):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
132 os.makedirs(out_dir)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
133
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
134 with open(samplesheet_file, "w") as fout:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
135 header = ["sample", "fq1", "fq2", "strandedness"]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
136 fout.write(",".join(header) + "\n")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
137 for sample, reads in sorted(read_dict.items()):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
138 for idx, read_1 in enumerate(reads["R1"]):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
139 read_2 = ""
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
140 if idx < len(reads["R2"]):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
141 read_2 = reads["R2"][idx]
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
142 sample_info = ",".join([sample, read_1, read_2, strandedness])
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
143 fout.write(f"{sample_info}\n")
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
144 else:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
145 error_str = (
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
146 "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
147 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
148 error_str += "Please check the values provided for the:\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
149 error_str += " - Path to the directory containing the FastQ files\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
150 error_str += " - '--read1_extension' parameter\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
151 error_str += " - '--read2_extension' parameter\n"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
152 print(error_str)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
153 sys.exit(1)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
154
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
155
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
156 def main(args=None):
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
157 args = parse_args(args)
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
158
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
159 strandedness = "unstranded"
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
160 if args.STRANDEDNESS in ["unstranded", "forward", "reverse"]:
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
161 strandedness = args.STRANDEDNESS
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
162
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
163 fastq_dir_to_samplesheet(
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
164 fastq_dir=args.FASTQ_DIR,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
165 samplesheet_file=args.SAMPLESHEET_FILE,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
166 strandedness=strandedness,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
167 read1_extension=args.READ1_EXTENSION,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
168 read2_extension=args.READ2_EXTENSION,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
169 single_end=args.SINGLE_END,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
170 sanitise_name=args.SANITISE_NAME,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
171 sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
172 sanitise_name_index=args.SANITISE_NAME_INDEX,
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
173 )
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
174
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
175
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
176 if __name__ == "__main__":
0a8dda29956e planemo upload
galaxytrakr
parents:
diff changeset
177 sys.exit(main())