comparison 0.5.0/bin/fastq_dir_to_samplesheet.py @ 1:365849f031fd

"planemo upload"
author kkonganti
date Mon, 05 Jun 2023 18:48:51 -0400
parents
children
comparison
equal deleted inserted replaced
0:a4b1ee4b68b1 1:365849f031fd
1 #!/usr/bin/env python3
2
3 import os
4 import sys
5 import glob
6 import argparse
7 import re
8
9
10 def parse_args(args=None):
11 Description = "Generate samplesheet from a directory of FastQ files."
12 Epilog = "Example usage: python fastq_dir_to_samplesheet.py <FASTQ_DIR> <SAMPLESHEET_FILE>"
13
14 parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
15 parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.")
16 parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.")
17 parser.add_argument(
18 "-st",
19 "--strandedness",
20 type=str,
21 dest="STRANDEDNESS",
22 default="unstranded",
23 help="Value for 'strandedness' in samplesheet. Must be one of 'unstranded', 'forward', 'reverse'.",
24 )
25 parser.add_argument(
26 "-r1",
27 "--read1_extension",
28 type=str,
29 dest="READ1_EXTENSION",
30 default="_R1_001.fastq.gz",
31 help="File extension for read 1.",
32 )
33 parser.add_argument(
34 "-r2",
35 "--read2_extension",
36 type=str,
37 dest="READ2_EXTENSION",
38 default="_R2_001.fastq.gz",
39 help="File extension for read 2.",
40 )
41 parser.add_argument(
42 "-se",
43 "--single_end",
44 dest="SINGLE_END",
45 action="store_true",
46 help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.",
47 )
48 parser.add_argument(
49 "-sn",
50 "--sanitise_name",
51 dest="SANITISE_NAME",
52 action="store_true",
53 help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.",
54 )
55 parser.add_argument(
56 "-sd",
57 "--sanitise_name_delimiter",
58 type=str,
59 dest="SANITISE_NAME_DELIMITER",
60 default="_",
61 help="Delimiter to use to sanitise sample name.",
62 )
63 parser.add_argument(
64 "-si",
65 "--sanitise_name_index",
66 type=int,
67 dest="SANITISE_NAME_INDEX",
68 default=1,
69 help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.",
70 )
71 return parser.parse_args(args)
72
73
74 def fastq_dir_to_samplesheet(
75 fastq_dir,
76 samplesheet_file,
77 strandedness="unstranded",
78 read1_extension="_R1_001.fastq.gz",
79 read2_extension="_R2_001.fastq.gz",
80 single_end=False,
81 sanitise_name=False,
82 sanitise_name_delimiter="_",
83 sanitise_name_index=1,
84 ):
85 def sanitize_sample(path, extension):
86 """Retrieve sample id from filename"""
87 sample = os.path.basename(path).replace(extension, "")
88 if sanitise_name:
89 if sanitise_name_index > 0:
90 sample = sanitise_name_delimiter.join(
91 os.path.basename(path).split(sanitise_name_delimiter)[
92 :sanitise_name_index
93 ]
94 )
95 # elif sanitise_name_index == -1:
96 # sample = os.path.basename(path)[ :os.path.basename(path).index('.') ]
97 return sample
98
99 def get_fastqs(extension):
100 """
101 Needs to be sorted to ensure R1 and R2 are in the same order
102 when merging technical replicates. Glob is not guaranteed to produce
103 sorted results.
104 See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered
105 """
106 abs_fq_files = glob.glob(os.path.join(fastq_dir, f"**", f"*{extension}"), recursive=True)
107 return sorted(
108 [
109 fq for _, fq in enumerate(abs_fq_files) if re.match('^((?!undetermined|unclassified|downloads).)*$', fq, flags=re.IGNORECASE)
110 ]
111 )
112
113 read_dict = {}
114
115 ## Get read 1 files
116 for read1_file in get_fastqs(read1_extension):
117 sample = sanitize_sample(read1_file, read1_extension)
118 if sample not in read_dict:
119 read_dict[sample] = {"R1": [], "R2": []}
120 read_dict[sample]["R1"].append(read1_file)
121
122 ## Get read 2 files
123 if not single_end:
124 for read2_file in get_fastqs(read2_extension):
125 sample = sanitize_sample(read2_file, read2_extension)
126 read_dict[sample]["R2"].append(read2_file)
127
128 ## Write to file
129 if len(read_dict) > 0:
130 out_dir = os.path.dirname(samplesheet_file)
131 if out_dir and not os.path.exists(out_dir):
132 os.makedirs(out_dir)
133
134 with open(samplesheet_file, "w") as fout:
135 header = ["sample", "fq1", "fq2", "strandedness"]
136 fout.write(",".join(header) + "\n")
137 for sample, reads in sorted(read_dict.items()):
138 for idx, read_1 in enumerate(reads["R1"]):
139 read_2 = ""
140 if idx < len(reads["R2"]):
141 read_2 = reads["R2"][idx]
142 sample_info = ",".join([sample, read_1, read_2, strandedness])
143 fout.write(f"{sample_info}\n")
144 else:
145 error_str = (
146 "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n"
147 )
148 error_str += "Please check the values provided for the:\n"
149 error_str += " - Path to the directory containing the FastQ files\n"
150 error_str += " - '--read1_extension' parameter\n"
151 error_str += " - '--read2_extension' parameter\n"
152 print(error_str)
153 sys.exit(1)
154
155
156 def main(args=None):
157 args = parse_args(args)
158
159 strandedness = "unstranded"
160 if args.STRANDEDNESS in ["unstranded", "forward", "reverse"]:
161 strandedness = args.STRANDEDNESS
162
163 fastq_dir_to_samplesheet(
164 fastq_dir=args.FASTQ_DIR,
165 samplesheet_file=args.SAMPLESHEET_FILE,
166 strandedness=strandedness,
167 read1_extension=args.READ1_EXTENSION,
168 read2_extension=args.READ2_EXTENSION,
169 single_end=args.SINGLE_END,
170 sanitise_name=args.SANITISE_NAME,
171 sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER,
172 sanitise_name_index=args.SANITISE_NAME_INDEX,
173 )
174
175
176 if __name__ == "__main__":
177 sys.exit(main())