annotate 0.2.0/bin/check_samplesheet.py @ 0:9e8b1c747a6a draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:32:17 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
2
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
3 import os
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
4 import sys
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
5 import errno
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
6 import argparse
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
7
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
8
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
9 def parse_args(args=None):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
10 Description = "Reformat samplesheet file and check its contents."
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
11 Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
12
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
13 parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
14 parser.add_argument("FILE_IN", help="Input samplesheet file.")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
15 parser.add_argument("FILE_OUT", help="Output file.")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
16 return parser.parse_args(args)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
17
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
18
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
19 def make_dir(path):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
20 if len(path) > 0:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
21 try:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
22 os.makedirs(path)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
23 except OSError as exception:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
24 if exception.errno != errno.EEXIST:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
25 raise exception
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
26
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
27
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
28 def print_error(error, context="Line", context_str=""):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
29 error_str = f"ERROR: Please check samplesheet -> {error}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
30 if context != "" and context_str != "":
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
31 error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
32 print(error_str)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
33 sys.exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
34
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
35
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
36 def check_samplesheet(file_in, file_out):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
37 """
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
38 This function checks that the samplesheet follows the following structure:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
39
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
40 sample,fq1,fq2,strandedness
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
41 SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz,forward
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
42 SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz,forward
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
43 SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq,,forward
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
44 SAMPLE_SE,SAMPLE_SE_RUN1_2.fastq.gz,,forward
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
45
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
46 For an example see:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
47 https://github.com/nf-core/test-datasets/blob/rnaseq/samplesheet/v3.1/samplesheet_test.csv
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
48 """
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
49
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
50 sample_mapping_dict = {}
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
51 with open(file_in, "r", encoding='utf-8-sig') as fin:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
52
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
53 ## Check header
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
54 MIN_COLS = 3
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
55 HEADER = ["sample", "fq1", "fq2", "strandedness"]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
56 header = [x.strip('"') for x in fin.readline().strip().split(",")]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
57 if header[: len(HEADER)] != HEADER:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
58 print(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
59 f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
60 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
61 sys.exit(1)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
62
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
63 ## Check sample entries
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
64 for line in fin:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
65 if line.strip():
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
66 lspl = [x.strip().strip('"') for x in line.strip().split(",")]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
67
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
68 ## Check valid number of columns per row
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
69 if len(lspl) < len(HEADER):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
70 print_error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
71 f"Invalid number of columns (minimum = {len(HEADER)})!",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
72 "Line",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
73 line,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
74 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
75
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
76 num_cols = len([x for x in lspl if x])
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
77 if num_cols < MIN_COLS:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
78 print_error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
79 f"Invalid number of populated columns (minimum = {MIN_COLS})!",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
80 "Line",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
81 line,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
82 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
83
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
84 ## Check sample name entries
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
85 sample, fq1, fq2, strandedness = lspl[: len(HEADER)]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
86 if sample.find(" ") != -1:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
87 print(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
88 f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
89 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
90 sample = sample.replace(" ", "_")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
91 if not sample:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
92 print_error("Sample entry has not been specified!", "Line", line)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
93
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
94 ## Check FastQ file extension
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
95 for fastq in [fq1, fq2]:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
96 if fastq:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
97 if fastq.find(" ") != -1:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
98 print_error("FastQ file contains spaces!", "Line", line)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
99 # if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
100 # print_error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
101 # "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
102 # "Line",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
103 # line,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
104 # )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
105
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
106 ## Check strandedness
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
107 strandednesses = ["unstranded", "forward", "reverse"]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
108 if strandedness:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
109 if strandedness not in strandednesses:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
110 print_error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
111 f"Strandedness must be one of '{', '.join(strandednesses)}'!",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
112 "Line",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
113 line,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
114 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
115 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
116 print_error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
117 f"Strandedness has not been specified! Must be one of {', '.join(strandednesses)}.",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
118 "Line",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
119 line,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
120 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
121
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
122 ## Auto-detect paired-end/single-end
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
123 sample_info = [] ## [single_end, fq1, fq2, strandedness]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
124 if sample and fq1 and fq2: ## Paired-end short reads
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
125 sample_info = ["0", fq1, fq2, strandedness]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
126 elif sample and fq1 and not fq2: ## Single-end short reads
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
127 sample_info = ["1", fq1, fq2, strandedness]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
128 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
129 print_error("Invalid combination of columns provided!", "Line", line)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
130
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
131 ## Create sample mapping dictionary = {sample: [[ single_end, fq1, fq2, strandedness ]]}
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
132 if sample not in sample_mapping_dict:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
133 sample_mapping_dict[sample] = [sample_info]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
134 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
135 if sample_info in sample_mapping_dict[sample]:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
136 print_error("Samplesheet contains duplicate rows!", "Line", line)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
137 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
138 sample_mapping_dict[sample].append(sample_info)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
139
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
140 ## Write validated samplesheet with appropriate columns
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
141 if len(sample_mapping_dict) > 0:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
142 out_dir = os.path.dirname(file_out)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
143 make_dir(out_dir)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
144 with open(file_out, "w") as fout:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
145 fout.write(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
146 ",".join(["sample", "single_end", "fq1", "fq2", "strandedness"])
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
147 + "\n"
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
148 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
149 for sample in sorted(sample_mapping_dict.keys()):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
150
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
151 ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
152 if not all(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
153 x[0] == sample_mapping_dict[sample][0][0]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
154 for x in sample_mapping_dict[sample]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
155 ):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
156 print_error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
157 f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
158 "Sample",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
159 sample,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
160 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
161
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
162 ## Check that multiple runs of the same sample are of the same strandedness
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
163 if not all(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
164 x[-1] == sample_mapping_dict[sample][0][-1]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
165 for x in sample_mapping_dict[sample]
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
166 ):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
167 print_error(
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
168 f"Multiple runs of a sample must have the same strandedness!",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
169 "Sample",
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
170 sample,
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
171 )
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
172
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
173 for idx, val in enumerate(sample_mapping_dict[sample]):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
174 fout.write(",".join([f"{sample}_T{idx+1}"] + val) + "\n")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
175 else:
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
176 print_error(f"No entries to process!", "Samplesheet: {file_in}")
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
177
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
178
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
179 def main(args=None):
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
180 args = parse_args(args)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
181 check_samplesheet(args.FILE_IN, args.FILE_OUT)
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
182
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
183
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
184 if __name__ == "__main__":
9e8b1c747a6a planemo upload
galaxytrakr
parents:
diff changeset
185 sys.exit(main())