kkonganti@0
|
1 #!/usr/bin/env python3
|
kkonganti@0
|
2
|
kkonganti@0
|
3 import argparse
|
kkonganti@0
|
4 import errno
|
kkonganti@0
|
5 import os
|
kkonganti@0
|
6 import sys
|
kkonganti@0
|
7
|
kkonganti@0
|
8
|
kkonganti@0
|
9 def parse_args(args=None):
|
kkonganti@0
|
10 Description = "Reformat samplesheet file and check its contents."
|
kkonganti@0
|
11 Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>"
|
kkonganti@0
|
12
|
kkonganti@0
|
13 parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
|
kkonganti@0
|
14 parser.add_argument("FILE_IN", help="Input samplesheet file.")
|
kkonganti@0
|
15 parser.add_argument("FILE_OUT", help="Output file.")
|
kkonganti@0
|
16 return parser.parse_args(args)
|
kkonganti@0
|
17
|
kkonganti@0
|
18
|
kkonganti@0
|
19 def make_dir(path):
|
kkonganti@0
|
20 if len(path) > 0:
|
kkonganti@0
|
21 try:
|
kkonganti@0
|
22 os.makedirs(path)
|
kkonganti@0
|
23 except OSError as exception:
|
kkonganti@0
|
24 if exception.errno != errno.EEXIST:
|
kkonganti@0
|
25 raise exception
|
kkonganti@0
|
26
|
kkonganti@0
|
27
|
kkonganti@0
|
28 def print_error(error, context="Line", context_str=""):
|
kkonganti@0
|
29 error_str = f"ERROR: Please check samplesheet -> {error}"
|
kkonganti@0
|
30 if context != "" and context_str != "":
|
kkonganti@0
|
31 error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'"
|
kkonganti@0
|
32 print(error_str)
|
kkonganti@0
|
33 sys.exit(1)
|
kkonganti@0
|
34
|
kkonganti@0
|
35
|
kkonganti@0
|
36 def check_samplesheet(file_in, file_out):
|
kkonganti@0
|
37 """
|
kkonganti@0
|
38 This function checks that the samplesheet follows the following structure:
|
kkonganti@0
|
39
|
kkonganti@0
|
40 sample,fq1,fq2,strandedness
|
kkonganti@0
|
41 SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz,forward
|
kkonganti@0
|
42 SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz,forward
|
kkonganti@0
|
43 SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq,,forward
|
kkonganti@0
|
44 SAMPLE_SE,SAMPLE_SE_RUN1_2.fastq.gz,,forward
|
kkonganti@0
|
45
|
kkonganti@0
|
46 For an example see:
|
kkonganti@0
|
47 https://github.com/nf-core/test-datasets/blob/rnaseq/samplesheet/v3.1/samplesheet_test.csv
|
kkonganti@0
|
48 """
|
kkonganti@0
|
49
|
kkonganti@0
|
50 sample_mapping_dict = {}
|
kkonganti@0
|
51 with open(file_in, "r", encoding="utf-8-sig") as fin:
|
kkonganti@0
|
52
|
kkonganti@0
|
53 ## Check header
|
kkonganti@0
|
54 MIN_COLS = 3
|
kkonganti@0
|
55 HEADER = ["sample", "fq1", "fq2", "strandedness"]
|
kkonganti@0
|
56 header = [x.strip('"') for x in fin.readline().strip().split(",")]
|
kkonganti@0
|
57 if header[: len(HEADER)] != HEADER:
|
kkonganti@0
|
58 print(
|
kkonganti@0
|
59 f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}"
|
kkonganti@0
|
60 )
|
kkonganti@0
|
61 sys.exit(1)
|
kkonganti@0
|
62
|
kkonganti@0
|
63 ## Check sample entries
|
kkonganti@0
|
64 for line in fin:
|
kkonganti@0
|
65 if line.strip():
|
kkonganti@0
|
66 lspl = [x.strip().strip('"') for x in line.strip().split(",")]
|
kkonganti@0
|
67
|
kkonganti@0
|
68 ## Check valid number of columns per row
|
kkonganti@0
|
69 if len(lspl) < len(HEADER):
|
kkonganti@0
|
70 print_error(
|
kkonganti@0
|
71 f"Invalid number of columns (minimum = {len(HEADER)})!",
|
kkonganti@0
|
72 "Line",
|
kkonganti@0
|
73 line,
|
kkonganti@0
|
74 )
|
kkonganti@0
|
75
|
kkonganti@0
|
76 num_cols = len([x for x in lspl if x])
|
kkonganti@0
|
77 if num_cols < MIN_COLS:
|
kkonganti@0
|
78 print_error(
|
kkonganti@0
|
79 f"Invalid number of populated columns (minimum = {MIN_COLS})!",
|
kkonganti@0
|
80 "Line",
|
kkonganti@0
|
81 line,
|
kkonganti@0
|
82 )
|
kkonganti@0
|
83
|
kkonganti@0
|
84 ## Check sample name entries
|
kkonganti@0
|
85 sample, fq1, fq2, strandedness = lspl[: len(HEADER)]
|
kkonganti@0
|
86 if sample.find(" ") != -1:
|
kkonganti@0
|
87 print(
|
kkonganti@0
|
88 f"WARNING: Spaces have been replaced by underscores for sample: {sample}"
|
kkonganti@0
|
89 )
|
kkonganti@0
|
90 sample = sample.replace(" ", "_")
|
kkonganti@0
|
91 if not sample:
|
kkonganti@0
|
92 print_error("Sample entry has not been specified!", "Line", line)
|
kkonganti@0
|
93
|
kkonganti@0
|
94 ## Check FastQ file extension
|
kkonganti@0
|
95 for fastq in [fq1, fq2]:
|
kkonganti@0
|
96 if fastq:
|
kkonganti@0
|
97 if fastq.find(" ") != -1:
|
kkonganti@0
|
98 print_error("FastQ file contains spaces!", "Line", line)
|
kkonganti@0
|
99 # if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
|
kkonganti@0
|
100 # print_error(
|
kkonganti@0
|
101 # "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
|
kkonganti@0
|
102 # "Line",
|
kkonganti@0
|
103 # line,
|
kkonganti@0
|
104 # )
|
kkonganti@0
|
105
|
kkonganti@0
|
106 ## Check strandedness
|
kkonganti@0
|
107 strandednesses = ["unstranded", "forward", "reverse"]
|
kkonganti@0
|
108 if strandedness:
|
kkonganti@0
|
109 if strandedness not in strandednesses:
|
kkonganti@0
|
110 print_error(
|
kkonganti@0
|
111 f"Strandedness must be one of '{', '.join(strandednesses)}'!",
|
kkonganti@0
|
112 "Line",
|
kkonganti@0
|
113 line,
|
kkonganti@0
|
114 )
|
kkonganti@0
|
115 else:
|
kkonganti@0
|
116 print_error(
|
kkonganti@0
|
117 f"Strandedness has not been specified! Must be one of {', '.join(strandednesses)}.",
|
kkonganti@0
|
118 "Line",
|
kkonganti@0
|
119 line,
|
kkonganti@0
|
120 )
|
kkonganti@0
|
121
|
kkonganti@0
|
122 ## Auto-detect paired-end/single-end
|
kkonganti@0
|
123 sample_info = [] ## [single_end, fq1, fq2, strandedness]
|
kkonganti@0
|
124 if sample and fq1 and fq2: ## Paired-end short reads
|
kkonganti@0
|
125 sample_info = ["0", fq1, fq2, strandedness]
|
kkonganti@0
|
126 elif sample and fq1 and not fq2: ## Single-end short reads
|
kkonganti@0
|
127 sample_info = ["1", fq1, fq2, strandedness]
|
kkonganti@0
|
128 else:
|
kkonganti@0
|
129 print_error(
|
kkonganti@0
|
130 "Invalid combination of columns provided!", "Line", line
|
kkonganti@0
|
131 )
|
kkonganti@0
|
132
|
kkonganti@0
|
133 ## Create sample mapping dictionary = {sample: [[ single_end, fq1, fq2, strandedness ]]}
|
kkonganti@0
|
134 if sample not in sample_mapping_dict:
|
kkonganti@0
|
135 sample_mapping_dict[sample] = [sample_info]
|
kkonganti@0
|
136 else:
|
kkonganti@0
|
137 if sample_info in sample_mapping_dict[sample]:
|
kkonganti@0
|
138 print_error(
|
kkonganti@0
|
139 "Samplesheet contains duplicate rows!", "Line", line
|
kkonganti@0
|
140 )
|
kkonganti@0
|
141 else:
|
kkonganti@0
|
142 sample_mapping_dict[sample].append(sample_info)
|
kkonganti@0
|
143
|
kkonganti@0
|
144 ## Write validated samplesheet with appropriate columns
|
kkonganti@0
|
145 if len(sample_mapping_dict) > 0:
|
kkonganti@0
|
146 out_dir = os.path.dirname(file_out)
|
kkonganti@0
|
147 make_dir(out_dir)
|
kkonganti@0
|
148 with open(file_out, "w") as fout:
|
kkonganti@0
|
149 fout.write(
|
kkonganti@0
|
150 ",".join(["sample", "single_end", "fq1", "fq2", "strandedness"]) + "\n"
|
kkonganti@0
|
151 )
|
kkonganti@0
|
152 for sample in sorted(sample_mapping_dict.keys()):
|
kkonganti@0
|
153
|
kkonganti@0
|
154 ## Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
|
kkonganti@0
|
155 if not all(
|
kkonganti@0
|
156 x[0] == sample_mapping_dict[sample][0][0]
|
kkonganti@0
|
157 for x in sample_mapping_dict[sample]
|
kkonganti@0
|
158 ):
|
kkonganti@0
|
159 print_error(
|
kkonganti@0
|
160 f"Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end!",
|
kkonganti@0
|
161 "Sample",
|
kkonganti@0
|
162 sample,
|
kkonganti@0
|
163 )
|
kkonganti@0
|
164
|
kkonganti@0
|
165 ## Check that multiple runs of the same sample are of the same strandedness
|
kkonganti@0
|
166 if not all(
|
kkonganti@0
|
167 x[-1] == sample_mapping_dict[sample][0][-1]
|
kkonganti@0
|
168 for x in sample_mapping_dict[sample]
|
kkonganti@0
|
169 ):
|
kkonganti@0
|
170 print_error(
|
kkonganti@0
|
171 f"Multiple runs of a sample must have the same strandedness!",
|
kkonganti@0
|
172 "Sample",
|
kkonganti@0
|
173 sample,
|
kkonganti@0
|
174 )
|
kkonganti@0
|
175
|
kkonganti@0
|
176 for idx, val in enumerate(sample_mapping_dict[sample]):
|
kkonganti@0
|
177 fout.write(",".join([f"{sample}_T{idx+1}"] + val) + "\n")
|
kkonganti@0
|
178 else:
|
kkonganti@0
|
179 print_error(f"No entries to process!", "Samplesheet: {file_in}")
|
kkonganti@0
|
180
|
kkonganti@0
|
181
|
kkonganti@0
|
182 def main(args=None):
|
kkonganti@0
|
183 args = parse_args(args)
|
kkonganti@0
|
184 check_samplesheet(args.FILE_IN, args.FILE_OUT)
|
kkonganti@0
|
185
|
kkonganti@0
|
186
|
kkonganti@0
|
187 if __name__ == "__main__":
|
kkonganti@0
|
188 sys.exit(main())
|