jpayne@68
|
1 #!/usr/bin/env python
|
jpayne@68
|
2 # -*- coding: utf-8 -*-
|
jpayne@68
|
3
|
jpayne@68
|
4 """
|
jpayne@68
|
5 stand alone read qc script based on jgi-rqc-pipeline/readqc/readqc.py v8.3.6
|
jpayne@68
|
6
|
jpayne@68
|
7 Command
|
jpayne@68
|
8 $ readqc.py -f FASTQ.GZ -o OUT_DIR --skip-blast -html
|
jpayne@68
|
9
|
jpayne@68
|
10 Outputs
|
jpayne@68
|
11 - normal QC outputs + index.html
|
jpayne@68
|
12
|
jpayne@68
|
13 Created: March 15, 2018
|
jpayne@68
|
14
|
jpayne@68
|
15 Shijie Yao (syao@lbl.gov)
|
jpayne@68
|
16
|
jpayne@68
|
17 Revision:
|
jpayne@68
|
18
|
jpayne@68
|
19 """
|
jpayne@68
|
20
|
jpayne@68
|
21 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
22 ## libraries to use
|
jpayne@68
|
23 import os
|
jpayne@68
|
24 import sys
|
jpayne@68
|
25 import argparse
|
jpayne@68
|
26 import datetime
|
jpayne@68
|
27 import shutil
|
jpayne@68
|
28
|
jpayne@68
|
29 SRC_ROOT = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
|
jpayne@68
|
30 sys.path.append(SRC_ROOT + "/lib") # common
|
jpayne@68
|
31
|
jpayne@68
|
32 from readqc_constants import RQCReadQcConfig, RQCReadQc, ReadqcStats
|
jpayne@68
|
33 from common import get_logger, get_status, append_rqc_stats, append_rqc_file, set_colors, get_subsample_rate,run_command
|
jpayne@68
|
34 from readqc_utils import *
|
jpayne@68
|
35 #from readqc_utils import checkpoint_step_wrapper, fast_subsample_fastq_sequences, write_unique_20_mers, illumina_read_gc
|
jpayne@68
|
36 from rqc_fastq import get_working_read_length, read_count
|
jpayne@68
|
37 from readqc_report import *
|
jpayne@68
|
38 from os_utility import make_dir
|
jpayne@68
|
39 from html_utility import html_tag, html_th, html_tr, html_link
|
jpayne@68
|
40 from rqc_utility import get_dict_obj, pipeline_val
|
jpayne@68
|
41
|
jpayne@68
|
42 VERSION = "1.0.0"
|
jpayne@68
|
43 LOG_LEVEL = "DEBUG"
|
jpayne@68
|
44 SCRIPT_NAME = __file__
|
jpayne@68
|
45
|
jpayne@68
|
46 PYDIR = os.path.abspath(os.path.dirname(__file__))
|
jpayne@68
|
47 BBDIR = os.path.join(PYDIR, os.path.pardir)
|
jpayne@68
|
48
|
jpayne@68
|
49 color = {}
|
jpayne@68
|
50 color = set_colors(color, True)
|
jpayne@68
|
51
|
jpayne@68
|
52 """
|
jpayne@68
|
53 STEP1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
54
|
jpayne@68
|
55 """
|
jpayne@68
|
56 def do_fast_subsample_fastq_sequences(fastq, skipSubsampling, log):
|
jpayne@68
|
57 log.info("\n\n%sSTEP1 - Subsampling reads <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
58
|
jpayne@68
|
59 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
60
|
jpayne@68
|
61 status = "1_illumina_readqc_subsampling in progress"
|
jpayne@68
|
62 checkpoint_step_wrapper(status)
|
jpayne@68
|
63 log.info("1_illumina_readqc_subsampling in progress.")
|
jpayne@68
|
64
|
jpayne@68
|
65 inputReadNum = 0
|
jpayne@68
|
66 sampleRate = 0.0
|
jpayne@68
|
67
|
jpayne@68
|
68 if not skipSubsampling:
|
jpayne@68
|
69 # sampleRate = RQCReadQc.ILLUMINA_SAMPLE_PCTENTAGE ## 0.01
|
jpayne@68
|
70 inputReadNum = read_count(fastq) ## read count from the original fastq
|
jpayne@68
|
71 assert inputReadNum > 0, "ERROR: invalid input fastq"
|
jpayne@68
|
72 sampleRate = get_subsample_rate(inputReadNum)
|
jpayne@68
|
73 log.info("Subsampling rate = %s", sampleRate)
|
jpayne@68
|
74 else:
|
jpayne@68
|
75 log.info("1_illumina_readqc_subsampling: skip subsampling. Use all the reads.")
|
jpayne@68
|
76 sampleRate = 1.0
|
jpayne@68
|
77
|
jpayne@68
|
78 retCode = None
|
jpayne@68
|
79 totalReadNum = 0
|
jpayne@68
|
80 firstSubsampledFastqFileName = ""
|
jpayne@68
|
81
|
jpayne@68
|
82 sequnitFileName = os.path.basename(fastq)
|
jpayne@68
|
83 sequnitFileName = sequnitFileName.replace(".fastq", "").replace(".gz", "")
|
jpayne@68
|
84
|
jpayne@68
|
85 firstSubsampledFastqFileName = sequnitFileName + ".s" + str(sampleRate) + ".fastq"
|
jpayne@68
|
86
|
jpayne@68
|
87 retCode, firstSubsampledFastqFileName, totalBaseCount, totalReadNum, subsampledReadNum, bIsPaired, readLength = fast_subsample_fastq_sequences(fastq, firstSubsampledFastqFileName, sampleRate, True, log)
|
jpayne@68
|
88
|
jpayne@68
|
89 append_rqc_stats(statsFile, "SUBSAMPLE_RATE", sampleRate, log)
|
jpayne@68
|
90
|
jpayne@68
|
91 if retCode in (RQCExitCodes.JGI_FAILURE, -2):
|
jpayne@68
|
92 log.info("1_illumina_readqc_subsampling failed.")
|
jpayne@68
|
93 status = "1_illumina_readqc_subsampling failed"
|
jpayne@68
|
94 checkpoint_step_wrapper(status)
|
jpayne@68
|
95
|
jpayne@68
|
96 else:
|
jpayne@68
|
97 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT, totalBaseCount, log)
|
jpayne@68
|
98 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_COUNT, totalReadNum, log)
|
jpayne@68
|
99
|
jpayne@68
|
100 log.info("1_illumina_readqc_subsampling complete.")
|
jpayne@68
|
101 status = "1_illumina_readqc_subsampling complete"
|
jpayne@68
|
102 checkpoint_step_wrapper(status)
|
jpayne@68
|
103
|
jpayne@68
|
104
|
jpayne@68
|
105 return status, firstSubsampledFastqFileName, totalReadNum, subsampledReadNum, bIsPaired, readLength
|
jpayne@68
|
106
|
jpayne@68
|
107
|
jpayne@68
|
108 """
|
jpayne@68
|
109 STEP2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
110
|
jpayne@68
|
111 """
|
jpayne@68
|
112 def do_write_unique_20_mers(fastq, totalReadCount, log):
|
jpayne@68
|
113 log.info("\n\n%sSTEP2 - Sampling unique 25 mers <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
114
|
jpayne@68
|
115 if totalReadCount >= RQCReadQc.ILLUMINA_MER_SAMPLE_REPORT_FRQ * 2: ## 25000
|
jpayne@68
|
116 log.debug("read count total in step2 = %s", totalReadCount)
|
jpayne@68
|
117 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
118 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
119
|
jpayne@68
|
120 status = "2_unique_mers_sampling in progress"
|
jpayne@68
|
121 checkpoint_step_wrapper(status)
|
jpayne@68
|
122 log.info(status)
|
jpayne@68
|
123
|
jpayne@68
|
124 retCode, newDataFile, newPngPlotFile, newHtmlPlotFile = write_unique_20_mers(fastq, log)
|
jpayne@68
|
125
|
jpayne@68
|
126 if retCode != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
127 status = "2_unique_mers_sampling failed"
|
jpayne@68
|
128 log.error(status)
|
jpayne@68
|
129 checkpoint_step_wrapper(status)
|
jpayne@68
|
130
|
jpayne@68
|
131 else:
|
jpayne@68
|
132 ## if no output files, skip this step.
|
jpayne@68
|
133 if newDataFile is not None:
|
jpayne@68
|
134 statsDict = {}
|
jpayne@68
|
135
|
jpayne@68
|
136 ## in readqc_report.py
|
jpayne@68
|
137 ## 2014.07.23 read_level_mer_sampling is updated to process new output file format from bbcountunique
|
jpayne@68
|
138 log.info("2_unique_mers_sampling: post-processing the bbcountunique output file.")
|
jpayne@68
|
139 read_level_mer_sampling(statsDict, newDataFile, log)
|
jpayne@68
|
140
|
jpayne@68
|
141 for k, v in statsDict.items():
|
jpayne@68
|
142 append_rqc_stats(statsFile, k, str(v), log)
|
jpayne@68
|
143
|
jpayne@68
|
144 ## outputs from bbcountunique
|
jpayne@68
|
145 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_20MER_UNIQUENESS_TEXT, newDataFile, log)
|
jpayne@68
|
146 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_20MER_UNIQUENESS_PLOT, newPngPlotFile, log)
|
jpayne@68
|
147 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_20MER_UNIQUENESS_D3_HTML_PLOT, newHtmlPlotFile, log)
|
jpayne@68
|
148
|
jpayne@68
|
149 log.info("2_unique_mers_sampling complete.")
|
jpayne@68
|
150 status = "2_unique_mers_sampling complete"
|
jpayne@68
|
151 checkpoint_step_wrapper(status)
|
jpayne@68
|
152
|
jpayne@68
|
153 else:
|
jpayne@68
|
154 ## if num reads < RQCReadQc.ILLUMINA_MER_SAMPLE_REPORT_FRQ = 25000
|
jpayne@68
|
155 ## just proceed to the next step
|
jpayne@68
|
156 log.warning("2_unique_mers_sampling can't run it because the number of reads < %s.", RQCReadQc.ILLUMINA_MER_SAMPLE_REPORT_FRQ * 2)
|
jpayne@68
|
157 status = "2_unique_mers_sampling complete"
|
jpayne@68
|
158 checkpoint_step_wrapper(status)
|
jpayne@68
|
159
|
jpayne@68
|
160
|
jpayne@68
|
161 return status
|
jpayne@68
|
162
|
jpayne@68
|
163
|
jpayne@68
|
164 """
|
jpayne@68
|
165 STEP3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
166
|
jpayne@68
|
167 """
|
jpayne@68
|
168 def do_illumina_read_gc(fastq, log):
|
jpayne@68
|
169 log.info("\n\n%sSTEP3 - Making read GC histograms <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
170
|
jpayne@68
|
171 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
172 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
173
|
jpayne@68
|
174 status = "3_illumina_read_gc in progress"
|
jpayne@68
|
175 checkpoint_step_wrapper(status)
|
jpayne@68
|
176 log.info("3_illumina_read_gc in progress.")
|
jpayne@68
|
177
|
jpayne@68
|
178 reformat_gchist_file, png_file, htmlFile, mean_val, stdev_val, med_val, mode_val = illumina_read_gc(fastq, log)
|
jpayne@68
|
179
|
jpayne@68
|
180 if not reformat_gchist_file:
|
jpayne@68
|
181 log.error("3_illumina_read_gc failed.")
|
jpayne@68
|
182 status = "3_illumina_read_gc failed"
|
jpayne@68
|
183 checkpoint_step_wrapper(status)
|
jpayne@68
|
184
|
jpayne@68
|
185 else:
|
jpayne@68
|
186 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_MEAN, mean_val, log)
|
jpayne@68
|
187 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_STD, stdev_val, log)
|
jpayne@68
|
188 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_MED, med_val, log)
|
jpayne@68
|
189 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_MODE, mode_val, log)
|
jpayne@68
|
190
|
jpayne@68
|
191 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_GC_TEXT, reformat_gchist_file, log)
|
jpayne@68
|
192 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_GC_PLOT, png_file, log)
|
jpayne@68
|
193 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_GC_D3_HTML_PLOT, htmlFile, log)
|
jpayne@68
|
194
|
jpayne@68
|
195 log.info("3_illumina_read_gc complete.")
|
jpayne@68
|
196 status = "3_illumina_read_gc complete"
|
jpayne@68
|
197 checkpoint_step_wrapper(status)
|
jpayne@68
|
198
|
jpayne@68
|
199
|
jpayne@68
|
200 return status
|
jpayne@68
|
201
|
jpayne@68
|
202
|
jpayne@68
|
203 """
|
jpayne@68
|
204 STEP4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
205
|
jpayne@68
|
206 """
|
jpayne@68
|
207 def do_read_quality_stats(fastq, log):
|
jpayne@68
|
208 log.info("\n\n%sSTEP4 - Analyzing read quality <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
209
|
jpayne@68
|
210 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
211 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
212
|
jpayne@68
|
213 status = "4_illumina_read_quality_stats in progress"
|
jpayne@68
|
214 checkpoint_step_wrapper(status)
|
jpayne@68
|
215 log.info("4_illumina_read_quality_stats in progress.")
|
jpayne@68
|
216
|
jpayne@68
|
217 readLength = 0
|
jpayne@68
|
218 readLenR1 = 0
|
jpayne@68
|
219 readLenR2 = 0
|
jpayne@68
|
220 isPairedEnd = None
|
jpayne@68
|
221
|
jpayne@68
|
222 if not os.path.isfile(fastq):
|
jpayne@68
|
223 log.error("4_illumina_read_quality_stats failed. Cannot find the input fastq file")
|
jpayne@68
|
224 status = "4_illumina_read_quality_stats failed"
|
jpayne@68
|
225 checkpoint_step_wrapper(status)
|
jpayne@68
|
226 return status
|
jpayne@68
|
227
|
jpayne@68
|
228 ## First figure out if it's pair-ended or not
|
jpayne@68
|
229 ## NOTE: ssize=10000 is recommended!
|
jpayne@68
|
230 readLength, readLenR1, readLenR2, isPairedEnd = get_working_read_length(fastq, log)
|
jpayne@68
|
231 log.info("Read length = %s, and is_pair_ended = %s", readLength, isPairedEnd)
|
jpayne@68
|
232
|
jpayne@68
|
233 if readLength == 0:
|
jpayne@68
|
234 log.error("Failed to run get_working_read_length.")
|
jpayne@68
|
235 status = "4_illumina_read_quality_stats failed"
|
jpayne@68
|
236 checkpoint_step_wrapper(status)
|
jpayne@68
|
237
|
jpayne@68
|
238 else:
|
jpayne@68
|
239 ## Pair-ended
|
jpayne@68
|
240 r1_r2_baseposqual_png = None ## Average Base Position Quality Plot (*.qrpt.png)
|
jpayne@68
|
241 r1_r2_baseposqual_html = None ## Average Base Position Quality D3 Plot (*.qrpt.html)
|
jpayne@68
|
242 r1_r2_baseposqual_txt = None ## Read 1/2 Average Base Position Quality Text (*.qhist.txt)
|
jpayne@68
|
243
|
jpayne@68
|
244 r1_baseposqual_box_png = None ## Read 1 Average Base Position Quality Boxplot (*.r1.png)
|
jpayne@68
|
245 r2_baseposqual_box_png = None ## Read 2 Average Base Position Quality Boxplot (*.r2.png)
|
jpayne@68
|
246 r1_baseposqual_box_html = None ## Read 1 Average Base Position Quality D3 Boxplot (*.r1.html)
|
jpayne@68
|
247 r2_baseposqual_box_html = None ## Read 2 Average Base Position Quality D3 Boxplot (*.r2.html)
|
jpayne@68
|
248 r1_r2_baseposqual_box_txt = None ## Average Base Position Quality text
|
jpayne@68
|
249
|
jpayne@68
|
250 r1_cyclenbase_png = None ## Read 1 Percent N by Read Position (*.r1.fastq.base.stats.Npercent.png) --> Read 1 Cycle N Base Percent plot
|
jpayne@68
|
251 r2_cyclenbase_png = None ## Read 2 Percent N by Read Position (*.r2.fastq.base.stats.Npercent.png) --> Read 2 Cycle N Base Percent plot
|
jpayne@68
|
252
|
jpayne@68
|
253 r1_cyclenbase_txt = None ## Read 1 Percent N by Read Position Text (*.r1.fastq.base.stats) --> Read 1 Cycle N Base Percent text
|
jpayne@68
|
254 #r2_cyclenbase_txt = None ## Read 2 Percent N by Read Position Text (*.r2.fastq.base.stats) --> Read 2 Cycle N Base Percent text
|
jpayne@68
|
255 r1_r2_cyclenbase_txt = None ## Merged Percent N by Read Position Text (*.r2.fastq.base.stats) --> Merged Cycle N Base Percent text
|
jpayne@68
|
256
|
jpayne@68
|
257 r1_cyclenbase_html = None
|
jpayne@68
|
258 r2_cyclenbase_html = None
|
jpayne@68
|
259
|
jpayne@68
|
260 r1_nuclcompfreq_png = None ## Read 1 Nucleotide Composition Frequency Plot (*.r1.stats.png)
|
jpayne@68
|
261 r2_nuclcompfreq_png = None ## Read 2 Nucleotide Composition Frequency Plot (*.r2.stats.png)
|
jpayne@68
|
262 r1_nuclcompfreq_html = None
|
jpayne@68
|
263 r2_nuclcompfreq_html = None
|
jpayne@68
|
264
|
jpayne@68
|
265 ## Single-ended
|
jpayne@68
|
266 #se_baseposqual_txt = None ## Average Base Position Quality Text (*.qrpt)
|
jpayne@68
|
267 #se_nuclcompfreq_png = None ## Cycle Nucleotide Composition (*.stats.png)
|
jpayne@68
|
268
|
jpayne@68
|
269 if isPairedEnd:
|
jpayne@68
|
270 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_LENGTH_1, readLenR1, log)
|
jpayne@68
|
271 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_LENGTH_2, readLenR2, log)
|
jpayne@68
|
272
|
jpayne@68
|
273 else:
|
jpayne@68
|
274 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_LENGTH_1, readLength, log)
|
jpayne@68
|
275
|
jpayne@68
|
276 ## Average Base Position Quality Plot/Text using qhist.txt
|
jpayne@68
|
277 r1_r2_baseposqual_txt, r1_r2_baseposqual_png, r1_r2_baseposqual_html = gen_average_base_position_quality_plot(fastq, isPairedEnd, log) ## .reformat.qhist.txt
|
jpayne@68
|
278
|
jpayne@68
|
279 log.debug("Outputs: %s %s %s", r1_r2_baseposqual_png, r1_r2_baseposqual_html, r1_r2_baseposqual_txt)
|
jpayne@68
|
280
|
jpayne@68
|
281 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_PLOT_MERGED, r1_r2_baseposqual_png, log)
|
jpayne@68
|
282 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_PLOT_MERGED_D3_HTML_PLOT, r1_r2_baseposqual_html, log)
|
jpayne@68
|
283 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_1, r1_r2_baseposqual_txt, log) ## for backward compatibility
|
jpayne@68
|
284 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_2, r1_r2_baseposqual_txt, log) ## for backward compatibility
|
jpayne@68
|
285 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_MERGED, r1_r2_baseposqual_txt, log)
|
jpayne@68
|
286
|
jpayne@68
|
287 ## Average Base Position Quality Plot/Text using bqhist.txt
|
jpayne@68
|
288 r1_r2_baseposqual_box_txt, r1_baseposqual_box_png, r2_baseposqual_box_png, r1_baseposqual_box_html, r2_baseposqual_box_html = gen_average_base_position_quality_boxplot(fastq, log)
|
jpayne@68
|
289
|
jpayne@68
|
290 log.debug("Read qual outputs: %s %s %s %s %s", r1_r2_baseposqual_box_txt, r1_baseposqual_box_png, r1_baseposqual_box_html, r2_baseposqual_box_png, r2_baseposqual_box_html)
|
jpayne@68
|
291
|
jpayne@68
|
292 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_BOXPLOT_1, r1_baseposqual_box_png, log)
|
jpayne@68
|
293 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_D3_HTML_BOXPLOT_1, r1_baseposqual_box_html, log)
|
jpayne@68
|
294 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_BOXPLOT_2, r2_baseposqual_box_png, log)
|
jpayne@68
|
295 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_D3_HTML_BOXPLOT_2, r2_baseposqual_box_html, log)
|
jpayne@68
|
296 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_BOXPLOT_TEXT, r1_r2_baseposqual_box_txt, log)
|
jpayne@68
|
297
|
jpayne@68
|
298 ## ----------------------------------------------------------------------------------------------------
|
jpayne@68
|
299 ## compute Q20 of the two reads
|
jpayne@68
|
300 q20Read1 = None
|
jpayne@68
|
301 q20Read2 = None
|
jpayne@68
|
302
|
jpayne@68
|
303 ## using bqhist.txt
|
jpayne@68
|
304 if r1_r2_baseposqual_box_txt:
|
jpayne@68
|
305 q20Read1 = q20_score_new(r1_r2_baseposqual_box_txt, 1, log)
|
jpayne@68
|
306 if isPairedEnd:
|
jpayne@68
|
307 q20Read2 = q20_score_new(r1_r2_baseposqual_box_txt, 2, log)
|
jpayne@68
|
308
|
jpayne@68
|
309 log.debug("q20 for read 1 = %s", q20Read1)
|
jpayne@68
|
310 log.debug("q20 for read 2 = %s", q20Read2)
|
jpayne@68
|
311
|
jpayne@68
|
312 if q20Read1 is not None:
|
jpayne@68
|
313 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_Q20_READ1, q20Read1, log)
|
jpayne@68
|
314 else:
|
jpayne@68
|
315 log.error("Failed to get q20 read 1 from %s", r1_r2_baseposqual_box_txt)
|
jpayne@68
|
316 status = "4_illumina_read_quality_stats failed"
|
jpayne@68
|
317 checkpoint_step_wrapper(status)
|
jpayne@68
|
318 return status
|
jpayne@68
|
319
|
jpayne@68
|
320 if isPairedEnd:
|
jpayne@68
|
321 if q20Read2 is not None:
|
jpayne@68
|
322 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_Q20_READ2, q20Read2, log)
|
jpayne@68
|
323 else:
|
jpayne@68
|
324 log.error("Failed to get q20 read 2 from %s", r1_r2_baseposqual_box_txt)
|
jpayne@68
|
325 status = "4_illumina_read_quality_stats failed"
|
jpayne@68
|
326 checkpoint_step_wrapper(status)
|
jpayne@68
|
327 return status
|
jpayne@68
|
328
|
jpayne@68
|
329 r1_r2_cyclenbase_txt, r1_nuclcompfreq_png, r1_nuclcompfreq_html, r2_nuclcompfreq_png, r2_nuclcompfreq_html = gen_cycle_nucleotide_composition_plot(fastq, readLength, isPairedEnd, log)
|
jpayne@68
|
330
|
jpayne@68
|
331 log.debug("gen_cycle_nucleotide_composition_plot() ==> %s %s %s %s %s", r1_cyclenbase_txt, r1_nuclcompfreq_png, r1_nuclcompfreq_html, r2_nuclcompfreq_png, r2_nuclcompfreq_html)
|
jpayne@68
|
332
|
jpayne@68
|
333 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_TEXT_1, r1_r2_cyclenbase_txt, log)
|
jpayne@68
|
334 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_TEXT_2, r1_r2_cyclenbase_txt, log) # reformat.sh generates a single merged output file.
|
jpayne@68
|
335
|
jpayne@68
|
336 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_PLOT_1, r1_nuclcompfreq_png, log)
|
jpayne@68
|
337 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_1, r1_nuclcompfreq_html, log)
|
jpayne@68
|
338
|
jpayne@68
|
339 if r2_nuclcompfreq_png:
|
jpayne@68
|
340 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_PLOT_2, r2_nuclcompfreq_png, log)
|
jpayne@68
|
341 if r2_nuclcompfreq_html:
|
jpayne@68
|
342 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_2, r2_nuclcompfreq_html, log)
|
jpayne@68
|
343
|
jpayne@68
|
344 ### ---------------------------------------------------------------------------------------------------
|
jpayne@68
|
345 ## using bhist.txt
|
jpayne@68
|
346 r1_cyclenbase_txt, r1_cyclenbase_png, r1_cyclenbase_html, r2_cyclenbase_png, r2_cyclenbase_html = gen_cycle_n_base_percent_plot(fastq, readLength, isPairedEnd, log)
|
jpayne@68
|
347
|
jpayne@68
|
348 log.debug("Outputs: %s %s %s", r1_cyclenbase_txt, r1_cyclenbase_png, r1_cyclenbase_html)
|
jpayne@68
|
349
|
jpayne@68
|
350 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_TEXT_1, r1_cyclenbase_txt, log)
|
jpayne@68
|
351 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_TEXT_2, r1_cyclenbase_txt, log)
|
jpayne@68
|
352
|
jpayne@68
|
353 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_PLOT_1, r1_cyclenbase_png, log)
|
jpayne@68
|
354 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_D3_HTML_PLOT_1, r1_cyclenbase_html, log)
|
jpayne@68
|
355
|
jpayne@68
|
356 if r2_cyclenbase_png:
|
jpayne@68
|
357 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_PLOT_2, r2_cyclenbase_png, log)
|
jpayne@68
|
358 if r2_cyclenbase_html:
|
jpayne@68
|
359 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_D3_HTML_PLOT_2, r2_cyclenbase_html, log)
|
jpayne@68
|
360
|
jpayne@68
|
361 log.info("4_illumina_read_quality_stats complete.")
|
jpayne@68
|
362 status = "4_illumina_read_quality_stats complete"
|
jpayne@68
|
363 checkpoint_step_wrapper(status)
|
jpayne@68
|
364
|
jpayne@68
|
365
|
jpayne@68
|
366 return status
|
jpayne@68
|
367
|
jpayne@68
|
368
|
jpayne@68
|
369
|
jpayne@68
|
370 """
|
jpayne@68
|
371 STEP5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
372
|
jpayne@68
|
373 """
|
jpayne@68
|
374 def do_write_base_quality_stats(fastq, log):
|
jpayne@68
|
375 log.info("\n\n%sSTEP5 - Calculating base quality statistics for reads <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
376
|
jpayne@68
|
377 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
378 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
379
|
jpayne@68
|
380 status = "5_illumina_read_quality_stats in progress"
|
jpayne@68
|
381 checkpoint_step_wrapper(status)
|
jpayne@68
|
382 log.info("5_illumina_read_quality_stats in progress.")
|
jpayne@68
|
383
|
jpayne@68
|
384 reformatObqhistFile = write_avg_base_quality_stats(fastq, log) ## *.reformat.obqhist.txt
|
jpayne@68
|
385
|
jpayne@68
|
386 if not reformatObqhistFile:
|
jpayne@68
|
387 log.error("5_illumina_read_quality_stats failed.")
|
jpayne@68
|
388 status = "5_illumina_read_quality_stats failed"
|
jpayne@68
|
389 checkpoint_step_wrapper(status)
|
jpayne@68
|
390
|
jpayne@68
|
391 else:
|
jpayne@68
|
392 ## Generate qual scores and plots of read level QC
|
jpayne@68
|
393 statsDict = {}
|
jpayne@68
|
394
|
jpayne@68
|
395 retCode = base_level_qual_stats(statsDict, reformatObqhistFile, log)
|
jpayne@68
|
396
|
jpayne@68
|
397 if retCode != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
398 log.error("5_illumina_read_quality_stats failed.")
|
jpayne@68
|
399 status = "5_illumina_read_quality_stats failed"
|
jpayne@68
|
400 checkpoint_step_wrapper(status)
|
jpayne@68
|
401
|
jpayne@68
|
402 else:
|
jpayne@68
|
403 for k, v in statsDict.items():
|
jpayne@68
|
404 append_rqc_stats(statsFile, k, str(v), log)
|
jpayne@68
|
405
|
jpayne@68
|
406 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_QUALITY_STATS, reformatObqhistFile, log)
|
jpayne@68
|
407
|
jpayne@68
|
408 log.info("5_illumina_read_quality_stats complete.")
|
jpayne@68
|
409 status = "5_illumina_read_quality_stats complete"
|
jpayne@68
|
410 checkpoint_step_wrapper(status)
|
jpayne@68
|
411
|
jpayne@68
|
412
|
jpayne@68
|
413 return status
|
jpayne@68
|
414
|
jpayne@68
|
415
|
jpayne@68
|
416
|
jpayne@68
|
417 """
|
jpayne@68
|
418 STEP6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
419
|
jpayne@68
|
420 """
|
jpayne@68
|
421 def do_illumina_count_q_score(fastq, log):
|
jpayne@68
|
422 log.info("\n\n%sSTEP6 - Generating quality score histogram <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
423
|
jpayne@68
|
424 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
425 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
426
|
jpayne@68
|
427 status = "6_illumina_count_q_score in progress"
|
jpayne@68
|
428 checkpoint_step_wrapper(status)
|
jpayne@68
|
429 log.info("6_illumina_count_q_score in progress.")
|
jpayne@68
|
430
|
jpayne@68
|
431 qhistTxtFile, qhistPngFile, qhistHtmlPlotFile = illumina_count_q_score(fastq, log) ## *.obqhist.txt
|
jpayne@68
|
432
|
jpayne@68
|
433 if not qhistTxtFile:
|
jpayne@68
|
434 log.error("6_illumina_count_q_score failed.")
|
jpayne@68
|
435 status = "6_illumina_count_q_score failed"
|
jpayne@68
|
436 checkpoint_step_wrapper(status)
|
jpayne@68
|
437
|
jpayne@68
|
438 else:
|
jpayne@68
|
439 ## save qscores in statsFile
|
jpayne@68
|
440 qscore = {}
|
jpayne@68
|
441 read_level_qual_stats(qscore, qhistTxtFile, log)
|
jpayne@68
|
442
|
jpayne@68
|
443 for k, v in qscore.items():
|
jpayne@68
|
444 append_rqc_stats(statsFile, k, str(v), log)
|
jpayne@68
|
445
|
jpayne@68
|
446 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QHIST_TEXT, qhistTxtFile, log)
|
jpayne@68
|
447 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QHIST_PLOT, qhistPngFile, log)
|
jpayne@68
|
448 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QHIST_D3_HTML_PLOT, qhistHtmlPlotFile, log)
|
jpayne@68
|
449
|
jpayne@68
|
450 log.info("6_illumina_count_q_score complete.")
|
jpayne@68
|
451 status = "6_illumina_count_q_score complete"
|
jpayne@68
|
452 checkpoint_step_wrapper(status)
|
jpayne@68
|
453
|
jpayne@68
|
454
|
jpayne@68
|
455 return status
|
jpayne@68
|
456
|
jpayne@68
|
457
|
jpayne@68
|
458
|
jpayne@68
|
459 """
|
jpayne@68
|
460 STEP7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
461
|
jpayne@68
|
462 """
|
jpayne@68
|
463 ## 20140903 removed
|
jpayne@68
|
464 ##def do_illumina_calculate_average_quality(fastq, log):
|
jpayne@68
|
465
|
jpayne@68
|
466
|
jpayne@68
|
467
|
jpayne@68
|
468 """
|
jpayne@68
|
469 STEP8 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
470
|
jpayne@68
|
471 """
|
jpayne@68
|
472 def do_illumina_find_common_motifs(fastq, log):
|
jpayne@68
|
473 log.info("\n\n%sSTEP8 - Locating N stutter motifs in sequence reads <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
474
|
jpayne@68
|
475 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
476 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
477
|
jpayne@68
|
478 status = "8_illumina_find_common_motifs in progress"
|
jpayne@68
|
479 checkpoint_step_wrapper(status)
|
jpayne@68
|
480 log.info(status)
|
jpayne@68
|
481
|
jpayne@68
|
482 retCode, statDataFile = illumina_find_common_motifs(fastq, log)
|
jpayne@68
|
483
|
jpayne@68
|
484 log.info("nstutter statDataFile name = %s", statDataFile)
|
jpayne@68
|
485
|
jpayne@68
|
486 if retCode != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
487 log.error("8_illumina_find_common_motifs failed.")
|
jpayne@68
|
488 status = "8_illumina_find_common_motifs failed"
|
jpayne@68
|
489 checkpoint_step_wrapper(status)
|
jpayne@68
|
490
|
jpayne@68
|
491 else:
|
jpayne@68
|
492 ## read_level_stutter
|
jpayne@68
|
493 ## ex)
|
jpayne@68
|
494 ##688 N----------------------------------------------------------------------------------------------------------------------------
|
jpayne@68
|
495 ##-------------------------
|
jpayne@68
|
496 ##346 NNNNNNNNNNNNN----------------------------------------------------------------------------------------------------------------
|
jpayne@68
|
497 ##-------------------------
|
jpayne@68
|
498 ##53924 ------------N----------------------------------------------------------------------------------------------------------------
|
jpayne@68
|
499 ##-------------------------
|
jpayne@68
|
500 ##sum pct patterNs past 0.1 == 15.9245930330268 ( 54958 / 345114 * 100 )
|
jpayne@68
|
501
|
jpayne@68
|
502 with open(statDataFile, "r") as stutFH:
|
jpayne@68
|
503 lines = stutFH.readlines()
|
jpayne@68
|
504
|
jpayne@68
|
505 ## if no motifs are detected the file is empty
|
jpayne@68
|
506 if not lines:
|
jpayne@68
|
507 log.warning("The *.nstutter.stat file is not available in function read_level_stutter(). The function still returns JGI_SUCCESS.")
|
jpayne@68
|
508
|
jpayne@68
|
509 else:
|
jpayne@68
|
510 assert lines[-1].find("patterNs") != -1
|
jpayne@68
|
511 t = lines[-1].split()
|
jpayne@68
|
512 ## ["sum", "pct", "patterNs", "past", "0.1", "==", "15.9245930330268", "(", "54958", "/", "345114", "*", "100", ")"]
|
jpayne@68
|
513 percent = "%.2f" % float(t[6])
|
jpayne@68
|
514
|
jpayne@68
|
515 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_N_FREQUENCE, percent, log)
|
jpayne@68
|
516 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_N_PATTERN, str("".join(lines[:-1])), log)
|
jpayne@68
|
517
|
jpayne@68
|
518 ## NOTE: ???
|
jpayne@68
|
519 append_rqc_file(filesFile, "find_common_motifs.dataFile", statDataFile, log)
|
jpayne@68
|
520
|
jpayne@68
|
521 log.info("8_illumina_find_common_motifs complete.")
|
jpayne@68
|
522 status = "8_illumina_find_common_motifs complete"
|
jpayne@68
|
523 checkpoint_step_wrapper(status)
|
jpayne@68
|
524
|
jpayne@68
|
525
|
jpayne@68
|
526 return status
|
jpayne@68
|
527
|
jpayne@68
|
528
|
jpayne@68
|
529 """
|
jpayne@68
|
530 STEP11 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
531
|
jpayne@68
|
532 """
|
jpayne@68
|
533 def do_illumina_detect_read_contam(fastq, bpToCut, log):
|
jpayne@68
|
534 log.info("\n\n%sSTEP11 - Detect read contam <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
535
|
jpayne@68
|
536 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
537 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
538
|
jpayne@68
|
539 status = "11_illumina_detect_read_contam in progress"
|
jpayne@68
|
540 checkpoint_step_wrapper(status)
|
jpayne@68
|
541 log.info("11_illumina_detect_read_contam in progress.")
|
jpayne@68
|
542
|
jpayne@68
|
543
|
jpayne@68
|
544 #########
|
jpayne@68
|
545 ## seal
|
jpayne@68
|
546 #########
|
jpayne@68
|
547 retCode2, outFileDict2, ratioResultDict2, contamStatDict = illumina_detect_read_contam3(fastq, bpToCut, log) ## seal version
|
jpayne@68
|
548
|
jpayne@68
|
549 if retCode2 != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
550 log.error("11_illumina_detect_read_contam seal version failed.")
|
jpayne@68
|
551 status = "11_illumina_detect_read_contam failed"
|
jpayne@68
|
552 checkpoint_step_wrapper(status)
|
jpayne@68
|
553
|
jpayne@68
|
554 else:
|
jpayne@68
|
555 for k, v in outFileDict2.items():
|
jpayne@68
|
556 append_rqc_file(filesFile, k + " seal", str(v), log)
|
jpayne@68
|
557 append_rqc_file(filesFile, k, str(v), log)
|
jpayne@68
|
558
|
jpayne@68
|
559 for k, v in ratioResultDict2.items():
|
jpayne@68
|
560 append_rqc_stats(statsFile, k + " seal", str(v), log)
|
jpayne@68
|
561 append_rqc_stats(statsFile, k, str(v), log)
|
jpayne@68
|
562
|
jpayne@68
|
563 ## contamination stat
|
jpayne@68
|
564 for k, v in contamStatDict.items():
|
jpayne@68
|
565 append_rqc_stats(statsFile, k, str(v), log)
|
jpayne@68
|
566
|
jpayne@68
|
567 log.info("11_illumina_detect_read_contam seal version complete.")
|
jpayne@68
|
568 status = "11_illumina_detect_read_contam complete"
|
jpayne@68
|
569 checkpoint_step_wrapper(status)
|
jpayne@68
|
570
|
jpayne@68
|
571
|
jpayne@68
|
572 return status
|
jpayne@68
|
573
|
jpayne@68
|
574
|
jpayne@68
|
575
|
jpayne@68
|
576 """
|
jpayne@68
|
577 STEP13 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
578
|
jpayne@68
|
579 """
|
jpayne@68
|
580 ## Removed!!
|
jpayne@68
|
581 ##def do_illumina_read_megablast(firstSubsampledFastqFileName, skipSubsampling, subsampledReadNum, log, blastDbPath=None):
|
jpayne@68
|
582
|
jpayne@68
|
583
|
jpayne@68
|
584
|
jpayne@68
|
585 """
|
jpayne@68
|
586 New STEP13 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
587
|
jpayne@68
|
588 """
|
jpayne@68
|
589 def do_illumina_subsampling_read_blastn(firstSubsampledFastqFileName, skipSubsampling, subsampledReadNum, log):
|
jpayne@68
|
590 log.info("\n\n%sSTEP13 - Run subsampling for Blast search <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
591
|
jpayne@68
|
592 subsampeldFastqFile = None
|
jpayne@68
|
593 totalReadNum = 0
|
jpayne@68
|
594 readNumToReturn = 0
|
jpayne@68
|
595
|
jpayne@68
|
596 status = "13_illumina_subsampling_read_megablast in progress"
|
jpayne@68
|
597 checkpoint_step_wrapper(status)
|
jpayne@68
|
598 log.info("13_illumina_subsampling_read_megablast in progress.")
|
jpayne@68
|
599
|
jpayne@68
|
600 if subsampledReadNum == 0:
|
jpayne@68
|
601 cmd = " ".join(["grep", "-c", "'^+'", firstSubsampledFastqFileName])
|
jpayne@68
|
602
|
jpayne@68
|
603 stdOut, _, exitCode = run_command(cmd, True, log)
|
jpayne@68
|
604
|
jpayne@68
|
605 if exitCode != 0:
|
jpayne@68
|
606 log.error("Failed to run grep cmd")
|
jpayne@68
|
607 return RQCExitCodes.JGI_FAILURE, None, None, None
|
jpayne@68
|
608
|
jpayne@68
|
609 else:
|
jpayne@68
|
610 readNum = int(stdOut)
|
jpayne@68
|
611
|
jpayne@68
|
612 else:
|
jpayne@68
|
613 readNum = subsampledReadNum
|
jpayne@68
|
614
|
jpayne@68
|
615 log.info("Subsampled read number = %d", readNum)
|
jpayne@68
|
616
|
jpayne@68
|
617 sampl_per = RQCReadQc.ILLUMINA_SAMPLE_PCTENTAGE ## 0.01
|
jpayne@68
|
618 max_count = RQCReadQc.ILLUMINA_SAMPLE_COUNT ## 50000
|
jpayne@68
|
619
|
jpayne@68
|
620 ## TODO
|
jpayne@68
|
621 ## Use "samplereadstarget" option in reformat.sh
|
jpayne@68
|
622
|
jpayne@68
|
623 if skipSubsampling:
|
jpayne@68
|
624 log.debug("No subsampling for megablast. Use fastq file, %s (readnum = %s) as query for megablast.", firstSubsampledFastqFileName, readNum)
|
jpayne@68
|
625 subsampeldFastqFile = firstSubsampledFastqFileName
|
jpayne@68
|
626 readNumToReturn = readNum
|
jpayne@68
|
627
|
jpayne@68
|
628 else:
|
jpayne@68
|
629 if readNum > max_count:
|
jpayne@68
|
630 log.debug("Run the 2nd subsampling for running megablast.")
|
jpayne@68
|
631 secondSubsamplingRate = float(max_count) / readNum
|
jpayne@68
|
632 log.debug("SecondSubsamplingRate=%s, max_count=%s, readNum=%s", secondSubsamplingRate, max_count, readNum)
|
jpayne@68
|
633 log.info("Second subsampling of Reads after Percent Subsampling reads = %s with new percent subsampling %f.", readNum, secondSubsamplingRate)
|
jpayne@68
|
634
|
jpayne@68
|
635 secondSubsampledFastqFile = ""
|
jpayne@68
|
636 # dataFile = ""
|
jpayne@68
|
637
|
jpayne@68
|
638 sequnitFileName = os.path.basename(firstSubsampledFastqFileName)
|
jpayne@68
|
639 sequnitFileName = sequnitFileName.replace(".fastq", "").replace(".gz", "")
|
jpayne@68
|
640
|
jpayne@68
|
641 secondSubsampledFastqFile = sequnitFileName + ".s" + str(sampl_per) + ".s" + str(secondSubsamplingRate) + ".n" + str(max_count) + ".fastq"
|
jpayne@68
|
642
|
jpayne@68
|
643 ## ex) fq_sub_sample.pl -f .../7601.1.77813.CTTGTA.s0.01.fastq -o .../7601.1.77813.CTTGTA.s0.01.stats -r 0.0588142575171 > .../7601.1.77813.CTTGTA.s0.01.s0.01.s0.0588142575171.n50000.fastq
|
jpayne@68
|
644 ## ex) reformat.sh
|
jpayne@68
|
645 retCode, secondSubsampledFastqFile, totalBaseCount, totalReadNum, subsampledReadNum, _, _ = fast_subsample_fastq_sequences(firstSubsampledFastqFileName, secondSubsampledFastqFile, secondSubsamplingRate, False, log)
|
jpayne@68
|
646
|
jpayne@68
|
647 if retCode != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
648 log.error("Second subsampling failed.")
|
jpayne@68
|
649 return RQCExitCodes.JGI_FAILURE, None, None, None
|
jpayne@68
|
650
|
jpayne@68
|
651 else:
|
jpayne@68
|
652 log.info("Second subsampling complete.")
|
jpayne@68
|
653 log.info("Second Subsampling Total Base Count = %s.", totalBaseCount)
|
jpayne@68
|
654 log.info("Second Subsampling Total Reads = %s.", totalReadNum)
|
jpayne@68
|
655 log.info("Second Subsampling Sampled Reads = %s.", subsampledReadNum)
|
jpayne@68
|
656
|
jpayne@68
|
657 if subsampledReadNum == 0:
|
jpayne@68
|
658 log.warning("Too small first subsampled fastq file. Skip the 2nd sampling.")
|
jpayne@68
|
659 secondSubsampledFastqFile = firstSubsampledFastqFileName
|
jpayne@68
|
660
|
jpayne@68
|
661 subsampeldFastqFile = secondSubsampledFastqFile
|
jpayne@68
|
662 readNumToReturn = subsampledReadNum
|
jpayne@68
|
663
|
jpayne@68
|
664 else:
|
jpayne@68
|
665 log.debug("The readNum is smaller than max_count=%s. The 2nd sampling skipped.", str(max_count))
|
jpayne@68
|
666 subsampeldFastqFile = firstSubsampledFastqFileName
|
jpayne@68
|
667 totalReadNum = readNum
|
jpayne@68
|
668 readNumToReturn = readNum
|
jpayne@68
|
669
|
jpayne@68
|
670 log.info("13_illumina_subsampling_read_megablast complete.")
|
jpayne@68
|
671 status = "13_illumina_subsampling_read_megablast complete"
|
jpayne@68
|
672 checkpoint_step_wrapper(status)
|
jpayne@68
|
673
|
jpayne@68
|
674
|
jpayne@68
|
675 return status, subsampeldFastqFile, totalReadNum, readNumToReturn
|
jpayne@68
|
676
|
jpayne@68
|
677
|
jpayne@68
|
678
|
jpayne@68
|
679 ##"""
|
jpayne@68
|
680 ##STEP14 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
681 ##
|
jpayne@68
|
682 ##"""
|
jpayne@68
|
683 ##def do_illumina_read_blastn_refseq_microbial(subsampeldFastqFile, subsampledReadNum, log, blastDbPath=None):
|
jpayne@68
|
684 ## 12212015 sulsj REMOVED!
|
jpayne@68
|
685
|
jpayne@68
|
686
|
jpayne@68
|
687 """
|
jpayne@68
|
688 New STEP14 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
689
|
jpayne@68
|
690 """
|
jpayne@68
|
691 def do_illumina_read_blastn_refseq_archaea(subsampeldFastqFile, subsampledReadNum, log):
|
jpayne@68
|
692 log.info("\n\n%sSTEP14 - Run read blastn against refseq archaea <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
693
|
jpayne@68
|
694 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
695 retCode = None
|
jpayne@68
|
696
|
jpayne@68
|
697 log.info("14_illumina_read_blastn_refseq_archaea in progress.")
|
jpayne@68
|
698 status = "14_illumina_read_blastn_refseq_archaea in progress"
|
jpayne@68
|
699 checkpoint_step_wrapper(status)
|
jpayne@68
|
700
|
jpayne@68
|
701 retCode = illumina_read_blastn_refseq_archaea(subsampeldFastqFile, log)
|
jpayne@68
|
702
|
jpayne@68
|
703 if retCode == RQCExitCodes.JGI_FAILURE:
|
jpayne@68
|
704 log.error("14_illumina_read_blastn_refseq_archaea failed.")
|
jpayne@68
|
705 status = "14_illumina_read_blastn_refseq_archaea failed"
|
jpayne@68
|
706
|
jpayne@68
|
707 elif retCode == -143: ## timeout
|
jpayne@68
|
708 log.warning("14_illumina_read_blastn_refseq_archaea timeout.")
|
jpayne@68
|
709 status = "14_illumina_read_blastn_refseq_archaea complete"
|
jpayne@68
|
710
|
jpayne@68
|
711 else:
|
jpayne@68
|
712 ## read number used in blast search?
|
jpayne@68
|
713 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READS_NUMBER, subsampledReadNum, log)
|
jpayne@68
|
714
|
jpayne@68
|
715 ret2 = read_megablast_hits("refseq.archaea", log)
|
jpayne@68
|
716
|
jpayne@68
|
717 if ret2 != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
718 log.error("Errors in read_megablast_hits() of refseq.microbial")
|
jpayne@68
|
719 log.error("14_illumina_read_blastn_refseq_archaea reporting failed.")
|
jpayne@68
|
720 status = "14_illumina_read_blastn_refseq_archaea failed"
|
jpayne@68
|
721
|
jpayne@68
|
722 else:
|
jpayne@68
|
723 log.info("14_illumina_read_blastn_refseq_archaea complete.")
|
jpayne@68
|
724 status = "14_illumina_read_blastn_refseq_archaea complete"
|
jpayne@68
|
725
|
jpayne@68
|
726
|
jpayne@68
|
727 return status
|
jpayne@68
|
728
|
jpayne@68
|
729
|
jpayne@68
|
730 """
|
jpayne@68
|
731 STEP15 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
732
|
jpayne@68
|
733 """
|
jpayne@68
|
734 def do_illumina_read_blastn_refseq_bacteria(subsampeldFastqFile, log):
|
jpayne@68
|
735 log.info("\n\n%sSTEP15 - Run read blastn against refseq bacteria <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
736
|
jpayne@68
|
737 #statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
738 #retCode = None
|
jpayne@68
|
739
|
jpayne@68
|
740 log.info("15_illumina_read_blastn_refseq_bacteria in progress.")
|
jpayne@68
|
741 status = "15_illumina_read_blastn_refseq_bacteria in progress"
|
jpayne@68
|
742 checkpoint_step_wrapper(status)
|
jpayne@68
|
743
|
jpayne@68
|
744 retCode = illumina_read_blastn_refseq_bacteria(subsampeldFastqFile, log)
|
jpayne@68
|
745
|
jpayne@68
|
746 if retCode == RQCExitCodes.JGI_FAILURE:
|
jpayne@68
|
747 log.error("15_illumina_read_blastn_refseq_bacteria failed.")
|
jpayne@68
|
748 status = "15_illumina_read_blastn_refseq_bacteria failed"
|
jpayne@68
|
749
|
jpayne@68
|
750 elif retCode == -143: ## timeout
|
jpayne@68
|
751 log.warning("15_illumina_read_blastn_refseq_bacteria timeout.")
|
jpayne@68
|
752 status = "15_illumina_read_blastn_refseq_bacteria complete"
|
jpayne@68
|
753
|
jpayne@68
|
754 else:
|
jpayne@68
|
755 ret2 = read_megablast_hits("refseq.bacteria", log)
|
jpayne@68
|
756
|
jpayne@68
|
757 if ret2 != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
758 log.error("Errors in read_megablast_hits() of refseq.microbial")
|
jpayne@68
|
759 log.error("15_illumina_read_blastn_refseq_bacteria reporting failed.")
|
jpayne@68
|
760 status = "15_illumina_read_blastn_refseq_bacteria failed"
|
jpayne@68
|
761
|
jpayne@68
|
762 else:
|
jpayne@68
|
763 log.info("15_illumina_read_blastn_refseq_bacteria complete.")
|
jpayne@68
|
764 status = "15_illumina_read_blastn_refseq_bacteria complete"
|
jpayne@68
|
765
|
jpayne@68
|
766
|
jpayne@68
|
767 return status
|
jpayne@68
|
768
|
jpayne@68
|
769
|
jpayne@68
|
770
|
jpayne@68
|
771 """
|
jpayne@68
|
772 STEP16 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
773
|
jpayne@68
|
774 """
|
jpayne@68
|
775 def do_illumina_read_blastn_nt(subsampeldFastqFile, log):
|
jpayne@68
|
776 log.info("\n\n%sSTEP16 - Run read blastn against nt <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
777
|
jpayne@68
|
778 retCode = None
|
jpayne@68
|
779
|
jpayne@68
|
780 log.info("16_illumina_read_blastn_nt in progress.")
|
jpayne@68
|
781 status = "16_illumina_read_blastn_nt in progress"
|
jpayne@68
|
782 checkpoint_step_wrapper(status)
|
jpayne@68
|
783
|
jpayne@68
|
784 retCode = illumina_read_blastn_nt(subsampeldFastqFile, log)
|
jpayne@68
|
785
|
jpayne@68
|
786 if retCode == RQCExitCodes.JGI_FAILURE:
|
jpayne@68
|
787 log.error("16_illumina_read_blastn_nt failed.")
|
jpayne@68
|
788 status = "16_illumina_read_blastn_nt failed"
|
jpayne@68
|
789
|
jpayne@68
|
790 elif retCode == -143: ## timeout
|
jpayne@68
|
791 log.warning("16_illumina_read_blastn_nt timeout.")
|
jpayne@68
|
792 status = "16_illumina_read_blastn_nt complete"
|
jpayne@68
|
793
|
jpayne@68
|
794 else:
|
jpayne@68
|
795 ret2 = read_megablast_hits("nt", log)
|
jpayne@68
|
796
|
jpayne@68
|
797 if ret2 != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
798 log.error("Errors in read_megablast_hits() of nt")
|
jpayne@68
|
799 log.error("16_illumina_read_blastn_nt reporting failed.")
|
jpayne@68
|
800 status = "16_illumina_read_blastn_nt failed"
|
jpayne@68
|
801
|
jpayne@68
|
802 else:
|
jpayne@68
|
803 log.info("16_illumina_read_blastn_nt complete.")
|
jpayne@68
|
804 status = "16_illumina_read_blastn_nt complete"
|
jpayne@68
|
805
|
jpayne@68
|
806
|
jpayne@68
|
807 return status
|
jpayne@68
|
808
|
jpayne@68
|
809
|
jpayne@68
|
810
|
jpayne@68
|
811 """
|
jpayne@68
|
812 STEP17 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
813
|
jpayne@68
|
814 do_illumina_multiplex_statistics
|
jpayne@68
|
815
|
jpayne@68
|
816 Demultiplexing analysis for pooled lib
|
jpayne@68
|
817
|
jpayne@68
|
818 """
|
jpayne@68
|
819 def do_illumina_multiplex_statistics(fastq, log, isMultiplexed=None):
|
jpayne@68
|
820 log.info("\n\n%sSTEP17 - Run Multiplex statistics analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
821
|
jpayne@68
|
822 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
823
|
jpayne@68
|
824 log.info("17_multiplex_statistics in progress.")
|
jpayne@68
|
825 status = "17_multiplex_statistics in progress"
|
jpayne@68
|
826 checkpoint_step_wrapper(status)
|
jpayne@68
|
827
|
jpayne@68
|
828 log.debug("fastq file: %s", fastq)
|
jpayne@68
|
829
|
jpayne@68
|
830 retCode, demultiplexStatsFile, detectionPngPlotFile, detectionHtmlPlotFile = illumina_generate_index_sequence_detection_plot(fastq, log, isMultiplexed=isMultiplexed)
|
jpayne@68
|
831
|
jpayne@68
|
832 if retCode != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
833 log.error("17_multiplex_statistics failed.")
|
jpayne@68
|
834 status = "17_multiplex_statistics failed"
|
jpayne@68
|
835 checkpoint_step_wrapper(status)
|
jpayne@68
|
836
|
jpayne@68
|
837 else:
|
jpayne@68
|
838 log.info("17_multiplex_statistics complete.")
|
jpayne@68
|
839 status = "17_multiplex_statistics complete"
|
jpayne@68
|
840 checkpoint_step_wrapper(status)
|
jpayne@68
|
841
|
jpayne@68
|
842 if detectionPngPlotFile is not None:
|
jpayne@68
|
843 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_DEMULTIPLEX_STATS_PLOT, detectionPngPlotFile, log)
|
jpayne@68
|
844
|
jpayne@68
|
845 if detectionHtmlPlotFile is not None:
|
jpayne@68
|
846 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_DEMULTIPLEX_STATS_D3_HTML_PLOT, detectionHtmlPlotFile, log)
|
jpayne@68
|
847
|
jpayne@68
|
848 if demultiplexStatsFile is not None:
|
jpayne@68
|
849 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_DEMULTIPLEX_STATS, demultiplexStatsFile, log)
|
jpayne@68
|
850
|
jpayne@68
|
851
|
jpayne@68
|
852 return status
|
jpayne@68
|
853
|
jpayne@68
|
854
|
jpayne@68
|
855
|
jpayne@68
|
856 """
|
jpayne@68
|
857 STEP18 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
858
|
jpayne@68
|
859 do_end_of_read_illumina_adapter_check
|
jpayne@68
|
860
|
jpayne@68
|
861 """
|
jpayne@68
|
862 def do_end_of_read_illumina_adapter_check(firstSubsampledFastqFileName, log):
|
jpayne@68
|
863 log.info("\n\n%sSTEP18 - Run end_of_read_illumina_adapter_check analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
864
|
jpayne@68
|
865 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
866 plotFile = None
|
jpayne@68
|
867 dataFile = None
|
jpayne@68
|
868
|
jpayne@68
|
869 log.info("18_end_of_read_illumina_adapter_check in progress.")
|
jpayne@68
|
870 status = "18_end_of_read_illumina_adapter_check in progress"
|
jpayne@68
|
871 checkpoint_step_wrapper(status)
|
jpayne@68
|
872
|
jpayne@68
|
873 log.debug("sampled fastq file: %s", firstSubsampledFastqFileName)
|
jpayne@68
|
874
|
jpayne@68
|
875 retCode, dataFile, plotFile, htmlFile = end_of_read_illumina_adapter_check(firstSubsampledFastqFileName, log)
|
jpayne@68
|
876
|
jpayne@68
|
877 if retCode != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
878 log.error("18_end_of_read_illumina_adapter_check failed.")
|
jpayne@68
|
879 status = "18_end_of_read_illumina_adapter_check failed"
|
jpayne@68
|
880 checkpoint_step_wrapper(status)
|
jpayne@68
|
881
|
jpayne@68
|
882 else:
|
jpayne@68
|
883 log.info("18_end_of_read_illumina_adapter_check complete.")
|
jpayne@68
|
884 status = "18_end_of_read_illumina_adapter_check complete"
|
jpayne@68
|
885 checkpoint_step_wrapper(status)
|
jpayne@68
|
886
|
jpayne@68
|
887 if plotFile is not None:
|
jpayne@68
|
888 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_END_OF_READ_ADAPTER_CHECK_PLOT, plotFile, log)
|
jpayne@68
|
889 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_END_OF_READ_ADAPTER_CHECK_D3_HTML_PLOT, htmlFile, log)
|
jpayne@68
|
890
|
jpayne@68
|
891 if dataFile is not None:
|
jpayne@68
|
892 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_END_OF_READ_ADAPTER_CHECK_DATA, dataFile, log)
|
jpayne@68
|
893
|
jpayne@68
|
894
|
jpayne@68
|
895 return status
|
jpayne@68
|
896
|
jpayne@68
|
897
|
jpayne@68
|
898
|
jpayne@68
|
899 """
|
jpayne@68
|
900 STEP19 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
901
|
jpayne@68
|
902 do_insert_size_analysis
|
jpayne@68
|
903
|
jpayne@68
|
904 """
|
jpayne@68
|
905 def do_insert_size_analysis(fastq, log):
|
jpayne@68
|
906 log.info("\n\n%sSTEP19 - Run insert size analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
907
|
jpayne@68
|
908 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
909 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
910
|
jpayne@68
|
911 plotFile = None
|
jpayne@68
|
912 dataFile = None
|
jpayne@68
|
913
|
jpayne@68
|
914 log.info("19_insert_size_analysis in progress.")
|
jpayne@68
|
915 status = "19_insert_size_analysis in progress"
|
jpayne@68
|
916 checkpoint_step_wrapper(status)
|
jpayne@68
|
917
|
jpayne@68
|
918 log.debug("fastq file used: %s", fastq)
|
jpayne@68
|
919
|
jpayne@68
|
920 retCode, dataFile, plotFile, htmlFile, statsDict = insert_size_analysis(fastq, log) ## by bbmerge.sh
|
jpayne@68
|
921
|
jpayne@68
|
922 if retCode != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
923 log.error("19_insert_size_analysis failed.")
|
jpayne@68
|
924 status = "19_insert_size_analysis failed"
|
jpayne@68
|
925 checkpoint_step_wrapper(status)
|
jpayne@68
|
926
|
jpayne@68
|
927 else:
|
jpayne@68
|
928 if plotFile is not None:
|
jpayne@68
|
929 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_HISTO_PLOT, plotFile, log)
|
jpayne@68
|
930
|
jpayne@68
|
931 if dataFile is not None:
|
jpayne@68
|
932 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_HISTO_DATA, dataFile, log)
|
jpayne@68
|
933
|
jpayne@68
|
934 if htmlFile is not None:
|
jpayne@68
|
935 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_HISTO_D3_HTML_PLOT, htmlFile, log)
|
jpayne@68
|
936
|
jpayne@68
|
937 if statsDict:
|
jpayne@68
|
938 try:
|
jpayne@68
|
939 ## --------------------------------------------------------------------------------------------------------
|
jpayne@68
|
940 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_AVG_INSERT, statsDict["avg_insert"], log)
|
jpayne@68
|
941 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_STD_INSERT, statsDict["std_insert"], log)
|
jpayne@68
|
942 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_MODE_INSERT, statsDict["mode_insert"], log)
|
jpayne@68
|
943 ## --------------------------------------------------------------------------------------------------------
|
jpayne@68
|
944
|
jpayne@68
|
945 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_TOTAL_TIME, statsDict["total_time"], log)
|
jpayne@68
|
946 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_NUM_READS, statsDict["num_reads"], log)
|
jpayne@68
|
947 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_JOINED_NUM, statsDict["joined_num"], log)
|
jpayne@68
|
948 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_JOINED_PERC, statsDict["joined_perc"], log)
|
jpayne@68
|
949
|
jpayne@68
|
950 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_AMBIGUOUS_NUM, statsDict["ambiguous_num"], log)
|
jpayne@68
|
951 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_AMBIGUOUS_PERC, statsDict["ambiguous_perc"], log)
|
jpayne@68
|
952 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_NO_SOLUTION_NUM, statsDict["no_solution_num"], log)
|
jpayne@68
|
953 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_NO_SOLUTION_PERC, statsDict["no_solution_perc"], log)
|
jpayne@68
|
954 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_TOO_SHORT_NUM, statsDict["too_short_num"], log)
|
jpayne@68
|
955 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_TOO_SHORT_PERC, statsDict["too_short_perc"], log)
|
jpayne@68
|
956
|
jpayne@68
|
957 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_INSERT_RANGE_START, statsDict["insert_range_start"], log)
|
jpayne@68
|
958 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_INSERT_RANGE_END, statsDict["insert_range_end"], log)
|
jpayne@68
|
959
|
jpayne@68
|
960 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_90TH_PERC, statsDict["perc_90th"], log)
|
jpayne@68
|
961 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_50TH_PERC, statsDict["perc_50th"], log)
|
jpayne@68
|
962 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_10TH_PERC, statsDict["perc_10th"], log)
|
jpayne@68
|
963 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_75TH_PERC, statsDict["perc_75th"], log)
|
jpayne@68
|
964 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_25TH_PERC, statsDict["perc_25th"], log)
|
jpayne@68
|
965
|
jpayne@68
|
966 except KeyError:
|
jpayne@68
|
967 log.error("19_insert_size_analysis failed (KeyError).")
|
jpayne@68
|
968 status = "19_insert_size_analysis failed"
|
jpayne@68
|
969 checkpoint_step_wrapper(status)
|
jpayne@68
|
970 return status
|
jpayne@68
|
971
|
jpayne@68
|
972 log.info("19_insert_size_analysis complete.")
|
jpayne@68
|
973 status = "19_insert_size_analysis complete"
|
jpayne@68
|
974 checkpoint_step_wrapper(status)
|
jpayne@68
|
975
|
jpayne@68
|
976
|
jpayne@68
|
977 return status
|
jpayne@68
|
978
|
jpayne@68
|
979
|
jpayne@68
|
980
|
jpayne@68
|
981
|
jpayne@68
|
982 """
|
jpayne@68
|
983 STEP21 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
984
|
jpayne@68
|
985 do_sketch_vs_nt_refseq_silva
|
jpayne@68
|
986
|
jpayne@68
|
987 """
|
jpayne@68
|
988 def do_sketch_vs_nt_refseq_silva(fasta, log):
|
jpayne@68
|
989 log.info("\n\n%sSTEP21 - Run sketch vs nt, refseq, silva <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
990
|
jpayne@68
|
991 status = "21_sketch_vs_nt_refseq_silva in progress"
|
jpayne@68
|
992 checkpoint_step_wrapper(status)
|
jpayne@68
|
993
|
jpayne@68
|
994 sketchOutDir = "sketch"
|
jpayne@68
|
995 sketchOutPath = os.path.join(outputPath, sketchOutDir)
|
jpayne@68
|
996 make_dir(sketchOutPath)
|
jpayne@68
|
997 change_mod(sketchOutPath, "0755")
|
jpayne@68
|
998
|
jpayne@68
|
999 seqUnitName = os.path.basename(fasta)
|
jpayne@68
|
1000 seqUnitName = file_name_trim(seqUnitName)
|
jpayne@68
|
1001
|
jpayne@68
|
1002 filesFile = RQCReadQcConfig.CFG["files_file"]
|
jpayne@68
|
1003
|
jpayne@68
|
1004 comOptions = "ow=t colors=f printtaxa=t depth depth2 unique2 merge"
|
jpayne@68
|
1005
|
jpayne@68
|
1006 ## NT ##########################
|
jpayne@68
|
1007 sketchOutFile = os.path.join(sketchOutPath, seqUnitName + ".sketch_vs_nt.txt")
|
jpayne@68
|
1008 sendSketchShCmd = os.path.join(BBDIR, 'sendsketch.sh')
|
jpayne@68
|
1009 cmd = "%s in=%s out=%s %s nt" % (sendSketchShCmd, fasta, sketchOutFile, comOptions)
|
jpayne@68
|
1010 stdOut, stdErr, exitCode = run_sh_command(cmd, True, log, True)
|
jpayne@68
|
1011 if exitCode != 0:
|
jpayne@68
|
1012 log.error("Failed to run : %s, stdout : %s, stderr: %s", cmd, stdOut, stdErr)
|
jpayne@68
|
1013 status = "21_sketch_vs_nt_refseq_silva failed"
|
jpayne@68
|
1014 checkpoint_step_wrapper(status)
|
jpayne@68
|
1015 return status
|
jpayne@68
|
1016
|
jpayne@68
|
1017 append_rqc_file(filesFile, "sketch_vs_nt_output", sketchOutFile, log)
|
jpayne@68
|
1018
|
jpayne@68
|
1019 ## Refseq ##########################
|
jpayne@68
|
1020 sketchOutFile = os.path.join(sketchOutPath, seqUnitName + ".sketch_vs_refseq.txt")
|
jpayne@68
|
1021 cmd = "%s in=%s out=%s %s refseq" % (sendSketchShCmd, fasta, sketchOutFile, comOptions)
|
jpayne@68
|
1022 stdOut, stdErr, exitCode = run_sh_command(cmd, True, log, True)
|
jpayne@68
|
1023 if exitCode != 0:
|
jpayne@68
|
1024 log.error("Failed to run : %s, stdout : %s, stderr: %s", cmd, stdOut, stdErr)
|
jpayne@68
|
1025 status = "21_sketch_vs_refseq failed"
|
jpayne@68
|
1026 checkpoint_step_wrapper(status)
|
jpayne@68
|
1027 return status
|
jpayne@68
|
1028
|
jpayne@68
|
1029 append_rqc_file(filesFile, "sketch_vs_refseq_output", sketchOutFile, log)
|
jpayne@68
|
1030
|
jpayne@68
|
1031 ## Silva ##########################
|
jpayne@68
|
1032 sketchOutFile = os.path.join(sketchOutPath, seqUnitName + ".sketch_vs_silva.txt")
|
jpayne@68
|
1033 cmd = "%s in=%s out=%s %s silva" % (sendSketchShCmd, fasta, sketchOutFile, comOptions)
|
jpayne@68
|
1034 stdOut, stdErr, exitCode = run_sh_command(cmd, True, log, True)
|
jpayne@68
|
1035 if exitCode != 0:
|
jpayne@68
|
1036 log.error("Failed to run : %s, stdout : %s, stderr: %s", cmd, stdOut, stdErr)
|
jpayne@68
|
1037 status = "21_sketch_vs_silva failed"
|
jpayne@68
|
1038 checkpoint_step_wrapper(status)
|
jpayne@68
|
1039 return status
|
jpayne@68
|
1040
|
jpayne@68
|
1041 append_rqc_file(filesFile, "sketch_vs_silva_output", sketchOutFile, log)
|
jpayne@68
|
1042
|
jpayne@68
|
1043 log.info("21_sketch_vs_nt_refseq_silva complete.")
|
jpayne@68
|
1044 status = "21_sketch_vs_nt_refseq_silva complete"
|
jpayne@68
|
1045 checkpoint_step_wrapper(status)
|
jpayne@68
|
1046
|
jpayne@68
|
1047 return status
|
jpayne@68
|
1048
|
jpayne@68
|
1049
|
jpayne@68
|
1050
|
jpayne@68
|
1051 """
|
jpayne@68
|
1052 STEP22 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1053
|
jpayne@68
|
1054 do_illumina_read_level_report_postprocessing
|
jpayne@68
|
1055
|
jpayne@68
|
1056 """
|
jpayne@68
|
1057 ## Removed!!
|
jpayne@68
|
1058 #def do_illumina_read_level_report(fastq, firstSubsampledFastqFileName, log):
|
jpayne@68
|
1059
|
jpayne@68
|
1060
|
jpayne@68
|
1061 """
|
jpayne@68
|
1062 STEP23 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1063
|
jpayne@68
|
1064 do_cleanup_readqc
|
jpayne@68
|
1065
|
jpayne@68
|
1066 """
|
jpayne@68
|
1067 def do_cleanup_readqc(log):
|
jpayne@68
|
1068 log.info("\n\n%sSTEP23 - Cleanup <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
1069
|
jpayne@68
|
1070 status = "23_cleanup_readqc in progress"
|
jpayne@68
|
1071 checkpoint_step_wrapper(status)
|
jpayne@68
|
1072
|
jpayne@68
|
1073 ## Get option
|
jpayne@68
|
1074 skipCleanup = RQCReadQcConfig.CFG["skip_cleanup"]
|
jpayne@68
|
1075
|
jpayne@68
|
1076 if not skipCleanup:
|
jpayne@68
|
1077 retCode = cleanup_readqc(log)
|
jpayne@68
|
1078 else:
|
jpayne@68
|
1079 log.warning("File cleaning is skipped.")
|
jpayne@68
|
1080 retCode = RQCExitCodes.JGI_SUCCESS
|
jpayne@68
|
1081
|
jpayne@68
|
1082 if retCode != RQCExitCodes.JGI_SUCCESS:
|
jpayne@68
|
1083 log.error("23_cleanup_readqc failed.")
|
jpayne@68
|
1084 status = "23_cleanup_readqc failed"
|
jpayne@68
|
1085 checkpoint_step_wrapper(status)
|
jpayne@68
|
1086
|
jpayne@68
|
1087 else:
|
jpayne@68
|
1088 log.info("23_cleanup_readqc complete.")
|
jpayne@68
|
1089 status = "23_cleanup_readqc complete"
|
jpayne@68
|
1090 checkpoint_step_wrapper(status)
|
jpayne@68
|
1091
|
jpayne@68
|
1092
|
jpayne@68
|
1093 return status
|
jpayne@68
|
1094
|
jpayne@68
|
1095
|
jpayne@68
|
1096 def stetch_section_note(table_header):
|
jpayne@68
|
1097 overview = [
|
jpayne@68
|
1098 'BBTools sketch.sh uses a technique called MinHash to rapidly compare large sequences. ',
|
jpayne@68
|
1099 'The result is similar to BLAST, a list of hits from a query sequence to various reference sequences, ',
|
jpayne@68
|
1100 'sorted by similarity but the mechanisms are very different. ',
|
jpayne@68
|
1101 'For more information, see <a href="http://bbtools.jgi.doe.gov/" target="_blank">http://bbtools.jgi.doe.gov</a>'
|
jpayne@68
|
1102 ]
|
jpayne@68
|
1103
|
jpayne@68
|
1104 legend = {
|
jpayne@68
|
1105 'WKID' : 'Weighted Kmer IDentity, which is the kmer identity compensating for differences in size. So, comparing human chr1 to the full human genome would yield 100% WKID but approximately 10% KID.',
|
jpayne@68
|
1106 'KID' : 'Kmer IDentity, equal to matches/length; this is the fraction of shared kmers.',
|
jpayne@68
|
1107 'ANI' : 'Average Nucleotide Identity, derived from WKID and kmer length.',
|
jpayne@68
|
1108 'Complt' : 'Genome completeness (percent of the reference represented in the query). Derived from WKID and KID.',
|
jpayne@68
|
1109 'Contam' : 'Contamination (percent of the query that does not match this reference, but matches some other reference).',
|
jpayne@68
|
1110 'Depth' : 'Per-kmer depth in sketches, indicating how many times that kmer was seen in the sequence.',
|
jpayne@68
|
1111 'Depth2' : 'Repeat-compensated depth',
|
jpayne@68
|
1112 'Matches' : 'The number of shared kmers between query and ref.',
|
jpayne@68
|
1113 'Unique' : 'The number of shared kmers between query and ref, and no other ref.',
|
jpayne@68
|
1114 'Unique2' : '??',
|
jpayne@68
|
1115 'Depth2' : 'Repeat-compensated depth',
|
jpayne@68
|
1116 'noHit' : 'Number of kmers that did not hit any reference sequence. Though constant for a query, it will be reported differently for different references based on the relative size of the reference and query (if the reference is bigger than the query, it will report all of them).',
|
jpayne@68
|
1117 'TaxID' : 'NCBI taxonomic id, when available.',
|
jpayne@68
|
1118 'gSize' : 'Estimate of genomic size (number of unique kmers in the genome). This is based on the smallest hash value in the list. This is affected by blacklists or whitelists, and by using an assembly versus raw reads.',
|
jpayne@68
|
1119 'gSeqs' : 'Number of sequences used in the sketch.',
|
jpayne@68
|
1120 'taxName' : 'NCBI\'s name for that taxID. If there is no taxID, the sequence name will be used.',
|
jpayne@68
|
1121 }
|
jpayne@68
|
1122
|
jpayne@68
|
1123 html = ''
|
jpayne@68
|
1124 for name in table_header:
|
jpayne@68
|
1125 if name != 'taxonomy':
|
jpayne@68
|
1126 html += html_tag('li', '%s: %s' % (html_tag('span', name, {'class': 'notice'}), html_tag('span', legend.get(name), {'class': 'small-italic'})))
|
jpayne@68
|
1127 html = html_tag('span', ''.join(overview), {'class': 'notice'}) + '<br /><br />Column Legend<br />' + html_tag('ul', html)
|
jpayne@68
|
1128
|
jpayne@68
|
1129 return html
|
jpayne@68
|
1130
|
jpayne@68
|
1131
|
jpayne@68
|
1132 def sketch_table(fname):
|
jpayne@68
|
1133 # print('DEBUG : %s' % fname)
|
jpayne@68
|
1134
|
jpayne@68
|
1135 html = ''
|
jpayne@68
|
1136
|
jpayne@68
|
1137 if os.path.isfile(fname):
|
jpayne@68
|
1138 title = None
|
jpayne@68
|
1139 header = None
|
jpayne@68
|
1140 data = []
|
jpayne@68
|
1141
|
jpayne@68
|
1142 with open(fname, 'r') as fh:
|
jpayne@68
|
1143 for line in fh:
|
jpayne@68
|
1144 line = line.strip()
|
jpayne@68
|
1145 if line == '':
|
jpayne@68
|
1146 continue
|
jpayne@68
|
1147 if line.startswith('Query:'):
|
jpayne@68
|
1148 title = line
|
jpayne@68
|
1149 elif line.startswith('WKID'):
|
jpayne@68
|
1150 header = line.split('\t')
|
jpayne@68
|
1151 else:
|
jpayne@68
|
1152 data.append(line)
|
jpayne@68
|
1153 if title and header and data:
|
jpayne@68
|
1154 html += html_th(header)
|
jpayne@68
|
1155 for line in data:
|
jpayne@68
|
1156 row = line.split('\t')
|
jpayne@68
|
1157 html += html_tr(row)
|
jpayne@68
|
1158
|
jpayne@68
|
1159 # html = html_tag('p', title) + html_tag('table', html, attrs={'class': 'data'})
|
jpayne@68
|
1160 html = html_tag('table', html, attrs={'class': 'data'})
|
jpayne@68
|
1161 return html, header
|
jpayne@68
|
1162
|
jpayne@68
|
1163 def do_html_contam_art_first_n_pb_tr(stats, files, odir, filepath_prefix):
|
jpayne@68
|
1164 temp = os.path.join(PYDIR, 'template/readqc_artifacts.html')
|
jpayne@68
|
1165 html = ''
|
jpayne@68
|
1166 with open(temp, 'r') as fh:
|
jpayne@68
|
1167 html = fh.read()
|
jpayne@68
|
1168
|
jpayne@68
|
1169 ## the optional artifact contam
|
jpayne@68
|
1170 artifact_tr = ''
|
jpayne@68
|
1171 artifact_type = '50'
|
jpayne@68
|
1172 artifact_val = pipeline_val('illumina read percent contamination artifact 50bp seal', {'type': 'raw'}, stats, files, filepath_prefix)
|
jpayne@68
|
1173 artifact_file = pipeline_val('artifact_50bp.seal.stats seal', {'type': 'file'}, stats, files, filepath_prefix)
|
jpayne@68
|
1174 if not artifact_file:
|
jpayne@68
|
1175 artifact_file = pipeline_val('artifact_20bp.seal.stats seal', {'type': 'file'}, stats, files, filepath_prefix)
|
jpayne@68
|
1176 artifact_type = '20'
|
jpayne@68
|
1177 artifact_val = pipeline_val('illumina read percent contamination artifact 20bp seal', {'type': 'raw'}, stats, files, filepath_prefix)
|
jpayne@68
|
1178 if artifact_file:
|
jpayne@68
|
1179 html = html.replace('[_CONTAM-ART-FIRST-BP_]', artifact_type)
|
jpayne@68
|
1180 html = html.replace('[_CONTAM-ART-FIRST-BP-SEAL_]', artifact_file)
|
jpayne@68
|
1181 html = html.replace('[_CONTAM-ART-FIRST-BP-SEAL-PCT_]', artifact_val)
|
jpayne@68
|
1182
|
jpayne@68
|
1183 return html
|
jpayne@68
|
1184
|
jpayne@68
|
1185
|
jpayne@68
|
1186 def do_html_body(odir, filepath_prefix):
|
jpayne@68
|
1187 print('do_html_body - %s' % filepath_prefix)
|
jpayne@68
|
1188 temp = os.path.join(PYDIR, 'template/readqc_body_template.html')
|
jpayne@68
|
1189 statsf = os.path.join(odir, 'readqc_stats.txt')
|
jpayne@68
|
1190 if not os.path.isfile(statsf):
|
jpayne@68
|
1191 print('ERROR : file not found: %s' % statsf)
|
jpayne@68
|
1192 stats = get_dict_obj(statsf)
|
jpayne@68
|
1193
|
jpayne@68
|
1194 filesf = os.path.join(odir, 'readqc_files.txt')
|
jpayne@68
|
1195 if not os.path.isfile(statsf):
|
jpayne@68
|
1196 print('ERROR : file not found: %s' % statsf)
|
jpayne@68
|
1197 files = get_dict_obj(os.path.join(odir, 'readqc_files.txt'))
|
jpayne@68
|
1198
|
jpayne@68
|
1199 ## key (key name in readqc_stats or readqc_files) : {token (space holder in html template), type (value format)}
|
jpayne@68
|
1200 tok_map = {
|
jpayne@68
|
1201 ## Average Base Quality section
|
jpayne@68
|
1202 'overall bases Q score mean' : {'token' : '[_BASE-QUALITY-SCORE_]', 'type': 'float', 'filter': 1},
|
jpayne@68
|
1203 'overall bases Q score std' : {'token' : '[_BASE-QUALITY-SCORE-STD_]', 'type': 'float', 'filter': 1},
|
jpayne@68
|
1204 'Q30 bases Q score mean' : {'token' : '[_Q30-BASE-QUALITY-SCORE_]', 'type': 'float', 'filter': 1},
|
jpayne@68
|
1205 'Q30 bases Q score std' : {'token' : '[_Q30-BASE-QUALITY-SCORE-STD_]', 'type': 'float', 'filter': 1},
|
jpayne@68
|
1206 'base C30' : {'token' : '[_COUNT-OF-BAESE-Q30_]', 'type': 'bigint'},
|
jpayne@68
|
1207 'base C25' : {'token' : '[_COUNT-OF-BAESE-Q25_]', 'type': 'bigint'},
|
jpayne@68
|
1208 'base C20' : {'token' : '[_COUNT-OF-BAESE-Q20_]', 'type': 'bigint'},
|
jpayne@68
|
1209 'base C15' : {'token' : '[_COUNT-OF-BAESE-Q15_]', 'type': 'bigint'},
|
jpayne@68
|
1210 'base C10' : {'token' : '[_COUNT-OF-BAESE-Q10_]', 'type': 'bigint'},
|
jpayne@68
|
1211 'base C5' : {'token' : '[_COUNT-OF-BAESE-Q5_]', 'type': 'bigint'},
|
jpayne@68
|
1212 'base Q30' : {'token' : '[_PCT-OF-BAESE-Q30_]', 'type': 'raw'},
|
jpayne@68
|
1213 'base Q25' : {'token' : '[_PCT-OF-BAESE-Q25_]', 'type': 'raw'},
|
jpayne@68
|
1214 'base Q20' : {'token' : '[_PCT-OF-BAESE-Q20_]', 'type': 'raw'},
|
jpayne@68
|
1215 'base Q15' : {'token' : '[_PCT-OF-BAESE-Q15_]', 'type': 'raw'},
|
jpayne@68
|
1216 'base Q10' : {'token' : '[_PCT-OF-BAESE-Q10_]', 'type': 'raw'},
|
jpayne@68
|
1217 'base Q5' : {'token' : '[_PCT-OF-BAESE-Q5_]', 'type': 'raw'},
|
jpayne@68
|
1218
|
jpayne@68
|
1219 ## Average Read Quality section
|
jpayne@68
|
1220 'read Q30' : {'token' : '[_PCT-OF-READS-Q30_]', 'type': 'raw'},
|
jpayne@68
|
1221 'read Q25' : {'token' : '[_PCT-OF-READS-Q25_]', 'type': 'raw'},
|
jpayne@68
|
1222 'read Q20' : {'token' : '[_PCT-OF-READS-Q20_]', 'type': 'raw'},
|
jpayne@68
|
1223 'read Q15' : {'token' : '[_PCT-OF-READS-Q15_]', 'type': 'raw'},
|
jpayne@68
|
1224 'read Q10' : {'token' : '[_PCT-OF-READS-Q10_]', 'type': 'raw'},
|
jpayne@68
|
1225 'read Q5' : {'token' : '[_PCT-OF-READS-Q5_]', 'type': 'raw'},
|
jpayne@68
|
1226
|
jpayne@68
|
1227 'SUBSAMPLE_RATE' : {'token' : '[_SUBSAMPLE-RATE_]', 'type': 'pct', 'filter': 1},
|
jpayne@68
|
1228 'read base quality stats' : {'token' : '[_AVG-READ-QUAL-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
|
jpayne@68
|
1229 'ILLUMINA_READ_QHIST_D3_HTML_PLOT' : {'token' : '[_AVG-READ-QUAL-HOSTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
|
jpayne@68
|
1230 'read qhist plot' : {'token' : '[_AVG-READ-QUALITY-HISTOGRAM_]', 'type': 'file'},
|
jpayne@68
|
1231
|
jpayne@68
|
1232 ## Average Base Position Quality section
|
jpayne@68
|
1233 'read q20 read1' : {'token' : '[_READ_Q20_READ1_]', 'type': 'raw'},
|
jpayne@68
|
1234 'read q20 read2' : {'token' : '[_READ_Q20_READ2_]', 'type': 'raw'},
|
jpayne@68
|
1235 'read qual pos qrpt 1' : {'token' : '[_AVG-BASE-POS-QUAL-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
|
jpayne@68
|
1236 'ILLUMINA_READ_QUAL_POS_PLOT_MERGED_D3_HTML_PLOT' : {'token' : '[_AVG-BASE-POS-QUAL-HISTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
|
jpayne@68
|
1237 'read qual pos plot merged' : {'token' : '[_AVG-BASE-POSITION-QUALITY_]', 'type': 'file'},
|
jpayne@68
|
1238
|
jpayne@68
|
1239 ## Insert Size
|
jpayne@68
|
1240 'ILLUMINA_READ_INSERT_SIZE_JOINED_PERC' : {'token' : '[_PCT-READS-JOINED_]', 'type': 'float', 'filter': 1},
|
jpayne@68
|
1241 'ILLUMINA_READ_INSERT_SIZE_AVG_INSERT' : {'token' : '[_PCT-READS-JOINED-AVG_]', 'type': 'raw'},
|
jpayne@68
|
1242 'ILLUMINA_READ_INSERT_SIZE_STD_INSERT' : {'token' : '[_PCT-READS-JOINED-STDDEV_]', 'type': 'raw'},
|
jpayne@68
|
1243 'ILLUMINA_READ_INSERT_SIZE_MODE_INSERT' : {'token' : '[_PCT-READS-JOINED-MODE_]', 'type': 'raw'},
|
jpayne@68
|
1244
|
jpayne@68
|
1245 'ILLUMINA_READ_INSERT_SIZE_HISTO_DATA' : {'token' : '[_INSERT-SIZE-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
|
jpayne@68
|
1246 'ILLUMINA_READ_INSERT_SIZE_HISTO_D3_HTML_PLOT' : {'token' : '[_INSERT-SIZE-HISTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
|
jpayne@68
|
1247 'ILLUMINA_READ_INSERT_SIZE_HISTO_PLOT' : {'token' : '[_INSERT-SIZE-HISTOGRAM_]', 'type': 'file'},
|
jpayne@68
|
1248
|
jpayne@68
|
1249 ## Read GC
|
jpayne@68
|
1250 'read GC mean' : {'token' : '[_READ-GC-AVG_]', 'type': 'float', 'filter': 1},
|
jpayne@68
|
1251 'read GC std' : {'token' : '[_READ-GC-STDDEV_]', 'type': 'float', 'filter': 1},
|
jpayne@68
|
1252 'read GC text hist' : {'token' : '[_READ-QC-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
|
jpayne@68
|
1253 'ILLUMINA_READ_GC_D3_HTML_PLOT' : {'token' : '[_READ-QC-HISTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
|
jpayne@68
|
1254 'read GC plot' : {'token' : '[_READ-GC-HIST_]', 'type': 'file'},
|
jpayne@68
|
1255
|
jpayne@68
|
1256 ## Cycle Nucleotide Composition
|
jpayne@68
|
1257 'read base count text 1' : {'token' : '[_NUC-COMP-FREQ-R1-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
|
jpayne@68
|
1258 'read base count text 2' : {'token' : '[_NUC-COMP-FREQ-R2-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
|
jpayne@68
|
1259 'ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_1' : {'token' : '[_NUC-COMP-FREQ-R1-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
|
jpayne@68
|
1260 'ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_2' : {'token' : '[_NUC-COMP-FREQ-R2-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
|
jpayne@68
|
1261 'read base count plot 1' : {'token' : '[_CYCLE-NUCL-COMPOSITION-READ1_]', 'type': 'file'},
|
jpayne@68
|
1262 'read base count plot 2' : {'token' : '[_CYCLE-NUCL-COMPOSITION-READ2_]', 'type': 'file'},
|
jpayne@68
|
1263
|
jpayne@68
|
1264 ## Percentage of Common Contaminants
|
jpayne@68
|
1265 'illumina read percent contamination artifact seal' : {'token' : '[_CONTAM-ART-SEA-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1266 'artifact.seal.stats seal' : {'token' : '[_CONTAM-ART-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1267
|
jpayne@68
|
1268 'illumina read percent contamination DNA spikein seal' : {'token' : '[_DNA-SPIKEIN-SEAL_PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1269 'DNA_spikein.seal.stats seal' : {'token' : '[_DNA-SPIKEIN-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1270
|
jpayne@68
|
1271 'illumina read percent contamination RNA spikein seal' : {'token' : '[_RNA-SPIKEIN-SEAL_PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1272 'RNA_spikein.seal.stats seal' : {'token' : '[_RNA-SPIKEIN-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1273
|
jpayne@68
|
1274 'illumina read percent contamination fosmid seal' : {'token' : '[_CONTAM-FOSMID-SEAL-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1275 'fosmid.seal.stats seal' : {'token' : '[_CONTAM-FOSMID-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1276
|
jpayne@68
|
1277 'illumina read percent contamination fosmid seal' : {'token' : '[_CONTAM-FOSMID-SEAL-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1278 'fosmid.seal.stats seal' : {'token' : '[_CONTAM-FOSMID-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1279
|
jpayne@68
|
1280 'illumina read percent contamination mitochondrion seal' : {'token' : '[_CONTAM-MITO-SEAL-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1281 'mitochondrion.seal.stats seal' : {'token' : '[_CONTAM-MITO-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1282
|
jpayne@68
|
1283 'illumina read percent contamination plastid seal' : {'token' : '[_CONTAM-CHLO-SEAL-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1284 'plastid.seal.stats seal' : {'token' : '[_CONTAM-CHLO-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1285
|
jpayne@68
|
1286 'illumina read percent contamination phix seal' : {'token' : '[_CONTAM-PHIX-SEAL-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1287 'phix.seal.stats seal' : {'token' : '[_CONTAM-PHIX-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1288
|
jpayne@68
|
1289 'illumina read percent contamination rrna seal' : {'token' : '[_CONTAM-RRNA-SEAL-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1290 'rrna.seal.stats seal' : {'token' : '[_CONTAM-RRNA-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1291
|
jpayne@68
|
1292 'illumina read percent contamination microbes seal' : {'token' : '[_CONTAM-NON-SYN-SEAL-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1293 'microbes.seal.stats seal' : {'token' : '[_CONTAM-NON-SYN-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1294
|
jpayne@68
|
1295 # 'illumina read percent contamination synthetic seal' : {'token' : '[_CONTAM-SYN-SEAL-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1296 'synthetic.seal.stats seal' : {'token' : '[_CONTAM-SYN-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1297
|
jpayne@68
|
1298 'illumina read percent contamination adapters seal' : {'token' : '[_CONTAM-ADAPTER-PCT_]', 'type': 'floatstr'},
|
jpayne@68
|
1299 'adapters.seal.stats seal' : {'token' : '[_CONTAM-ADAPTER-SEAL_]', 'type': 'file'},
|
jpayne@68
|
1300
|
jpayne@68
|
1301 ## Sketch vs NT
|
jpayne@68
|
1302 'sketch_vs_nt_output' : {'token' : '[_SKETCH-VS-NT_]', 'type': 'file'},
|
jpayne@68
|
1303 # 'sketch_vs_nt_output' : {'token' : '[_SKETCH-VS-NT-BASE_]', 'type': 'file', 'filter': 'base'},
|
jpayne@68
|
1304 }
|
jpayne@68
|
1305
|
jpayne@68
|
1306 html = ''
|
jpayne@68
|
1307 with open(temp, 'r') as fh:
|
jpayne@68
|
1308 html = fh.read()
|
jpayne@68
|
1309
|
jpayne@68
|
1310 for key in tok_map:
|
jpayne@68
|
1311 dat = tok_map[key]
|
jpayne@68
|
1312 val = pipeline_val(key, dat, stats, files, filepath_prefix)
|
jpayne@68
|
1313 # print('key=%s; %s; ====type=%s' % (key, val, type(val)))
|
jpayne@68
|
1314 html = html.replace(dat['token'], val)
|
jpayne@68
|
1315
|
jpayne@68
|
1316 artifact_tr = do_html_contam_art_first_n_pb_tr(stats, files, odir, filepath_prefix)
|
jpayne@68
|
1317 html = html.replace('[_CONTAN-ART-SEAL-FIRST-BP_]', artifact_tr)
|
jpayne@68
|
1318
|
jpayne@68
|
1319 ## synthetic contam
|
jpayne@68
|
1320 contam_syn_file = pipeline_val('synthetic.seal.stats seal', {'type': 'file', 'filter': 'full'}, stats, files, filepath_prefix)
|
jpayne@68
|
1321 if contam_syn_file and os.path.isfile(contam_syn_file):
|
jpayne@68
|
1322 contam_syn_pct_key = 'illumina read percent contamination synthetic seal'
|
jpayne@68
|
1323 cmd = 'grep "#Matched" %s' % contam_syn_file
|
jpayne@68
|
1324 stdOut, stdErr, exitCode = run_sh_command(cmd, True)
|
jpayne@68
|
1325 if exitCode == 0:
|
jpayne@68
|
1326 toks = stdOut.strip().split()
|
jpayne@68
|
1327 if len(toks) == 3:
|
jpayne@68
|
1328 html = html.replace('[_CONTAM-SYN-SEAL-PCT_]', toks[2][:-1])
|
jpayne@68
|
1329
|
jpayne@68
|
1330 ###--- Sketch
|
jpayne@68
|
1331 sketch_html = ''
|
jpayne@68
|
1332 ## add Sketch vs NT if file exists:
|
jpayne@68
|
1333 sketch_nt_file = pipeline_val('sketch_vs_nt_output', {'type': 'file', 'filter': 'full'}, stats, files, filepath_prefix)
|
jpayne@68
|
1334
|
jpayne@68
|
1335 if os.path.isfile(sketch_nt_file):
|
jpayne@68
|
1336 basename = os.path.basename(sketch_nt_file)
|
jpayne@68
|
1337 # href = pipeline_val('sketch_vs_nt_output', {'type': 'file'}, stats, files, filepath_prefix)
|
jpayne@68
|
1338 sketch_html = html_tag('h4', 'Sketch vs. NT') #+ '<br />' + 'Raw file:%s' % html_link(href, basename)
|
jpayne@68
|
1339 sketch_tab, sketch_table_header = sketch_table(sketch_nt_file)
|
jpayne@68
|
1340 sketch_html += sketch_tab
|
jpayne@68
|
1341
|
jpayne@68
|
1342
|
jpayne@68
|
1343 ## add Sketch vs refseq if file exists?
|
jpayne@68
|
1344 if sketch_html:
|
jpayne@68
|
1345 sketch_html = html_tag('h2', 'Read Sketch', attrs={'class': 'section-title'}) \
|
jpayne@68
|
1346 + stetch_section_note(sketch_table_header) \
|
jpayne@68
|
1347 + html_tag('div', sketch_html, {'class': 'section'})
|
jpayne@68
|
1348
|
jpayne@68
|
1349 html = html.replace('[_SKETCH-TABLE_]', sketch_html)
|
jpayne@68
|
1350
|
jpayne@68
|
1351 # copy the image file
|
jpayne@68
|
1352 imgdir = os.path.join(PYDIR, 'images')
|
jpayne@68
|
1353 todir = os.path.join(odir, 'images')
|
jpayne@68
|
1354 if os.path.isdir(todir):
|
jpayne@68
|
1355 shutil.rmtree(todir)
|
jpayne@68
|
1356 shutil.copytree(imgdir, todir, False, None)
|
jpayne@68
|
1357
|
jpayne@68
|
1358 return html
|
jpayne@68
|
1359
|
jpayne@68
|
1360
|
jpayne@68
|
1361 def do_html(odir, infile):
|
jpayne@68
|
1362 # odir = os.path.abspath(odir) ## DO NOT convert!! The relative file path in html need this original odir string matching
|
jpayne@68
|
1363
|
jpayne@68
|
1364 screen('Output dir - %s' % odir)
|
jpayne@68
|
1365 fname = os.path.basename(infile)
|
jpayne@68
|
1366 screen('Create HTML page in %s for %s ..' % (odir, fname))
|
jpayne@68
|
1367 temp = os.path.join(PYDIR, 'template/template.html')
|
jpayne@68
|
1368
|
jpayne@68
|
1369 with open(temp, 'r') as fh:
|
jpayne@68
|
1370 html = fh.read()
|
jpayne@68
|
1371 html = html.replace('[_PAGE-TITLE_]', 'Read QC Report')
|
jpayne@68
|
1372 html = html.replace('[_REPORT-TITLE_]', 'BBTools Read QC Report')
|
jpayne@68
|
1373 html = html.replace('[_INPUT-FILE-NAME_]', fname)
|
jpayne@68
|
1374 html = html.replace('[_REPORT-DATE_]', '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))
|
jpayne@68
|
1375
|
jpayne@68
|
1376 hbody =do_html_body(odir, odir)
|
jpayne@68
|
1377 html = html.replace('[_REPORT-BODY_]', hbody)
|
jpayne@68
|
1378
|
jpayne@68
|
1379 ## write the html to file
|
jpayne@68
|
1380 idxfile = os.path.join(odir, 'index.html')
|
jpayne@68
|
1381 with open(idxfile, 'w') as fh2:
|
jpayne@68
|
1382 fh2.write(html)
|
jpayne@68
|
1383 screen('HTML index file written to %s' % idxfile)
|
jpayne@68
|
1384
|
jpayne@68
|
1385 # copy the css file
|
jpayne@68
|
1386 cssdir = os.path.join(PYDIR, 'css')
|
jpayne@68
|
1387 todir = os.path.join(odir, 'css')
|
jpayne@68
|
1388 if os.path.isdir(todir):
|
jpayne@68
|
1389 shutil.rmtree(todir)
|
jpayne@68
|
1390 shutil.copytree(cssdir, todir, False, None)
|
jpayne@68
|
1391
|
jpayne@68
|
1392 def screen(txt):
|
jpayne@68
|
1393 print('.. %s' % txt)
|
jpayne@68
|
1394 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1395 ## Main Program
|
jpayne@68
|
1396 if __name__ == "__main__":
|
jpayne@68
|
1397 desc = "RQC ReadQC Pipeline"
|
jpayne@68
|
1398 parser = argparse.ArgumentParser(description=desc)
|
jpayne@68
|
1399 parser.add_argument("-f", "--fastq", dest="fastq", help="Set input fastq file (full path to fastq)", required=True)
|
jpayne@68
|
1400 parser.add_argument("-o", "--output-path", dest="outputPath", help="Set output path to write to")
|
jpayne@68
|
1401 parser.add_argument("-x", "--cut", dest="cutLenOfFirstBases", help="Set Read cut length (bp) for read contamination detection")
|
jpayne@68
|
1402 parser.add_argument("-v", "--version", action="version", version=VERSION)
|
jpayne@68
|
1403
|
jpayne@68
|
1404 ## For running on Crays
|
jpayne@68
|
1405 parser.add_argument("-l", "--lib-name", dest="libName", help="Set library name", required=False)
|
jpayne@68
|
1406 parser.add_argument("-m", "--is-multiplexed", dest="isMultiplexed", help="Set multiplexed data", required=False)
|
jpayne@68
|
1407 parser.add_argument("-r", "--is-rna", dest="isRna", help="Set RNA data", required=False)
|
jpayne@68
|
1408
|
jpayne@68
|
1409 ## produce html when processing is done
|
jpayne@68
|
1410 parser.add_argument("-html", "--html", action="store_true", help="Create html file", dest="html", default=False, required=False)
|
jpayne@68
|
1411
|
jpayne@68
|
1412 ## Toggle options
|
jpayne@68
|
1413 parser.add_argument("-b", "--skip-blast", action="store_true", help="Skip the blast run", dest="skipBlast", default=False, required=False)
|
jpayne@68
|
1414
|
jpayne@68
|
1415 parser.add_argument("-bn", "--skip-blast-nt", action="store_true", help="Skip Blast run against nt", dest="skipBlastNt", default=False, required=False)
|
jpayne@68
|
1416 parser.add_argument("-br", "--skip-blast-refseq", action="store_true", help="Skip Blast run against refseq.archaea and refseq.bateria", dest="skipBlastRefseq", default=False, required=False)
|
jpayne@68
|
1417 parser.add_argument("-c", "--skip-cleanup", action="store_true", help="Skip temporary file cleanup", dest="skipCleanup", default=False, required=False)
|
jpayne@68
|
1418 parser.add_argument("-p", "--pooled-analysis", action="store_true", help="Enable pooled analysis (demultiplexing)", dest="pooledAnalysis", default=False, required=False)
|
jpayne@68
|
1419 parser.add_argument("-s", "--skip-subsample", action="store_true", help="Skip subsampling.", dest="skipSubsample", default=False, required=False)
|
jpayne@68
|
1420 parser.add_argument("-z", "--skip-localization", action="store_true", help="Skip database localization", dest="skipDbLocalization", default=False, required=False)
|
jpayne@68
|
1421
|
jpayne@68
|
1422 options = parser.parse_args()
|
jpayne@68
|
1423
|
jpayne@68
|
1424 outputPath = None # output path, defaults to current working directory
|
jpayne@68
|
1425 fastq = None # full path to input fastq.gz
|
jpayne@68
|
1426
|
jpayne@68
|
1427 status = "start"
|
jpayne@68
|
1428 nextStepToDo = 0
|
jpayne@68
|
1429
|
jpayne@68
|
1430 if options.outputPath:
|
jpayne@68
|
1431 outputPath = options.outputPath
|
jpayne@68
|
1432 else:
|
jpayne@68
|
1433 outputPath = os.getcwd()
|
jpayne@68
|
1434
|
jpayne@68
|
1435 if options.fastq:
|
jpayne@68
|
1436 fastq = options.fastq
|
jpayne@68
|
1437
|
jpayne@68
|
1438 ## create output_directory if it doesn't exist
|
jpayne@68
|
1439 if not os.path.isdir(outputPath):
|
jpayne@68
|
1440 os.makedirs(outputPath)
|
jpayne@68
|
1441
|
jpayne@68
|
1442 libName = None ## for illumina_sciclone_analysis()
|
jpayne@68
|
1443 isRna = None ## for illumina_sciclone_analysis()
|
jpayne@68
|
1444 isMultiplexed = None ## for illumina_generate_index_sequence_detection_plot()
|
jpayne@68
|
1445
|
jpayne@68
|
1446 bSkipBlast = False
|
jpayne@68
|
1447 bSkipBlastRefseq = False ## refseq.archaea and refseq.bacteria
|
jpayne@68
|
1448 bSkipBlastNt = False
|
jpayne@68
|
1449 bPooledAnalysis = False
|
jpayne@68
|
1450
|
jpayne@68
|
1451 ## RQC-743 Need to specify the first cutting bp for read contam detection (eps. for smRNA )
|
jpayne@68
|
1452 firstBptoCut = 50
|
jpayne@68
|
1453 if options.cutLenOfFirstBases:
|
jpayne@68
|
1454 firstBptoCut = options.cutLenOfFirstBases
|
jpayne@68
|
1455
|
jpayne@68
|
1456 if options.libName:
|
jpayne@68
|
1457 libName = options.libName
|
jpayne@68
|
1458
|
jpayne@68
|
1459
|
jpayne@68
|
1460 ## switches
|
jpayne@68
|
1461 if options.isRna:
|
jpayne@68
|
1462 isRna = options.isRna
|
jpayne@68
|
1463 if options.isMultiplexed:
|
jpayne@68
|
1464 isMultiplexed = options.isMultiplexed
|
jpayne@68
|
1465 if options.skipBlast:
|
jpayne@68
|
1466 bSkipBlast = options.skipBlast
|
jpayne@68
|
1467 if options.skipBlastRefseq:
|
jpayne@68
|
1468 bSkipBlastRefseq = options.skipBlastRefseq
|
jpayne@68
|
1469 if options.skipBlastNt:
|
jpayne@68
|
1470 bSkipBlastNt = options.skipBlastNt
|
jpayne@68
|
1471 if options.pooledAnalysis:
|
jpayne@68
|
1472 bPooledAnalysis = options.pooledAnalysis
|
jpayne@68
|
1473
|
jpayne@68
|
1474
|
jpayne@68
|
1475 skipSubsampling = options.skipSubsample
|
jpayne@68
|
1476
|
jpayne@68
|
1477 ## Set readqc config
|
jpayne@68
|
1478 RQCReadQcConfig.CFG["status_file"] = os.path.join(outputPath, "readqc_status.log")
|
jpayne@68
|
1479 RQCReadQcConfig.CFG["files_file"] = os.path.join(outputPath, "readqc_files.tmp")
|
jpayne@68
|
1480 RQCReadQcConfig.CFG["stats_file"] = os.path.join(outputPath, "readqc_stats.tmp")
|
jpayne@68
|
1481 RQCReadQcConfig.CFG["output_path"] = outputPath
|
jpayne@68
|
1482 RQCReadQcConfig.CFG["skip_cleanup"] = options.skipCleanup
|
jpayne@68
|
1483 RQCReadQcConfig.CFG["skip_localization"] = options.skipDbLocalization
|
jpayne@68
|
1484 RQCReadQcConfig.CFG["log_file"] = os.path.join(outputPath, "readqc.log")
|
jpayne@68
|
1485
|
jpayne@68
|
1486 screen("Started readqc pipeline, writing log to: %s" % (RQCReadQcConfig.CFG["log_file"]))
|
jpayne@68
|
1487
|
jpayne@68
|
1488 log = get_logger("readqc", RQCReadQcConfig.CFG["log_file"], LOG_LEVEL, False, True)
|
jpayne@68
|
1489 log.info("=================================================================")
|
jpayne@68
|
1490 log.info(" Read Qc Analysis (version %s)", VERSION)
|
jpayne@68
|
1491 log.info("=================================================================")
|
jpayne@68
|
1492
|
jpayne@68
|
1493 log.info("Starting %s with %s", SCRIPT_NAME, fastq)
|
jpayne@68
|
1494
|
jpayne@68
|
1495 if os.path.isfile(RQCReadQcConfig.CFG["status_file"]):
|
jpayne@68
|
1496 status = get_status(RQCReadQcConfig.CFG["status_file"], log)
|
jpayne@68
|
1497 else:
|
jpayne@68
|
1498 checkpoint_step_wrapper(status)
|
jpayne@68
|
1499
|
jpayne@68
|
1500
|
jpayne@68
|
1501 if not os.path.isfile(fastq):
|
jpayne@68
|
1502 log.error("%s not found, aborting!", fastq)
|
jpayne@68
|
1503 exit(2)
|
jpayne@68
|
1504 elif status != "complete":
|
jpayne@68
|
1505 ## check for fastq file
|
jpayne@68
|
1506 # if os.path.isfile(fastq):
|
jpayne@68
|
1507 log.info("Found %s, starting processing.", fastq)
|
jpayne@68
|
1508 log.info("Latest status = %s", status)
|
jpayne@68
|
1509 if status != 'start':
|
jpayne@68
|
1510 nextStepToDo = int(status.split("_")[0])
|
jpayne@68
|
1511 if status.find("complete") != -1:
|
jpayne@68
|
1512 nextStepToDo += 1
|
jpayne@68
|
1513 log.info("Next step to do = %s", nextStepToDo)
|
jpayne@68
|
1514
|
jpayne@68
|
1515 if status != 'complete':
|
jpayne@68
|
1516 bDone = False
|
jpayne@68
|
1517 cycle = 0
|
jpayne@68
|
1518 totalReadNum = 0
|
jpayne@68
|
1519 firstSubsampledFastqFileName = ""
|
jpayne@68
|
1520 secondSubsampledFastqFile = ""
|
jpayne@68
|
1521 totalReadCount = 0
|
jpayne@68
|
1522 subsampledReadNum = 0
|
jpayne@68
|
1523 bIsPaired = False
|
jpayne@68
|
1524 readLength = 0
|
jpayne@68
|
1525 firstSubsampledLogFile = os.path.join(outputPath, "subsample", "first_subsampled.txt")
|
jpayne@68
|
1526 secondSubsampledLogFile = os.path.join(outputPath, "subsample", "second_subsampled.txt")
|
jpayne@68
|
1527
|
jpayne@68
|
1528 while not bDone:
|
jpayne@68
|
1529
|
jpayne@68
|
1530 cycle += 1
|
jpayne@68
|
1531
|
jpayne@68
|
1532 if bPooledAnalysis:
|
jpayne@68
|
1533 nextStepToDo = 17
|
jpayne@68
|
1534 status = "16_illumina_read_blastn_nt complete"
|
jpayne@68
|
1535
|
jpayne@68
|
1536 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1537 ## 1. fast_subsample_fastq_sequences
|
jpayne@68
|
1538 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1539 if nextStepToDo == 1 or status == "start":
|
jpayne@68
|
1540 status, firstSubsampledFastqFileName, totalReadCount, subsampledReadNum, bIsPaired, readLength = do_fast_subsample_fastq_sequences(fastq, skipSubsampling, log)
|
jpayne@68
|
1541
|
jpayne@68
|
1542 if status.endswith("failed"):
|
jpayne@68
|
1543 log.error("Subsampling failed.")
|
jpayne@68
|
1544 sys.exit(-1)
|
jpayne@68
|
1545
|
jpayne@68
|
1546 ## if not skipSubsampling and subsampledReadNum == 0: ## too small input file
|
jpayne@68
|
1547 if not skipSubsampling and subsampledReadNum < RQCReadQc.ILLUMINA_MIN_NUM_READS: ## min=1000
|
jpayne@68
|
1548 log.info("Too small input fastq file. Skip subsampling: total number of reads = %s, sampled number of reads = %s", totalReadCount, subsampledReadNum)
|
jpayne@68
|
1549 skipSubsampling = True
|
jpayne@68
|
1550 status, firstSubsampledFastqFileName, totalReadCount, subsampledReadNum, bIsPaired, readLength = do_fast_subsample_fastq_sequences(fastq, skipSubsampling, log)
|
jpayne@68
|
1551
|
jpayne@68
|
1552 if status.endswith("failed"):
|
jpayne@68
|
1553 log.error("Subsampling failed.")
|
jpayne@68
|
1554 sys.exit(-1)
|
jpayne@68
|
1555
|
jpayne@68
|
1556 subsampledReadNum = totalReadCount
|
jpayne@68
|
1557 if subsampledReadNum >= RQCReadQc.ILLUMINA_MIN_NUM_READS:
|
jpayne@68
|
1558 status = "1_illumina_readqc_subsampling complete"
|
jpayne@68
|
1559
|
jpayne@68
|
1560 ## Still too low number of reads -> record ILLUMINA_TOO_SMALL_NUM_READS and quit
|
jpayne@68
|
1561 ##
|
jpayne@68
|
1562 if subsampledReadNum == 0 or subsampledReadNum < RQCReadQc.ILLUMINA_MIN_NUM_READS: ## min=1000
|
jpayne@68
|
1563 log.info("Too small number of reads (< %s). Stop processing.", RQCReadQc.ILLUMINA_MIN_NUM_READS)
|
jpayne@68
|
1564 print("WARNING : Too small number of reads (< %s). Stop processing." % RQCReadQc.ILLUMINA_MIN_NUM_READS)
|
jpayne@68
|
1565 log.info("Completed %s: %s", SCRIPT_NAME, fastq)
|
jpayne@68
|
1566
|
jpayne@68
|
1567 ## Add ILLUMINA_TOO_SMALL_NUM_READS=1 to stats to be used in reporting
|
jpayne@68
|
1568 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
1569 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_TOO_SMALL_NUM_READS, "1", log)
|
jpayne@68
|
1570
|
jpayne@68
|
1571 ## move rqc-files.tmp to rqc-files.txt
|
jpayne@68
|
1572 newFilesFile = os.path.join(outputPath, "readqc_files.txt")
|
jpayne@68
|
1573 newStatsFile = os.path.join(outputPath, "readqc_stats.txt")
|
jpayne@68
|
1574
|
jpayne@68
|
1575 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["files_file"], newFilesFile)
|
jpayne@68
|
1576 log.info("mv cmd: %s", cmd)
|
jpayne@68
|
1577 run_sh_command(cmd, True, log)
|
jpayne@68
|
1578
|
jpayne@68
|
1579 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["stats_file"], newStatsFile)
|
jpayne@68
|
1580 log.info("mv cmd: %s", cmd)
|
jpayne@68
|
1581 run_sh_command(cmd, True, log)
|
jpayne@68
|
1582
|
jpayne@68
|
1583 exit(0)
|
jpayne@68
|
1584
|
jpayne@68
|
1585 if status == "1_illumina_readqc_subsampling failed":
|
jpayne@68
|
1586 bDone = True
|
jpayne@68
|
1587
|
jpayne@68
|
1588
|
jpayne@68
|
1589 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1590 ## 2. write_unique_20_mers
|
jpayne@68
|
1591 ## NOTE: fastq = orig input fastq,
|
jpayne@68
|
1592 ## totalReadCount = total reads in the orig input fastq
|
jpayne@68
|
1593 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1594 if nextStepToDo == 2 or status == "1_illumina_readqc_subsampling complete":
|
jpayne@68
|
1595 ## Cope with restarting
|
jpayne@68
|
1596 ## save subsamples file in "first_subsampled.txt" so that the file
|
jpayne@68
|
1597 ## name can be read when started
|
jpayne@68
|
1598 if not os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1599 make_dir(os.path.join(outputPath, "subsample"))
|
jpayne@68
|
1600 with open(firstSubsampledLogFile, "w") as SAMP_FH:
|
jpayne@68
|
1601 SAMP_FH.write(firstSubsampledFastqFileName + " " + str(totalReadCount) + " " + str(subsampledReadNum) + "\n")
|
jpayne@68
|
1602
|
jpayne@68
|
1603 if firstSubsampledFastqFileName == "" and options.skipSubsample:
|
jpayne@68
|
1604 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1605 with open(firstSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1606 l = SAMP_FH.readline().strip()
|
jpayne@68
|
1607 t = l.split()
|
jpayne@68
|
1608 assert len(t) == 2 or len(t) == 3
|
jpayne@68
|
1609 firstSubsampledFastqFileName = t[0]
|
jpayne@68
|
1610 totalReadCount = t[1]
|
jpayne@68
|
1611 subsampledReadNum = t[2]
|
jpayne@68
|
1612 log.debug("firstSubsampledFastqFileName=%s, totalReadCount=%s, subsampledReadNum=%s", firstSubsampledFastqFileName, totalReadCount, subsampledReadNum)
|
jpayne@68
|
1613
|
jpayne@68
|
1614 else:
|
jpayne@68
|
1615 nextStepToDo = 1
|
jpayne@68
|
1616 continue
|
jpayne@68
|
1617
|
jpayne@68
|
1618 ## TODO: move getting totalReadNum in the func ???
|
jpayne@68
|
1619 ##
|
jpayne@68
|
1620 log.debug("firstSubsampledFastqFileName=%s, totalReadCount=%s, subsampledReadNum=%s", firstSubsampledFastqFileName, totalReadCount, subsampledReadNum)
|
jpayne@68
|
1621
|
jpayne@68
|
1622 if totalReadCount is None or totalReadCount == 0:
|
jpayne@68
|
1623 nextStepToDo = 1
|
jpayne@68
|
1624 continue
|
jpayne@68
|
1625 else:
|
jpayne@68
|
1626 status = do_write_unique_20_mers(fastq, totalReadCount, log)
|
jpayne@68
|
1627
|
jpayne@68
|
1628 if status == "2_unique_mers_sampling failed":
|
jpayne@68
|
1629 bDone = True
|
jpayne@68
|
1630
|
jpayne@68
|
1631 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1632 ## 3. generate read GC histograms: Illumina_read_gc
|
jpayne@68
|
1633 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1634 if nextStepToDo == 3 or status == "2_unique_mers_sampling complete":
|
jpayne@68
|
1635 ## Read the subsampled file name
|
jpayne@68
|
1636 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1637 with open(firstSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1638 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
|
jpayne@68
|
1639 status = do_illumina_read_gc(firstSubsampledFastqFileName, log)
|
jpayne@68
|
1640
|
jpayne@68
|
1641 if status == "3_illumina_read_gc failed":
|
jpayne@68
|
1642 bDone = True
|
jpayne@68
|
1643
|
jpayne@68
|
1644 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1645 ## 4. read_quality_stats
|
jpayne@68
|
1646 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1647 if nextStepToDo == 4 or status == "3_illumina_read_gc complete":
|
jpayne@68
|
1648 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1649 with open(firstSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1650 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
|
jpayne@68
|
1651 status = do_read_quality_stats(firstSubsampledFastqFileName, log)
|
jpayne@68
|
1652
|
jpayne@68
|
1653 if status == "4_illumina_read_quality_stats failed":
|
jpayne@68
|
1654 bDone = True
|
jpayne@68
|
1655
|
jpayne@68
|
1656 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1657 ## 5. write_base_quality_stats
|
jpayne@68
|
1658 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1659 if nextStepToDo == 5 or status == "4_illumina_read_quality_stats complete":
|
jpayne@68
|
1660 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1661 with open(firstSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1662 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
|
jpayne@68
|
1663 status = do_write_base_quality_stats(firstSubsampledFastqFileName, log)
|
jpayne@68
|
1664
|
jpayne@68
|
1665 if status == "5_illumina_read_quality_stats failed":
|
jpayne@68
|
1666 bDone = True
|
jpayne@68
|
1667
|
jpayne@68
|
1668 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1669 ## 6. illumina_count_q_score
|
jpayne@68
|
1670 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1671 if nextStepToDo == 6 or status == "5_illumina_read_quality_stats complete":
|
jpayne@68
|
1672 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1673 with open(firstSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1674 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
|
jpayne@68
|
1675 status = do_illumina_count_q_score(firstSubsampledFastqFileName, log)
|
jpayne@68
|
1676
|
jpayne@68
|
1677 if status == "6_illumina_count_q_score failed":
|
jpayne@68
|
1678 bDone = True
|
jpayne@68
|
1679
|
jpayne@68
|
1680 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1681 ## 7. illumina_calculate_average_quality
|
jpayne@68
|
1682 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1683 if nextStepToDo == 7 or status == "6_illumina_count_q_score complete":
|
jpayne@68
|
1684 ## Let's skip this step. (20140902)
|
jpayne@68
|
1685 log.info("\n\n%sSTEP7 - Skipping 21mer analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
1686
|
jpayne@68
|
1687 status = "7_illumina_calculate_average_quality in progress"
|
jpayne@68
|
1688 checkpoint_step_wrapper(status)
|
jpayne@68
|
1689
|
jpayne@68
|
1690 status = "7_illumina_calculate_average_quality complete"
|
jpayne@68
|
1691 checkpoint_step_wrapper(status)
|
jpayne@68
|
1692
|
jpayne@68
|
1693 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1694 ## 8. illumina_find_common_motifs
|
jpayne@68
|
1695 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1696 if nextStepToDo == 8 or status == "7_illumina_calculate_average_quality complete":
|
jpayne@68
|
1697 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1698 with open(firstSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1699 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
|
jpayne@68
|
1700 status = do_illumina_find_common_motifs(firstSubsampledFastqFileName, log)
|
jpayne@68
|
1701
|
jpayne@68
|
1702 if status == "8_illumina_find_common_motifs failed":
|
jpayne@68
|
1703 bDone = True
|
jpayne@68
|
1704
|
jpayne@68
|
1705 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1706 ## 10. illumina_run_tagdust
|
jpayne@68
|
1707 ##
|
jpayne@68
|
1708 ## NOTE: This step will be skipped. No need to run.
|
jpayne@68
|
1709 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1710 # if nextStepToDo == 10 or status == "9_illumina_run_dedupe complete":
|
jpayne@68
|
1711 if nextStepToDo == 9 or status == "8_illumina_find_common_motifs complete":
|
jpayne@68
|
1712 ## 20131023 skip this step
|
jpayne@68
|
1713 log.info("\n\n%sSTEP10 - Skipping tag dust <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
1714 status = "10_illumina_run_tagdust in progress"
|
jpayne@68
|
1715 checkpoint_step_wrapper(status)
|
jpayne@68
|
1716
|
jpayne@68
|
1717 status = "10_illumina_run_tagdust complete"
|
jpayne@68
|
1718 checkpoint_step_wrapper(status)
|
jpayne@68
|
1719
|
jpayne@68
|
1720 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1721 ## 11. illumina_detect_read_contam
|
jpayne@68
|
1722 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1723 if nextStepToDo == 11 or status == "10_illumina_run_tagdust complete":
|
jpayne@68
|
1724 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1725 with open(firstSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1726 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
|
jpayne@68
|
1727 status = do_illumina_detect_read_contam(firstSubsampledFastqFileName, firstBptoCut, log)
|
jpayne@68
|
1728
|
jpayne@68
|
1729 if status == "11_illumina_detect_read_contam failed":
|
jpayne@68
|
1730 bDone = True
|
jpayne@68
|
1731
|
jpayne@68
|
1732
|
jpayne@68
|
1733 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1734 ## 13. illumina_subsampling_read_megablast
|
jpayne@68
|
1735 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1736 if nextStepToDo == 12 or status == "11_illumina_detect_read_contam complete":
|
jpayne@68
|
1737 if not bSkipBlast:
|
jpayne@68
|
1738 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1739 with open(firstSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1740 l = SAMP_FH.readline().strip()
|
jpayne@68
|
1741 firstSubsampledFastqFileName = l.split()[0]
|
jpayne@68
|
1742 totalReadCount = int(l.split()[1])
|
jpayne@68
|
1743 subsampledReadNum = int(l.split()[2])
|
jpayne@68
|
1744
|
jpayne@68
|
1745 status, secondSubsampledFastqFile, second_read_cnt_total, second_read_cnt_sampled = do_illumina_subsampling_read_blastn(firstSubsampledFastqFileName, skipSubsampling, subsampledReadNum, log)
|
jpayne@68
|
1746 log.debug("status=%s, secondSubsampledFastqFile=%s, second_read_cnt_total=%s, second_read_cnt_sampled=%s", status, secondSubsampledFastqFile, second_read_cnt_total, second_read_cnt_sampled)
|
jpayne@68
|
1747
|
jpayne@68
|
1748 else:
|
jpayne@68
|
1749 statsFile = RQCReadQcConfig.CFG["stats_file"]
|
jpayne@68
|
1750 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_SKIP_BLAST, "1", log)
|
jpayne@68
|
1751 log.info("\n\n%sSTEP13 - Run subsampling for Blast search <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
1752 log.info("13_illumina_subsampling_read_megablast skipped.\n")
|
jpayne@68
|
1753 status = "13_illumina_subsampling_read_megablast complete"
|
jpayne@68
|
1754 checkpoint_step_wrapper(status)
|
jpayne@68
|
1755
|
jpayne@68
|
1756 if status == "13_illumina_subsampling_read_megablast failed":
|
jpayne@68
|
1757 bDone = True
|
jpayne@68
|
1758
|
jpayne@68
|
1759 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1760 # 14. illumina_read_blastn_refseq_archaea
|
jpayne@68
|
1761 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1762 if nextStepToDo == 14 or status == "13_illumina_subsampling_read_megablast complete":
|
jpayne@68
|
1763 if not bSkipBlast and not bSkipBlastRefseq:
|
jpayne@68
|
1764 if secondSubsampledFastqFile == "" and not os.path.isfile(secondSubsampledLogFile):
|
jpayne@68
|
1765 nextStepToDo = 13
|
jpayne@68
|
1766 continue
|
jpayne@68
|
1767
|
jpayne@68
|
1768 if secondSubsampledFastqFile != "" and not os.path.isfile(secondSubsampledLogFile):
|
jpayne@68
|
1769 make_dir(os.path.join(outputPath, "subsample"))
|
jpayne@68
|
1770 with open(secondSubsampledLogFile, "w") as SAMP_FH:
|
jpayne@68
|
1771 SAMP_FH.write(secondSubsampledFastqFile + " " + str(second_read_cnt_total) + " " + str(second_read_cnt_sampled) + "\n")
|
jpayne@68
|
1772 else:
|
jpayne@68
|
1773 with open(secondSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1774 l = SAMP_FH.readline().strip()
|
jpayne@68
|
1775 secondSubsampledFastqFile = l.split()[0]
|
jpayne@68
|
1776 second_read_cnt_total = int(l.split()[1])
|
jpayne@68
|
1777 second_read_cnt_sampled = int(l.split()[2])
|
jpayne@68
|
1778
|
jpayne@68
|
1779 status = do_illumina_read_blastn_refseq_archaea(secondSubsampledFastqFile, second_read_cnt_sampled, log)
|
jpayne@68
|
1780 checkpoint_step_wrapper(status)
|
jpayne@68
|
1781
|
jpayne@68
|
1782 else:
|
jpayne@68
|
1783 log.info("\n\n%sSTEP14 - Run illumina_read_blastn_refseq_archaea analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
1784 log.info("14_illumina_read_blastn_refseq_archaea skipped.\n")
|
jpayne@68
|
1785 status = "14_illumina_read_blastn_refseq_archaea in progress"
|
jpayne@68
|
1786 checkpoint_step_wrapper(status)
|
jpayne@68
|
1787 status = "14_illumina_read_blastn_refseq_archaea complete"
|
jpayne@68
|
1788 checkpoint_step_wrapper(status)
|
jpayne@68
|
1789
|
jpayne@68
|
1790 if status == "14_illumina_read_blastn_refseq_archaea failed":
|
jpayne@68
|
1791 bDone = True
|
jpayne@68
|
1792
|
jpayne@68
|
1793 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1794 # 15. illumina_read_blastn_refseq_bacteria
|
jpayne@68
|
1795 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1796 if nextStepToDo == 15 or status == "14_illumina_read_blastn_refseq_archaea complete":
|
jpayne@68
|
1797 if not bSkipBlast and not bSkipBlastRefseq:
|
jpayne@68
|
1798 if secondSubsampledFastqFile == "" and not os.path.isfile(secondSubsampledLogFile):
|
jpayne@68
|
1799 nextStepToDo = 13
|
jpayne@68
|
1800 continue
|
jpayne@68
|
1801
|
jpayne@68
|
1802 if secondSubsampledFastqFile != "" and not os.path.isfile(secondSubsampledLogFile):
|
jpayne@68
|
1803 make_dir(os.path.join(outputPath, "subsample"))
|
jpayne@68
|
1804 with open(secondSubsampledLogFile, "w") as SAMP_FH:
|
jpayne@68
|
1805 SAMP_FH.write(secondSubsampledFastqFile + " " + str(second_read_cnt_total) + " " + str(second_read_cnt_sampled) + "\n")
|
jpayne@68
|
1806 else:
|
jpayne@68
|
1807 with open(secondSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1808 l = SAMP_FH.readline().strip()
|
jpayne@68
|
1809 secondSubsampledFastqFile = l.split()[0]
|
jpayne@68
|
1810 second_read_cnt_total = int(l.split()[1])
|
jpayne@68
|
1811 second_read_cnt_sampled = int(l.split()[2])
|
jpayne@68
|
1812
|
jpayne@68
|
1813 status = do_illumina_read_blastn_refseq_bacteria(secondSubsampledFastqFile, log)
|
jpayne@68
|
1814 checkpoint_step_wrapper(status)
|
jpayne@68
|
1815
|
jpayne@68
|
1816 else:
|
jpayne@68
|
1817 log.info("\n\n%sSTEP15 - Run illumina_read_blastn_refseq_bacteria analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
1818 log.info("15_illumina_read_blastn_refseq_bacteria skipped.\n")
|
jpayne@68
|
1819 status = "15_illumina_read_blastn_refseq_bacteria in progress"
|
jpayne@68
|
1820 checkpoint_step_wrapper(status)
|
jpayne@68
|
1821 status = "15_illumina_read_blastn_refseq_bacteria complete"
|
jpayne@68
|
1822 checkpoint_step_wrapper(status)
|
jpayne@68
|
1823
|
jpayne@68
|
1824 if status == "15_illumina_read_blastn_refseq_bacteria failed":
|
jpayne@68
|
1825 bDone = True
|
jpayne@68
|
1826
|
jpayne@68
|
1827 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1828 ## 16. illumina_read_blastn_nt
|
jpayne@68
|
1829 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1830 if nextStepToDo == 16 or status == "15_illumina_read_blastn_refseq_bacteria complete":
|
jpayne@68
|
1831 if not bSkipBlast and not bSkipBlastNt:
|
jpayne@68
|
1832 if secondSubsampledFastqFile == "" and not os.path.isfile(secondSubsampledLogFile):
|
jpayne@68
|
1833 nextStepToDo = 13
|
jpayne@68
|
1834 continue
|
jpayne@68
|
1835
|
jpayne@68
|
1836 if secondSubsampledFastqFile == "" and os.path.isfile(secondSubsampledLogFile):
|
jpayne@68
|
1837 with open(secondSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1838 l = SAMP_FH.readline().strip()
|
jpayne@68
|
1839 secondSubsampledFastqFile = l.split()[0]
|
jpayne@68
|
1840 second_read_cnt_total = int(l.split()[1])
|
jpayne@68
|
1841 second_read_cnt_sampled = int(l.split()[2])
|
jpayne@68
|
1842
|
jpayne@68
|
1843 status = do_illumina_read_blastn_nt(secondSubsampledFastqFile, log)
|
jpayne@68
|
1844 checkpoint_step_wrapper(status)
|
jpayne@68
|
1845
|
jpayne@68
|
1846 else:
|
jpayne@68
|
1847 log.info("\n\n%sSTEP16 - Run illumina_read_blastn_nt analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
1848 log.info("16_illumina_read_blastn_nt skipped.\n")
|
jpayne@68
|
1849 status = "16_illumina_read_blastn_nt in progress"
|
jpayne@68
|
1850 checkpoint_step_wrapper(status)
|
jpayne@68
|
1851 status = "16_illumina_read_blastn_nt complete"
|
jpayne@68
|
1852 checkpoint_step_wrapper(status)
|
jpayne@68
|
1853
|
jpayne@68
|
1854 if status == "16_illumina_read_blastn_nt failed":
|
jpayne@68
|
1855 bDone = True
|
jpayne@68
|
1856
|
jpayne@68
|
1857 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1858 ## 17. multiplex_statistics
|
jpayne@68
|
1859 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1860 if nextStepToDo == 17 or status == "16_illumina_read_blastn_nt complete":
|
jpayne@68
|
1861 status = do_illumina_multiplex_statistics(fastq, log, isMultiplexed=isMultiplexed)
|
jpayne@68
|
1862
|
jpayne@68
|
1863 if status == "17_multiplex_statistics failed":
|
jpayne@68
|
1864 bDone = True
|
jpayne@68
|
1865
|
jpayne@68
|
1866 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1867 ## 18. end_of_read_illumina_adapter_check
|
jpayne@68
|
1868 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1869 if nextStepToDo == 18 or status == "17_multiplex_statistics complete":
|
jpayne@68
|
1870 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
|
jpayne@68
|
1871 with open(firstSubsampledLogFile, "r") as SAMP_FH:
|
jpayne@68
|
1872 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
|
jpayne@68
|
1873 status = do_end_of_read_illumina_adapter_check(firstSubsampledFastqFileName, log)
|
jpayne@68
|
1874
|
jpayne@68
|
1875 if status == "18_end_of_read_illumina_adapter_check failed":
|
jpayne@68
|
1876 bDone = True
|
jpayne@68
|
1877
|
jpayne@68
|
1878 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1879 ## 19. insert size analysis (bbmerge.sh)
|
jpayne@68
|
1880 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1881 if nextStepToDo == 19 or status == "18_end_of_read_illumina_adapter_check complete":
|
jpayne@68
|
1882 status = do_insert_size_analysis(fastq, log)
|
jpayne@68
|
1883
|
jpayne@68
|
1884 if status == "19_insert_size_analysis failed":
|
jpayne@68
|
1885 bDone = True
|
jpayne@68
|
1886
|
jpayne@68
|
1887
|
jpayne@68
|
1888 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1889 ## 21. sketch vs nt, refseq, silva
|
jpayne@68
|
1890 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1891 # if nextStepToDo == 21 or status == "20_gc_divergence_analysis complete":
|
jpayne@68
|
1892 if nextStepToDo == 20 or status == "19_insert_size_analysis complete":
|
jpayne@68
|
1893 status = do_sketch_vs_nt_refseq_silva(fastq, log)
|
jpayne@68
|
1894
|
jpayne@68
|
1895 if status == "21_sketch_vs_nt_refseq_silva failed":
|
jpayne@68
|
1896 bDone = True
|
jpayne@68
|
1897
|
jpayne@68
|
1898 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1899 ## 22. postprocessing & reporting
|
jpayne@68
|
1900 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1901 if nextStepToDo == 22 or status == "21_sketch_vs_nt_refseq_silva complete":
|
jpayne@68
|
1902 ## 20131023 skip this step
|
jpayne@68
|
1903 log.info("\n\n%sSTEP22 - Run illumina_readqc_report_postprocess: mv rqc-*.tmp to rqc-*.txt <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
|
jpayne@68
|
1904 status = "22_illumina_readqc_report_postprocess in progress"
|
jpayne@68
|
1905 checkpoint_step_wrapper(status)
|
jpayne@68
|
1906
|
jpayne@68
|
1907 ## move rqc-files.tmp to rqc-files.txt
|
jpayne@68
|
1908 newFilesFile = os.path.join(outputPath, "readqc_files.txt")
|
jpayne@68
|
1909 newStatsFile = os.path.join(outputPath, "readqc_stats.txt")
|
jpayne@68
|
1910
|
jpayne@68
|
1911 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["files_file"], newFilesFile)
|
jpayne@68
|
1912 log.info("mv cmd: %s", cmd)
|
jpayne@68
|
1913 run_sh_command(cmd, True, log)
|
jpayne@68
|
1914
|
jpayne@68
|
1915 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["stats_file"], newStatsFile)
|
jpayne@68
|
1916 log.info("mv cmd: %s", cmd)
|
jpayne@68
|
1917 run_sh_command(cmd, True, log)
|
jpayne@68
|
1918
|
jpayne@68
|
1919 log.info("22_illumina_readqc_report_postprocess complete.")
|
jpayne@68
|
1920 status = "22_illumina_readqc_report_postprocess complete"
|
jpayne@68
|
1921 checkpoint_step_wrapper(status)
|
jpayne@68
|
1922
|
jpayne@68
|
1923 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1924 ## 23. Cleanup
|
jpayne@68
|
1925 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
jpayne@68
|
1926 if nextStepToDo == 23 or status == "22_illumina_readqc_report_postprocess complete":
|
jpayne@68
|
1927 status = do_cleanup_readqc(log)
|
jpayne@68
|
1928
|
jpayne@68
|
1929 if status == "23_cleanup_readqc failed":
|
jpayne@68
|
1930 bDone = True
|
jpayne@68
|
1931
|
jpayne@68
|
1932 if status == "23_cleanup_readqc complete":
|
jpayne@68
|
1933 status = "complete" ## FINAL COMPLETE!
|
jpayne@68
|
1934 bDone = True
|
jpayne@68
|
1935
|
jpayne@68
|
1936 ## don't cycle more than 10 times ...
|
jpayne@68
|
1937 if cycle > 10:
|
jpayne@68
|
1938 bDone = True
|
jpayne@68
|
1939
|
jpayne@68
|
1940 if status != "complete":
|
jpayne@68
|
1941 log.info("Status %s", status)
|
jpayne@68
|
1942 else:
|
jpayne@68
|
1943 log.info("\n\nCompleted %s: %s", SCRIPT_NAME, fastq)
|
jpayne@68
|
1944 checkpoint_step_wrapper("complete")
|
jpayne@68
|
1945 log.info("Pipeline processing Done.")
|
jpayne@68
|
1946 print("Pipeline processing Done.")
|
jpayne@68
|
1947
|
jpayne@68
|
1948 else:
|
jpayne@68
|
1949 log.info("Pipeline processing is already complete, skip.")
|
jpayne@68
|
1950 print("Pipeline processing is already complete, skip.")
|
jpayne@68
|
1951
|
jpayne@68
|
1952 if status == 'complete' and options.html:
|
jpayne@68
|
1953 do_html(outputPath, fastq)
|
jpayne@68
|
1954
|
jpayne@68
|
1955
|
jpayne@68
|
1956 exit(0)
|
jpayne@68
|
1957
|
jpayne@68
|
1958
|
jpayne@68
|
1959 ## EOF
|