comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/pytools/readqc.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 """
5 stand alone read qc script based on jgi-rqc-pipeline/readqc/readqc.py v8.3.6
6
7 Command
8 $ readqc.py -f FASTQ.GZ -o OUT_DIR --skip-blast -html
9
10 Outputs
11 - normal QC outputs + index.html
12
13 Created: March 15, 2018
14
15 Shijie Yao (syao@lbl.gov)
16
17 Revision:
18
19 """
20
21 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
22 ## libraries to use
23 import os
24 import sys
25 import argparse
26 import datetime
27 import shutil
28
29 SRC_ROOT = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
30 sys.path.append(SRC_ROOT + "/lib") # common
31
32 from readqc_constants import RQCReadQcConfig, RQCReadQc, ReadqcStats
33 from common import get_logger, get_status, append_rqc_stats, append_rqc_file, set_colors, get_subsample_rate,run_command
34 from readqc_utils import *
35 #from readqc_utils import checkpoint_step_wrapper, fast_subsample_fastq_sequences, write_unique_20_mers, illumina_read_gc
36 from rqc_fastq import get_working_read_length, read_count
37 from readqc_report import *
38 from os_utility import make_dir
39 from html_utility import html_tag, html_th, html_tr, html_link
40 from rqc_utility import get_dict_obj, pipeline_val
41
42 VERSION = "1.0.0"
43 LOG_LEVEL = "DEBUG"
44 SCRIPT_NAME = __file__
45
46 PYDIR = os.path.abspath(os.path.dirname(__file__))
47 BBDIR = os.path.join(PYDIR, os.path.pardir)
48
49 color = {}
50 color = set_colors(color, True)
51
52 """
53 STEP1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
54
55 """
56 def do_fast_subsample_fastq_sequences(fastq, skipSubsampling, log):
57 log.info("\n\n%sSTEP1 - Subsampling reads <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
58
59 statsFile = RQCReadQcConfig.CFG["stats_file"]
60
61 status = "1_illumina_readqc_subsampling in progress"
62 checkpoint_step_wrapper(status)
63 log.info("1_illumina_readqc_subsampling in progress.")
64
65 inputReadNum = 0
66 sampleRate = 0.0
67
68 if not skipSubsampling:
69 # sampleRate = RQCReadQc.ILLUMINA_SAMPLE_PCTENTAGE ## 0.01
70 inputReadNum = read_count(fastq) ## read count from the original fastq
71 assert inputReadNum > 0, "ERROR: invalid input fastq"
72 sampleRate = get_subsample_rate(inputReadNum)
73 log.info("Subsampling rate = %s", sampleRate)
74 else:
75 log.info("1_illumina_readqc_subsampling: skip subsampling. Use all the reads.")
76 sampleRate = 1.0
77
78 retCode = None
79 totalReadNum = 0
80 firstSubsampledFastqFileName = ""
81
82 sequnitFileName = os.path.basename(fastq)
83 sequnitFileName = sequnitFileName.replace(".fastq", "").replace(".gz", "")
84
85 firstSubsampledFastqFileName = sequnitFileName + ".s" + str(sampleRate) + ".fastq"
86
87 retCode, firstSubsampledFastqFileName, totalBaseCount, totalReadNum, subsampledReadNum, bIsPaired, readLength = fast_subsample_fastq_sequences(fastq, firstSubsampledFastqFileName, sampleRate, True, log)
88
89 append_rqc_stats(statsFile, "SUBSAMPLE_RATE", sampleRate, log)
90
91 if retCode in (RQCExitCodes.JGI_FAILURE, -2):
92 log.info("1_illumina_readqc_subsampling failed.")
93 status = "1_illumina_readqc_subsampling failed"
94 checkpoint_step_wrapper(status)
95
96 else:
97 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT, totalBaseCount, log)
98 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_COUNT, totalReadNum, log)
99
100 log.info("1_illumina_readqc_subsampling complete.")
101 status = "1_illumina_readqc_subsampling complete"
102 checkpoint_step_wrapper(status)
103
104
105 return status, firstSubsampledFastqFileName, totalReadNum, subsampledReadNum, bIsPaired, readLength
106
107
108 """
109 STEP2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
110
111 """
112 def do_write_unique_20_mers(fastq, totalReadCount, log):
113 log.info("\n\n%sSTEP2 - Sampling unique 25 mers <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
114
115 if totalReadCount >= RQCReadQc.ILLUMINA_MER_SAMPLE_REPORT_FRQ * 2: ## 25000
116 log.debug("read count total in step2 = %s", totalReadCount)
117 filesFile = RQCReadQcConfig.CFG["files_file"]
118 statsFile = RQCReadQcConfig.CFG["stats_file"]
119
120 status = "2_unique_mers_sampling in progress"
121 checkpoint_step_wrapper(status)
122 log.info(status)
123
124 retCode, newDataFile, newPngPlotFile, newHtmlPlotFile = write_unique_20_mers(fastq, log)
125
126 if retCode != RQCExitCodes.JGI_SUCCESS:
127 status = "2_unique_mers_sampling failed"
128 log.error(status)
129 checkpoint_step_wrapper(status)
130
131 else:
132 ## if no output files, skip this step.
133 if newDataFile is not None:
134 statsDict = {}
135
136 ## in readqc_report.py
137 ## 2014.07.23 read_level_mer_sampling is updated to process new output file format from bbcountunique
138 log.info("2_unique_mers_sampling: post-processing the bbcountunique output file.")
139 read_level_mer_sampling(statsDict, newDataFile, log)
140
141 for k, v in statsDict.items():
142 append_rqc_stats(statsFile, k, str(v), log)
143
144 ## outputs from bbcountunique
145 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_20MER_UNIQUENESS_TEXT, newDataFile, log)
146 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_20MER_UNIQUENESS_PLOT, newPngPlotFile, log)
147 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_20MER_UNIQUENESS_D3_HTML_PLOT, newHtmlPlotFile, log)
148
149 log.info("2_unique_mers_sampling complete.")
150 status = "2_unique_mers_sampling complete"
151 checkpoint_step_wrapper(status)
152
153 else:
154 ## if num reads < RQCReadQc.ILLUMINA_MER_SAMPLE_REPORT_FRQ = 25000
155 ## just proceed to the next step
156 log.warning("2_unique_mers_sampling can't run it because the number of reads < %s.", RQCReadQc.ILLUMINA_MER_SAMPLE_REPORT_FRQ * 2)
157 status = "2_unique_mers_sampling complete"
158 checkpoint_step_wrapper(status)
159
160
161 return status
162
163
164 """
165 STEP3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
166
167 """
168 def do_illumina_read_gc(fastq, log):
169 log.info("\n\n%sSTEP3 - Making read GC histograms <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
170
171 filesFile = RQCReadQcConfig.CFG["files_file"]
172 statsFile = RQCReadQcConfig.CFG["stats_file"]
173
174 status = "3_illumina_read_gc in progress"
175 checkpoint_step_wrapper(status)
176 log.info("3_illumina_read_gc in progress.")
177
178 reformat_gchist_file, png_file, htmlFile, mean_val, stdev_val, med_val, mode_val = illumina_read_gc(fastq, log)
179
180 if not reformat_gchist_file:
181 log.error("3_illumina_read_gc failed.")
182 status = "3_illumina_read_gc failed"
183 checkpoint_step_wrapper(status)
184
185 else:
186 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_MEAN, mean_val, log)
187 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_STD, stdev_val, log)
188 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_MED, med_val, log)
189 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_MODE, mode_val, log)
190
191 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_GC_TEXT, reformat_gchist_file, log)
192 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_GC_PLOT, png_file, log)
193 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_GC_D3_HTML_PLOT, htmlFile, log)
194
195 log.info("3_illumina_read_gc complete.")
196 status = "3_illumina_read_gc complete"
197 checkpoint_step_wrapper(status)
198
199
200 return status
201
202
203 """
204 STEP4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
205
206 """
207 def do_read_quality_stats(fastq, log):
208 log.info("\n\n%sSTEP4 - Analyzing read quality <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
209
210 filesFile = RQCReadQcConfig.CFG["files_file"]
211 statsFile = RQCReadQcConfig.CFG["stats_file"]
212
213 status = "4_illumina_read_quality_stats in progress"
214 checkpoint_step_wrapper(status)
215 log.info("4_illumina_read_quality_stats in progress.")
216
217 readLength = 0
218 readLenR1 = 0
219 readLenR2 = 0
220 isPairedEnd = None
221
222 if not os.path.isfile(fastq):
223 log.error("4_illumina_read_quality_stats failed. Cannot find the input fastq file")
224 status = "4_illumina_read_quality_stats failed"
225 checkpoint_step_wrapper(status)
226 return status
227
228 ## First figure out if it's pair-ended or not
229 ## NOTE: ssize=10000 is recommended!
230 readLength, readLenR1, readLenR2, isPairedEnd = get_working_read_length(fastq, log)
231 log.info("Read length = %s, and is_pair_ended = %s", readLength, isPairedEnd)
232
233 if readLength == 0:
234 log.error("Failed to run get_working_read_length.")
235 status = "4_illumina_read_quality_stats failed"
236 checkpoint_step_wrapper(status)
237
238 else:
239 ## Pair-ended
240 r1_r2_baseposqual_png = None ## Average Base Position Quality Plot (*.qrpt.png)
241 r1_r2_baseposqual_html = None ## Average Base Position Quality D3 Plot (*.qrpt.html)
242 r1_r2_baseposqual_txt = None ## Read 1/2 Average Base Position Quality Text (*.qhist.txt)
243
244 r1_baseposqual_box_png = None ## Read 1 Average Base Position Quality Boxplot (*.r1.png)
245 r2_baseposqual_box_png = None ## Read 2 Average Base Position Quality Boxplot (*.r2.png)
246 r1_baseposqual_box_html = None ## Read 1 Average Base Position Quality D3 Boxplot (*.r1.html)
247 r2_baseposqual_box_html = None ## Read 2 Average Base Position Quality D3 Boxplot (*.r2.html)
248 r1_r2_baseposqual_box_txt = None ## Average Base Position Quality text
249
250 r1_cyclenbase_png = None ## Read 1 Percent N by Read Position (*.r1.fastq.base.stats.Npercent.png) --> Read 1 Cycle N Base Percent plot
251 r2_cyclenbase_png = None ## Read 2 Percent N by Read Position (*.r2.fastq.base.stats.Npercent.png) --> Read 2 Cycle N Base Percent plot
252
253 r1_cyclenbase_txt = None ## Read 1 Percent N by Read Position Text (*.r1.fastq.base.stats) --> Read 1 Cycle N Base Percent text
254 #r2_cyclenbase_txt = None ## Read 2 Percent N by Read Position Text (*.r2.fastq.base.stats) --> Read 2 Cycle N Base Percent text
255 r1_r2_cyclenbase_txt = None ## Merged Percent N by Read Position Text (*.r2.fastq.base.stats) --> Merged Cycle N Base Percent text
256
257 r1_cyclenbase_html = None
258 r2_cyclenbase_html = None
259
260 r1_nuclcompfreq_png = None ## Read 1 Nucleotide Composition Frequency Plot (*.r1.stats.png)
261 r2_nuclcompfreq_png = None ## Read 2 Nucleotide Composition Frequency Plot (*.r2.stats.png)
262 r1_nuclcompfreq_html = None
263 r2_nuclcompfreq_html = None
264
265 ## Single-ended
266 #se_baseposqual_txt = None ## Average Base Position Quality Text (*.qrpt)
267 #se_nuclcompfreq_png = None ## Cycle Nucleotide Composition (*.stats.png)
268
269 if isPairedEnd:
270 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_LENGTH_1, readLenR1, log)
271 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_LENGTH_2, readLenR2, log)
272
273 else:
274 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_LENGTH_1, readLength, log)
275
276 ## Average Base Position Quality Plot/Text using qhist.txt
277 r1_r2_baseposqual_txt, r1_r2_baseposqual_png, r1_r2_baseposqual_html = gen_average_base_position_quality_plot(fastq, isPairedEnd, log) ## .reformat.qhist.txt
278
279 log.debug("Outputs: %s %s %s", r1_r2_baseposqual_png, r1_r2_baseposqual_html, r1_r2_baseposqual_txt)
280
281 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_PLOT_MERGED, r1_r2_baseposqual_png, log)
282 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_PLOT_MERGED_D3_HTML_PLOT, r1_r2_baseposqual_html, log)
283 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_1, r1_r2_baseposqual_txt, log) ## for backward compatibility
284 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_2, r1_r2_baseposqual_txt, log) ## for backward compatibility
285 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_MERGED, r1_r2_baseposqual_txt, log)
286
287 ## Average Base Position Quality Plot/Text using bqhist.txt
288 r1_r2_baseposqual_box_txt, r1_baseposqual_box_png, r2_baseposqual_box_png, r1_baseposqual_box_html, r2_baseposqual_box_html = gen_average_base_position_quality_boxplot(fastq, log)
289
290 log.debug("Read qual outputs: %s %s %s %s %s", r1_r2_baseposqual_box_txt, r1_baseposqual_box_png, r1_baseposqual_box_html, r2_baseposqual_box_png, r2_baseposqual_box_html)
291
292 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_BOXPLOT_1, r1_baseposqual_box_png, log)
293 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_D3_HTML_BOXPLOT_1, r1_baseposqual_box_html, log)
294 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_BOXPLOT_2, r2_baseposqual_box_png, log)
295 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_D3_HTML_BOXPLOT_2, r2_baseposqual_box_html, log)
296 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_BOXPLOT_TEXT, r1_r2_baseposqual_box_txt, log)
297
298 ## ----------------------------------------------------------------------------------------------------
299 ## compute Q20 of the two reads
300 q20Read1 = None
301 q20Read2 = None
302
303 ## using bqhist.txt
304 if r1_r2_baseposqual_box_txt:
305 q20Read1 = q20_score_new(r1_r2_baseposqual_box_txt, 1, log)
306 if isPairedEnd:
307 q20Read2 = q20_score_new(r1_r2_baseposqual_box_txt, 2, log)
308
309 log.debug("q20 for read 1 = %s", q20Read1)
310 log.debug("q20 for read 2 = %s", q20Read2)
311
312 if q20Read1 is not None:
313 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_Q20_READ1, q20Read1, log)
314 else:
315 log.error("Failed to get q20 read 1 from %s", r1_r2_baseposqual_box_txt)
316 status = "4_illumina_read_quality_stats failed"
317 checkpoint_step_wrapper(status)
318 return status
319
320 if isPairedEnd:
321 if q20Read2 is not None:
322 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_Q20_READ2, q20Read2, log)
323 else:
324 log.error("Failed to get q20 read 2 from %s", r1_r2_baseposqual_box_txt)
325 status = "4_illumina_read_quality_stats failed"
326 checkpoint_step_wrapper(status)
327 return status
328
329 r1_r2_cyclenbase_txt, r1_nuclcompfreq_png, r1_nuclcompfreq_html, r2_nuclcompfreq_png, r2_nuclcompfreq_html = gen_cycle_nucleotide_composition_plot(fastq, readLength, isPairedEnd, log)
330
331 log.debug("gen_cycle_nucleotide_composition_plot() ==> %s %s %s %s %s", r1_cyclenbase_txt, r1_nuclcompfreq_png, r1_nuclcompfreq_html, r2_nuclcompfreq_png, r2_nuclcompfreq_html)
332
333 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_TEXT_1, r1_r2_cyclenbase_txt, log)
334 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_TEXT_2, r1_r2_cyclenbase_txt, log) # reformat.sh generates a single merged output file.
335
336 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_PLOT_1, r1_nuclcompfreq_png, log)
337 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_1, r1_nuclcompfreq_html, log)
338
339 if r2_nuclcompfreq_png:
340 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_PLOT_2, r2_nuclcompfreq_png, log)
341 if r2_nuclcompfreq_html:
342 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_2, r2_nuclcompfreq_html, log)
343
344 ### ---------------------------------------------------------------------------------------------------
345 ## using bhist.txt
346 r1_cyclenbase_txt, r1_cyclenbase_png, r1_cyclenbase_html, r2_cyclenbase_png, r2_cyclenbase_html = gen_cycle_n_base_percent_plot(fastq, readLength, isPairedEnd, log)
347
348 log.debug("Outputs: %s %s %s", r1_cyclenbase_txt, r1_cyclenbase_png, r1_cyclenbase_html)
349
350 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_TEXT_1, r1_cyclenbase_txt, log)
351 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_TEXT_2, r1_cyclenbase_txt, log)
352
353 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_PLOT_1, r1_cyclenbase_png, log)
354 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_D3_HTML_PLOT_1, r1_cyclenbase_html, log)
355
356 if r2_cyclenbase_png:
357 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_PLOT_2, r2_cyclenbase_png, log)
358 if r2_cyclenbase_html:
359 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_D3_HTML_PLOT_2, r2_cyclenbase_html, log)
360
361 log.info("4_illumina_read_quality_stats complete.")
362 status = "4_illumina_read_quality_stats complete"
363 checkpoint_step_wrapper(status)
364
365
366 return status
367
368
369
370 """
371 STEP5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
372
373 """
374 def do_write_base_quality_stats(fastq, log):
375 log.info("\n\n%sSTEP5 - Calculating base quality statistics for reads <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
376
377 filesFile = RQCReadQcConfig.CFG["files_file"]
378 statsFile = RQCReadQcConfig.CFG["stats_file"]
379
380 status = "5_illumina_read_quality_stats in progress"
381 checkpoint_step_wrapper(status)
382 log.info("5_illumina_read_quality_stats in progress.")
383
384 reformatObqhistFile = write_avg_base_quality_stats(fastq, log) ## *.reformat.obqhist.txt
385
386 if not reformatObqhistFile:
387 log.error("5_illumina_read_quality_stats failed.")
388 status = "5_illumina_read_quality_stats failed"
389 checkpoint_step_wrapper(status)
390
391 else:
392 ## Generate qual scores and plots of read level QC
393 statsDict = {}
394
395 retCode = base_level_qual_stats(statsDict, reformatObqhistFile, log)
396
397 if retCode != RQCExitCodes.JGI_SUCCESS:
398 log.error("5_illumina_read_quality_stats failed.")
399 status = "5_illumina_read_quality_stats failed"
400 checkpoint_step_wrapper(status)
401
402 else:
403 for k, v in statsDict.items():
404 append_rqc_stats(statsFile, k, str(v), log)
405
406 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_QUALITY_STATS, reformatObqhistFile, log)
407
408 log.info("5_illumina_read_quality_stats complete.")
409 status = "5_illumina_read_quality_stats complete"
410 checkpoint_step_wrapper(status)
411
412
413 return status
414
415
416
417 """
418 STEP6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
419
420 """
421 def do_illumina_count_q_score(fastq, log):
422 log.info("\n\n%sSTEP6 - Generating quality score histogram <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
423
424 filesFile = RQCReadQcConfig.CFG["files_file"]
425 statsFile = RQCReadQcConfig.CFG["stats_file"]
426
427 status = "6_illumina_count_q_score in progress"
428 checkpoint_step_wrapper(status)
429 log.info("6_illumina_count_q_score in progress.")
430
431 qhistTxtFile, qhistPngFile, qhistHtmlPlotFile = illumina_count_q_score(fastq, log) ## *.obqhist.txt
432
433 if not qhistTxtFile:
434 log.error("6_illumina_count_q_score failed.")
435 status = "6_illumina_count_q_score failed"
436 checkpoint_step_wrapper(status)
437
438 else:
439 ## save qscores in statsFile
440 qscore = {}
441 read_level_qual_stats(qscore, qhistTxtFile, log)
442
443 for k, v in qscore.items():
444 append_rqc_stats(statsFile, k, str(v), log)
445
446 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QHIST_TEXT, qhistTxtFile, log)
447 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QHIST_PLOT, qhistPngFile, log)
448 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QHIST_D3_HTML_PLOT, qhistHtmlPlotFile, log)
449
450 log.info("6_illumina_count_q_score complete.")
451 status = "6_illumina_count_q_score complete"
452 checkpoint_step_wrapper(status)
453
454
455 return status
456
457
458
459 """
460 STEP7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
461
462 """
463 ## 20140903 removed
464 ##def do_illumina_calculate_average_quality(fastq, log):
465
466
467
468 """
469 STEP8 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
470
471 """
472 def do_illumina_find_common_motifs(fastq, log):
473 log.info("\n\n%sSTEP8 - Locating N stutter motifs in sequence reads <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
474
475 filesFile = RQCReadQcConfig.CFG["files_file"]
476 statsFile = RQCReadQcConfig.CFG["stats_file"]
477
478 status = "8_illumina_find_common_motifs in progress"
479 checkpoint_step_wrapper(status)
480 log.info(status)
481
482 retCode, statDataFile = illumina_find_common_motifs(fastq, log)
483
484 log.info("nstutter statDataFile name = %s", statDataFile)
485
486 if retCode != RQCExitCodes.JGI_SUCCESS:
487 log.error("8_illumina_find_common_motifs failed.")
488 status = "8_illumina_find_common_motifs failed"
489 checkpoint_step_wrapper(status)
490
491 else:
492 ## read_level_stutter
493 ## ex)
494 ##688 N----------------------------------------------------------------------------------------------------------------------------
495 ##-------------------------
496 ##346 NNNNNNNNNNNNN----------------------------------------------------------------------------------------------------------------
497 ##-------------------------
498 ##53924 ------------N----------------------------------------------------------------------------------------------------------------
499 ##-------------------------
500 ##sum pct patterNs past 0.1 == 15.9245930330268 ( 54958 / 345114 * 100 )
501
502 with open(statDataFile, "r") as stutFH:
503 lines = stutFH.readlines()
504
505 ## if no motifs are detected the file is empty
506 if not lines:
507 log.warning("The *.nstutter.stat file is not available in function read_level_stutter(). The function still returns JGI_SUCCESS.")
508
509 else:
510 assert lines[-1].find("patterNs") != -1
511 t = lines[-1].split()
512 ## ["sum", "pct", "patterNs", "past", "0.1", "==", "15.9245930330268", "(", "54958", "/", "345114", "*", "100", ")"]
513 percent = "%.2f" % float(t[6])
514
515 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_N_FREQUENCE, percent, log)
516 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_N_PATTERN, str("".join(lines[:-1])), log)
517
518 ## NOTE: ???
519 append_rqc_file(filesFile, "find_common_motifs.dataFile", statDataFile, log)
520
521 log.info("8_illumina_find_common_motifs complete.")
522 status = "8_illumina_find_common_motifs complete"
523 checkpoint_step_wrapper(status)
524
525
526 return status
527
528
529 """
530 STEP11 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
531
532 """
533 def do_illumina_detect_read_contam(fastq, bpToCut, log):
534 log.info("\n\n%sSTEP11 - Detect read contam <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
535
536 filesFile = RQCReadQcConfig.CFG["files_file"]
537 statsFile = RQCReadQcConfig.CFG["stats_file"]
538
539 status = "11_illumina_detect_read_contam in progress"
540 checkpoint_step_wrapper(status)
541 log.info("11_illumina_detect_read_contam in progress.")
542
543
544 #########
545 ## seal
546 #########
547 retCode2, outFileDict2, ratioResultDict2, contamStatDict = illumina_detect_read_contam3(fastq, bpToCut, log) ## seal version
548
549 if retCode2 != RQCExitCodes.JGI_SUCCESS:
550 log.error("11_illumina_detect_read_contam seal version failed.")
551 status = "11_illumina_detect_read_contam failed"
552 checkpoint_step_wrapper(status)
553
554 else:
555 for k, v in outFileDict2.items():
556 append_rqc_file(filesFile, k + " seal", str(v), log)
557 append_rqc_file(filesFile, k, str(v), log)
558
559 for k, v in ratioResultDict2.items():
560 append_rqc_stats(statsFile, k + " seal", str(v), log)
561 append_rqc_stats(statsFile, k, str(v), log)
562
563 ## contamination stat
564 for k, v in contamStatDict.items():
565 append_rqc_stats(statsFile, k, str(v), log)
566
567 log.info("11_illumina_detect_read_contam seal version complete.")
568 status = "11_illumina_detect_read_contam complete"
569 checkpoint_step_wrapper(status)
570
571
572 return status
573
574
575
576 """
577 STEP13 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
578
579 """
580 ## Removed!!
581 ##def do_illumina_read_megablast(firstSubsampledFastqFileName, skipSubsampling, subsampledReadNum, log, blastDbPath=None):
582
583
584
585 """
586 New STEP13 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
587
588 """
589 def do_illumina_subsampling_read_blastn(firstSubsampledFastqFileName, skipSubsampling, subsampledReadNum, log):
590 log.info("\n\n%sSTEP13 - Run subsampling for Blast search <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
591
592 subsampeldFastqFile = None
593 totalReadNum = 0
594 readNumToReturn = 0
595
596 status = "13_illumina_subsampling_read_megablast in progress"
597 checkpoint_step_wrapper(status)
598 log.info("13_illumina_subsampling_read_megablast in progress.")
599
600 if subsampledReadNum == 0:
601 cmd = " ".join(["grep", "-c", "'^+'", firstSubsampledFastqFileName])
602
603 stdOut, _, exitCode = run_command(cmd, True, log)
604
605 if exitCode != 0:
606 log.error("Failed to run grep cmd")
607 return RQCExitCodes.JGI_FAILURE, None, None, None
608
609 else:
610 readNum = int(stdOut)
611
612 else:
613 readNum = subsampledReadNum
614
615 log.info("Subsampled read number = %d", readNum)
616
617 sampl_per = RQCReadQc.ILLUMINA_SAMPLE_PCTENTAGE ## 0.01
618 max_count = RQCReadQc.ILLUMINA_SAMPLE_COUNT ## 50000
619
620 ## TODO
621 ## Use "samplereadstarget" option in reformat.sh
622
623 if skipSubsampling:
624 log.debug("No subsampling for megablast. Use fastq file, %s (readnum = %s) as query for megablast.", firstSubsampledFastqFileName, readNum)
625 subsampeldFastqFile = firstSubsampledFastqFileName
626 readNumToReturn = readNum
627
628 else:
629 if readNum > max_count:
630 log.debug("Run the 2nd subsampling for running megablast.")
631 secondSubsamplingRate = float(max_count) / readNum
632 log.debug("SecondSubsamplingRate=%s, max_count=%s, readNum=%s", secondSubsamplingRate, max_count, readNum)
633 log.info("Second subsampling of Reads after Percent Subsampling reads = %s with new percent subsampling %f.", readNum, secondSubsamplingRate)
634
635 secondSubsampledFastqFile = ""
636 # dataFile = ""
637
638 sequnitFileName = os.path.basename(firstSubsampledFastqFileName)
639 sequnitFileName = sequnitFileName.replace(".fastq", "").replace(".gz", "")
640
641 secondSubsampledFastqFile = sequnitFileName + ".s" + str(sampl_per) + ".s" + str(secondSubsamplingRate) + ".n" + str(max_count) + ".fastq"
642
643 ## ex) fq_sub_sample.pl -f .../7601.1.77813.CTTGTA.s0.01.fastq -o .../7601.1.77813.CTTGTA.s0.01.stats -r 0.0588142575171 > .../7601.1.77813.CTTGTA.s0.01.s0.01.s0.0588142575171.n50000.fastq
644 ## ex) reformat.sh
645 retCode, secondSubsampledFastqFile, totalBaseCount, totalReadNum, subsampledReadNum, _, _ = fast_subsample_fastq_sequences(firstSubsampledFastqFileName, secondSubsampledFastqFile, secondSubsamplingRate, False, log)
646
647 if retCode != RQCExitCodes.JGI_SUCCESS:
648 log.error("Second subsampling failed.")
649 return RQCExitCodes.JGI_FAILURE, None, None, None
650
651 else:
652 log.info("Second subsampling complete.")
653 log.info("Second Subsampling Total Base Count = %s.", totalBaseCount)
654 log.info("Second Subsampling Total Reads = %s.", totalReadNum)
655 log.info("Second Subsampling Sampled Reads = %s.", subsampledReadNum)
656
657 if subsampledReadNum == 0:
658 log.warning("Too small first subsampled fastq file. Skip the 2nd sampling.")
659 secondSubsampledFastqFile = firstSubsampledFastqFileName
660
661 subsampeldFastqFile = secondSubsampledFastqFile
662 readNumToReturn = subsampledReadNum
663
664 else:
665 log.debug("The readNum is smaller than max_count=%s. The 2nd sampling skipped.", str(max_count))
666 subsampeldFastqFile = firstSubsampledFastqFileName
667 totalReadNum = readNum
668 readNumToReturn = readNum
669
670 log.info("13_illumina_subsampling_read_megablast complete.")
671 status = "13_illumina_subsampling_read_megablast complete"
672 checkpoint_step_wrapper(status)
673
674
675 return status, subsampeldFastqFile, totalReadNum, readNumToReturn
676
677
678
679 ##"""
680 ##STEP14 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
681 ##
682 ##"""
683 ##def do_illumina_read_blastn_refseq_microbial(subsampeldFastqFile, subsampledReadNum, log, blastDbPath=None):
684 ## 12212015 sulsj REMOVED!
685
686
687 """
688 New STEP14 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
689
690 """
691 def do_illumina_read_blastn_refseq_archaea(subsampeldFastqFile, subsampledReadNum, log):
692 log.info("\n\n%sSTEP14 - Run read blastn against refseq archaea <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
693
694 statsFile = RQCReadQcConfig.CFG["stats_file"]
695 retCode = None
696
697 log.info("14_illumina_read_blastn_refseq_archaea in progress.")
698 status = "14_illumina_read_blastn_refseq_archaea in progress"
699 checkpoint_step_wrapper(status)
700
701 retCode = illumina_read_blastn_refseq_archaea(subsampeldFastqFile, log)
702
703 if retCode == RQCExitCodes.JGI_FAILURE:
704 log.error("14_illumina_read_blastn_refseq_archaea failed.")
705 status = "14_illumina_read_blastn_refseq_archaea failed"
706
707 elif retCode == -143: ## timeout
708 log.warning("14_illumina_read_blastn_refseq_archaea timeout.")
709 status = "14_illumina_read_blastn_refseq_archaea complete"
710
711 else:
712 ## read number used in blast search?
713 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READS_NUMBER, subsampledReadNum, log)
714
715 ret2 = read_megablast_hits("refseq.archaea", log)
716
717 if ret2 != RQCExitCodes.JGI_SUCCESS:
718 log.error("Errors in read_megablast_hits() of refseq.microbial")
719 log.error("14_illumina_read_blastn_refseq_archaea reporting failed.")
720 status = "14_illumina_read_blastn_refseq_archaea failed"
721
722 else:
723 log.info("14_illumina_read_blastn_refseq_archaea complete.")
724 status = "14_illumina_read_blastn_refseq_archaea complete"
725
726
727 return status
728
729
730 """
731 STEP15 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
732
733 """
734 def do_illumina_read_blastn_refseq_bacteria(subsampeldFastqFile, log):
735 log.info("\n\n%sSTEP15 - Run read blastn against refseq bacteria <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
736
737 #statsFile = RQCReadQcConfig.CFG["stats_file"]
738 #retCode = None
739
740 log.info("15_illumina_read_blastn_refseq_bacteria in progress.")
741 status = "15_illumina_read_blastn_refseq_bacteria in progress"
742 checkpoint_step_wrapper(status)
743
744 retCode = illumina_read_blastn_refseq_bacteria(subsampeldFastqFile, log)
745
746 if retCode == RQCExitCodes.JGI_FAILURE:
747 log.error("15_illumina_read_blastn_refseq_bacteria failed.")
748 status = "15_illumina_read_blastn_refseq_bacteria failed"
749
750 elif retCode == -143: ## timeout
751 log.warning("15_illumina_read_blastn_refseq_bacteria timeout.")
752 status = "15_illumina_read_blastn_refseq_bacteria complete"
753
754 else:
755 ret2 = read_megablast_hits("refseq.bacteria", log)
756
757 if ret2 != RQCExitCodes.JGI_SUCCESS:
758 log.error("Errors in read_megablast_hits() of refseq.microbial")
759 log.error("15_illumina_read_blastn_refseq_bacteria reporting failed.")
760 status = "15_illumina_read_blastn_refseq_bacteria failed"
761
762 else:
763 log.info("15_illumina_read_blastn_refseq_bacteria complete.")
764 status = "15_illumina_read_blastn_refseq_bacteria complete"
765
766
767 return status
768
769
770
771 """
772 STEP16 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
773
774 """
775 def do_illumina_read_blastn_nt(subsampeldFastqFile, log):
776 log.info("\n\n%sSTEP16 - Run read blastn against nt <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
777
778 retCode = None
779
780 log.info("16_illumina_read_blastn_nt in progress.")
781 status = "16_illumina_read_blastn_nt in progress"
782 checkpoint_step_wrapper(status)
783
784 retCode = illumina_read_blastn_nt(subsampeldFastqFile, log)
785
786 if retCode == RQCExitCodes.JGI_FAILURE:
787 log.error("16_illumina_read_blastn_nt failed.")
788 status = "16_illumina_read_blastn_nt failed"
789
790 elif retCode == -143: ## timeout
791 log.warning("16_illumina_read_blastn_nt timeout.")
792 status = "16_illumina_read_blastn_nt complete"
793
794 else:
795 ret2 = read_megablast_hits("nt", log)
796
797 if ret2 != RQCExitCodes.JGI_SUCCESS:
798 log.error("Errors in read_megablast_hits() of nt")
799 log.error("16_illumina_read_blastn_nt reporting failed.")
800 status = "16_illumina_read_blastn_nt failed"
801
802 else:
803 log.info("16_illumina_read_blastn_nt complete.")
804 status = "16_illumina_read_blastn_nt complete"
805
806
807 return status
808
809
810
811 """
812 STEP17 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
813
814 do_illumina_multiplex_statistics
815
816 Demultiplexing analysis for pooled lib
817
818 """
819 def do_illumina_multiplex_statistics(fastq, log, isMultiplexed=None):
820 log.info("\n\n%sSTEP17 - Run Multiplex statistics analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
821
822 filesFile = RQCReadQcConfig.CFG["files_file"]
823
824 log.info("17_multiplex_statistics in progress.")
825 status = "17_multiplex_statistics in progress"
826 checkpoint_step_wrapper(status)
827
828 log.debug("fastq file: %s", fastq)
829
830 retCode, demultiplexStatsFile, detectionPngPlotFile, detectionHtmlPlotFile = illumina_generate_index_sequence_detection_plot(fastq, log, isMultiplexed=isMultiplexed)
831
832 if retCode != RQCExitCodes.JGI_SUCCESS:
833 log.error("17_multiplex_statistics failed.")
834 status = "17_multiplex_statistics failed"
835 checkpoint_step_wrapper(status)
836
837 else:
838 log.info("17_multiplex_statistics complete.")
839 status = "17_multiplex_statistics complete"
840 checkpoint_step_wrapper(status)
841
842 if detectionPngPlotFile is not None:
843 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_DEMULTIPLEX_STATS_PLOT, detectionPngPlotFile, log)
844
845 if detectionHtmlPlotFile is not None:
846 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_DEMULTIPLEX_STATS_D3_HTML_PLOT, detectionHtmlPlotFile, log)
847
848 if demultiplexStatsFile is not None:
849 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_DEMULTIPLEX_STATS, demultiplexStatsFile, log)
850
851
852 return status
853
854
855
856 """
857 STEP18 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
858
859 do_end_of_read_illumina_adapter_check
860
861 """
862 def do_end_of_read_illumina_adapter_check(firstSubsampledFastqFileName, log):
863 log.info("\n\n%sSTEP18 - Run end_of_read_illumina_adapter_check analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
864
865 filesFile = RQCReadQcConfig.CFG["files_file"]
866 plotFile = None
867 dataFile = None
868
869 log.info("18_end_of_read_illumina_adapter_check in progress.")
870 status = "18_end_of_read_illumina_adapter_check in progress"
871 checkpoint_step_wrapper(status)
872
873 log.debug("sampled fastq file: %s", firstSubsampledFastqFileName)
874
875 retCode, dataFile, plotFile, htmlFile = end_of_read_illumina_adapter_check(firstSubsampledFastqFileName, log)
876
877 if retCode != RQCExitCodes.JGI_SUCCESS:
878 log.error("18_end_of_read_illumina_adapter_check failed.")
879 status = "18_end_of_read_illumina_adapter_check failed"
880 checkpoint_step_wrapper(status)
881
882 else:
883 log.info("18_end_of_read_illumina_adapter_check complete.")
884 status = "18_end_of_read_illumina_adapter_check complete"
885 checkpoint_step_wrapper(status)
886
887 if plotFile is not None:
888 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_END_OF_READ_ADAPTER_CHECK_PLOT, plotFile, log)
889 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_END_OF_READ_ADAPTER_CHECK_D3_HTML_PLOT, htmlFile, log)
890
891 if dataFile is not None:
892 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_END_OF_READ_ADAPTER_CHECK_DATA, dataFile, log)
893
894
895 return status
896
897
898
899 """
900 STEP19 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
901
902 do_insert_size_analysis
903
904 """
905 def do_insert_size_analysis(fastq, log):
906 log.info("\n\n%sSTEP19 - Run insert size analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
907
908 filesFile = RQCReadQcConfig.CFG["files_file"]
909 statsFile = RQCReadQcConfig.CFG["stats_file"]
910
911 plotFile = None
912 dataFile = None
913
914 log.info("19_insert_size_analysis in progress.")
915 status = "19_insert_size_analysis in progress"
916 checkpoint_step_wrapper(status)
917
918 log.debug("fastq file used: %s", fastq)
919
920 retCode, dataFile, plotFile, htmlFile, statsDict = insert_size_analysis(fastq, log) ## by bbmerge.sh
921
922 if retCode != RQCExitCodes.JGI_SUCCESS:
923 log.error("19_insert_size_analysis failed.")
924 status = "19_insert_size_analysis failed"
925 checkpoint_step_wrapper(status)
926
927 else:
928 if plotFile is not None:
929 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_HISTO_PLOT, plotFile, log)
930
931 if dataFile is not None:
932 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_HISTO_DATA, dataFile, log)
933
934 if htmlFile is not None:
935 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_HISTO_D3_HTML_PLOT, htmlFile, log)
936
937 if statsDict:
938 try:
939 ## --------------------------------------------------------------------------------------------------------
940 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_AVG_INSERT, statsDict["avg_insert"], log)
941 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_STD_INSERT, statsDict["std_insert"], log)
942 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_MODE_INSERT, statsDict["mode_insert"], log)
943 ## --------------------------------------------------------------------------------------------------------
944
945 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_TOTAL_TIME, statsDict["total_time"], log)
946 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_NUM_READS, statsDict["num_reads"], log)
947 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_JOINED_NUM, statsDict["joined_num"], log)
948 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_JOINED_PERC, statsDict["joined_perc"], log)
949
950 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_AMBIGUOUS_NUM, statsDict["ambiguous_num"], log)
951 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_AMBIGUOUS_PERC, statsDict["ambiguous_perc"], log)
952 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_NO_SOLUTION_NUM, statsDict["no_solution_num"], log)
953 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_NO_SOLUTION_PERC, statsDict["no_solution_perc"], log)
954 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_TOO_SHORT_NUM, statsDict["too_short_num"], log)
955 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_TOO_SHORT_PERC, statsDict["too_short_perc"], log)
956
957 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_INSERT_RANGE_START, statsDict["insert_range_start"], log)
958 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_INSERT_RANGE_END, statsDict["insert_range_end"], log)
959
960 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_90TH_PERC, statsDict["perc_90th"], log)
961 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_50TH_PERC, statsDict["perc_50th"], log)
962 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_10TH_PERC, statsDict["perc_10th"], log)
963 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_75TH_PERC, statsDict["perc_75th"], log)
964 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_25TH_PERC, statsDict["perc_25th"], log)
965
966 except KeyError:
967 log.error("19_insert_size_analysis failed (KeyError).")
968 status = "19_insert_size_analysis failed"
969 checkpoint_step_wrapper(status)
970 return status
971
972 log.info("19_insert_size_analysis complete.")
973 status = "19_insert_size_analysis complete"
974 checkpoint_step_wrapper(status)
975
976
977 return status
978
979
980
981
982 """
983 STEP21 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
984
985 do_sketch_vs_nt_refseq_silva
986
987 """
988 def do_sketch_vs_nt_refseq_silva(fasta, log):
989 log.info("\n\n%sSTEP21 - Run sketch vs nt, refseq, silva <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
990
991 status = "21_sketch_vs_nt_refseq_silva in progress"
992 checkpoint_step_wrapper(status)
993
994 sketchOutDir = "sketch"
995 sketchOutPath = os.path.join(outputPath, sketchOutDir)
996 make_dir(sketchOutPath)
997 change_mod(sketchOutPath, "0755")
998
999 seqUnitName = os.path.basename(fasta)
1000 seqUnitName = file_name_trim(seqUnitName)
1001
1002 filesFile = RQCReadQcConfig.CFG["files_file"]
1003
1004 comOptions = "ow=t colors=f printtaxa=t depth depth2 unique2 merge"
1005
1006 ## NT ##########################
1007 sketchOutFile = os.path.join(sketchOutPath, seqUnitName + ".sketch_vs_nt.txt")
1008 sendSketchShCmd = os.path.join(BBDIR, 'sendsketch.sh')
1009 cmd = "%s in=%s out=%s %s nt" % (sendSketchShCmd, fasta, sketchOutFile, comOptions)
1010 stdOut, stdErr, exitCode = run_sh_command(cmd, True, log, True)
1011 if exitCode != 0:
1012 log.error("Failed to run : %s, stdout : %s, stderr: %s", cmd, stdOut, stdErr)
1013 status = "21_sketch_vs_nt_refseq_silva failed"
1014 checkpoint_step_wrapper(status)
1015 return status
1016
1017 append_rqc_file(filesFile, "sketch_vs_nt_output", sketchOutFile, log)
1018
1019 ## Refseq ##########################
1020 sketchOutFile = os.path.join(sketchOutPath, seqUnitName + ".sketch_vs_refseq.txt")
1021 cmd = "%s in=%s out=%s %s refseq" % (sendSketchShCmd, fasta, sketchOutFile, comOptions)
1022 stdOut, stdErr, exitCode = run_sh_command(cmd, True, log, True)
1023 if exitCode != 0:
1024 log.error("Failed to run : %s, stdout : %s, stderr: %s", cmd, stdOut, stdErr)
1025 status = "21_sketch_vs_refseq failed"
1026 checkpoint_step_wrapper(status)
1027 return status
1028
1029 append_rqc_file(filesFile, "sketch_vs_refseq_output", sketchOutFile, log)
1030
1031 ## Silva ##########################
1032 sketchOutFile = os.path.join(sketchOutPath, seqUnitName + ".sketch_vs_silva.txt")
1033 cmd = "%s in=%s out=%s %s silva" % (sendSketchShCmd, fasta, sketchOutFile, comOptions)
1034 stdOut, stdErr, exitCode = run_sh_command(cmd, True, log, True)
1035 if exitCode != 0:
1036 log.error("Failed to run : %s, stdout : %s, stderr: %s", cmd, stdOut, stdErr)
1037 status = "21_sketch_vs_silva failed"
1038 checkpoint_step_wrapper(status)
1039 return status
1040
1041 append_rqc_file(filesFile, "sketch_vs_silva_output", sketchOutFile, log)
1042
1043 log.info("21_sketch_vs_nt_refseq_silva complete.")
1044 status = "21_sketch_vs_nt_refseq_silva complete"
1045 checkpoint_step_wrapper(status)
1046
1047 return status
1048
1049
1050
1051 """
1052 STEP22 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1053
1054 do_illumina_read_level_report_postprocessing
1055
1056 """
1057 ## Removed!!
1058 #def do_illumina_read_level_report(fastq, firstSubsampledFastqFileName, log):
1059
1060
1061 """
1062 STEP23 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1063
1064 do_cleanup_readqc
1065
1066 """
1067 def do_cleanup_readqc(log):
1068 log.info("\n\n%sSTEP23 - Cleanup <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
1069
1070 status = "23_cleanup_readqc in progress"
1071 checkpoint_step_wrapper(status)
1072
1073 ## Get option
1074 skipCleanup = RQCReadQcConfig.CFG["skip_cleanup"]
1075
1076 if not skipCleanup:
1077 retCode = cleanup_readqc(log)
1078 else:
1079 log.warning("File cleaning is skipped.")
1080 retCode = RQCExitCodes.JGI_SUCCESS
1081
1082 if retCode != RQCExitCodes.JGI_SUCCESS:
1083 log.error("23_cleanup_readqc failed.")
1084 status = "23_cleanup_readqc failed"
1085 checkpoint_step_wrapper(status)
1086
1087 else:
1088 log.info("23_cleanup_readqc complete.")
1089 status = "23_cleanup_readqc complete"
1090 checkpoint_step_wrapper(status)
1091
1092
1093 return status
1094
1095
1096 def stetch_section_note(table_header):
1097 overview = [
1098 'BBTools sketch.sh uses a technique called MinHash to rapidly compare large sequences. ',
1099 'The result is similar to BLAST, a list of hits from a query sequence to various reference sequences, ',
1100 'sorted by similarity but the mechanisms are very different. ',
1101 'For more information, see <a href="http://bbtools.jgi.doe.gov/" target="_blank">http://bbtools.jgi.doe.gov</a>'
1102 ]
1103
1104 legend = {
1105 'WKID' : 'Weighted Kmer IDentity, which is the kmer identity compensating for differences in size. So, comparing human chr1 to the full human genome would yield 100% WKID but approximately 10% KID.',
1106 'KID' : 'Kmer IDentity, equal to matches/length; this is the fraction of shared kmers.',
1107 'ANI' : 'Average Nucleotide Identity, derived from WKID and kmer length.',
1108 'Complt' : 'Genome completeness (percent of the reference represented in the query). Derived from WKID and KID.',
1109 'Contam' : 'Contamination (percent of the query that does not match this reference, but matches some other reference).',
1110 'Depth' : 'Per-kmer depth in sketches, indicating how many times that kmer was seen in the sequence.',
1111 'Depth2' : 'Repeat-compensated depth',
1112 'Matches' : 'The number of shared kmers between query and ref.',
1113 'Unique' : 'The number of shared kmers between query and ref, and no other ref.',
1114 'Unique2' : '??',
1115 'Depth2' : 'Repeat-compensated depth',
1116 'noHit' : 'Number of kmers that did not hit any reference sequence. Though constant for a query, it will be reported differently for different references based on the relative size of the reference and query (if the reference is bigger than the query, it will report all of them).',
1117 'TaxID' : 'NCBI taxonomic id, when available.',
1118 'gSize' : 'Estimate of genomic size (number of unique kmers in the genome). This is based on the smallest hash value in the list. This is affected by blacklists or whitelists, and by using an assembly versus raw reads.',
1119 'gSeqs' : 'Number of sequences used in the sketch.',
1120 'taxName' : 'NCBI\'s name for that taxID. If there is no taxID, the sequence name will be used.',
1121 }
1122
1123 html = ''
1124 for name in table_header:
1125 if name != 'taxonomy':
1126 html += html_tag('li', '%s: %s' % (html_tag('span', name, {'class': 'notice'}), html_tag('span', legend.get(name), {'class': 'small-italic'})))
1127 html = html_tag('span', ''.join(overview), {'class': 'notice'}) + '<br /><br />Column Legend<br />' + html_tag('ul', html)
1128
1129 return html
1130
1131
1132 def sketch_table(fname):
1133 # print('DEBUG : %s' % fname)
1134
1135 html = ''
1136
1137 if os.path.isfile(fname):
1138 title = None
1139 header = None
1140 data = []
1141
1142 with open(fname, 'r') as fh:
1143 for line in fh:
1144 line = line.strip()
1145 if line == '':
1146 continue
1147 if line.startswith('Query:'):
1148 title = line
1149 elif line.startswith('WKID'):
1150 header = line.split('\t')
1151 else:
1152 data.append(line)
1153 if title and header and data:
1154 html += html_th(header)
1155 for line in data:
1156 row = line.split('\t')
1157 html += html_tr(row)
1158
1159 # html = html_tag('p', title) + html_tag('table', html, attrs={'class': 'data'})
1160 html = html_tag('table', html, attrs={'class': 'data'})
1161 return html, header
1162
1163 def do_html_contam_art_first_n_pb_tr(stats, files, odir, filepath_prefix):
1164 temp = os.path.join(PYDIR, 'template/readqc_artifacts.html')
1165 html = ''
1166 with open(temp, 'r') as fh:
1167 html = fh.read()
1168
1169 ## the optional artifact contam
1170 artifact_tr = ''
1171 artifact_type = '50'
1172 artifact_val = pipeline_val('illumina read percent contamination artifact 50bp seal', {'type': 'raw'}, stats, files, filepath_prefix)
1173 artifact_file = pipeline_val('artifact_50bp.seal.stats seal', {'type': 'file'}, stats, files, filepath_prefix)
1174 if not artifact_file:
1175 artifact_file = pipeline_val('artifact_20bp.seal.stats seal', {'type': 'file'}, stats, files, filepath_prefix)
1176 artifact_type = '20'
1177 artifact_val = pipeline_val('illumina read percent contamination artifact 20bp seal', {'type': 'raw'}, stats, files, filepath_prefix)
1178 if artifact_file:
1179 html = html.replace('[_CONTAM-ART-FIRST-BP_]', artifact_type)
1180 html = html.replace('[_CONTAM-ART-FIRST-BP-SEAL_]', artifact_file)
1181 html = html.replace('[_CONTAM-ART-FIRST-BP-SEAL-PCT_]', artifact_val)
1182
1183 return html
1184
1185
1186 def do_html_body(odir, filepath_prefix):
1187 print('do_html_body - %s' % filepath_prefix)
1188 temp = os.path.join(PYDIR, 'template/readqc_body_template.html')
1189 statsf = os.path.join(odir, 'readqc_stats.txt')
1190 if not os.path.isfile(statsf):
1191 print('ERROR : file not found: %s' % statsf)
1192 stats = get_dict_obj(statsf)
1193
1194 filesf = os.path.join(odir, 'readqc_files.txt')
1195 if not os.path.isfile(statsf):
1196 print('ERROR : file not found: %s' % statsf)
1197 files = get_dict_obj(os.path.join(odir, 'readqc_files.txt'))
1198
1199 ## key (key name in readqc_stats or readqc_files) : {token (space holder in html template), type (value format)}
1200 tok_map = {
1201 ## Average Base Quality section
1202 'overall bases Q score mean' : {'token' : '[_BASE-QUALITY-SCORE_]', 'type': 'float', 'filter': 1},
1203 'overall bases Q score std' : {'token' : '[_BASE-QUALITY-SCORE-STD_]', 'type': 'float', 'filter': 1},
1204 'Q30 bases Q score mean' : {'token' : '[_Q30-BASE-QUALITY-SCORE_]', 'type': 'float', 'filter': 1},
1205 'Q30 bases Q score std' : {'token' : '[_Q30-BASE-QUALITY-SCORE-STD_]', 'type': 'float', 'filter': 1},
1206 'base C30' : {'token' : '[_COUNT-OF-BAESE-Q30_]', 'type': 'bigint'},
1207 'base C25' : {'token' : '[_COUNT-OF-BAESE-Q25_]', 'type': 'bigint'},
1208 'base C20' : {'token' : '[_COUNT-OF-BAESE-Q20_]', 'type': 'bigint'},
1209 'base C15' : {'token' : '[_COUNT-OF-BAESE-Q15_]', 'type': 'bigint'},
1210 'base C10' : {'token' : '[_COUNT-OF-BAESE-Q10_]', 'type': 'bigint'},
1211 'base C5' : {'token' : '[_COUNT-OF-BAESE-Q5_]', 'type': 'bigint'},
1212 'base Q30' : {'token' : '[_PCT-OF-BAESE-Q30_]', 'type': 'raw'},
1213 'base Q25' : {'token' : '[_PCT-OF-BAESE-Q25_]', 'type': 'raw'},
1214 'base Q20' : {'token' : '[_PCT-OF-BAESE-Q20_]', 'type': 'raw'},
1215 'base Q15' : {'token' : '[_PCT-OF-BAESE-Q15_]', 'type': 'raw'},
1216 'base Q10' : {'token' : '[_PCT-OF-BAESE-Q10_]', 'type': 'raw'},
1217 'base Q5' : {'token' : '[_PCT-OF-BAESE-Q5_]', 'type': 'raw'},
1218
1219 ## Average Read Quality section
1220 'read Q30' : {'token' : '[_PCT-OF-READS-Q30_]', 'type': 'raw'},
1221 'read Q25' : {'token' : '[_PCT-OF-READS-Q25_]', 'type': 'raw'},
1222 'read Q20' : {'token' : '[_PCT-OF-READS-Q20_]', 'type': 'raw'},
1223 'read Q15' : {'token' : '[_PCT-OF-READS-Q15_]', 'type': 'raw'},
1224 'read Q10' : {'token' : '[_PCT-OF-READS-Q10_]', 'type': 'raw'},
1225 'read Q5' : {'token' : '[_PCT-OF-READS-Q5_]', 'type': 'raw'},
1226
1227 'SUBSAMPLE_RATE' : {'token' : '[_SUBSAMPLE-RATE_]', 'type': 'pct', 'filter': 1},
1228 'read base quality stats' : {'token' : '[_AVG-READ-QUAL-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
1229 'ILLUMINA_READ_QHIST_D3_HTML_PLOT' : {'token' : '[_AVG-READ-QUAL-HOSTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
1230 'read qhist plot' : {'token' : '[_AVG-READ-QUALITY-HISTOGRAM_]', 'type': 'file'},
1231
1232 ## Average Base Position Quality section
1233 'read q20 read1' : {'token' : '[_READ_Q20_READ1_]', 'type': 'raw'},
1234 'read q20 read2' : {'token' : '[_READ_Q20_READ2_]', 'type': 'raw'},
1235 'read qual pos qrpt 1' : {'token' : '[_AVG-BASE-POS-QUAL-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
1236 'ILLUMINA_READ_QUAL_POS_PLOT_MERGED_D3_HTML_PLOT' : {'token' : '[_AVG-BASE-POS-QUAL-HISTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
1237 'read qual pos plot merged' : {'token' : '[_AVG-BASE-POSITION-QUALITY_]', 'type': 'file'},
1238
1239 ## Insert Size
1240 'ILLUMINA_READ_INSERT_SIZE_JOINED_PERC' : {'token' : '[_PCT-READS-JOINED_]', 'type': 'float', 'filter': 1},
1241 'ILLUMINA_READ_INSERT_SIZE_AVG_INSERT' : {'token' : '[_PCT-READS-JOINED-AVG_]', 'type': 'raw'},
1242 'ILLUMINA_READ_INSERT_SIZE_STD_INSERT' : {'token' : '[_PCT-READS-JOINED-STDDEV_]', 'type': 'raw'},
1243 'ILLUMINA_READ_INSERT_SIZE_MODE_INSERT' : {'token' : '[_PCT-READS-JOINED-MODE_]', 'type': 'raw'},
1244
1245 'ILLUMINA_READ_INSERT_SIZE_HISTO_DATA' : {'token' : '[_INSERT-SIZE-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
1246 'ILLUMINA_READ_INSERT_SIZE_HISTO_D3_HTML_PLOT' : {'token' : '[_INSERT-SIZE-HISTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
1247 'ILLUMINA_READ_INSERT_SIZE_HISTO_PLOT' : {'token' : '[_INSERT-SIZE-HISTOGRAM_]', 'type': 'file'},
1248
1249 ## Read GC
1250 'read GC mean' : {'token' : '[_READ-GC-AVG_]', 'type': 'float', 'filter': 1},
1251 'read GC std' : {'token' : '[_READ-GC-STDDEV_]', 'type': 'float', 'filter': 1},
1252 'read GC text hist' : {'token' : '[_READ-QC-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
1253 'ILLUMINA_READ_GC_D3_HTML_PLOT' : {'token' : '[_READ-QC-HISTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
1254 'read GC plot' : {'token' : '[_READ-GC-HIST_]', 'type': 'file'},
1255
1256 ## Cycle Nucleotide Composition
1257 'read base count text 1' : {'token' : '[_NUC-COMP-FREQ-R1-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
1258 'read base count text 2' : {'token' : '[_NUC-COMP-FREQ-R2-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
1259 'ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_1' : {'token' : '[_NUC-COMP-FREQ-R1-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
1260 'ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_2' : {'token' : '[_NUC-COMP-FREQ-R2-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
1261 'read base count plot 1' : {'token' : '[_CYCLE-NUCL-COMPOSITION-READ1_]', 'type': 'file'},
1262 'read base count plot 2' : {'token' : '[_CYCLE-NUCL-COMPOSITION-READ2_]', 'type': 'file'},
1263
1264 ## Percentage of Common Contaminants
1265 'illumina read percent contamination artifact seal' : {'token' : '[_CONTAM-ART-SEA-PCT_]', 'type': 'floatstr'},
1266 'artifact.seal.stats seal' : {'token' : '[_CONTAM-ART-SEAL_]', 'type': 'file'},
1267
1268 'illumina read percent contamination DNA spikein seal' : {'token' : '[_DNA-SPIKEIN-SEAL_PCT_]', 'type': 'floatstr'},
1269 'DNA_spikein.seal.stats seal' : {'token' : '[_DNA-SPIKEIN-SEAL_]', 'type': 'file'},
1270
1271 'illumina read percent contamination RNA spikein seal' : {'token' : '[_RNA-SPIKEIN-SEAL_PCT_]', 'type': 'floatstr'},
1272 'RNA_spikein.seal.stats seal' : {'token' : '[_RNA-SPIKEIN-SEAL_]', 'type': 'file'},
1273
1274 'illumina read percent contamination fosmid seal' : {'token' : '[_CONTAM-FOSMID-SEAL-PCT_]', 'type': 'floatstr'},
1275 'fosmid.seal.stats seal' : {'token' : '[_CONTAM-FOSMID-SEAL_]', 'type': 'file'},
1276
1277 'illumina read percent contamination fosmid seal' : {'token' : '[_CONTAM-FOSMID-SEAL-PCT_]', 'type': 'floatstr'},
1278 'fosmid.seal.stats seal' : {'token' : '[_CONTAM-FOSMID-SEAL_]', 'type': 'file'},
1279
1280 'illumina read percent contamination mitochondrion seal' : {'token' : '[_CONTAM-MITO-SEAL-PCT_]', 'type': 'floatstr'},
1281 'mitochondrion.seal.stats seal' : {'token' : '[_CONTAM-MITO-SEAL_]', 'type': 'file'},
1282
1283 'illumina read percent contamination plastid seal' : {'token' : '[_CONTAM-CHLO-SEAL-PCT_]', 'type': 'floatstr'},
1284 'plastid.seal.stats seal' : {'token' : '[_CONTAM-CHLO-SEAL_]', 'type': 'file'},
1285
1286 'illumina read percent contamination phix seal' : {'token' : '[_CONTAM-PHIX-SEAL-PCT_]', 'type': 'floatstr'},
1287 'phix.seal.stats seal' : {'token' : '[_CONTAM-PHIX-SEAL_]', 'type': 'file'},
1288
1289 'illumina read percent contamination rrna seal' : {'token' : '[_CONTAM-RRNA-SEAL-PCT_]', 'type': 'floatstr'},
1290 'rrna.seal.stats seal' : {'token' : '[_CONTAM-RRNA-SEAL_]', 'type': 'file'},
1291
1292 'illumina read percent contamination microbes seal' : {'token' : '[_CONTAM-NON-SYN-SEAL-PCT_]', 'type': 'floatstr'},
1293 'microbes.seal.stats seal' : {'token' : '[_CONTAM-NON-SYN-SEAL_]', 'type': 'file'},
1294
1295 # 'illumina read percent contamination synthetic seal' : {'token' : '[_CONTAM-SYN-SEAL-PCT_]', 'type': 'floatstr'},
1296 'synthetic.seal.stats seal' : {'token' : '[_CONTAM-SYN-SEAL_]', 'type': 'file'},
1297
1298 'illumina read percent contamination adapters seal' : {'token' : '[_CONTAM-ADAPTER-PCT_]', 'type': 'floatstr'},
1299 'adapters.seal.stats seal' : {'token' : '[_CONTAM-ADAPTER-SEAL_]', 'type': 'file'},
1300
1301 ## Sketch vs NT
1302 'sketch_vs_nt_output' : {'token' : '[_SKETCH-VS-NT_]', 'type': 'file'},
1303 # 'sketch_vs_nt_output' : {'token' : '[_SKETCH-VS-NT-BASE_]', 'type': 'file', 'filter': 'base'},
1304 }
1305
1306 html = ''
1307 with open(temp, 'r') as fh:
1308 html = fh.read()
1309
1310 for key in tok_map:
1311 dat = tok_map[key]
1312 val = pipeline_val(key, dat, stats, files, filepath_prefix)
1313 # print('key=%s; %s; ====type=%s' % (key, val, type(val)))
1314 html = html.replace(dat['token'], val)
1315
1316 artifact_tr = do_html_contam_art_first_n_pb_tr(stats, files, odir, filepath_prefix)
1317 html = html.replace('[_CONTAN-ART-SEAL-FIRST-BP_]', artifact_tr)
1318
1319 ## synthetic contam
1320 contam_syn_file = pipeline_val('synthetic.seal.stats seal', {'type': 'file', 'filter': 'full'}, stats, files, filepath_prefix)
1321 if contam_syn_file and os.path.isfile(contam_syn_file):
1322 contam_syn_pct_key = 'illumina read percent contamination synthetic seal'
1323 cmd = 'grep "#Matched" %s' % contam_syn_file
1324 stdOut, stdErr, exitCode = run_sh_command(cmd, True)
1325 if exitCode == 0:
1326 toks = stdOut.strip().split()
1327 if len(toks) == 3:
1328 html = html.replace('[_CONTAM-SYN-SEAL-PCT_]', toks[2][:-1])
1329
1330 ###--- Sketch
1331 sketch_html = ''
1332 ## add Sketch vs NT if file exists:
1333 sketch_nt_file = pipeline_val('sketch_vs_nt_output', {'type': 'file', 'filter': 'full'}, stats, files, filepath_prefix)
1334
1335 if os.path.isfile(sketch_nt_file):
1336 basename = os.path.basename(sketch_nt_file)
1337 # href = pipeline_val('sketch_vs_nt_output', {'type': 'file'}, stats, files, filepath_prefix)
1338 sketch_html = html_tag('h4', 'Sketch vs. NT') #+ '<br />' + 'Raw file:%s' % html_link(href, basename)
1339 sketch_tab, sketch_table_header = sketch_table(sketch_nt_file)
1340 sketch_html += sketch_tab
1341
1342
1343 ## add Sketch vs refseq if file exists?
1344 if sketch_html:
1345 sketch_html = html_tag('h2', 'Read Sketch', attrs={'class': 'section-title'}) \
1346 + stetch_section_note(sketch_table_header) \
1347 + html_tag('div', sketch_html, {'class': 'section'})
1348
1349 html = html.replace('[_SKETCH-TABLE_]', sketch_html)
1350
1351 # copy the image file
1352 imgdir = os.path.join(PYDIR, 'images')
1353 todir = os.path.join(odir, 'images')
1354 if os.path.isdir(todir):
1355 shutil.rmtree(todir)
1356 shutil.copytree(imgdir, todir, False, None)
1357
1358 return html
1359
1360
1361 def do_html(odir, infile):
1362 # odir = os.path.abspath(odir) ## DO NOT convert!! The relative file path in html need this original odir string matching
1363
1364 screen('Output dir - %s' % odir)
1365 fname = os.path.basename(infile)
1366 screen('Create HTML page in %s for %s ..' % (odir, fname))
1367 temp = os.path.join(PYDIR, 'template/template.html')
1368
1369 with open(temp, 'r') as fh:
1370 html = fh.read()
1371 html = html.replace('[_PAGE-TITLE_]', 'Read QC Report')
1372 html = html.replace('[_REPORT-TITLE_]', 'BBTools Read QC Report')
1373 html = html.replace('[_INPUT-FILE-NAME_]', fname)
1374 html = html.replace('[_REPORT-DATE_]', '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))
1375
1376 hbody =do_html_body(odir, odir)
1377 html = html.replace('[_REPORT-BODY_]', hbody)
1378
1379 ## write the html to file
1380 idxfile = os.path.join(odir, 'index.html')
1381 with open(idxfile, 'w') as fh2:
1382 fh2.write(html)
1383 screen('HTML index file written to %s' % idxfile)
1384
1385 # copy the css file
1386 cssdir = os.path.join(PYDIR, 'css')
1387 todir = os.path.join(odir, 'css')
1388 if os.path.isdir(todir):
1389 shutil.rmtree(todir)
1390 shutil.copytree(cssdir, todir, False, None)
1391
1392 def screen(txt):
1393 print('.. %s' % txt)
1394 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1395 ## Main Program
1396 if __name__ == "__main__":
1397 desc = "RQC ReadQC Pipeline"
1398 parser = argparse.ArgumentParser(description=desc)
1399 parser.add_argument("-f", "--fastq", dest="fastq", help="Set input fastq file (full path to fastq)", required=True)
1400 parser.add_argument("-o", "--output-path", dest="outputPath", help="Set output path to write to")
1401 parser.add_argument("-x", "--cut", dest="cutLenOfFirstBases", help="Set Read cut length (bp) for read contamination detection")
1402 parser.add_argument("-v", "--version", action="version", version=VERSION)
1403
1404 ## For running on Crays
1405 parser.add_argument("-l", "--lib-name", dest="libName", help="Set library name", required=False)
1406 parser.add_argument("-m", "--is-multiplexed", dest="isMultiplexed", help="Set multiplexed data", required=False)
1407 parser.add_argument("-r", "--is-rna", dest="isRna", help="Set RNA data", required=False)
1408
1409 ## produce html when processing is done
1410 parser.add_argument("-html", "--html", action="store_true", help="Create html file", dest="html", default=False, required=False)
1411
1412 ## Toggle options
1413 parser.add_argument("-b", "--skip-blast", action="store_true", help="Skip the blast run", dest="skipBlast", default=False, required=False)
1414
1415 parser.add_argument("-bn", "--skip-blast-nt", action="store_true", help="Skip Blast run against nt", dest="skipBlastNt", default=False, required=False)
1416 parser.add_argument("-br", "--skip-blast-refseq", action="store_true", help="Skip Blast run against refseq.archaea and refseq.bateria", dest="skipBlastRefseq", default=False, required=False)
1417 parser.add_argument("-c", "--skip-cleanup", action="store_true", help="Skip temporary file cleanup", dest="skipCleanup", default=False, required=False)
1418 parser.add_argument("-p", "--pooled-analysis", action="store_true", help="Enable pooled analysis (demultiplexing)", dest="pooledAnalysis", default=False, required=False)
1419 parser.add_argument("-s", "--skip-subsample", action="store_true", help="Skip subsampling.", dest="skipSubsample", default=False, required=False)
1420 parser.add_argument("-z", "--skip-localization", action="store_true", help="Skip database localization", dest="skipDbLocalization", default=False, required=False)
1421
1422 options = parser.parse_args()
1423
1424 outputPath = None # output path, defaults to current working directory
1425 fastq = None # full path to input fastq.gz
1426
1427 status = "start"
1428 nextStepToDo = 0
1429
1430 if options.outputPath:
1431 outputPath = options.outputPath
1432 else:
1433 outputPath = os.getcwd()
1434
1435 if options.fastq:
1436 fastq = options.fastq
1437
1438 ## create output_directory if it doesn't exist
1439 if not os.path.isdir(outputPath):
1440 os.makedirs(outputPath)
1441
1442 libName = None ## for illumina_sciclone_analysis()
1443 isRna = None ## for illumina_sciclone_analysis()
1444 isMultiplexed = None ## for illumina_generate_index_sequence_detection_plot()
1445
1446 bSkipBlast = False
1447 bSkipBlastRefseq = False ## refseq.archaea and refseq.bacteria
1448 bSkipBlastNt = False
1449 bPooledAnalysis = False
1450
1451 ## RQC-743 Need to specify the first cutting bp for read contam detection (eps. for smRNA )
1452 firstBptoCut = 50
1453 if options.cutLenOfFirstBases:
1454 firstBptoCut = options.cutLenOfFirstBases
1455
1456 if options.libName:
1457 libName = options.libName
1458
1459
1460 ## switches
1461 if options.isRna:
1462 isRna = options.isRna
1463 if options.isMultiplexed:
1464 isMultiplexed = options.isMultiplexed
1465 if options.skipBlast:
1466 bSkipBlast = options.skipBlast
1467 if options.skipBlastRefseq:
1468 bSkipBlastRefseq = options.skipBlastRefseq
1469 if options.skipBlastNt:
1470 bSkipBlastNt = options.skipBlastNt
1471 if options.pooledAnalysis:
1472 bPooledAnalysis = options.pooledAnalysis
1473
1474
1475 skipSubsampling = options.skipSubsample
1476
1477 ## Set readqc config
1478 RQCReadQcConfig.CFG["status_file"] = os.path.join(outputPath, "readqc_status.log")
1479 RQCReadQcConfig.CFG["files_file"] = os.path.join(outputPath, "readqc_files.tmp")
1480 RQCReadQcConfig.CFG["stats_file"] = os.path.join(outputPath, "readqc_stats.tmp")
1481 RQCReadQcConfig.CFG["output_path"] = outputPath
1482 RQCReadQcConfig.CFG["skip_cleanup"] = options.skipCleanup
1483 RQCReadQcConfig.CFG["skip_localization"] = options.skipDbLocalization
1484 RQCReadQcConfig.CFG["log_file"] = os.path.join(outputPath, "readqc.log")
1485
1486 screen("Started readqc pipeline, writing log to: %s" % (RQCReadQcConfig.CFG["log_file"]))
1487
1488 log = get_logger("readqc", RQCReadQcConfig.CFG["log_file"], LOG_LEVEL, False, True)
1489 log.info("=================================================================")
1490 log.info(" Read Qc Analysis (version %s)", VERSION)
1491 log.info("=================================================================")
1492
1493 log.info("Starting %s with %s", SCRIPT_NAME, fastq)
1494
1495 if os.path.isfile(RQCReadQcConfig.CFG["status_file"]):
1496 status = get_status(RQCReadQcConfig.CFG["status_file"], log)
1497 else:
1498 checkpoint_step_wrapper(status)
1499
1500
1501 if not os.path.isfile(fastq):
1502 log.error("%s not found, aborting!", fastq)
1503 exit(2)
1504 elif status != "complete":
1505 ## check for fastq file
1506 # if os.path.isfile(fastq):
1507 log.info("Found %s, starting processing.", fastq)
1508 log.info("Latest status = %s", status)
1509 if status != 'start':
1510 nextStepToDo = int(status.split("_")[0])
1511 if status.find("complete") != -1:
1512 nextStepToDo += 1
1513 log.info("Next step to do = %s", nextStepToDo)
1514
1515 if status != 'complete':
1516 bDone = False
1517 cycle = 0
1518 totalReadNum = 0
1519 firstSubsampledFastqFileName = ""
1520 secondSubsampledFastqFile = ""
1521 totalReadCount = 0
1522 subsampledReadNum = 0
1523 bIsPaired = False
1524 readLength = 0
1525 firstSubsampledLogFile = os.path.join(outputPath, "subsample", "first_subsampled.txt")
1526 secondSubsampledLogFile = os.path.join(outputPath, "subsample", "second_subsampled.txt")
1527
1528 while not bDone:
1529
1530 cycle += 1
1531
1532 if bPooledAnalysis:
1533 nextStepToDo = 17
1534 status = "16_illumina_read_blastn_nt complete"
1535
1536 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1537 ## 1. fast_subsample_fastq_sequences
1538 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1539 if nextStepToDo == 1 or status == "start":
1540 status, firstSubsampledFastqFileName, totalReadCount, subsampledReadNum, bIsPaired, readLength = do_fast_subsample_fastq_sequences(fastq, skipSubsampling, log)
1541
1542 if status.endswith("failed"):
1543 log.error("Subsampling failed.")
1544 sys.exit(-1)
1545
1546 ## if not skipSubsampling and subsampledReadNum == 0: ## too small input file
1547 if not skipSubsampling and subsampledReadNum < RQCReadQc.ILLUMINA_MIN_NUM_READS: ## min=1000
1548 log.info("Too small input fastq file. Skip subsampling: total number of reads = %s, sampled number of reads = %s", totalReadCount, subsampledReadNum)
1549 skipSubsampling = True
1550 status, firstSubsampledFastqFileName, totalReadCount, subsampledReadNum, bIsPaired, readLength = do_fast_subsample_fastq_sequences(fastq, skipSubsampling, log)
1551
1552 if status.endswith("failed"):
1553 log.error("Subsampling failed.")
1554 sys.exit(-1)
1555
1556 subsampledReadNum = totalReadCount
1557 if subsampledReadNum >= RQCReadQc.ILLUMINA_MIN_NUM_READS:
1558 status = "1_illumina_readqc_subsampling complete"
1559
1560 ## Still too low number of reads -> record ILLUMINA_TOO_SMALL_NUM_READS and quit
1561 ##
1562 if subsampledReadNum == 0 or subsampledReadNum < RQCReadQc.ILLUMINA_MIN_NUM_READS: ## min=1000
1563 log.info("Too small number of reads (< %s). Stop processing.", RQCReadQc.ILLUMINA_MIN_NUM_READS)
1564 print("WARNING : Too small number of reads (< %s). Stop processing." % RQCReadQc.ILLUMINA_MIN_NUM_READS)
1565 log.info("Completed %s: %s", SCRIPT_NAME, fastq)
1566
1567 ## Add ILLUMINA_TOO_SMALL_NUM_READS=1 to stats to be used in reporting
1568 statsFile = RQCReadQcConfig.CFG["stats_file"]
1569 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_TOO_SMALL_NUM_READS, "1", log)
1570
1571 ## move rqc-files.tmp to rqc-files.txt
1572 newFilesFile = os.path.join(outputPath, "readqc_files.txt")
1573 newStatsFile = os.path.join(outputPath, "readqc_stats.txt")
1574
1575 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["files_file"], newFilesFile)
1576 log.info("mv cmd: %s", cmd)
1577 run_sh_command(cmd, True, log)
1578
1579 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["stats_file"], newStatsFile)
1580 log.info("mv cmd: %s", cmd)
1581 run_sh_command(cmd, True, log)
1582
1583 exit(0)
1584
1585 if status == "1_illumina_readqc_subsampling failed":
1586 bDone = True
1587
1588
1589 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1590 ## 2. write_unique_20_mers
1591 ## NOTE: fastq = orig input fastq,
1592 ## totalReadCount = total reads in the orig input fastq
1593 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1594 if nextStepToDo == 2 or status == "1_illumina_readqc_subsampling complete":
1595 ## Cope with restarting
1596 ## save subsamples file in "first_subsampled.txt" so that the file
1597 ## name can be read when started
1598 if not os.path.isfile(firstSubsampledLogFile):
1599 make_dir(os.path.join(outputPath, "subsample"))
1600 with open(firstSubsampledLogFile, "w") as SAMP_FH:
1601 SAMP_FH.write(firstSubsampledFastqFileName + " " + str(totalReadCount) + " " + str(subsampledReadNum) + "\n")
1602
1603 if firstSubsampledFastqFileName == "" and options.skipSubsample:
1604 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
1605 with open(firstSubsampledLogFile, "r") as SAMP_FH:
1606 l = SAMP_FH.readline().strip()
1607 t = l.split()
1608 assert len(t) == 2 or len(t) == 3
1609 firstSubsampledFastqFileName = t[0]
1610 totalReadCount = t[1]
1611 subsampledReadNum = t[2]
1612 log.debug("firstSubsampledFastqFileName=%s, totalReadCount=%s, subsampledReadNum=%s", firstSubsampledFastqFileName, totalReadCount, subsampledReadNum)
1613
1614 else:
1615 nextStepToDo = 1
1616 continue
1617
1618 ## TODO: move getting totalReadNum in the func ???
1619 ##
1620 log.debug("firstSubsampledFastqFileName=%s, totalReadCount=%s, subsampledReadNum=%s", firstSubsampledFastqFileName, totalReadCount, subsampledReadNum)
1621
1622 if totalReadCount is None or totalReadCount == 0:
1623 nextStepToDo = 1
1624 continue
1625 else:
1626 status = do_write_unique_20_mers(fastq, totalReadCount, log)
1627
1628 if status == "2_unique_mers_sampling failed":
1629 bDone = True
1630
1631 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1632 ## 3. generate read GC histograms: Illumina_read_gc
1633 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1634 if nextStepToDo == 3 or status == "2_unique_mers_sampling complete":
1635 ## Read the subsampled file name
1636 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
1637 with open(firstSubsampledLogFile, "r") as SAMP_FH:
1638 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
1639 status = do_illumina_read_gc(firstSubsampledFastqFileName, log)
1640
1641 if status == "3_illumina_read_gc failed":
1642 bDone = True
1643
1644 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1645 ## 4. read_quality_stats
1646 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1647 if nextStepToDo == 4 or status == "3_illumina_read_gc complete":
1648 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
1649 with open(firstSubsampledLogFile, "r") as SAMP_FH:
1650 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
1651 status = do_read_quality_stats(firstSubsampledFastqFileName, log)
1652
1653 if status == "4_illumina_read_quality_stats failed":
1654 bDone = True
1655
1656 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1657 ## 5. write_base_quality_stats
1658 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1659 if nextStepToDo == 5 or status == "4_illumina_read_quality_stats complete":
1660 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
1661 with open(firstSubsampledLogFile, "r") as SAMP_FH:
1662 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
1663 status = do_write_base_quality_stats(firstSubsampledFastqFileName, log)
1664
1665 if status == "5_illumina_read_quality_stats failed":
1666 bDone = True
1667
1668 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1669 ## 6. illumina_count_q_score
1670 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1671 if nextStepToDo == 6 or status == "5_illumina_read_quality_stats complete":
1672 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
1673 with open(firstSubsampledLogFile, "r") as SAMP_FH:
1674 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
1675 status = do_illumina_count_q_score(firstSubsampledFastqFileName, log)
1676
1677 if status == "6_illumina_count_q_score failed":
1678 bDone = True
1679
1680 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1681 ## 7. illumina_calculate_average_quality
1682 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1683 if nextStepToDo == 7 or status == "6_illumina_count_q_score complete":
1684 ## Let's skip this step. (20140902)
1685 log.info("\n\n%sSTEP7 - Skipping 21mer analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
1686
1687 status = "7_illumina_calculate_average_quality in progress"
1688 checkpoint_step_wrapper(status)
1689
1690 status = "7_illumina_calculate_average_quality complete"
1691 checkpoint_step_wrapper(status)
1692
1693 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1694 ## 8. illumina_find_common_motifs
1695 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1696 if nextStepToDo == 8 or status == "7_illumina_calculate_average_quality complete":
1697 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
1698 with open(firstSubsampledLogFile, "r") as SAMP_FH:
1699 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
1700 status = do_illumina_find_common_motifs(firstSubsampledFastqFileName, log)
1701
1702 if status == "8_illumina_find_common_motifs failed":
1703 bDone = True
1704
1705 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1706 ## 10. illumina_run_tagdust
1707 ##
1708 ## NOTE: This step will be skipped. No need to run.
1709 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1710 # if nextStepToDo == 10 or status == "9_illumina_run_dedupe complete":
1711 if nextStepToDo == 9 or status == "8_illumina_find_common_motifs complete":
1712 ## 20131023 skip this step
1713 log.info("\n\n%sSTEP10 - Skipping tag dust <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
1714 status = "10_illumina_run_tagdust in progress"
1715 checkpoint_step_wrapper(status)
1716
1717 status = "10_illumina_run_tagdust complete"
1718 checkpoint_step_wrapper(status)
1719
1720 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1721 ## 11. illumina_detect_read_contam
1722 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1723 if nextStepToDo == 11 or status == "10_illumina_run_tagdust complete":
1724 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
1725 with open(firstSubsampledLogFile, "r") as SAMP_FH:
1726 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
1727 status = do_illumina_detect_read_contam(firstSubsampledFastqFileName, firstBptoCut, log)
1728
1729 if status == "11_illumina_detect_read_contam failed":
1730 bDone = True
1731
1732
1733 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1734 ## 13. illumina_subsampling_read_megablast
1735 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1736 if nextStepToDo == 12 or status == "11_illumina_detect_read_contam complete":
1737 if not bSkipBlast:
1738 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
1739 with open(firstSubsampledLogFile, "r") as SAMP_FH:
1740 l = SAMP_FH.readline().strip()
1741 firstSubsampledFastqFileName = l.split()[0]
1742 totalReadCount = int(l.split()[1])
1743 subsampledReadNum = int(l.split()[2])
1744
1745 status, secondSubsampledFastqFile, second_read_cnt_total, second_read_cnt_sampled = do_illumina_subsampling_read_blastn(firstSubsampledFastqFileName, skipSubsampling, subsampledReadNum, log)
1746 log.debug("status=%s, secondSubsampledFastqFile=%s, second_read_cnt_total=%s, second_read_cnt_sampled=%s", status, secondSubsampledFastqFile, second_read_cnt_total, second_read_cnt_sampled)
1747
1748 else:
1749 statsFile = RQCReadQcConfig.CFG["stats_file"]
1750 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_SKIP_BLAST, "1", log)
1751 log.info("\n\n%sSTEP13 - Run subsampling for Blast search <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
1752 log.info("13_illumina_subsampling_read_megablast skipped.\n")
1753 status = "13_illumina_subsampling_read_megablast complete"
1754 checkpoint_step_wrapper(status)
1755
1756 if status == "13_illumina_subsampling_read_megablast failed":
1757 bDone = True
1758
1759 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1760 # 14. illumina_read_blastn_refseq_archaea
1761 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1762 if nextStepToDo == 14 or status == "13_illumina_subsampling_read_megablast complete":
1763 if not bSkipBlast and not bSkipBlastRefseq:
1764 if secondSubsampledFastqFile == "" and not os.path.isfile(secondSubsampledLogFile):
1765 nextStepToDo = 13
1766 continue
1767
1768 if secondSubsampledFastqFile != "" and not os.path.isfile(secondSubsampledLogFile):
1769 make_dir(os.path.join(outputPath, "subsample"))
1770 with open(secondSubsampledLogFile, "w") as SAMP_FH:
1771 SAMP_FH.write(secondSubsampledFastqFile + " " + str(second_read_cnt_total) + " " + str(second_read_cnt_sampled) + "\n")
1772 else:
1773 with open(secondSubsampledLogFile, "r") as SAMP_FH:
1774 l = SAMP_FH.readline().strip()
1775 secondSubsampledFastqFile = l.split()[0]
1776 second_read_cnt_total = int(l.split()[1])
1777 second_read_cnt_sampled = int(l.split()[2])
1778
1779 status = do_illumina_read_blastn_refseq_archaea(secondSubsampledFastqFile, second_read_cnt_sampled, log)
1780 checkpoint_step_wrapper(status)
1781
1782 else:
1783 log.info("\n\n%sSTEP14 - Run illumina_read_blastn_refseq_archaea analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
1784 log.info("14_illumina_read_blastn_refseq_archaea skipped.\n")
1785 status = "14_illumina_read_blastn_refseq_archaea in progress"
1786 checkpoint_step_wrapper(status)
1787 status = "14_illumina_read_blastn_refseq_archaea complete"
1788 checkpoint_step_wrapper(status)
1789
1790 if status == "14_illumina_read_blastn_refseq_archaea failed":
1791 bDone = True
1792
1793 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1794 # 15. illumina_read_blastn_refseq_bacteria
1795 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1796 if nextStepToDo == 15 or status == "14_illumina_read_blastn_refseq_archaea complete":
1797 if not bSkipBlast and not bSkipBlastRefseq:
1798 if secondSubsampledFastqFile == "" and not os.path.isfile(secondSubsampledLogFile):
1799 nextStepToDo = 13
1800 continue
1801
1802 if secondSubsampledFastqFile != "" and not os.path.isfile(secondSubsampledLogFile):
1803 make_dir(os.path.join(outputPath, "subsample"))
1804 with open(secondSubsampledLogFile, "w") as SAMP_FH:
1805 SAMP_FH.write(secondSubsampledFastqFile + " " + str(second_read_cnt_total) + " " + str(second_read_cnt_sampled) + "\n")
1806 else:
1807 with open(secondSubsampledLogFile, "r") as SAMP_FH:
1808 l = SAMP_FH.readline().strip()
1809 secondSubsampledFastqFile = l.split()[0]
1810 second_read_cnt_total = int(l.split()[1])
1811 second_read_cnt_sampled = int(l.split()[2])
1812
1813 status = do_illumina_read_blastn_refseq_bacteria(secondSubsampledFastqFile, log)
1814 checkpoint_step_wrapper(status)
1815
1816 else:
1817 log.info("\n\n%sSTEP15 - Run illumina_read_blastn_refseq_bacteria analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
1818 log.info("15_illumina_read_blastn_refseq_bacteria skipped.\n")
1819 status = "15_illumina_read_blastn_refseq_bacteria in progress"
1820 checkpoint_step_wrapper(status)
1821 status = "15_illumina_read_blastn_refseq_bacteria complete"
1822 checkpoint_step_wrapper(status)
1823
1824 if status == "15_illumina_read_blastn_refseq_bacteria failed":
1825 bDone = True
1826
1827 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1828 ## 16. illumina_read_blastn_nt
1829 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1830 if nextStepToDo == 16 or status == "15_illumina_read_blastn_refseq_bacteria complete":
1831 if not bSkipBlast and not bSkipBlastNt:
1832 if secondSubsampledFastqFile == "" and not os.path.isfile(secondSubsampledLogFile):
1833 nextStepToDo = 13
1834 continue
1835
1836 if secondSubsampledFastqFile == "" and os.path.isfile(secondSubsampledLogFile):
1837 with open(secondSubsampledLogFile, "r") as SAMP_FH:
1838 l = SAMP_FH.readline().strip()
1839 secondSubsampledFastqFile = l.split()[0]
1840 second_read_cnt_total = int(l.split()[1])
1841 second_read_cnt_sampled = int(l.split()[2])
1842
1843 status = do_illumina_read_blastn_nt(secondSubsampledFastqFile, log)
1844 checkpoint_step_wrapper(status)
1845
1846 else:
1847 log.info("\n\n%sSTEP16 - Run illumina_read_blastn_nt analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
1848 log.info("16_illumina_read_blastn_nt skipped.\n")
1849 status = "16_illumina_read_blastn_nt in progress"
1850 checkpoint_step_wrapper(status)
1851 status = "16_illumina_read_blastn_nt complete"
1852 checkpoint_step_wrapper(status)
1853
1854 if status == "16_illumina_read_blastn_nt failed":
1855 bDone = True
1856
1857 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1858 ## 17. multiplex_statistics
1859 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1860 if nextStepToDo == 17 or status == "16_illumina_read_blastn_nt complete":
1861 status = do_illumina_multiplex_statistics(fastq, log, isMultiplexed=isMultiplexed)
1862
1863 if status == "17_multiplex_statistics failed":
1864 bDone = True
1865
1866 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1867 ## 18. end_of_read_illumina_adapter_check
1868 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1869 if nextStepToDo == 18 or status == "17_multiplex_statistics complete":
1870 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
1871 with open(firstSubsampledLogFile, "r") as SAMP_FH:
1872 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
1873 status = do_end_of_read_illumina_adapter_check(firstSubsampledFastqFileName, log)
1874
1875 if status == "18_end_of_read_illumina_adapter_check failed":
1876 bDone = True
1877
1878 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1879 ## 19. insert size analysis (bbmerge.sh)
1880 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1881 if nextStepToDo == 19 or status == "18_end_of_read_illumina_adapter_check complete":
1882 status = do_insert_size_analysis(fastq, log)
1883
1884 if status == "19_insert_size_analysis failed":
1885 bDone = True
1886
1887
1888 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1889 ## 21. sketch vs nt, refseq, silva
1890 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1891 # if nextStepToDo == 21 or status == "20_gc_divergence_analysis complete":
1892 if nextStepToDo == 20 or status == "19_insert_size_analysis complete":
1893 status = do_sketch_vs_nt_refseq_silva(fastq, log)
1894
1895 if status == "21_sketch_vs_nt_refseq_silva failed":
1896 bDone = True
1897
1898 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1899 ## 22. postprocessing & reporting
1900 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1901 if nextStepToDo == 22 or status == "21_sketch_vs_nt_refseq_silva complete":
1902 ## 20131023 skip this step
1903 log.info("\n\n%sSTEP22 - Run illumina_readqc_report_postprocess: mv rqc-*.tmp to rqc-*.txt <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
1904 status = "22_illumina_readqc_report_postprocess in progress"
1905 checkpoint_step_wrapper(status)
1906
1907 ## move rqc-files.tmp to rqc-files.txt
1908 newFilesFile = os.path.join(outputPath, "readqc_files.txt")
1909 newStatsFile = os.path.join(outputPath, "readqc_stats.txt")
1910
1911 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["files_file"], newFilesFile)
1912 log.info("mv cmd: %s", cmd)
1913 run_sh_command(cmd, True, log)
1914
1915 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["stats_file"], newStatsFile)
1916 log.info("mv cmd: %s", cmd)
1917 run_sh_command(cmd, True, log)
1918
1919 log.info("22_illumina_readqc_report_postprocess complete.")
1920 status = "22_illumina_readqc_report_postprocess complete"
1921 checkpoint_step_wrapper(status)
1922
1923 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1924 ## 23. Cleanup
1925 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1926 if nextStepToDo == 23 or status == "22_illumina_readqc_report_postprocess complete":
1927 status = do_cleanup_readqc(log)
1928
1929 if status == "23_cleanup_readqc failed":
1930 bDone = True
1931
1932 if status == "23_cleanup_readqc complete":
1933 status = "complete" ## FINAL COMPLETE!
1934 bDone = True
1935
1936 ## don't cycle more than 10 times ...
1937 if cycle > 10:
1938 bDone = True
1939
1940 if status != "complete":
1941 log.info("Status %s", status)
1942 else:
1943 log.info("\n\nCompleted %s: %s", SCRIPT_NAME, fastq)
1944 checkpoint_step_wrapper("complete")
1945 log.info("Pipeline processing Done.")
1946 print("Pipeline processing Done.")
1947
1948 else:
1949 log.info("Pipeline processing is already complete, skip.")
1950 print("Pipeline processing is already complete, skip.")
1951
1952 if status == 'complete' and options.html:
1953 do_html(outputPath, fastq)
1954
1955
1956 exit(0)
1957
1958
1959 ## EOF