annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/pytools/readqc.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 #!/usr/bin/env python
jpayne@68 2 # -*- coding: utf-8 -*-
jpayne@68 3
jpayne@68 4 """
jpayne@68 5 stand alone read qc script based on jgi-rqc-pipeline/readqc/readqc.py v8.3.6
jpayne@68 6
jpayne@68 7 Command
jpayne@68 8 $ readqc.py -f FASTQ.GZ -o OUT_DIR --skip-blast -html
jpayne@68 9
jpayne@68 10 Outputs
jpayne@68 11 - normal QC outputs + index.html
jpayne@68 12
jpayne@68 13 Created: March 15, 2018
jpayne@68 14
jpayne@68 15 Shijie Yao (syao@lbl.gov)
jpayne@68 16
jpayne@68 17 Revision:
jpayne@68 18
jpayne@68 19 """
jpayne@68 20
jpayne@68 21 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 22 ## libraries to use
jpayne@68 23 import os
jpayne@68 24 import sys
jpayne@68 25 import argparse
jpayne@68 26 import datetime
jpayne@68 27 import shutil
jpayne@68 28
jpayne@68 29 SRC_ROOT = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
jpayne@68 30 sys.path.append(SRC_ROOT + "/lib") # common
jpayne@68 31
jpayne@68 32 from readqc_constants import RQCReadQcConfig, RQCReadQc, ReadqcStats
jpayne@68 33 from common import get_logger, get_status, append_rqc_stats, append_rqc_file, set_colors, get_subsample_rate,run_command
jpayne@68 34 from readqc_utils import *
jpayne@68 35 #from readqc_utils import checkpoint_step_wrapper, fast_subsample_fastq_sequences, write_unique_20_mers, illumina_read_gc
jpayne@68 36 from rqc_fastq import get_working_read_length, read_count
jpayne@68 37 from readqc_report import *
jpayne@68 38 from os_utility import make_dir
jpayne@68 39 from html_utility import html_tag, html_th, html_tr, html_link
jpayne@68 40 from rqc_utility import get_dict_obj, pipeline_val
jpayne@68 41
jpayne@68 42 VERSION = "1.0.0"
jpayne@68 43 LOG_LEVEL = "DEBUG"
jpayne@68 44 SCRIPT_NAME = __file__
jpayne@68 45
jpayne@68 46 PYDIR = os.path.abspath(os.path.dirname(__file__))
jpayne@68 47 BBDIR = os.path.join(PYDIR, os.path.pardir)
jpayne@68 48
jpayne@68 49 color = {}
jpayne@68 50 color = set_colors(color, True)
jpayne@68 51
jpayne@68 52 """
jpayne@68 53 STEP1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 54
jpayne@68 55 """
jpayne@68 56 def do_fast_subsample_fastq_sequences(fastq, skipSubsampling, log):
jpayne@68 57 log.info("\n\n%sSTEP1 - Subsampling reads <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 58
jpayne@68 59 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 60
jpayne@68 61 status = "1_illumina_readqc_subsampling in progress"
jpayne@68 62 checkpoint_step_wrapper(status)
jpayne@68 63 log.info("1_illumina_readqc_subsampling in progress.")
jpayne@68 64
jpayne@68 65 inputReadNum = 0
jpayne@68 66 sampleRate = 0.0
jpayne@68 67
jpayne@68 68 if not skipSubsampling:
jpayne@68 69 # sampleRate = RQCReadQc.ILLUMINA_SAMPLE_PCTENTAGE ## 0.01
jpayne@68 70 inputReadNum = read_count(fastq) ## read count from the original fastq
jpayne@68 71 assert inputReadNum > 0, "ERROR: invalid input fastq"
jpayne@68 72 sampleRate = get_subsample_rate(inputReadNum)
jpayne@68 73 log.info("Subsampling rate = %s", sampleRate)
jpayne@68 74 else:
jpayne@68 75 log.info("1_illumina_readqc_subsampling: skip subsampling. Use all the reads.")
jpayne@68 76 sampleRate = 1.0
jpayne@68 77
jpayne@68 78 retCode = None
jpayne@68 79 totalReadNum = 0
jpayne@68 80 firstSubsampledFastqFileName = ""
jpayne@68 81
jpayne@68 82 sequnitFileName = os.path.basename(fastq)
jpayne@68 83 sequnitFileName = sequnitFileName.replace(".fastq", "").replace(".gz", "")
jpayne@68 84
jpayne@68 85 firstSubsampledFastqFileName = sequnitFileName + ".s" + str(sampleRate) + ".fastq"
jpayne@68 86
jpayne@68 87 retCode, firstSubsampledFastqFileName, totalBaseCount, totalReadNum, subsampledReadNum, bIsPaired, readLength = fast_subsample_fastq_sequences(fastq, firstSubsampledFastqFileName, sampleRate, True, log)
jpayne@68 88
jpayne@68 89 append_rqc_stats(statsFile, "SUBSAMPLE_RATE", sampleRate, log)
jpayne@68 90
jpayne@68 91 if retCode in (RQCExitCodes.JGI_FAILURE, -2):
jpayne@68 92 log.info("1_illumina_readqc_subsampling failed.")
jpayne@68 93 status = "1_illumina_readqc_subsampling failed"
jpayne@68 94 checkpoint_step_wrapper(status)
jpayne@68 95
jpayne@68 96 else:
jpayne@68 97 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT, totalBaseCount, log)
jpayne@68 98 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_COUNT, totalReadNum, log)
jpayne@68 99
jpayne@68 100 log.info("1_illumina_readqc_subsampling complete.")
jpayne@68 101 status = "1_illumina_readqc_subsampling complete"
jpayne@68 102 checkpoint_step_wrapper(status)
jpayne@68 103
jpayne@68 104
jpayne@68 105 return status, firstSubsampledFastqFileName, totalReadNum, subsampledReadNum, bIsPaired, readLength
jpayne@68 106
jpayne@68 107
jpayne@68 108 """
jpayne@68 109 STEP2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 110
jpayne@68 111 """
jpayne@68 112 def do_write_unique_20_mers(fastq, totalReadCount, log):
jpayne@68 113 log.info("\n\n%sSTEP2 - Sampling unique 25 mers <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 114
jpayne@68 115 if totalReadCount >= RQCReadQc.ILLUMINA_MER_SAMPLE_REPORT_FRQ * 2: ## 25000
jpayne@68 116 log.debug("read count total in step2 = %s", totalReadCount)
jpayne@68 117 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 118 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 119
jpayne@68 120 status = "2_unique_mers_sampling in progress"
jpayne@68 121 checkpoint_step_wrapper(status)
jpayne@68 122 log.info(status)
jpayne@68 123
jpayne@68 124 retCode, newDataFile, newPngPlotFile, newHtmlPlotFile = write_unique_20_mers(fastq, log)
jpayne@68 125
jpayne@68 126 if retCode != RQCExitCodes.JGI_SUCCESS:
jpayne@68 127 status = "2_unique_mers_sampling failed"
jpayne@68 128 log.error(status)
jpayne@68 129 checkpoint_step_wrapper(status)
jpayne@68 130
jpayne@68 131 else:
jpayne@68 132 ## if no output files, skip this step.
jpayne@68 133 if newDataFile is not None:
jpayne@68 134 statsDict = {}
jpayne@68 135
jpayne@68 136 ## in readqc_report.py
jpayne@68 137 ## 2014.07.23 read_level_mer_sampling is updated to process new output file format from bbcountunique
jpayne@68 138 log.info("2_unique_mers_sampling: post-processing the bbcountunique output file.")
jpayne@68 139 read_level_mer_sampling(statsDict, newDataFile, log)
jpayne@68 140
jpayne@68 141 for k, v in statsDict.items():
jpayne@68 142 append_rqc_stats(statsFile, k, str(v), log)
jpayne@68 143
jpayne@68 144 ## outputs from bbcountunique
jpayne@68 145 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_20MER_UNIQUENESS_TEXT, newDataFile, log)
jpayne@68 146 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_20MER_UNIQUENESS_PLOT, newPngPlotFile, log)
jpayne@68 147 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_20MER_UNIQUENESS_D3_HTML_PLOT, newHtmlPlotFile, log)
jpayne@68 148
jpayne@68 149 log.info("2_unique_mers_sampling complete.")
jpayne@68 150 status = "2_unique_mers_sampling complete"
jpayne@68 151 checkpoint_step_wrapper(status)
jpayne@68 152
jpayne@68 153 else:
jpayne@68 154 ## if num reads < RQCReadQc.ILLUMINA_MER_SAMPLE_REPORT_FRQ = 25000
jpayne@68 155 ## just proceed to the next step
jpayne@68 156 log.warning("2_unique_mers_sampling can't run it because the number of reads < %s.", RQCReadQc.ILLUMINA_MER_SAMPLE_REPORT_FRQ * 2)
jpayne@68 157 status = "2_unique_mers_sampling complete"
jpayne@68 158 checkpoint_step_wrapper(status)
jpayne@68 159
jpayne@68 160
jpayne@68 161 return status
jpayne@68 162
jpayne@68 163
jpayne@68 164 """
jpayne@68 165 STEP3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 166
jpayne@68 167 """
jpayne@68 168 def do_illumina_read_gc(fastq, log):
jpayne@68 169 log.info("\n\n%sSTEP3 - Making read GC histograms <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 170
jpayne@68 171 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 172 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 173
jpayne@68 174 status = "3_illumina_read_gc in progress"
jpayne@68 175 checkpoint_step_wrapper(status)
jpayne@68 176 log.info("3_illumina_read_gc in progress.")
jpayne@68 177
jpayne@68 178 reformat_gchist_file, png_file, htmlFile, mean_val, stdev_val, med_val, mode_val = illumina_read_gc(fastq, log)
jpayne@68 179
jpayne@68 180 if not reformat_gchist_file:
jpayne@68 181 log.error("3_illumina_read_gc failed.")
jpayne@68 182 status = "3_illumina_read_gc failed"
jpayne@68 183 checkpoint_step_wrapper(status)
jpayne@68 184
jpayne@68 185 else:
jpayne@68 186 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_MEAN, mean_val, log)
jpayne@68 187 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_STD, stdev_val, log)
jpayne@68 188 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_MED, med_val, log)
jpayne@68 189 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_GC_MODE, mode_val, log)
jpayne@68 190
jpayne@68 191 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_GC_TEXT, reformat_gchist_file, log)
jpayne@68 192 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_GC_PLOT, png_file, log)
jpayne@68 193 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_GC_D3_HTML_PLOT, htmlFile, log)
jpayne@68 194
jpayne@68 195 log.info("3_illumina_read_gc complete.")
jpayne@68 196 status = "3_illumina_read_gc complete"
jpayne@68 197 checkpoint_step_wrapper(status)
jpayne@68 198
jpayne@68 199
jpayne@68 200 return status
jpayne@68 201
jpayne@68 202
jpayne@68 203 """
jpayne@68 204 STEP4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 205
jpayne@68 206 """
jpayne@68 207 def do_read_quality_stats(fastq, log):
jpayne@68 208 log.info("\n\n%sSTEP4 - Analyzing read quality <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 209
jpayne@68 210 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 211 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 212
jpayne@68 213 status = "4_illumina_read_quality_stats in progress"
jpayne@68 214 checkpoint_step_wrapper(status)
jpayne@68 215 log.info("4_illumina_read_quality_stats in progress.")
jpayne@68 216
jpayne@68 217 readLength = 0
jpayne@68 218 readLenR1 = 0
jpayne@68 219 readLenR2 = 0
jpayne@68 220 isPairedEnd = None
jpayne@68 221
jpayne@68 222 if not os.path.isfile(fastq):
jpayne@68 223 log.error("4_illumina_read_quality_stats failed. Cannot find the input fastq file")
jpayne@68 224 status = "4_illumina_read_quality_stats failed"
jpayne@68 225 checkpoint_step_wrapper(status)
jpayne@68 226 return status
jpayne@68 227
jpayne@68 228 ## First figure out if it's pair-ended or not
jpayne@68 229 ## NOTE: ssize=10000 is recommended!
jpayne@68 230 readLength, readLenR1, readLenR2, isPairedEnd = get_working_read_length(fastq, log)
jpayne@68 231 log.info("Read length = %s, and is_pair_ended = %s", readLength, isPairedEnd)
jpayne@68 232
jpayne@68 233 if readLength == 0:
jpayne@68 234 log.error("Failed to run get_working_read_length.")
jpayne@68 235 status = "4_illumina_read_quality_stats failed"
jpayne@68 236 checkpoint_step_wrapper(status)
jpayne@68 237
jpayne@68 238 else:
jpayne@68 239 ## Pair-ended
jpayne@68 240 r1_r2_baseposqual_png = None ## Average Base Position Quality Plot (*.qrpt.png)
jpayne@68 241 r1_r2_baseposqual_html = None ## Average Base Position Quality D3 Plot (*.qrpt.html)
jpayne@68 242 r1_r2_baseposqual_txt = None ## Read 1/2 Average Base Position Quality Text (*.qhist.txt)
jpayne@68 243
jpayne@68 244 r1_baseposqual_box_png = None ## Read 1 Average Base Position Quality Boxplot (*.r1.png)
jpayne@68 245 r2_baseposqual_box_png = None ## Read 2 Average Base Position Quality Boxplot (*.r2.png)
jpayne@68 246 r1_baseposqual_box_html = None ## Read 1 Average Base Position Quality D3 Boxplot (*.r1.html)
jpayne@68 247 r2_baseposqual_box_html = None ## Read 2 Average Base Position Quality D3 Boxplot (*.r2.html)
jpayne@68 248 r1_r2_baseposqual_box_txt = None ## Average Base Position Quality text
jpayne@68 249
jpayne@68 250 r1_cyclenbase_png = None ## Read 1 Percent N by Read Position (*.r1.fastq.base.stats.Npercent.png) --> Read 1 Cycle N Base Percent plot
jpayne@68 251 r2_cyclenbase_png = None ## Read 2 Percent N by Read Position (*.r2.fastq.base.stats.Npercent.png) --> Read 2 Cycle N Base Percent plot
jpayne@68 252
jpayne@68 253 r1_cyclenbase_txt = None ## Read 1 Percent N by Read Position Text (*.r1.fastq.base.stats) --> Read 1 Cycle N Base Percent text
jpayne@68 254 #r2_cyclenbase_txt = None ## Read 2 Percent N by Read Position Text (*.r2.fastq.base.stats) --> Read 2 Cycle N Base Percent text
jpayne@68 255 r1_r2_cyclenbase_txt = None ## Merged Percent N by Read Position Text (*.r2.fastq.base.stats) --> Merged Cycle N Base Percent text
jpayne@68 256
jpayne@68 257 r1_cyclenbase_html = None
jpayne@68 258 r2_cyclenbase_html = None
jpayne@68 259
jpayne@68 260 r1_nuclcompfreq_png = None ## Read 1 Nucleotide Composition Frequency Plot (*.r1.stats.png)
jpayne@68 261 r2_nuclcompfreq_png = None ## Read 2 Nucleotide Composition Frequency Plot (*.r2.stats.png)
jpayne@68 262 r1_nuclcompfreq_html = None
jpayne@68 263 r2_nuclcompfreq_html = None
jpayne@68 264
jpayne@68 265 ## Single-ended
jpayne@68 266 #se_baseposqual_txt = None ## Average Base Position Quality Text (*.qrpt)
jpayne@68 267 #se_nuclcompfreq_png = None ## Cycle Nucleotide Composition (*.stats.png)
jpayne@68 268
jpayne@68 269 if isPairedEnd:
jpayne@68 270 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_LENGTH_1, readLenR1, log)
jpayne@68 271 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_LENGTH_2, readLenR2, log)
jpayne@68 272
jpayne@68 273 else:
jpayne@68 274 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_LENGTH_1, readLength, log)
jpayne@68 275
jpayne@68 276 ## Average Base Position Quality Plot/Text using qhist.txt
jpayne@68 277 r1_r2_baseposqual_txt, r1_r2_baseposqual_png, r1_r2_baseposqual_html = gen_average_base_position_quality_plot(fastq, isPairedEnd, log) ## .reformat.qhist.txt
jpayne@68 278
jpayne@68 279 log.debug("Outputs: %s %s %s", r1_r2_baseposqual_png, r1_r2_baseposqual_html, r1_r2_baseposqual_txt)
jpayne@68 280
jpayne@68 281 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_PLOT_MERGED, r1_r2_baseposqual_png, log)
jpayne@68 282 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_PLOT_MERGED_D3_HTML_PLOT, r1_r2_baseposqual_html, log)
jpayne@68 283 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_1, r1_r2_baseposqual_txt, log) ## for backward compatibility
jpayne@68 284 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_2, r1_r2_baseposqual_txt, log) ## for backward compatibility
jpayne@68 285 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_MERGED, r1_r2_baseposqual_txt, log)
jpayne@68 286
jpayne@68 287 ## Average Base Position Quality Plot/Text using bqhist.txt
jpayne@68 288 r1_r2_baseposqual_box_txt, r1_baseposqual_box_png, r2_baseposqual_box_png, r1_baseposqual_box_html, r2_baseposqual_box_html = gen_average_base_position_quality_boxplot(fastq, log)
jpayne@68 289
jpayne@68 290 log.debug("Read qual outputs: %s %s %s %s %s", r1_r2_baseposqual_box_txt, r1_baseposqual_box_png, r1_baseposqual_box_html, r2_baseposqual_box_png, r2_baseposqual_box_html)
jpayne@68 291
jpayne@68 292 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_BOXPLOT_1, r1_baseposqual_box_png, log)
jpayne@68 293 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_D3_HTML_BOXPLOT_1, r1_baseposqual_box_html, log)
jpayne@68 294 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_BOXPLOT_2, r2_baseposqual_box_png, log)
jpayne@68 295 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_D3_HTML_BOXPLOT_2, r2_baseposqual_box_html, log)
jpayne@68 296 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QUAL_POS_QRPT_BOXPLOT_TEXT, r1_r2_baseposqual_box_txt, log)
jpayne@68 297
jpayne@68 298 ## ----------------------------------------------------------------------------------------------------
jpayne@68 299 ## compute Q20 of the two reads
jpayne@68 300 q20Read1 = None
jpayne@68 301 q20Read2 = None
jpayne@68 302
jpayne@68 303 ## using bqhist.txt
jpayne@68 304 if r1_r2_baseposqual_box_txt:
jpayne@68 305 q20Read1 = q20_score_new(r1_r2_baseposqual_box_txt, 1, log)
jpayne@68 306 if isPairedEnd:
jpayne@68 307 q20Read2 = q20_score_new(r1_r2_baseposqual_box_txt, 2, log)
jpayne@68 308
jpayne@68 309 log.debug("q20 for read 1 = %s", q20Read1)
jpayne@68 310 log.debug("q20 for read 2 = %s", q20Read2)
jpayne@68 311
jpayne@68 312 if q20Read1 is not None:
jpayne@68 313 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_Q20_READ1, q20Read1, log)
jpayne@68 314 else:
jpayne@68 315 log.error("Failed to get q20 read 1 from %s", r1_r2_baseposqual_box_txt)
jpayne@68 316 status = "4_illumina_read_quality_stats failed"
jpayne@68 317 checkpoint_step_wrapper(status)
jpayne@68 318 return status
jpayne@68 319
jpayne@68 320 if isPairedEnd:
jpayne@68 321 if q20Read2 is not None:
jpayne@68 322 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_Q20_READ2, q20Read2, log)
jpayne@68 323 else:
jpayne@68 324 log.error("Failed to get q20 read 2 from %s", r1_r2_baseposqual_box_txt)
jpayne@68 325 status = "4_illumina_read_quality_stats failed"
jpayne@68 326 checkpoint_step_wrapper(status)
jpayne@68 327 return status
jpayne@68 328
jpayne@68 329 r1_r2_cyclenbase_txt, r1_nuclcompfreq_png, r1_nuclcompfreq_html, r2_nuclcompfreq_png, r2_nuclcompfreq_html = gen_cycle_nucleotide_composition_plot(fastq, readLength, isPairedEnd, log)
jpayne@68 330
jpayne@68 331 log.debug("gen_cycle_nucleotide_composition_plot() ==> %s %s %s %s %s", r1_cyclenbase_txt, r1_nuclcompfreq_png, r1_nuclcompfreq_html, r2_nuclcompfreq_png, r2_nuclcompfreq_html)
jpayne@68 332
jpayne@68 333 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_TEXT_1, r1_r2_cyclenbase_txt, log)
jpayne@68 334 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_TEXT_2, r1_r2_cyclenbase_txt, log) # reformat.sh generates a single merged output file.
jpayne@68 335
jpayne@68 336 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_PLOT_1, r1_nuclcompfreq_png, log)
jpayne@68 337 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_1, r1_nuclcompfreq_html, log)
jpayne@68 338
jpayne@68 339 if r2_nuclcompfreq_png:
jpayne@68 340 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_PLOT_2, r2_nuclcompfreq_png, log)
jpayne@68 341 if r2_nuclcompfreq_html:
jpayne@68 342 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_2, r2_nuclcompfreq_html, log)
jpayne@68 343
jpayne@68 344 ### ---------------------------------------------------------------------------------------------------
jpayne@68 345 ## using bhist.txt
jpayne@68 346 r1_cyclenbase_txt, r1_cyclenbase_png, r1_cyclenbase_html, r2_cyclenbase_png, r2_cyclenbase_html = gen_cycle_n_base_percent_plot(fastq, readLength, isPairedEnd, log)
jpayne@68 347
jpayne@68 348 log.debug("Outputs: %s %s %s", r1_cyclenbase_txt, r1_cyclenbase_png, r1_cyclenbase_html)
jpayne@68 349
jpayne@68 350 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_TEXT_1, r1_cyclenbase_txt, log)
jpayne@68 351 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_TEXT_2, r1_cyclenbase_txt, log)
jpayne@68 352
jpayne@68 353 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_PLOT_1, r1_cyclenbase_png, log)
jpayne@68 354 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_D3_HTML_PLOT_1, r1_cyclenbase_html, log)
jpayne@68 355
jpayne@68 356 if r2_cyclenbase_png:
jpayne@68 357 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_PLOT_2, r2_cyclenbase_png, log)
jpayne@68 358 if r2_cyclenbase_html:
jpayne@68 359 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_PERCENTAGE_D3_HTML_PLOT_2, r2_cyclenbase_html, log)
jpayne@68 360
jpayne@68 361 log.info("4_illumina_read_quality_stats complete.")
jpayne@68 362 status = "4_illumina_read_quality_stats complete"
jpayne@68 363 checkpoint_step_wrapper(status)
jpayne@68 364
jpayne@68 365
jpayne@68 366 return status
jpayne@68 367
jpayne@68 368
jpayne@68 369
jpayne@68 370 """
jpayne@68 371 STEP5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 372
jpayne@68 373 """
jpayne@68 374 def do_write_base_quality_stats(fastq, log):
jpayne@68 375 log.info("\n\n%sSTEP5 - Calculating base quality statistics for reads <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 376
jpayne@68 377 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 378 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 379
jpayne@68 380 status = "5_illumina_read_quality_stats in progress"
jpayne@68 381 checkpoint_step_wrapper(status)
jpayne@68 382 log.info("5_illumina_read_quality_stats in progress.")
jpayne@68 383
jpayne@68 384 reformatObqhistFile = write_avg_base_quality_stats(fastq, log) ## *.reformat.obqhist.txt
jpayne@68 385
jpayne@68 386 if not reformatObqhistFile:
jpayne@68 387 log.error("5_illumina_read_quality_stats failed.")
jpayne@68 388 status = "5_illumina_read_quality_stats failed"
jpayne@68 389 checkpoint_step_wrapper(status)
jpayne@68 390
jpayne@68 391 else:
jpayne@68 392 ## Generate qual scores and plots of read level QC
jpayne@68 393 statsDict = {}
jpayne@68 394
jpayne@68 395 retCode = base_level_qual_stats(statsDict, reformatObqhistFile, log)
jpayne@68 396
jpayne@68 397 if retCode != RQCExitCodes.JGI_SUCCESS:
jpayne@68 398 log.error("5_illumina_read_quality_stats failed.")
jpayne@68 399 status = "5_illumina_read_quality_stats failed"
jpayne@68 400 checkpoint_step_wrapper(status)
jpayne@68 401
jpayne@68 402 else:
jpayne@68 403 for k, v in statsDict.items():
jpayne@68 404 append_rqc_stats(statsFile, k, str(v), log)
jpayne@68 405
jpayne@68 406 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_BASE_QUALITY_STATS, reformatObqhistFile, log)
jpayne@68 407
jpayne@68 408 log.info("5_illumina_read_quality_stats complete.")
jpayne@68 409 status = "5_illumina_read_quality_stats complete"
jpayne@68 410 checkpoint_step_wrapper(status)
jpayne@68 411
jpayne@68 412
jpayne@68 413 return status
jpayne@68 414
jpayne@68 415
jpayne@68 416
jpayne@68 417 """
jpayne@68 418 STEP6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 419
jpayne@68 420 """
jpayne@68 421 def do_illumina_count_q_score(fastq, log):
jpayne@68 422 log.info("\n\n%sSTEP6 - Generating quality score histogram <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 423
jpayne@68 424 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 425 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 426
jpayne@68 427 status = "6_illumina_count_q_score in progress"
jpayne@68 428 checkpoint_step_wrapper(status)
jpayne@68 429 log.info("6_illumina_count_q_score in progress.")
jpayne@68 430
jpayne@68 431 qhistTxtFile, qhistPngFile, qhistHtmlPlotFile = illumina_count_q_score(fastq, log) ## *.obqhist.txt
jpayne@68 432
jpayne@68 433 if not qhistTxtFile:
jpayne@68 434 log.error("6_illumina_count_q_score failed.")
jpayne@68 435 status = "6_illumina_count_q_score failed"
jpayne@68 436 checkpoint_step_wrapper(status)
jpayne@68 437
jpayne@68 438 else:
jpayne@68 439 ## save qscores in statsFile
jpayne@68 440 qscore = {}
jpayne@68 441 read_level_qual_stats(qscore, qhistTxtFile, log)
jpayne@68 442
jpayne@68 443 for k, v in qscore.items():
jpayne@68 444 append_rqc_stats(statsFile, k, str(v), log)
jpayne@68 445
jpayne@68 446 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QHIST_TEXT, qhistTxtFile, log)
jpayne@68 447 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QHIST_PLOT, qhistPngFile, log)
jpayne@68 448 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_QHIST_D3_HTML_PLOT, qhistHtmlPlotFile, log)
jpayne@68 449
jpayne@68 450 log.info("6_illumina_count_q_score complete.")
jpayne@68 451 status = "6_illumina_count_q_score complete"
jpayne@68 452 checkpoint_step_wrapper(status)
jpayne@68 453
jpayne@68 454
jpayne@68 455 return status
jpayne@68 456
jpayne@68 457
jpayne@68 458
jpayne@68 459 """
jpayne@68 460 STEP7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 461
jpayne@68 462 """
jpayne@68 463 ## 20140903 removed
jpayne@68 464 ##def do_illumina_calculate_average_quality(fastq, log):
jpayne@68 465
jpayne@68 466
jpayne@68 467
jpayne@68 468 """
jpayne@68 469 STEP8 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 470
jpayne@68 471 """
jpayne@68 472 def do_illumina_find_common_motifs(fastq, log):
jpayne@68 473 log.info("\n\n%sSTEP8 - Locating N stutter motifs in sequence reads <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 474
jpayne@68 475 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 476 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 477
jpayne@68 478 status = "8_illumina_find_common_motifs in progress"
jpayne@68 479 checkpoint_step_wrapper(status)
jpayne@68 480 log.info(status)
jpayne@68 481
jpayne@68 482 retCode, statDataFile = illumina_find_common_motifs(fastq, log)
jpayne@68 483
jpayne@68 484 log.info("nstutter statDataFile name = %s", statDataFile)
jpayne@68 485
jpayne@68 486 if retCode != RQCExitCodes.JGI_SUCCESS:
jpayne@68 487 log.error("8_illumina_find_common_motifs failed.")
jpayne@68 488 status = "8_illumina_find_common_motifs failed"
jpayne@68 489 checkpoint_step_wrapper(status)
jpayne@68 490
jpayne@68 491 else:
jpayne@68 492 ## read_level_stutter
jpayne@68 493 ## ex)
jpayne@68 494 ##688 N----------------------------------------------------------------------------------------------------------------------------
jpayne@68 495 ##-------------------------
jpayne@68 496 ##346 NNNNNNNNNNNNN----------------------------------------------------------------------------------------------------------------
jpayne@68 497 ##-------------------------
jpayne@68 498 ##53924 ------------N----------------------------------------------------------------------------------------------------------------
jpayne@68 499 ##-------------------------
jpayne@68 500 ##sum pct patterNs past 0.1 == 15.9245930330268 ( 54958 / 345114 * 100 )
jpayne@68 501
jpayne@68 502 with open(statDataFile, "r") as stutFH:
jpayne@68 503 lines = stutFH.readlines()
jpayne@68 504
jpayne@68 505 ## if no motifs are detected the file is empty
jpayne@68 506 if not lines:
jpayne@68 507 log.warning("The *.nstutter.stat file is not available in function read_level_stutter(). The function still returns JGI_SUCCESS.")
jpayne@68 508
jpayne@68 509 else:
jpayne@68 510 assert lines[-1].find("patterNs") != -1
jpayne@68 511 t = lines[-1].split()
jpayne@68 512 ## ["sum", "pct", "patterNs", "past", "0.1", "==", "15.9245930330268", "(", "54958", "/", "345114", "*", "100", ")"]
jpayne@68 513 percent = "%.2f" % float(t[6])
jpayne@68 514
jpayne@68 515 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_N_FREQUENCE, percent, log)
jpayne@68 516 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_N_PATTERN, str("".join(lines[:-1])), log)
jpayne@68 517
jpayne@68 518 ## NOTE: ???
jpayne@68 519 append_rqc_file(filesFile, "find_common_motifs.dataFile", statDataFile, log)
jpayne@68 520
jpayne@68 521 log.info("8_illumina_find_common_motifs complete.")
jpayne@68 522 status = "8_illumina_find_common_motifs complete"
jpayne@68 523 checkpoint_step_wrapper(status)
jpayne@68 524
jpayne@68 525
jpayne@68 526 return status
jpayne@68 527
jpayne@68 528
jpayne@68 529 """
jpayne@68 530 STEP11 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 531
jpayne@68 532 """
jpayne@68 533 def do_illumina_detect_read_contam(fastq, bpToCut, log):
jpayne@68 534 log.info("\n\n%sSTEP11 - Detect read contam <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 535
jpayne@68 536 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 537 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 538
jpayne@68 539 status = "11_illumina_detect_read_contam in progress"
jpayne@68 540 checkpoint_step_wrapper(status)
jpayne@68 541 log.info("11_illumina_detect_read_contam in progress.")
jpayne@68 542
jpayne@68 543
jpayne@68 544 #########
jpayne@68 545 ## seal
jpayne@68 546 #########
jpayne@68 547 retCode2, outFileDict2, ratioResultDict2, contamStatDict = illumina_detect_read_contam3(fastq, bpToCut, log) ## seal version
jpayne@68 548
jpayne@68 549 if retCode2 != RQCExitCodes.JGI_SUCCESS:
jpayne@68 550 log.error("11_illumina_detect_read_contam seal version failed.")
jpayne@68 551 status = "11_illumina_detect_read_contam failed"
jpayne@68 552 checkpoint_step_wrapper(status)
jpayne@68 553
jpayne@68 554 else:
jpayne@68 555 for k, v in outFileDict2.items():
jpayne@68 556 append_rqc_file(filesFile, k + " seal", str(v), log)
jpayne@68 557 append_rqc_file(filesFile, k, str(v), log)
jpayne@68 558
jpayne@68 559 for k, v in ratioResultDict2.items():
jpayne@68 560 append_rqc_stats(statsFile, k + " seal", str(v), log)
jpayne@68 561 append_rqc_stats(statsFile, k, str(v), log)
jpayne@68 562
jpayne@68 563 ## contamination stat
jpayne@68 564 for k, v in contamStatDict.items():
jpayne@68 565 append_rqc_stats(statsFile, k, str(v), log)
jpayne@68 566
jpayne@68 567 log.info("11_illumina_detect_read_contam seal version complete.")
jpayne@68 568 status = "11_illumina_detect_read_contam complete"
jpayne@68 569 checkpoint_step_wrapper(status)
jpayne@68 570
jpayne@68 571
jpayne@68 572 return status
jpayne@68 573
jpayne@68 574
jpayne@68 575
jpayne@68 576 """
jpayne@68 577 STEP13 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 578
jpayne@68 579 """
jpayne@68 580 ## Removed!!
jpayne@68 581 ##def do_illumina_read_megablast(firstSubsampledFastqFileName, skipSubsampling, subsampledReadNum, log, blastDbPath=None):
jpayne@68 582
jpayne@68 583
jpayne@68 584
jpayne@68 585 """
jpayne@68 586 New STEP13 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 587
jpayne@68 588 """
jpayne@68 589 def do_illumina_subsampling_read_blastn(firstSubsampledFastqFileName, skipSubsampling, subsampledReadNum, log):
jpayne@68 590 log.info("\n\n%sSTEP13 - Run subsampling for Blast search <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 591
jpayne@68 592 subsampeldFastqFile = None
jpayne@68 593 totalReadNum = 0
jpayne@68 594 readNumToReturn = 0
jpayne@68 595
jpayne@68 596 status = "13_illumina_subsampling_read_megablast in progress"
jpayne@68 597 checkpoint_step_wrapper(status)
jpayne@68 598 log.info("13_illumina_subsampling_read_megablast in progress.")
jpayne@68 599
jpayne@68 600 if subsampledReadNum == 0:
jpayne@68 601 cmd = " ".join(["grep", "-c", "'^+'", firstSubsampledFastqFileName])
jpayne@68 602
jpayne@68 603 stdOut, _, exitCode = run_command(cmd, True, log)
jpayne@68 604
jpayne@68 605 if exitCode != 0:
jpayne@68 606 log.error("Failed to run grep cmd")
jpayne@68 607 return RQCExitCodes.JGI_FAILURE, None, None, None
jpayne@68 608
jpayne@68 609 else:
jpayne@68 610 readNum = int(stdOut)
jpayne@68 611
jpayne@68 612 else:
jpayne@68 613 readNum = subsampledReadNum
jpayne@68 614
jpayne@68 615 log.info("Subsampled read number = %d", readNum)
jpayne@68 616
jpayne@68 617 sampl_per = RQCReadQc.ILLUMINA_SAMPLE_PCTENTAGE ## 0.01
jpayne@68 618 max_count = RQCReadQc.ILLUMINA_SAMPLE_COUNT ## 50000
jpayne@68 619
jpayne@68 620 ## TODO
jpayne@68 621 ## Use "samplereadstarget" option in reformat.sh
jpayne@68 622
jpayne@68 623 if skipSubsampling:
jpayne@68 624 log.debug("No subsampling for megablast. Use fastq file, %s (readnum = %s) as query for megablast.", firstSubsampledFastqFileName, readNum)
jpayne@68 625 subsampeldFastqFile = firstSubsampledFastqFileName
jpayne@68 626 readNumToReturn = readNum
jpayne@68 627
jpayne@68 628 else:
jpayne@68 629 if readNum > max_count:
jpayne@68 630 log.debug("Run the 2nd subsampling for running megablast.")
jpayne@68 631 secondSubsamplingRate = float(max_count) / readNum
jpayne@68 632 log.debug("SecondSubsamplingRate=%s, max_count=%s, readNum=%s", secondSubsamplingRate, max_count, readNum)
jpayne@68 633 log.info("Second subsampling of Reads after Percent Subsampling reads = %s with new percent subsampling %f.", readNum, secondSubsamplingRate)
jpayne@68 634
jpayne@68 635 secondSubsampledFastqFile = ""
jpayne@68 636 # dataFile = ""
jpayne@68 637
jpayne@68 638 sequnitFileName = os.path.basename(firstSubsampledFastqFileName)
jpayne@68 639 sequnitFileName = sequnitFileName.replace(".fastq", "").replace(".gz", "")
jpayne@68 640
jpayne@68 641 secondSubsampledFastqFile = sequnitFileName + ".s" + str(sampl_per) + ".s" + str(secondSubsamplingRate) + ".n" + str(max_count) + ".fastq"
jpayne@68 642
jpayne@68 643 ## ex) fq_sub_sample.pl -f .../7601.1.77813.CTTGTA.s0.01.fastq -o .../7601.1.77813.CTTGTA.s0.01.stats -r 0.0588142575171 > .../7601.1.77813.CTTGTA.s0.01.s0.01.s0.0588142575171.n50000.fastq
jpayne@68 644 ## ex) reformat.sh
jpayne@68 645 retCode, secondSubsampledFastqFile, totalBaseCount, totalReadNum, subsampledReadNum, _, _ = fast_subsample_fastq_sequences(firstSubsampledFastqFileName, secondSubsampledFastqFile, secondSubsamplingRate, False, log)
jpayne@68 646
jpayne@68 647 if retCode != RQCExitCodes.JGI_SUCCESS:
jpayne@68 648 log.error("Second subsampling failed.")
jpayne@68 649 return RQCExitCodes.JGI_FAILURE, None, None, None
jpayne@68 650
jpayne@68 651 else:
jpayne@68 652 log.info("Second subsampling complete.")
jpayne@68 653 log.info("Second Subsampling Total Base Count = %s.", totalBaseCount)
jpayne@68 654 log.info("Second Subsampling Total Reads = %s.", totalReadNum)
jpayne@68 655 log.info("Second Subsampling Sampled Reads = %s.", subsampledReadNum)
jpayne@68 656
jpayne@68 657 if subsampledReadNum == 0:
jpayne@68 658 log.warning("Too small first subsampled fastq file. Skip the 2nd sampling.")
jpayne@68 659 secondSubsampledFastqFile = firstSubsampledFastqFileName
jpayne@68 660
jpayne@68 661 subsampeldFastqFile = secondSubsampledFastqFile
jpayne@68 662 readNumToReturn = subsampledReadNum
jpayne@68 663
jpayne@68 664 else:
jpayne@68 665 log.debug("The readNum is smaller than max_count=%s. The 2nd sampling skipped.", str(max_count))
jpayne@68 666 subsampeldFastqFile = firstSubsampledFastqFileName
jpayne@68 667 totalReadNum = readNum
jpayne@68 668 readNumToReturn = readNum
jpayne@68 669
jpayne@68 670 log.info("13_illumina_subsampling_read_megablast complete.")
jpayne@68 671 status = "13_illumina_subsampling_read_megablast complete"
jpayne@68 672 checkpoint_step_wrapper(status)
jpayne@68 673
jpayne@68 674
jpayne@68 675 return status, subsampeldFastqFile, totalReadNum, readNumToReturn
jpayne@68 676
jpayne@68 677
jpayne@68 678
jpayne@68 679 ##"""
jpayne@68 680 ##STEP14 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 681 ##
jpayne@68 682 ##"""
jpayne@68 683 ##def do_illumina_read_blastn_refseq_microbial(subsampeldFastqFile, subsampledReadNum, log, blastDbPath=None):
jpayne@68 684 ## 12212015 sulsj REMOVED!
jpayne@68 685
jpayne@68 686
jpayne@68 687 """
jpayne@68 688 New STEP14 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 689
jpayne@68 690 """
jpayne@68 691 def do_illumina_read_blastn_refseq_archaea(subsampeldFastqFile, subsampledReadNum, log):
jpayne@68 692 log.info("\n\n%sSTEP14 - Run read blastn against refseq archaea <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 693
jpayne@68 694 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 695 retCode = None
jpayne@68 696
jpayne@68 697 log.info("14_illumina_read_blastn_refseq_archaea in progress.")
jpayne@68 698 status = "14_illumina_read_blastn_refseq_archaea in progress"
jpayne@68 699 checkpoint_step_wrapper(status)
jpayne@68 700
jpayne@68 701 retCode = illumina_read_blastn_refseq_archaea(subsampeldFastqFile, log)
jpayne@68 702
jpayne@68 703 if retCode == RQCExitCodes.JGI_FAILURE:
jpayne@68 704 log.error("14_illumina_read_blastn_refseq_archaea failed.")
jpayne@68 705 status = "14_illumina_read_blastn_refseq_archaea failed"
jpayne@68 706
jpayne@68 707 elif retCode == -143: ## timeout
jpayne@68 708 log.warning("14_illumina_read_blastn_refseq_archaea timeout.")
jpayne@68 709 status = "14_illumina_read_blastn_refseq_archaea complete"
jpayne@68 710
jpayne@68 711 else:
jpayne@68 712 ## read number used in blast search?
jpayne@68 713 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READS_NUMBER, subsampledReadNum, log)
jpayne@68 714
jpayne@68 715 ret2 = read_megablast_hits("refseq.archaea", log)
jpayne@68 716
jpayne@68 717 if ret2 != RQCExitCodes.JGI_SUCCESS:
jpayne@68 718 log.error("Errors in read_megablast_hits() of refseq.microbial")
jpayne@68 719 log.error("14_illumina_read_blastn_refseq_archaea reporting failed.")
jpayne@68 720 status = "14_illumina_read_blastn_refseq_archaea failed"
jpayne@68 721
jpayne@68 722 else:
jpayne@68 723 log.info("14_illumina_read_blastn_refseq_archaea complete.")
jpayne@68 724 status = "14_illumina_read_blastn_refseq_archaea complete"
jpayne@68 725
jpayne@68 726
jpayne@68 727 return status
jpayne@68 728
jpayne@68 729
jpayne@68 730 """
jpayne@68 731 STEP15 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 732
jpayne@68 733 """
jpayne@68 734 def do_illumina_read_blastn_refseq_bacteria(subsampeldFastqFile, log):
jpayne@68 735 log.info("\n\n%sSTEP15 - Run read blastn against refseq bacteria <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 736
jpayne@68 737 #statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 738 #retCode = None
jpayne@68 739
jpayne@68 740 log.info("15_illumina_read_blastn_refseq_bacteria in progress.")
jpayne@68 741 status = "15_illumina_read_blastn_refseq_bacteria in progress"
jpayne@68 742 checkpoint_step_wrapper(status)
jpayne@68 743
jpayne@68 744 retCode = illumina_read_blastn_refseq_bacteria(subsampeldFastqFile, log)
jpayne@68 745
jpayne@68 746 if retCode == RQCExitCodes.JGI_FAILURE:
jpayne@68 747 log.error("15_illumina_read_blastn_refseq_bacteria failed.")
jpayne@68 748 status = "15_illumina_read_blastn_refseq_bacteria failed"
jpayne@68 749
jpayne@68 750 elif retCode == -143: ## timeout
jpayne@68 751 log.warning("15_illumina_read_blastn_refseq_bacteria timeout.")
jpayne@68 752 status = "15_illumina_read_blastn_refseq_bacteria complete"
jpayne@68 753
jpayne@68 754 else:
jpayne@68 755 ret2 = read_megablast_hits("refseq.bacteria", log)
jpayne@68 756
jpayne@68 757 if ret2 != RQCExitCodes.JGI_SUCCESS:
jpayne@68 758 log.error("Errors in read_megablast_hits() of refseq.microbial")
jpayne@68 759 log.error("15_illumina_read_blastn_refseq_bacteria reporting failed.")
jpayne@68 760 status = "15_illumina_read_blastn_refseq_bacteria failed"
jpayne@68 761
jpayne@68 762 else:
jpayne@68 763 log.info("15_illumina_read_blastn_refseq_bacteria complete.")
jpayne@68 764 status = "15_illumina_read_blastn_refseq_bacteria complete"
jpayne@68 765
jpayne@68 766
jpayne@68 767 return status
jpayne@68 768
jpayne@68 769
jpayne@68 770
jpayne@68 771 """
jpayne@68 772 STEP16 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 773
jpayne@68 774 """
jpayne@68 775 def do_illumina_read_blastn_nt(subsampeldFastqFile, log):
jpayne@68 776 log.info("\n\n%sSTEP16 - Run read blastn against nt <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 777
jpayne@68 778 retCode = None
jpayne@68 779
jpayne@68 780 log.info("16_illumina_read_blastn_nt in progress.")
jpayne@68 781 status = "16_illumina_read_blastn_nt in progress"
jpayne@68 782 checkpoint_step_wrapper(status)
jpayne@68 783
jpayne@68 784 retCode = illumina_read_blastn_nt(subsampeldFastqFile, log)
jpayne@68 785
jpayne@68 786 if retCode == RQCExitCodes.JGI_FAILURE:
jpayne@68 787 log.error("16_illumina_read_blastn_nt failed.")
jpayne@68 788 status = "16_illumina_read_blastn_nt failed"
jpayne@68 789
jpayne@68 790 elif retCode == -143: ## timeout
jpayne@68 791 log.warning("16_illumina_read_blastn_nt timeout.")
jpayne@68 792 status = "16_illumina_read_blastn_nt complete"
jpayne@68 793
jpayne@68 794 else:
jpayne@68 795 ret2 = read_megablast_hits("nt", log)
jpayne@68 796
jpayne@68 797 if ret2 != RQCExitCodes.JGI_SUCCESS:
jpayne@68 798 log.error("Errors in read_megablast_hits() of nt")
jpayne@68 799 log.error("16_illumina_read_blastn_nt reporting failed.")
jpayne@68 800 status = "16_illumina_read_blastn_nt failed"
jpayne@68 801
jpayne@68 802 else:
jpayne@68 803 log.info("16_illumina_read_blastn_nt complete.")
jpayne@68 804 status = "16_illumina_read_blastn_nt complete"
jpayne@68 805
jpayne@68 806
jpayne@68 807 return status
jpayne@68 808
jpayne@68 809
jpayne@68 810
jpayne@68 811 """
jpayne@68 812 STEP17 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 813
jpayne@68 814 do_illumina_multiplex_statistics
jpayne@68 815
jpayne@68 816 Demultiplexing analysis for pooled lib
jpayne@68 817
jpayne@68 818 """
jpayne@68 819 def do_illumina_multiplex_statistics(fastq, log, isMultiplexed=None):
jpayne@68 820 log.info("\n\n%sSTEP17 - Run Multiplex statistics analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 821
jpayne@68 822 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 823
jpayne@68 824 log.info("17_multiplex_statistics in progress.")
jpayne@68 825 status = "17_multiplex_statistics in progress"
jpayne@68 826 checkpoint_step_wrapper(status)
jpayne@68 827
jpayne@68 828 log.debug("fastq file: %s", fastq)
jpayne@68 829
jpayne@68 830 retCode, demultiplexStatsFile, detectionPngPlotFile, detectionHtmlPlotFile = illumina_generate_index_sequence_detection_plot(fastq, log, isMultiplexed=isMultiplexed)
jpayne@68 831
jpayne@68 832 if retCode != RQCExitCodes.JGI_SUCCESS:
jpayne@68 833 log.error("17_multiplex_statistics failed.")
jpayne@68 834 status = "17_multiplex_statistics failed"
jpayne@68 835 checkpoint_step_wrapper(status)
jpayne@68 836
jpayne@68 837 else:
jpayne@68 838 log.info("17_multiplex_statistics complete.")
jpayne@68 839 status = "17_multiplex_statistics complete"
jpayne@68 840 checkpoint_step_wrapper(status)
jpayne@68 841
jpayne@68 842 if detectionPngPlotFile is not None:
jpayne@68 843 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_DEMULTIPLEX_STATS_PLOT, detectionPngPlotFile, log)
jpayne@68 844
jpayne@68 845 if detectionHtmlPlotFile is not None:
jpayne@68 846 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_DEMULTIPLEX_STATS_D3_HTML_PLOT, detectionHtmlPlotFile, log)
jpayne@68 847
jpayne@68 848 if demultiplexStatsFile is not None:
jpayne@68 849 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_DEMULTIPLEX_STATS, demultiplexStatsFile, log)
jpayne@68 850
jpayne@68 851
jpayne@68 852 return status
jpayne@68 853
jpayne@68 854
jpayne@68 855
jpayne@68 856 """
jpayne@68 857 STEP18 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 858
jpayne@68 859 do_end_of_read_illumina_adapter_check
jpayne@68 860
jpayne@68 861 """
jpayne@68 862 def do_end_of_read_illumina_adapter_check(firstSubsampledFastqFileName, log):
jpayne@68 863 log.info("\n\n%sSTEP18 - Run end_of_read_illumina_adapter_check analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 864
jpayne@68 865 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 866 plotFile = None
jpayne@68 867 dataFile = None
jpayne@68 868
jpayne@68 869 log.info("18_end_of_read_illumina_adapter_check in progress.")
jpayne@68 870 status = "18_end_of_read_illumina_adapter_check in progress"
jpayne@68 871 checkpoint_step_wrapper(status)
jpayne@68 872
jpayne@68 873 log.debug("sampled fastq file: %s", firstSubsampledFastqFileName)
jpayne@68 874
jpayne@68 875 retCode, dataFile, plotFile, htmlFile = end_of_read_illumina_adapter_check(firstSubsampledFastqFileName, log)
jpayne@68 876
jpayne@68 877 if retCode != RQCExitCodes.JGI_SUCCESS:
jpayne@68 878 log.error("18_end_of_read_illumina_adapter_check failed.")
jpayne@68 879 status = "18_end_of_read_illumina_adapter_check failed"
jpayne@68 880 checkpoint_step_wrapper(status)
jpayne@68 881
jpayne@68 882 else:
jpayne@68 883 log.info("18_end_of_read_illumina_adapter_check complete.")
jpayne@68 884 status = "18_end_of_read_illumina_adapter_check complete"
jpayne@68 885 checkpoint_step_wrapper(status)
jpayne@68 886
jpayne@68 887 if plotFile is not None:
jpayne@68 888 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_END_OF_READ_ADAPTER_CHECK_PLOT, plotFile, log)
jpayne@68 889 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_END_OF_READ_ADAPTER_CHECK_D3_HTML_PLOT, htmlFile, log)
jpayne@68 890
jpayne@68 891 if dataFile is not None:
jpayne@68 892 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_END_OF_READ_ADAPTER_CHECK_DATA, dataFile, log)
jpayne@68 893
jpayne@68 894
jpayne@68 895 return status
jpayne@68 896
jpayne@68 897
jpayne@68 898
jpayne@68 899 """
jpayne@68 900 STEP19 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 901
jpayne@68 902 do_insert_size_analysis
jpayne@68 903
jpayne@68 904 """
jpayne@68 905 def do_insert_size_analysis(fastq, log):
jpayne@68 906 log.info("\n\n%sSTEP19 - Run insert size analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 907
jpayne@68 908 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 909 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 910
jpayne@68 911 plotFile = None
jpayne@68 912 dataFile = None
jpayne@68 913
jpayne@68 914 log.info("19_insert_size_analysis in progress.")
jpayne@68 915 status = "19_insert_size_analysis in progress"
jpayne@68 916 checkpoint_step_wrapper(status)
jpayne@68 917
jpayne@68 918 log.debug("fastq file used: %s", fastq)
jpayne@68 919
jpayne@68 920 retCode, dataFile, plotFile, htmlFile, statsDict = insert_size_analysis(fastq, log) ## by bbmerge.sh
jpayne@68 921
jpayne@68 922 if retCode != RQCExitCodes.JGI_SUCCESS:
jpayne@68 923 log.error("19_insert_size_analysis failed.")
jpayne@68 924 status = "19_insert_size_analysis failed"
jpayne@68 925 checkpoint_step_wrapper(status)
jpayne@68 926
jpayne@68 927 else:
jpayne@68 928 if plotFile is not None:
jpayne@68 929 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_HISTO_PLOT, plotFile, log)
jpayne@68 930
jpayne@68 931 if dataFile is not None:
jpayne@68 932 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_HISTO_DATA, dataFile, log)
jpayne@68 933
jpayne@68 934 if htmlFile is not None:
jpayne@68 935 append_rqc_file(filesFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_HISTO_D3_HTML_PLOT, htmlFile, log)
jpayne@68 936
jpayne@68 937 if statsDict:
jpayne@68 938 try:
jpayne@68 939 ## --------------------------------------------------------------------------------------------------------
jpayne@68 940 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_AVG_INSERT, statsDict["avg_insert"], log)
jpayne@68 941 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_STD_INSERT, statsDict["std_insert"], log)
jpayne@68 942 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_MODE_INSERT, statsDict["mode_insert"], log)
jpayne@68 943 ## --------------------------------------------------------------------------------------------------------
jpayne@68 944
jpayne@68 945 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_TOTAL_TIME, statsDict["total_time"], log)
jpayne@68 946 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_NUM_READS, statsDict["num_reads"], log)
jpayne@68 947 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_JOINED_NUM, statsDict["joined_num"], log)
jpayne@68 948 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_JOINED_PERC, statsDict["joined_perc"], log)
jpayne@68 949
jpayne@68 950 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_AMBIGUOUS_NUM, statsDict["ambiguous_num"], log)
jpayne@68 951 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_AMBIGUOUS_PERC, statsDict["ambiguous_perc"], log)
jpayne@68 952 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_NO_SOLUTION_NUM, statsDict["no_solution_num"], log)
jpayne@68 953 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_NO_SOLUTION_PERC, statsDict["no_solution_perc"], log)
jpayne@68 954 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_TOO_SHORT_NUM, statsDict["too_short_num"], log)
jpayne@68 955 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_TOO_SHORT_PERC, statsDict["too_short_perc"], log)
jpayne@68 956
jpayne@68 957 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_INSERT_RANGE_START, statsDict["insert_range_start"], log)
jpayne@68 958 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_INSERT_RANGE_END, statsDict["insert_range_end"], log)
jpayne@68 959
jpayne@68 960 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_90TH_PERC, statsDict["perc_90th"], log)
jpayne@68 961 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_50TH_PERC, statsDict["perc_50th"], log)
jpayne@68 962 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_10TH_PERC, statsDict["perc_10th"], log)
jpayne@68 963 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_75TH_PERC, statsDict["perc_75th"], log)
jpayne@68 964 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_READ_INSERT_SIZE_25TH_PERC, statsDict["perc_25th"], log)
jpayne@68 965
jpayne@68 966 except KeyError:
jpayne@68 967 log.error("19_insert_size_analysis failed (KeyError).")
jpayne@68 968 status = "19_insert_size_analysis failed"
jpayne@68 969 checkpoint_step_wrapper(status)
jpayne@68 970 return status
jpayne@68 971
jpayne@68 972 log.info("19_insert_size_analysis complete.")
jpayne@68 973 status = "19_insert_size_analysis complete"
jpayne@68 974 checkpoint_step_wrapper(status)
jpayne@68 975
jpayne@68 976
jpayne@68 977 return status
jpayne@68 978
jpayne@68 979
jpayne@68 980
jpayne@68 981
jpayne@68 982 """
jpayne@68 983 STEP21 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 984
jpayne@68 985 do_sketch_vs_nt_refseq_silva
jpayne@68 986
jpayne@68 987 """
jpayne@68 988 def do_sketch_vs_nt_refseq_silva(fasta, log):
jpayne@68 989 log.info("\n\n%sSTEP21 - Run sketch vs nt, refseq, silva <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 990
jpayne@68 991 status = "21_sketch_vs_nt_refseq_silva in progress"
jpayne@68 992 checkpoint_step_wrapper(status)
jpayne@68 993
jpayne@68 994 sketchOutDir = "sketch"
jpayne@68 995 sketchOutPath = os.path.join(outputPath, sketchOutDir)
jpayne@68 996 make_dir(sketchOutPath)
jpayne@68 997 change_mod(sketchOutPath, "0755")
jpayne@68 998
jpayne@68 999 seqUnitName = os.path.basename(fasta)
jpayne@68 1000 seqUnitName = file_name_trim(seqUnitName)
jpayne@68 1001
jpayne@68 1002 filesFile = RQCReadQcConfig.CFG["files_file"]
jpayne@68 1003
jpayne@68 1004 comOptions = "ow=t colors=f printtaxa=t depth depth2 unique2 merge"
jpayne@68 1005
jpayne@68 1006 ## NT ##########################
jpayne@68 1007 sketchOutFile = os.path.join(sketchOutPath, seqUnitName + ".sketch_vs_nt.txt")
jpayne@68 1008 sendSketchShCmd = os.path.join(BBDIR, 'sendsketch.sh')
jpayne@68 1009 cmd = "%s in=%s out=%s %s nt" % (sendSketchShCmd, fasta, sketchOutFile, comOptions)
jpayne@68 1010 stdOut, stdErr, exitCode = run_sh_command(cmd, True, log, True)
jpayne@68 1011 if exitCode != 0:
jpayne@68 1012 log.error("Failed to run : %s, stdout : %s, stderr: %s", cmd, stdOut, stdErr)
jpayne@68 1013 status = "21_sketch_vs_nt_refseq_silva failed"
jpayne@68 1014 checkpoint_step_wrapper(status)
jpayne@68 1015 return status
jpayne@68 1016
jpayne@68 1017 append_rqc_file(filesFile, "sketch_vs_nt_output", sketchOutFile, log)
jpayne@68 1018
jpayne@68 1019 ## Refseq ##########################
jpayne@68 1020 sketchOutFile = os.path.join(sketchOutPath, seqUnitName + ".sketch_vs_refseq.txt")
jpayne@68 1021 cmd = "%s in=%s out=%s %s refseq" % (sendSketchShCmd, fasta, sketchOutFile, comOptions)
jpayne@68 1022 stdOut, stdErr, exitCode = run_sh_command(cmd, True, log, True)
jpayne@68 1023 if exitCode != 0:
jpayne@68 1024 log.error("Failed to run : %s, stdout : %s, stderr: %s", cmd, stdOut, stdErr)
jpayne@68 1025 status = "21_sketch_vs_refseq failed"
jpayne@68 1026 checkpoint_step_wrapper(status)
jpayne@68 1027 return status
jpayne@68 1028
jpayne@68 1029 append_rqc_file(filesFile, "sketch_vs_refseq_output", sketchOutFile, log)
jpayne@68 1030
jpayne@68 1031 ## Silva ##########################
jpayne@68 1032 sketchOutFile = os.path.join(sketchOutPath, seqUnitName + ".sketch_vs_silva.txt")
jpayne@68 1033 cmd = "%s in=%s out=%s %s silva" % (sendSketchShCmd, fasta, sketchOutFile, comOptions)
jpayne@68 1034 stdOut, stdErr, exitCode = run_sh_command(cmd, True, log, True)
jpayne@68 1035 if exitCode != 0:
jpayne@68 1036 log.error("Failed to run : %s, stdout : %s, stderr: %s", cmd, stdOut, stdErr)
jpayne@68 1037 status = "21_sketch_vs_silva failed"
jpayne@68 1038 checkpoint_step_wrapper(status)
jpayne@68 1039 return status
jpayne@68 1040
jpayne@68 1041 append_rqc_file(filesFile, "sketch_vs_silva_output", sketchOutFile, log)
jpayne@68 1042
jpayne@68 1043 log.info("21_sketch_vs_nt_refseq_silva complete.")
jpayne@68 1044 status = "21_sketch_vs_nt_refseq_silva complete"
jpayne@68 1045 checkpoint_step_wrapper(status)
jpayne@68 1046
jpayne@68 1047 return status
jpayne@68 1048
jpayne@68 1049
jpayne@68 1050
jpayne@68 1051 """
jpayne@68 1052 STEP22 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1053
jpayne@68 1054 do_illumina_read_level_report_postprocessing
jpayne@68 1055
jpayne@68 1056 """
jpayne@68 1057 ## Removed!!
jpayne@68 1058 #def do_illumina_read_level_report(fastq, firstSubsampledFastqFileName, log):
jpayne@68 1059
jpayne@68 1060
jpayne@68 1061 """
jpayne@68 1062 STEP23 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1063
jpayne@68 1064 do_cleanup_readqc
jpayne@68 1065
jpayne@68 1066 """
jpayne@68 1067 def do_cleanup_readqc(log):
jpayne@68 1068 log.info("\n\n%sSTEP23 - Cleanup <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 1069
jpayne@68 1070 status = "23_cleanup_readqc in progress"
jpayne@68 1071 checkpoint_step_wrapper(status)
jpayne@68 1072
jpayne@68 1073 ## Get option
jpayne@68 1074 skipCleanup = RQCReadQcConfig.CFG["skip_cleanup"]
jpayne@68 1075
jpayne@68 1076 if not skipCleanup:
jpayne@68 1077 retCode = cleanup_readqc(log)
jpayne@68 1078 else:
jpayne@68 1079 log.warning("File cleaning is skipped.")
jpayne@68 1080 retCode = RQCExitCodes.JGI_SUCCESS
jpayne@68 1081
jpayne@68 1082 if retCode != RQCExitCodes.JGI_SUCCESS:
jpayne@68 1083 log.error("23_cleanup_readqc failed.")
jpayne@68 1084 status = "23_cleanup_readqc failed"
jpayne@68 1085 checkpoint_step_wrapper(status)
jpayne@68 1086
jpayne@68 1087 else:
jpayne@68 1088 log.info("23_cleanup_readqc complete.")
jpayne@68 1089 status = "23_cleanup_readqc complete"
jpayne@68 1090 checkpoint_step_wrapper(status)
jpayne@68 1091
jpayne@68 1092
jpayne@68 1093 return status
jpayne@68 1094
jpayne@68 1095
jpayne@68 1096 def stetch_section_note(table_header):
jpayne@68 1097 overview = [
jpayne@68 1098 'BBTools sketch.sh uses a technique called MinHash to rapidly compare large sequences. ',
jpayne@68 1099 'The result is similar to BLAST, a list of hits from a query sequence to various reference sequences, ',
jpayne@68 1100 'sorted by similarity but the mechanisms are very different. ',
jpayne@68 1101 'For more information, see <a href="http://bbtools.jgi.doe.gov/" target="_blank">http://bbtools.jgi.doe.gov</a>'
jpayne@68 1102 ]
jpayne@68 1103
jpayne@68 1104 legend = {
jpayne@68 1105 'WKID' : 'Weighted Kmer IDentity, which is the kmer identity compensating for differences in size. So, comparing human chr1 to the full human genome would yield 100% WKID but approximately 10% KID.',
jpayne@68 1106 'KID' : 'Kmer IDentity, equal to matches/length; this is the fraction of shared kmers.',
jpayne@68 1107 'ANI' : 'Average Nucleotide Identity, derived from WKID and kmer length.',
jpayne@68 1108 'Complt' : 'Genome completeness (percent of the reference represented in the query). Derived from WKID and KID.',
jpayne@68 1109 'Contam' : 'Contamination (percent of the query that does not match this reference, but matches some other reference).',
jpayne@68 1110 'Depth' : 'Per-kmer depth in sketches, indicating how many times that kmer was seen in the sequence.',
jpayne@68 1111 'Depth2' : 'Repeat-compensated depth',
jpayne@68 1112 'Matches' : 'The number of shared kmers between query and ref.',
jpayne@68 1113 'Unique' : 'The number of shared kmers between query and ref, and no other ref.',
jpayne@68 1114 'Unique2' : '??',
jpayne@68 1115 'Depth2' : 'Repeat-compensated depth',
jpayne@68 1116 'noHit' : 'Number of kmers that did not hit any reference sequence. Though constant for a query, it will be reported differently for different references based on the relative size of the reference and query (if the reference is bigger than the query, it will report all of them).',
jpayne@68 1117 'TaxID' : 'NCBI taxonomic id, when available.',
jpayne@68 1118 'gSize' : 'Estimate of genomic size (number of unique kmers in the genome). This is based on the smallest hash value in the list. This is affected by blacklists or whitelists, and by using an assembly versus raw reads.',
jpayne@68 1119 'gSeqs' : 'Number of sequences used in the sketch.',
jpayne@68 1120 'taxName' : 'NCBI\'s name for that taxID. If there is no taxID, the sequence name will be used.',
jpayne@68 1121 }
jpayne@68 1122
jpayne@68 1123 html = ''
jpayne@68 1124 for name in table_header:
jpayne@68 1125 if name != 'taxonomy':
jpayne@68 1126 html += html_tag('li', '%s: %s' % (html_tag('span', name, {'class': 'notice'}), html_tag('span', legend.get(name), {'class': 'small-italic'})))
jpayne@68 1127 html = html_tag('span', ''.join(overview), {'class': 'notice'}) + '<br /><br />Column Legend<br />' + html_tag('ul', html)
jpayne@68 1128
jpayne@68 1129 return html
jpayne@68 1130
jpayne@68 1131
jpayne@68 1132 def sketch_table(fname):
jpayne@68 1133 # print('DEBUG : %s' % fname)
jpayne@68 1134
jpayne@68 1135 html = ''
jpayne@68 1136
jpayne@68 1137 if os.path.isfile(fname):
jpayne@68 1138 title = None
jpayne@68 1139 header = None
jpayne@68 1140 data = []
jpayne@68 1141
jpayne@68 1142 with open(fname, 'r') as fh:
jpayne@68 1143 for line in fh:
jpayne@68 1144 line = line.strip()
jpayne@68 1145 if line == '':
jpayne@68 1146 continue
jpayne@68 1147 if line.startswith('Query:'):
jpayne@68 1148 title = line
jpayne@68 1149 elif line.startswith('WKID'):
jpayne@68 1150 header = line.split('\t')
jpayne@68 1151 else:
jpayne@68 1152 data.append(line)
jpayne@68 1153 if title and header and data:
jpayne@68 1154 html += html_th(header)
jpayne@68 1155 for line in data:
jpayne@68 1156 row = line.split('\t')
jpayne@68 1157 html += html_tr(row)
jpayne@68 1158
jpayne@68 1159 # html = html_tag('p', title) + html_tag('table', html, attrs={'class': 'data'})
jpayne@68 1160 html = html_tag('table', html, attrs={'class': 'data'})
jpayne@68 1161 return html, header
jpayne@68 1162
jpayne@68 1163 def do_html_contam_art_first_n_pb_tr(stats, files, odir, filepath_prefix):
jpayne@68 1164 temp = os.path.join(PYDIR, 'template/readqc_artifacts.html')
jpayne@68 1165 html = ''
jpayne@68 1166 with open(temp, 'r') as fh:
jpayne@68 1167 html = fh.read()
jpayne@68 1168
jpayne@68 1169 ## the optional artifact contam
jpayne@68 1170 artifact_tr = ''
jpayne@68 1171 artifact_type = '50'
jpayne@68 1172 artifact_val = pipeline_val('illumina read percent contamination artifact 50bp seal', {'type': 'raw'}, stats, files, filepath_prefix)
jpayne@68 1173 artifact_file = pipeline_val('artifact_50bp.seal.stats seal', {'type': 'file'}, stats, files, filepath_prefix)
jpayne@68 1174 if not artifact_file:
jpayne@68 1175 artifact_file = pipeline_val('artifact_20bp.seal.stats seal', {'type': 'file'}, stats, files, filepath_prefix)
jpayne@68 1176 artifact_type = '20'
jpayne@68 1177 artifact_val = pipeline_val('illumina read percent contamination artifact 20bp seal', {'type': 'raw'}, stats, files, filepath_prefix)
jpayne@68 1178 if artifact_file:
jpayne@68 1179 html = html.replace('[_CONTAM-ART-FIRST-BP_]', artifact_type)
jpayne@68 1180 html = html.replace('[_CONTAM-ART-FIRST-BP-SEAL_]', artifact_file)
jpayne@68 1181 html = html.replace('[_CONTAM-ART-FIRST-BP-SEAL-PCT_]', artifact_val)
jpayne@68 1182
jpayne@68 1183 return html
jpayne@68 1184
jpayne@68 1185
jpayne@68 1186 def do_html_body(odir, filepath_prefix):
jpayne@68 1187 print('do_html_body - %s' % filepath_prefix)
jpayne@68 1188 temp = os.path.join(PYDIR, 'template/readqc_body_template.html')
jpayne@68 1189 statsf = os.path.join(odir, 'readqc_stats.txt')
jpayne@68 1190 if not os.path.isfile(statsf):
jpayne@68 1191 print('ERROR : file not found: %s' % statsf)
jpayne@68 1192 stats = get_dict_obj(statsf)
jpayne@68 1193
jpayne@68 1194 filesf = os.path.join(odir, 'readqc_files.txt')
jpayne@68 1195 if not os.path.isfile(statsf):
jpayne@68 1196 print('ERROR : file not found: %s' % statsf)
jpayne@68 1197 files = get_dict_obj(os.path.join(odir, 'readqc_files.txt'))
jpayne@68 1198
jpayne@68 1199 ## key (key name in readqc_stats or readqc_files) : {token (space holder in html template), type (value format)}
jpayne@68 1200 tok_map = {
jpayne@68 1201 ## Average Base Quality section
jpayne@68 1202 'overall bases Q score mean' : {'token' : '[_BASE-QUALITY-SCORE_]', 'type': 'float', 'filter': 1},
jpayne@68 1203 'overall bases Q score std' : {'token' : '[_BASE-QUALITY-SCORE-STD_]', 'type': 'float', 'filter': 1},
jpayne@68 1204 'Q30 bases Q score mean' : {'token' : '[_Q30-BASE-QUALITY-SCORE_]', 'type': 'float', 'filter': 1},
jpayne@68 1205 'Q30 bases Q score std' : {'token' : '[_Q30-BASE-QUALITY-SCORE-STD_]', 'type': 'float', 'filter': 1},
jpayne@68 1206 'base C30' : {'token' : '[_COUNT-OF-BAESE-Q30_]', 'type': 'bigint'},
jpayne@68 1207 'base C25' : {'token' : '[_COUNT-OF-BAESE-Q25_]', 'type': 'bigint'},
jpayne@68 1208 'base C20' : {'token' : '[_COUNT-OF-BAESE-Q20_]', 'type': 'bigint'},
jpayne@68 1209 'base C15' : {'token' : '[_COUNT-OF-BAESE-Q15_]', 'type': 'bigint'},
jpayne@68 1210 'base C10' : {'token' : '[_COUNT-OF-BAESE-Q10_]', 'type': 'bigint'},
jpayne@68 1211 'base C5' : {'token' : '[_COUNT-OF-BAESE-Q5_]', 'type': 'bigint'},
jpayne@68 1212 'base Q30' : {'token' : '[_PCT-OF-BAESE-Q30_]', 'type': 'raw'},
jpayne@68 1213 'base Q25' : {'token' : '[_PCT-OF-BAESE-Q25_]', 'type': 'raw'},
jpayne@68 1214 'base Q20' : {'token' : '[_PCT-OF-BAESE-Q20_]', 'type': 'raw'},
jpayne@68 1215 'base Q15' : {'token' : '[_PCT-OF-BAESE-Q15_]', 'type': 'raw'},
jpayne@68 1216 'base Q10' : {'token' : '[_PCT-OF-BAESE-Q10_]', 'type': 'raw'},
jpayne@68 1217 'base Q5' : {'token' : '[_PCT-OF-BAESE-Q5_]', 'type': 'raw'},
jpayne@68 1218
jpayne@68 1219 ## Average Read Quality section
jpayne@68 1220 'read Q30' : {'token' : '[_PCT-OF-READS-Q30_]', 'type': 'raw'},
jpayne@68 1221 'read Q25' : {'token' : '[_PCT-OF-READS-Q25_]', 'type': 'raw'},
jpayne@68 1222 'read Q20' : {'token' : '[_PCT-OF-READS-Q20_]', 'type': 'raw'},
jpayne@68 1223 'read Q15' : {'token' : '[_PCT-OF-READS-Q15_]', 'type': 'raw'},
jpayne@68 1224 'read Q10' : {'token' : '[_PCT-OF-READS-Q10_]', 'type': 'raw'},
jpayne@68 1225 'read Q5' : {'token' : '[_PCT-OF-READS-Q5_]', 'type': 'raw'},
jpayne@68 1226
jpayne@68 1227 'SUBSAMPLE_RATE' : {'token' : '[_SUBSAMPLE-RATE_]', 'type': 'pct', 'filter': 1},
jpayne@68 1228 'read base quality stats' : {'token' : '[_AVG-READ-QUAL-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
jpayne@68 1229 'ILLUMINA_READ_QHIST_D3_HTML_PLOT' : {'token' : '[_AVG-READ-QUAL-HOSTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
jpayne@68 1230 'read qhist plot' : {'token' : '[_AVG-READ-QUALITY-HISTOGRAM_]', 'type': 'file'},
jpayne@68 1231
jpayne@68 1232 ## Average Base Position Quality section
jpayne@68 1233 'read q20 read1' : {'token' : '[_READ_Q20_READ1_]', 'type': 'raw'},
jpayne@68 1234 'read q20 read2' : {'token' : '[_READ_Q20_READ2_]', 'type': 'raw'},
jpayne@68 1235 'read qual pos qrpt 1' : {'token' : '[_AVG-BASE-POS-QUAL-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
jpayne@68 1236 'ILLUMINA_READ_QUAL_POS_PLOT_MERGED_D3_HTML_PLOT' : {'token' : '[_AVG-BASE-POS-QUAL-HISTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
jpayne@68 1237 'read qual pos plot merged' : {'token' : '[_AVG-BASE-POSITION-QUALITY_]', 'type': 'file'},
jpayne@68 1238
jpayne@68 1239 ## Insert Size
jpayne@68 1240 'ILLUMINA_READ_INSERT_SIZE_JOINED_PERC' : {'token' : '[_PCT-READS-JOINED_]', 'type': 'float', 'filter': 1},
jpayne@68 1241 'ILLUMINA_READ_INSERT_SIZE_AVG_INSERT' : {'token' : '[_PCT-READS-JOINED-AVG_]', 'type': 'raw'},
jpayne@68 1242 'ILLUMINA_READ_INSERT_SIZE_STD_INSERT' : {'token' : '[_PCT-READS-JOINED-STDDEV_]', 'type': 'raw'},
jpayne@68 1243 'ILLUMINA_READ_INSERT_SIZE_MODE_INSERT' : {'token' : '[_PCT-READS-JOINED-MODE_]', 'type': 'raw'},
jpayne@68 1244
jpayne@68 1245 'ILLUMINA_READ_INSERT_SIZE_HISTO_DATA' : {'token' : '[_INSERT-SIZE-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
jpayne@68 1246 'ILLUMINA_READ_INSERT_SIZE_HISTO_D3_HTML_PLOT' : {'token' : '[_INSERT-SIZE-HISTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
jpayne@68 1247 'ILLUMINA_READ_INSERT_SIZE_HISTO_PLOT' : {'token' : '[_INSERT-SIZE-HISTOGRAM_]', 'type': 'file'},
jpayne@68 1248
jpayne@68 1249 ## Read GC
jpayne@68 1250 'read GC mean' : {'token' : '[_READ-GC-AVG_]', 'type': 'float', 'filter': 1},
jpayne@68 1251 'read GC std' : {'token' : '[_READ-GC-STDDEV_]', 'type': 'float', 'filter': 1},
jpayne@68 1252 'read GC text hist' : {'token' : '[_READ-QC-HISTO-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
jpayne@68 1253 'ILLUMINA_READ_GC_D3_HTML_PLOT' : {'token' : '[_READ-QC-HISTO-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
jpayne@68 1254 'read GC plot' : {'token' : '[_READ-GC-HIST_]', 'type': 'file'},
jpayne@68 1255
jpayne@68 1256 ## Cycle Nucleotide Composition
jpayne@68 1257 'read base count text 1' : {'token' : '[_NUC-COMP-FREQ-R1-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
jpayne@68 1258 'read base count text 2' : {'token' : '[_NUC-COMP-FREQ-R2-DATA_]', 'type': 'file', 'filter': 'link', 'label':'data file'},
jpayne@68 1259 'ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_1' : {'token' : '[_NUC-COMP-FREQ-R1-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
jpayne@68 1260 'ILLUMINA_READ_BASE_COUNT_D3_HTML_PLOT_2' : {'token' : '[_NUC-COMP-FREQ-R2-D3_]', 'type': 'file', 'filter': 'link', 'label':'interactive plot'},
jpayne@68 1261 'read base count plot 1' : {'token' : '[_CYCLE-NUCL-COMPOSITION-READ1_]', 'type': 'file'},
jpayne@68 1262 'read base count plot 2' : {'token' : '[_CYCLE-NUCL-COMPOSITION-READ2_]', 'type': 'file'},
jpayne@68 1263
jpayne@68 1264 ## Percentage of Common Contaminants
jpayne@68 1265 'illumina read percent contamination artifact seal' : {'token' : '[_CONTAM-ART-SEA-PCT_]', 'type': 'floatstr'},
jpayne@68 1266 'artifact.seal.stats seal' : {'token' : '[_CONTAM-ART-SEAL_]', 'type': 'file'},
jpayne@68 1267
jpayne@68 1268 'illumina read percent contamination DNA spikein seal' : {'token' : '[_DNA-SPIKEIN-SEAL_PCT_]', 'type': 'floatstr'},
jpayne@68 1269 'DNA_spikein.seal.stats seal' : {'token' : '[_DNA-SPIKEIN-SEAL_]', 'type': 'file'},
jpayne@68 1270
jpayne@68 1271 'illumina read percent contamination RNA spikein seal' : {'token' : '[_RNA-SPIKEIN-SEAL_PCT_]', 'type': 'floatstr'},
jpayne@68 1272 'RNA_spikein.seal.stats seal' : {'token' : '[_RNA-SPIKEIN-SEAL_]', 'type': 'file'},
jpayne@68 1273
jpayne@68 1274 'illumina read percent contamination fosmid seal' : {'token' : '[_CONTAM-FOSMID-SEAL-PCT_]', 'type': 'floatstr'},
jpayne@68 1275 'fosmid.seal.stats seal' : {'token' : '[_CONTAM-FOSMID-SEAL_]', 'type': 'file'},
jpayne@68 1276
jpayne@68 1277 'illumina read percent contamination fosmid seal' : {'token' : '[_CONTAM-FOSMID-SEAL-PCT_]', 'type': 'floatstr'},
jpayne@68 1278 'fosmid.seal.stats seal' : {'token' : '[_CONTAM-FOSMID-SEAL_]', 'type': 'file'},
jpayne@68 1279
jpayne@68 1280 'illumina read percent contamination mitochondrion seal' : {'token' : '[_CONTAM-MITO-SEAL-PCT_]', 'type': 'floatstr'},
jpayne@68 1281 'mitochondrion.seal.stats seal' : {'token' : '[_CONTAM-MITO-SEAL_]', 'type': 'file'},
jpayne@68 1282
jpayne@68 1283 'illumina read percent contamination plastid seal' : {'token' : '[_CONTAM-CHLO-SEAL-PCT_]', 'type': 'floatstr'},
jpayne@68 1284 'plastid.seal.stats seal' : {'token' : '[_CONTAM-CHLO-SEAL_]', 'type': 'file'},
jpayne@68 1285
jpayne@68 1286 'illumina read percent contamination phix seal' : {'token' : '[_CONTAM-PHIX-SEAL-PCT_]', 'type': 'floatstr'},
jpayne@68 1287 'phix.seal.stats seal' : {'token' : '[_CONTAM-PHIX-SEAL_]', 'type': 'file'},
jpayne@68 1288
jpayne@68 1289 'illumina read percent contamination rrna seal' : {'token' : '[_CONTAM-RRNA-SEAL-PCT_]', 'type': 'floatstr'},
jpayne@68 1290 'rrna.seal.stats seal' : {'token' : '[_CONTAM-RRNA-SEAL_]', 'type': 'file'},
jpayne@68 1291
jpayne@68 1292 'illumina read percent contamination microbes seal' : {'token' : '[_CONTAM-NON-SYN-SEAL-PCT_]', 'type': 'floatstr'},
jpayne@68 1293 'microbes.seal.stats seal' : {'token' : '[_CONTAM-NON-SYN-SEAL_]', 'type': 'file'},
jpayne@68 1294
jpayne@68 1295 # 'illumina read percent contamination synthetic seal' : {'token' : '[_CONTAM-SYN-SEAL-PCT_]', 'type': 'floatstr'},
jpayne@68 1296 'synthetic.seal.stats seal' : {'token' : '[_CONTAM-SYN-SEAL_]', 'type': 'file'},
jpayne@68 1297
jpayne@68 1298 'illumina read percent contamination adapters seal' : {'token' : '[_CONTAM-ADAPTER-PCT_]', 'type': 'floatstr'},
jpayne@68 1299 'adapters.seal.stats seal' : {'token' : '[_CONTAM-ADAPTER-SEAL_]', 'type': 'file'},
jpayne@68 1300
jpayne@68 1301 ## Sketch vs NT
jpayne@68 1302 'sketch_vs_nt_output' : {'token' : '[_SKETCH-VS-NT_]', 'type': 'file'},
jpayne@68 1303 # 'sketch_vs_nt_output' : {'token' : '[_SKETCH-VS-NT-BASE_]', 'type': 'file', 'filter': 'base'},
jpayne@68 1304 }
jpayne@68 1305
jpayne@68 1306 html = ''
jpayne@68 1307 with open(temp, 'r') as fh:
jpayne@68 1308 html = fh.read()
jpayne@68 1309
jpayne@68 1310 for key in tok_map:
jpayne@68 1311 dat = tok_map[key]
jpayne@68 1312 val = pipeline_val(key, dat, stats, files, filepath_prefix)
jpayne@68 1313 # print('key=%s; %s; ====type=%s' % (key, val, type(val)))
jpayne@68 1314 html = html.replace(dat['token'], val)
jpayne@68 1315
jpayne@68 1316 artifact_tr = do_html_contam_art_first_n_pb_tr(stats, files, odir, filepath_prefix)
jpayne@68 1317 html = html.replace('[_CONTAN-ART-SEAL-FIRST-BP_]', artifact_tr)
jpayne@68 1318
jpayne@68 1319 ## synthetic contam
jpayne@68 1320 contam_syn_file = pipeline_val('synthetic.seal.stats seal', {'type': 'file', 'filter': 'full'}, stats, files, filepath_prefix)
jpayne@68 1321 if contam_syn_file and os.path.isfile(contam_syn_file):
jpayne@68 1322 contam_syn_pct_key = 'illumina read percent contamination synthetic seal'
jpayne@68 1323 cmd = 'grep "#Matched" %s' % contam_syn_file
jpayne@68 1324 stdOut, stdErr, exitCode = run_sh_command(cmd, True)
jpayne@68 1325 if exitCode == 0:
jpayne@68 1326 toks = stdOut.strip().split()
jpayne@68 1327 if len(toks) == 3:
jpayne@68 1328 html = html.replace('[_CONTAM-SYN-SEAL-PCT_]', toks[2][:-1])
jpayne@68 1329
jpayne@68 1330 ###--- Sketch
jpayne@68 1331 sketch_html = ''
jpayne@68 1332 ## add Sketch vs NT if file exists:
jpayne@68 1333 sketch_nt_file = pipeline_val('sketch_vs_nt_output', {'type': 'file', 'filter': 'full'}, stats, files, filepath_prefix)
jpayne@68 1334
jpayne@68 1335 if os.path.isfile(sketch_nt_file):
jpayne@68 1336 basename = os.path.basename(sketch_nt_file)
jpayne@68 1337 # href = pipeline_val('sketch_vs_nt_output', {'type': 'file'}, stats, files, filepath_prefix)
jpayne@68 1338 sketch_html = html_tag('h4', 'Sketch vs. NT') #+ '<br />' + 'Raw file:%s' % html_link(href, basename)
jpayne@68 1339 sketch_tab, sketch_table_header = sketch_table(sketch_nt_file)
jpayne@68 1340 sketch_html += sketch_tab
jpayne@68 1341
jpayne@68 1342
jpayne@68 1343 ## add Sketch vs refseq if file exists?
jpayne@68 1344 if sketch_html:
jpayne@68 1345 sketch_html = html_tag('h2', 'Read Sketch', attrs={'class': 'section-title'}) \
jpayne@68 1346 + stetch_section_note(sketch_table_header) \
jpayne@68 1347 + html_tag('div', sketch_html, {'class': 'section'})
jpayne@68 1348
jpayne@68 1349 html = html.replace('[_SKETCH-TABLE_]', sketch_html)
jpayne@68 1350
jpayne@68 1351 # copy the image file
jpayne@68 1352 imgdir = os.path.join(PYDIR, 'images')
jpayne@68 1353 todir = os.path.join(odir, 'images')
jpayne@68 1354 if os.path.isdir(todir):
jpayne@68 1355 shutil.rmtree(todir)
jpayne@68 1356 shutil.copytree(imgdir, todir, False, None)
jpayne@68 1357
jpayne@68 1358 return html
jpayne@68 1359
jpayne@68 1360
jpayne@68 1361 def do_html(odir, infile):
jpayne@68 1362 # odir = os.path.abspath(odir) ## DO NOT convert!! The relative file path in html need this original odir string matching
jpayne@68 1363
jpayne@68 1364 screen('Output dir - %s' % odir)
jpayne@68 1365 fname = os.path.basename(infile)
jpayne@68 1366 screen('Create HTML page in %s for %s ..' % (odir, fname))
jpayne@68 1367 temp = os.path.join(PYDIR, 'template/template.html')
jpayne@68 1368
jpayne@68 1369 with open(temp, 'r') as fh:
jpayne@68 1370 html = fh.read()
jpayne@68 1371 html = html.replace('[_PAGE-TITLE_]', 'Read QC Report')
jpayne@68 1372 html = html.replace('[_REPORT-TITLE_]', 'BBTools Read QC Report')
jpayne@68 1373 html = html.replace('[_INPUT-FILE-NAME_]', fname)
jpayne@68 1374 html = html.replace('[_REPORT-DATE_]', '{:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))
jpayne@68 1375
jpayne@68 1376 hbody =do_html_body(odir, odir)
jpayne@68 1377 html = html.replace('[_REPORT-BODY_]', hbody)
jpayne@68 1378
jpayne@68 1379 ## write the html to file
jpayne@68 1380 idxfile = os.path.join(odir, 'index.html')
jpayne@68 1381 with open(idxfile, 'w') as fh2:
jpayne@68 1382 fh2.write(html)
jpayne@68 1383 screen('HTML index file written to %s' % idxfile)
jpayne@68 1384
jpayne@68 1385 # copy the css file
jpayne@68 1386 cssdir = os.path.join(PYDIR, 'css')
jpayne@68 1387 todir = os.path.join(odir, 'css')
jpayne@68 1388 if os.path.isdir(todir):
jpayne@68 1389 shutil.rmtree(todir)
jpayne@68 1390 shutil.copytree(cssdir, todir, False, None)
jpayne@68 1391
jpayne@68 1392 def screen(txt):
jpayne@68 1393 print('.. %s' % txt)
jpayne@68 1394 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1395 ## Main Program
jpayne@68 1396 if __name__ == "__main__":
jpayne@68 1397 desc = "RQC ReadQC Pipeline"
jpayne@68 1398 parser = argparse.ArgumentParser(description=desc)
jpayne@68 1399 parser.add_argument("-f", "--fastq", dest="fastq", help="Set input fastq file (full path to fastq)", required=True)
jpayne@68 1400 parser.add_argument("-o", "--output-path", dest="outputPath", help="Set output path to write to")
jpayne@68 1401 parser.add_argument("-x", "--cut", dest="cutLenOfFirstBases", help="Set Read cut length (bp) for read contamination detection")
jpayne@68 1402 parser.add_argument("-v", "--version", action="version", version=VERSION)
jpayne@68 1403
jpayne@68 1404 ## For running on Crays
jpayne@68 1405 parser.add_argument("-l", "--lib-name", dest="libName", help="Set library name", required=False)
jpayne@68 1406 parser.add_argument("-m", "--is-multiplexed", dest="isMultiplexed", help="Set multiplexed data", required=False)
jpayne@68 1407 parser.add_argument("-r", "--is-rna", dest="isRna", help="Set RNA data", required=False)
jpayne@68 1408
jpayne@68 1409 ## produce html when processing is done
jpayne@68 1410 parser.add_argument("-html", "--html", action="store_true", help="Create html file", dest="html", default=False, required=False)
jpayne@68 1411
jpayne@68 1412 ## Toggle options
jpayne@68 1413 parser.add_argument("-b", "--skip-blast", action="store_true", help="Skip the blast run", dest="skipBlast", default=False, required=False)
jpayne@68 1414
jpayne@68 1415 parser.add_argument("-bn", "--skip-blast-nt", action="store_true", help="Skip Blast run against nt", dest="skipBlastNt", default=False, required=False)
jpayne@68 1416 parser.add_argument("-br", "--skip-blast-refseq", action="store_true", help="Skip Blast run against refseq.archaea and refseq.bateria", dest="skipBlastRefseq", default=False, required=False)
jpayne@68 1417 parser.add_argument("-c", "--skip-cleanup", action="store_true", help="Skip temporary file cleanup", dest="skipCleanup", default=False, required=False)
jpayne@68 1418 parser.add_argument("-p", "--pooled-analysis", action="store_true", help="Enable pooled analysis (demultiplexing)", dest="pooledAnalysis", default=False, required=False)
jpayne@68 1419 parser.add_argument("-s", "--skip-subsample", action="store_true", help="Skip subsampling.", dest="skipSubsample", default=False, required=False)
jpayne@68 1420 parser.add_argument("-z", "--skip-localization", action="store_true", help="Skip database localization", dest="skipDbLocalization", default=False, required=False)
jpayne@68 1421
jpayne@68 1422 options = parser.parse_args()
jpayne@68 1423
jpayne@68 1424 outputPath = None # output path, defaults to current working directory
jpayne@68 1425 fastq = None # full path to input fastq.gz
jpayne@68 1426
jpayne@68 1427 status = "start"
jpayne@68 1428 nextStepToDo = 0
jpayne@68 1429
jpayne@68 1430 if options.outputPath:
jpayne@68 1431 outputPath = options.outputPath
jpayne@68 1432 else:
jpayne@68 1433 outputPath = os.getcwd()
jpayne@68 1434
jpayne@68 1435 if options.fastq:
jpayne@68 1436 fastq = options.fastq
jpayne@68 1437
jpayne@68 1438 ## create output_directory if it doesn't exist
jpayne@68 1439 if not os.path.isdir(outputPath):
jpayne@68 1440 os.makedirs(outputPath)
jpayne@68 1441
jpayne@68 1442 libName = None ## for illumina_sciclone_analysis()
jpayne@68 1443 isRna = None ## for illumina_sciclone_analysis()
jpayne@68 1444 isMultiplexed = None ## for illumina_generate_index_sequence_detection_plot()
jpayne@68 1445
jpayne@68 1446 bSkipBlast = False
jpayne@68 1447 bSkipBlastRefseq = False ## refseq.archaea and refseq.bacteria
jpayne@68 1448 bSkipBlastNt = False
jpayne@68 1449 bPooledAnalysis = False
jpayne@68 1450
jpayne@68 1451 ## RQC-743 Need to specify the first cutting bp for read contam detection (eps. for smRNA )
jpayne@68 1452 firstBptoCut = 50
jpayne@68 1453 if options.cutLenOfFirstBases:
jpayne@68 1454 firstBptoCut = options.cutLenOfFirstBases
jpayne@68 1455
jpayne@68 1456 if options.libName:
jpayne@68 1457 libName = options.libName
jpayne@68 1458
jpayne@68 1459
jpayne@68 1460 ## switches
jpayne@68 1461 if options.isRna:
jpayne@68 1462 isRna = options.isRna
jpayne@68 1463 if options.isMultiplexed:
jpayne@68 1464 isMultiplexed = options.isMultiplexed
jpayne@68 1465 if options.skipBlast:
jpayne@68 1466 bSkipBlast = options.skipBlast
jpayne@68 1467 if options.skipBlastRefseq:
jpayne@68 1468 bSkipBlastRefseq = options.skipBlastRefseq
jpayne@68 1469 if options.skipBlastNt:
jpayne@68 1470 bSkipBlastNt = options.skipBlastNt
jpayne@68 1471 if options.pooledAnalysis:
jpayne@68 1472 bPooledAnalysis = options.pooledAnalysis
jpayne@68 1473
jpayne@68 1474
jpayne@68 1475 skipSubsampling = options.skipSubsample
jpayne@68 1476
jpayne@68 1477 ## Set readqc config
jpayne@68 1478 RQCReadQcConfig.CFG["status_file"] = os.path.join(outputPath, "readqc_status.log")
jpayne@68 1479 RQCReadQcConfig.CFG["files_file"] = os.path.join(outputPath, "readqc_files.tmp")
jpayne@68 1480 RQCReadQcConfig.CFG["stats_file"] = os.path.join(outputPath, "readqc_stats.tmp")
jpayne@68 1481 RQCReadQcConfig.CFG["output_path"] = outputPath
jpayne@68 1482 RQCReadQcConfig.CFG["skip_cleanup"] = options.skipCleanup
jpayne@68 1483 RQCReadQcConfig.CFG["skip_localization"] = options.skipDbLocalization
jpayne@68 1484 RQCReadQcConfig.CFG["log_file"] = os.path.join(outputPath, "readqc.log")
jpayne@68 1485
jpayne@68 1486 screen("Started readqc pipeline, writing log to: %s" % (RQCReadQcConfig.CFG["log_file"]))
jpayne@68 1487
jpayne@68 1488 log = get_logger("readqc", RQCReadQcConfig.CFG["log_file"], LOG_LEVEL, False, True)
jpayne@68 1489 log.info("=================================================================")
jpayne@68 1490 log.info(" Read Qc Analysis (version %s)", VERSION)
jpayne@68 1491 log.info("=================================================================")
jpayne@68 1492
jpayne@68 1493 log.info("Starting %s with %s", SCRIPT_NAME, fastq)
jpayne@68 1494
jpayne@68 1495 if os.path.isfile(RQCReadQcConfig.CFG["status_file"]):
jpayne@68 1496 status = get_status(RQCReadQcConfig.CFG["status_file"], log)
jpayne@68 1497 else:
jpayne@68 1498 checkpoint_step_wrapper(status)
jpayne@68 1499
jpayne@68 1500
jpayne@68 1501 if not os.path.isfile(fastq):
jpayne@68 1502 log.error("%s not found, aborting!", fastq)
jpayne@68 1503 exit(2)
jpayne@68 1504 elif status != "complete":
jpayne@68 1505 ## check for fastq file
jpayne@68 1506 # if os.path.isfile(fastq):
jpayne@68 1507 log.info("Found %s, starting processing.", fastq)
jpayne@68 1508 log.info("Latest status = %s", status)
jpayne@68 1509 if status != 'start':
jpayne@68 1510 nextStepToDo = int(status.split("_")[0])
jpayne@68 1511 if status.find("complete") != -1:
jpayne@68 1512 nextStepToDo += 1
jpayne@68 1513 log.info("Next step to do = %s", nextStepToDo)
jpayne@68 1514
jpayne@68 1515 if status != 'complete':
jpayne@68 1516 bDone = False
jpayne@68 1517 cycle = 0
jpayne@68 1518 totalReadNum = 0
jpayne@68 1519 firstSubsampledFastqFileName = ""
jpayne@68 1520 secondSubsampledFastqFile = ""
jpayne@68 1521 totalReadCount = 0
jpayne@68 1522 subsampledReadNum = 0
jpayne@68 1523 bIsPaired = False
jpayne@68 1524 readLength = 0
jpayne@68 1525 firstSubsampledLogFile = os.path.join(outputPath, "subsample", "first_subsampled.txt")
jpayne@68 1526 secondSubsampledLogFile = os.path.join(outputPath, "subsample", "second_subsampled.txt")
jpayne@68 1527
jpayne@68 1528 while not bDone:
jpayne@68 1529
jpayne@68 1530 cycle += 1
jpayne@68 1531
jpayne@68 1532 if bPooledAnalysis:
jpayne@68 1533 nextStepToDo = 17
jpayne@68 1534 status = "16_illumina_read_blastn_nt complete"
jpayne@68 1535
jpayne@68 1536 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1537 ## 1. fast_subsample_fastq_sequences
jpayne@68 1538 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1539 if nextStepToDo == 1 or status == "start":
jpayne@68 1540 status, firstSubsampledFastqFileName, totalReadCount, subsampledReadNum, bIsPaired, readLength = do_fast_subsample_fastq_sequences(fastq, skipSubsampling, log)
jpayne@68 1541
jpayne@68 1542 if status.endswith("failed"):
jpayne@68 1543 log.error("Subsampling failed.")
jpayne@68 1544 sys.exit(-1)
jpayne@68 1545
jpayne@68 1546 ## if not skipSubsampling and subsampledReadNum == 0: ## too small input file
jpayne@68 1547 if not skipSubsampling and subsampledReadNum < RQCReadQc.ILLUMINA_MIN_NUM_READS: ## min=1000
jpayne@68 1548 log.info("Too small input fastq file. Skip subsampling: total number of reads = %s, sampled number of reads = %s", totalReadCount, subsampledReadNum)
jpayne@68 1549 skipSubsampling = True
jpayne@68 1550 status, firstSubsampledFastqFileName, totalReadCount, subsampledReadNum, bIsPaired, readLength = do_fast_subsample_fastq_sequences(fastq, skipSubsampling, log)
jpayne@68 1551
jpayne@68 1552 if status.endswith("failed"):
jpayne@68 1553 log.error("Subsampling failed.")
jpayne@68 1554 sys.exit(-1)
jpayne@68 1555
jpayne@68 1556 subsampledReadNum = totalReadCount
jpayne@68 1557 if subsampledReadNum >= RQCReadQc.ILLUMINA_MIN_NUM_READS:
jpayne@68 1558 status = "1_illumina_readqc_subsampling complete"
jpayne@68 1559
jpayne@68 1560 ## Still too low number of reads -> record ILLUMINA_TOO_SMALL_NUM_READS and quit
jpayne@68 1561 ##
jpayne@68 1562 if subsampledReadNum == 0 or subsampledReadNum < RQCReadQc.ILLUMINA_MIN_NUM_READS: ## min=1000
jpayne@68 1563 log.info("Too small number of reads (< %s). Stop processing.", RQCReadQc.ILLUMINA_MIN_NUM_READS)
jpayne@68 1564 print("WARNING : Too small number of reads (< %s). Stop processing." % RQCReadQc.ILLUMINA_MIN_NUM_READS)
jpayne@68 1565 log.info("Completed %s: %s", SCRIPT_NAME, fastq)
jpayne@68 1566
jpayne@68 1567 ## Add ILLUMINA_TOO_SMALL_NUM_READS=1 to stats to be used in reporting
jpayne@68 1568 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 1569 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_TOO_SMALL_NUM_READS, "1", log)
jpayne@68 1570
jpayne@68 1571 ## move rqc-files.tmp to rqc-files.txt
jpayne@68 1572 newFilesFile = os.path.join(outputPath, "readqc_files.txt")
jpayne@68 1573 newStatsFile = os.path.join(outputPath, "readqc_stats.txt")
jpayne@68 1574
jpayne@68 1575 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["files_file"], newFilesFile)
jpayne@68 1576 log.info("mv cmd: %s", cmd)
jpayne@68 1577 run_sh_command(cmd, True, log)
jpayne@68 1578
jpayne@68 1579 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["stats_file"], newStatsFile)
jpayne@68 1580 log.info("mv cmd: %s", cmd)
jpayne@68 1581 run_sh_command(cmd, True, log)
jpayne@68 1582
jpayne@68 1583 exit(0)
jpayne@68 1584
jpayne@68 1585 if status == "1_illumina_readqc_subsampling failed":
jpayne@68 1586 bDone = True
jpayne@68 1587
jpayne@68 1588
jpayne@68 1589 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1590 ## 2. write_unique_20_mers
jpayne@68 1591 ## NOTE: fastq = orig input fastq,
jpayne@68 1592 ## totalReadCount = total reads in the orig input fastq
jpayne@68 1593 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1594 if nextStepToDo == 2 or status == "1_illumina_readqc_subsampling complete":
jpayne@68 1595 ## Cope with restarting
jpayne@68 1596 ## save subsamples file in "first_subsampled.txt" so that the file
jpayne@68 1597 ## name can be read when started
jpayne@68 1598 if not os.path.isfile(firstSubsampledLogFile):
jpayne@68 1599 make_dir(os.path.join(outputPath, "subsample"))
jpayne@68 1600 with open(firstSubsampledLogFile, "w") as SAMP_FH:
jpayne@68 1601 SAMP_FH.write(firstSubsampledFastqFileName + " " + str(totalReadCount) + " " + str(subsampledReadNum) + "\n")
jpayne@68 1602
jpayne@68 1603 if firstSubsampledFastqFileName == "" and options.skipSubsample:
jpayne@68 1604 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
jpayne@68 1605 with open(firstSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1606 l = SAMP_FH.readline().strip()
jpayne@68 1607 t = l.split()
jpayne@68 1608 assert len(t) == 2 or len(t) == 3
jpayne@68 1609 firstSubsampledFastqFileName = t[0]
jpayne@68 1610 totalReadCount = t[1]
jpayne@68 1611 subsampledReadNum = t[2]
jpayne@68 1612 log.debug("firstSubsampledFastqFileName=%s, totalReadCount=%s, subsampledReadNum=%s", firstSubsampledFastqFileName, totalReadCount, subsampledReadNum)
jpayne@68 1613
jpayne@68 1614 else:
jpayne@68 1615 nextStepToDo = 1
jpayne@68 1616 continue
jpayne@68 1617
jpayne@68 1618 ## TODO: move getting totalReadNum in the func ???
jpayne@68 1619 ##
jpayne@68 1620 log.debug("firstSubsampledFastqFileName=%s, totalReadCount=%s, subsampledReadNum=%s", firstSubsampledFastqFileName, totalReadCount, subsampledReadNum)
jpayne@68 1621
jpayne@68 1622 if totalReadCount is None or totalReadCount == 0:
jpayne@68 1623 nextStepToDo = 1
jpayne@68 1624 continue
jpayne@68 1625 else:
jpayne@68 1626 status = do_write_unique_20_mers(fastq, totalReadCount, log)
jpayne@68 1627
jpayne@68 1628 if status == "2_unique_mers_sampling failed":
jpayne@68 1629 bDone = True
jpayne@68 1630
jpayne@68 1631 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1632 ## 3. generate read GC histograms: Illumina_read_gc
jpayne@68 1633 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1634 if nextStepToDo == 3 or status == "2_unique_mers_sampling complete":
jpayne@68 1635 ## Read the subsampled file name
jpayne@68 1636 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
jpayne@68 1637 with open(firstSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1638 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
jpayne@68 1639 status = do_illumina_read_gc(firstSubsampledFastqFileName, log)
jpayne@68 1640
jpayne@68 1641 if status == "3_illumina_read_gc failed":
jpayne@68 1642 bDone = True
jpayne@68 1643
jpayne@68 1644 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1645 ## 4. read_quality_stats
jpayne@68 1646 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1647 if nextStepToDo == 4 or status == "3_illumina_read_gc complete":
jpayne@68 1648 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
jpayne@68 1649 with open(firstSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1650 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
jpayne@68 1651 status = do_read_quality_stats(firstSubsampledFastqFileName, log)
jpayne@68 1652
jpayne@68 1653 if status == "4_illumina_read_quality_stats failed":
jpayne@68 1654 bDone = True
jpayne@68 1655
jpayne@68 1656 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1657 ## 5. write_base_quality_stats
jpayne@68 1658 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1659 if nextStepToDo == 5 or status == "4_illumina_read_quality_stats complete":
jpayne@68 1660 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
jpayne@68 1661 with open(firstSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1662 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
jpayne@68 1663 status = do_write_base_quality_stats(firstSubsampledFastqFileName, log)
jpayne@68 1664
jpayne@68 1665 if status == "5_illumina_read_quality_stats failed":
jpayne@68 1666 bDone = True
jpayne@68 1667
jpayne@68 1668 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1669 ## 6. illumina_count_q_score
jpayne@68 1670 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1671 if nextStepToDo == 6 or status == "5_illumina_read_quality_stats complete":
jpayne@68 1672 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
jpayne@68 1673 with open(firstSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1674 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
jpayne@68 1675 status = do_illumina_count_q_score(firstSubsampledFastqFileName, log)
jpayne@68 1676
jpayne@68 1677 if status == "6_illumina_count_q_score failed":
jpayne@68 1678 bDone = True
jpayne@68 1679
jpayne@68 1680 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1681 ## 7. illumina_calculate_average_quality
jpayne@68 1682 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1683 if nextStepToDo == 7 or status == "6_illumina_count_q_score complete":
jpayne@68 1684 ## Let's skip this step. (20140902)
jpayne@68 1685 log.info("\n\n%sSTEP7 - Skipping 21mer analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 1686
jpayne@68 1687 status = "7_illumina_calculate_average_quality in progress"
jpayne@68 1688 checkpoint_step_wrapper(status)
jpayne@68 1689
jpayne@68 1690 status = "7_illumina_calculate_average_quality complete"
jpayne@68 1691 checkpoint_step_wrapper(status)
jpayne@68 1692
jpayne@68 1693 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1694 ## 8. illumina_find_common_motifs
jpayne@68 1695 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1696 if nextStepToDo == 8 or status == "7_illumina_calculate_average_quality complete":
jpayne@68 1697 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
jpayne@68 1698 with open(firstSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1699 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
jpayne@68 1700 status = do_illumina_find_common_motifs(firstSubsampledFastqFileName, log)
jpayne@68 1701
jpayne@68 1702 if status == "8_illumina_find_common_motifs failed":
jpayne@68 1703 bDone = True
jpayne@68 1704
jpayne@68 1705 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1706 ## 10. illumina_run_tagdust
jpayne@68 1707 ##
jpayne@68 1708 ## NOTE: This step will be skipped. No need to run.
jpayne@68 1709 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1710 # if nextStepToDo == 10 or status == "9_illumina_run_dedupe complete":
jpayne@68 1711 if nextStepToDo == 9 or status == "8_illumina_find_common_motifs complete":
jpayne@68 1712 ## 20131023 skip this step
jpayne@68 1713 log.info("\n\n%sSTEP10 - Skipping tag dust <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 1714 status = "10_illumina_run_tagdust in progress"
jpayne@68 1715 checkpoint_step_wrapper(status)
jpayne@68 1716
jpayne@68 1717 status = "10_illumina_run_tagdust complete"
jpayne@68 1718 checkpoint_step_wrapper(status)
jpayne@68 1719
jpayne@68 1720 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1721 ## 11. illumina_detect_read_contam
jpayne@68 1722 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1723 if nextStepToDo == 11 or status == "10_illumina_run_tagdust complete":
jpayne@68 1724 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
jpayne@68 1725 with open(firstSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1726 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
jpayne@68 1727 status = do_illumina_detect_read_contam(firstSubsampledFastqFileName, firstBptoCut, log)
jpayne@68 1728
jpayne@68 1729 if status == "11_illumina_detect_read_contam failed":
jpayne@68 1730 bDone = True
jpayne@68 1731
jpayne@68 1732
jpayne@68 1733 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1734 ## 13. illumina_subsampling_read_megablast
jpayne@68 1735 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1736 if nextStepToDo == 12 or status == "11_illumina_detect_read_contam complete":
jpayne@68 1737 if not bSkipBlast:
jpayne@68 1738 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
jpayne@68 1739 with open(firstSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1740 l = SAMP_FH.readline().strip()
jpayne@68 1741 firstSubsampledFastqFileName = l.split()[0]
jpayne@68 1742 totalReadCount = int(l.split()[1])
jpayne@68 1743 subsampledReadNum = int(l.split()[2])
jpayne@68 1744
jpayne@68 1745 status, secondSubsampledFastqFile, second_read_cnt_total, second_read_cnt_sampled = do_illumina_subsampling_read_blastn(firstSubsampledFastqFileName, skipSubsampling, subsampledReadNum, log)
jpayne@68 1746 log.debug("status=%s, secondSubsampledFastqFile=%s, second_read_cnt_total=%s, second_read_cnt_sampled=%s", status, secondSubsampledFastqFile, second_read_cnt_total, second_read_cnt_sampled)
jpayne@68 1747
jpayne@68 1748 else:
jpayne@68 1749 statsFile = RQCReadQcConfig.CFG["stats_file"]
jpayne@68 1750 append_rqc_stats(statsFile, ReadqcStats.ILLUMINA_SKIP_BLAST, "1", log)
jpayne@68 1751 log.info("\n\n%sSTEP13 - Run subsampling for Blast search <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 1752 log.info("13_illumina_subsampling_read_megablast skipped.\n")
jpayne@68 1753 status = "13_illumina_subsampling_read_megablast complete"
jpayne@68 1754 checkpoint_step_wrapper(status)
jpayne@68 1755
jpayne@68 1756 if status == "13_illumina_subsampling_read_megablast failed":
jpayne@68 1757 bDone = True
jpayne@68 1758
jpayne@68 1759 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1760 # 14. illumina_read_blastn_refseq_archaea
jpayne@68 1761 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1762 if nextStepToDo == 14 or status == "13_illumina_subsampling_read_megablast complete":
jpayne@68 1763 if not bSkipBlast and not bSkipBlastRefseq:
jpayne@68 1764 if secondSubsampledFastqFile == "" and not os.path.isfile(secondSubsampledLogFile):
jpayne@68 1765 nextStepToDo = 13
jpayne@68 1766 continue
jpayne@68 1767
jpayne@68 1768 if secondSubsampledFastqFile != "" and not os.path.isfile(secondSubsampledLogFile):
jpayne@68 1769 make_dir(os.path.join(outputPath, "subsample"))
jpayne@68 1770 with open(secondSubsampledLogFile, "w") as SAMP_FH:
jpayne@68 1771 SAMP_FH.write(secondSubsampledFastqFile + " " + str(second_read_cnt_total) + " " + str(second_read_cnt_sampled) + "\n")
jpayne@68 1772 else:
jpayne@68 1773 with open(secondSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1774 l = SAMP_FH.readline().strip()
jpayne@68 1775 secondSubsampledFastqFile = l.split()[0]
jpayne@68 1776 second_read_cnt_total = int(l.split()[1])
jpayne@68 1777 second_read_cnt_sampled = int(l.split()[2])
jpayne@68 1778
jpayne@68 1779 status = do_illumina_read_blastn_refseq_archaea(secondSubsampledFastqFile, second_read_cnt_sampled, log)
jpayne@68 1780 checkpoint_step_wrapper(status)
jpayne@68 1781
jpayne@68 1782 else:
jpayne@68 1783 log.info("\n\n%sSTEP14 - Run illumina_read_blastn_refseq_archaea analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 1784 log.info("14_illumina_read_blastn_refseq_archaea skipped.\n")
jpayne@68 1785 status = "14_illumina_read_blastn_refseq_archaea in progress"
jpayne@68 1786 checkpoint_step_wrapper(status)
jpayne@68 1787 status = "14_illumina_read_blastn_refseq_archaea complete"
jpayne@68 1788 checkpoint_step_wrapper(status)
jpayne@68 1789
jpayne@68 1790 if status == "14_illumina_read_blastn_refseq_archaea failed":
jpayne@68 1791 bDone = True
jpayne@68 1792
jpayne@68 1793 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1794 # 15. illumina_read_blastn_refseq_bacteria
jpayne@68 1795 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1796 if nextStepToDo == 15 or status == "14_illumina_read_blastn_refseq_archaea complete":
jpayne@68 1797 if not bSkipBlast and not bSkipBlastRefseq:
jpayne@68 1798 if secondSubsampledFastqFile == "" and not os.path.isfile(secondSubsampledLogFile):
jpayne@68 1799 nextStepToDo = 13
jpayne@68 1800 continue
jpayne@68 1801
jpayne@68 1802 if secondSubsampledFastqFile != "" and not os.path.isfile(secondSubsampledLogFile):
jpayne@68 1803 make_dir(os.path.join(outputPath, "subsample"))
jpayne@68 1804 with open(secondSubsampledLogFile, "w") as SAMP_FH:
jpayne@68 1805 SAMP_FH.write(secondSubsampledFastqFile + " " + str(second_read_cnt_total) + " " + str(second_read_cnt_sampled) + "\n")
jpayne@68 1806 else:
jpayne@68 1807 with open(secondSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1808 l = SAMP_FH.readline().strip()
jpayne@68 1809 secondSubsampledFastqFile = l.split()[0]
jpayne@68 1810 second_read_cnt_total = int(l.split()[1])
jpayne@68 1811 second_read_cnt_sampled = int(l.split()[2])
jpayne@68 1812
jpayne@68 1813 status = do_illumina_read_blastn_refseq_bacteria(secondSubsampledFastqFile, log)
jpayne@68 1814 checkpoint_step_wrapper(status)
jpayne@68 1815
jpayne@68 1816 else:
jpayne@68 1817 log.info("\n\n%sSTEP15 - Run illumina_read_blastn_refseq_bacteria analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 1818 log.info("15_illumina_read_blastn_refseq_bacteria skipped.\n")
jpayne@68 1819 status = "15_illumina_read_blastn_refseq_bacteria in progress"
jpayne@68 1820 checkpoint_step_wrapper(status)
jpayne@68 1821 status = "15_illumina_read_blastn_refseq_bacteria complete"
jpayne@68 1822 checkpoint_step_wrapper(status)
jpayne@68 1823
jpayne@68 1824 if status == "15_illumina_read_blastn_refseq_bacteria failed":
jpayne@68 1825 bDone = True
jpayne@68 1826
jpayne@68 1827 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1828 ## 16. illumina_read_blastn_nt
jpayne@68 1829 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1830 if nextStepToDo == 16 or status == "15_illumina_read_blastn_refseq_bacteria complete":
jpayne@68 1831 if not bSkipBlast and not bSkipBlastNt:
jpayne@68 1832 if secondSubsampledFastqFile == "" and not os.path.isfile(secondSubsampledLogFile):
jpayne@68 1833 nextStepToDo = 13
jpayne@68 1834 continue
jpayne@68 1835
jpayne@68 1836 if secondSubsampledFastqFile == "" and os.path.isfile(secondSubsampledLogFile):
jpayne@68 1837 with open(secondSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1838 l = SAMP_FH.readline().strip()
jpayne@68 1839 secondSubsampledFastqFile = l.split()[0]
jpayne@68 1840 second_read_cnt_total = int(l.split()[1])
jpayne@68 1841 second_read_cnt_sampled = int(l.split()[2])
jpayne@68 1842
jpayne@68 1843 status = do_illumina_read_blastn_nt(secondSubsampledFastqFile, log)
jpayne@68 1844 checkpoint_step_wrapper(status)
jpayne@68 1845
jpayne@68 1846 else:
jpayne@68 1847 log.info("\n\n%sSTEP16 - Run illumina_read_blastn_nt analysis <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 1848 log.info("16_illumina_read_blastn_nt skipped.\n")
jpayne@68 1849 status = "16_illumina_read_blastn_nt in progress"
jpayne@68 1850 checkpoint_step_wrapper(status)
jpayne@68 1851 status = "16_illumina_read_blastn_nt complete"
jpayne@68 1852 checkpoint_step_wrapper(status)
jpayne@68 1853
jpayne@68 1854 if status == "16_illumina_read_blastn_nt failed":
jpayne@68 1855 bDone = True
jpayne@68 1856
jpayne@68 1857 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1858 ## 17. multiplex_statistics
jpayne@68 1859 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1860 if nextStepToDo == 17 or status == "16_illumina_read_blastn_nt complete":
jpayne@68 1861 status = do_illumina_multiplex_statistics(fastq, log, isMultiplexed=isMultiplexed)
jpayne@68 1862
jpayne@68 1863 if status == "17_multiplex_statistics failed":
jpayne@68 1864 bDone = True
jpayne@68 1865
jpayne@68 1866 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1867 ## 18. end_of_read_illumina_adapter_check
jpayne@68 1868 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1869 if nextStepToDo == 18 or status == "17_multiplex_statistics complete":
jpayne@68 1870 if firstSubsampledFastqFileName == "" and os.path.isfile(firstSubsampledLogFile):
jpayne@68 1871 with open(firstSubsampledLogFile, "r") as SAMP_FH:
jpayne@68 1872 firstSubsampledFastqFileName = SAMP_FH.readline().strip().split()[0]
jpayne@68 1873 status = do_end_of_read_illumina_adapter_check(firstSubsampledFastqFileName, log)
jpayne@68 1874
jpayne@68 1875 if status == "18_end_of_read_illumina_adapter_check failed":
jpayne@68 1876 bDone = True
jpayne@68 1877
jpayne@68 1878 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1879 ## 19. insert size analysis (bbmerge.sh)
jpayne@68 1880 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1881 if nextStepToDo == 19 or status == "18_end_of_read_illumina_adapter_check complete":
jpayne@68 1882 status = do_insert_size_analysis(fastq, log)
jpayne@68 1883
jpayne@68 1884 if status == "19_insert_size_analysis failed":
jpayne@68 1885 bDone = True
jpayne@68 1886
jpayne@68 1887
jpayne@68 1888 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1889 ## 21. sketch vs nt, refseq, silva
jpayne@68 1890 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1891 # if nextStepToDo == 21 or status == "20_gc_divergence_analysis complete":
jpayne@68 1892 if nextStepToDo == 20 or status == "19_insert_size_analysis complete":
jpayne@68 1893 status = do_sketch_vs_nt_refseq_silva(fastq, log)
jpayne@68 1894
jpayne@68 1895 if status == "21_sketch_vs_nt_refseq_silva failed":
jpayne@68 1896 bDone = True
jpayne@68 1897
jpayne@68 1898 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1899 ## 22. postprocessing & reporting
jpayne@68 1900 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1901 if nextStepToDo == 22 or status == "21_sketch_vs_nt_refseq_silva complete":
jpayne@68 1902 ## 20131023 skip this step
jpayne@68 1903 log.info("\n\n%sSTEP22 - Run illumina_readqc_report_postprocess: mv rqc-*.tmp to rqc-*.txt <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<%s\n", color['pink'], color[''])
jpayne@68 1904 status = "22_illumina_readqc_report_postprocess in progress"
jpayne@68 1905 checkpoint_step_wrapper(status)
jpayne@68 1906
jpayne@68 1907 ## move rqc-files.tmp to rqc-files.txt
jpayne@68 1908 newFilesFile = os.path.join(outputPath, "readqc_files.txt")
jpayne@68 1909 newStatsFile = os.path.join(outputPath, "readqc_stats.txt")
jpayne@68 1910
jpayne@68 1911 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["files_file"], newFilesFile)
jpayne@68 1912 log.info("mv cmd: %s", cmd)
jpayne@68 1913 run_sh_command(cmd, True, log)
jpayne@68 1914
jpayne@68 1915 cmd = "mv %s %s " % (RQCReadQcConfig.CFG["stats_file"], newStatsFile)
jpayne@68 1916 log.info("mv cmd: %s", cmd)
jpayne@68 1917 run_sh_command(cmd, True, log)
jpayne@68 1918
jpayne@68 1919 log.info("22_illumina_readqc_report_postprocess complete.")
jpayne@68 1920 status = "22_illumina_readqc_report_postprocess complete"
jpayne@68 1921 checkpoint_step_wrapper(status)
jpayne@68 1922
jpayne@68 1923 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1924 ## 23. Cleanup
jpayne@68 1925 ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 1926 if nextStepToDo == 23 or status == "22_illumina_readqc_report_postprocess complete":
jpayne@68 1927 status = do_cleanup_readqc(log)
jpayne@68 1928
jpayne@68 1929 if status == "23_cleanup_readqc failed":
jpayne@68 1930 bDone = True
jpayne@68 1931
jpayne@68 1932 if status == "23_cleanup_readqc complete":
jpayne@68 1933 status = "complete" ## FINAL COMPLETE!
jpayne@68 1934 bDone = True
jpayne@68 1935
jpayne@68 1936 ## don't cycle more than 10 times ...
jpayne@68 1937 if cycle > 10:
jpayne@68 1938 bDone = True
jpayne@68 1939
jpayne@68 1940 if status != "complete":
jpayne@68 1941 log.info("Status %s", status)
jpayne@68 1942 else:
jpayne@68 1943 log.info("\n\nCompleted %s: %s", SCRIPT_NAME, fastq)
jpayne@68 1944 checkpoint_step_wrapper("complete")
jpayne@68 1945 log.info("Pipeline processing Done.")
jpayne@68 1946 print("Pipeline processing Done.")
jpayne@68 1947
jpayne@68 1948 else:
jpayne@68 1949 log.info("Pipeline processing is already complete, skip.")
jpayne@68 1950 print("Pipeline processing is already complete, skip.")
jpayne@68 1951
jpayne@68 1952 if status == 'complete' and options.html:
jpayne@68 1953 do_html(outputPath, fastq)
jpayne@68 1954
jpayne@68 1955
jpayne@68 1956 exit(0)
jpayne@68 1957
jpayne@68 1958
jpayne@68 1959 ## EOF