annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/pytools/lib/common.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 #! /usr/bin/env python
jpayne@68 2 # -*- coding: utf-8 -*-
jpayne@68 3 """
jpayne@68 4 Function definitions common to all programs.
jpayne@68 5
jpayne@68 6
jpayne@68 7 """
jpayne@68 8
jpayne@68 9 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 10 ## libraries to use
jpayne@68 11
jpayne@68 12 #import re
jpayne@68 13 import os
jpayne@68 14 import time
jpayne@68 15 import sys
jpayne@68 16 #import getpass
jpayne@68 17 import logging
jpayne@68 18 #from colorlog import ColoredFormatter
jpayne@68 19 # import EnvironmentModules # get_read_count_fastq
jpayne@68 20 from subprocess import Popen, PIPE
jpayne@68 21 from email.mime.text import MIMEText
jpayne@68 22
jpayne@68 23
jpayne@68 24 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 25 ## function definitions
jpayne@68 26
jpayne@68 27
jpayne@68 28 '''
jpayne@68 29 creates a logging instance
jpayne@68 30 https://docs.python.org/2/howto/logging.html
jpayne@68 31 https://pypi.python.org/pypi/colorlog
jpayne@68 32 '''
jpayne@68 33 def get_logger(log_name, log_file, log_level = "INFO", stdout = False, color = False):
jpayne@68 34 log = logging.getLogger(log_name)
jpayne@68 35 handler = None
jpayne@68 36 if stdout:
jpayne@68 37 handler = logging.StreamHandler(sys.stdout)
jpayne@68 38 else:
jpayne@68 39 handler = logging.FileHandler(log_file)
jpayne@68 40
jpayne@68 41 formatter = logging.Formatter('%(filename)-15s:%(process)d %(asctime)s %(levelname)s: %(message)s')
jpayne@68 42
jpayne@68 43 if color and 1==2:
jpayne@68 44 """
jpayne@68 45 formatter = ColoredFormatter("%(filename)-15s:%(process)d %(asctime)s %(log_color)s%(levelname)s: %(message)s", datefmt=None, reset=True,
jpayne@68 46 log_colors={
jpayne@68 47 'DEBUG': 'blue',
jpayne@68 48 'INFO': 'green',
jpayne@68 49 'WARNING': 'yellow',
jpayne@68 50 'ERROR': 'red',
jpayne@68 51 'CRITICAL': 'red, bg_white',
jpayne@68 52 },
jpayne@68 53 secondary_log_colors={},
jpayne@68 54 style='%')
jpayne@68 55 Not working in conda - 2017-04-29
jpayne@68 56 """
jpayne@68 57 handler.setFormatter(formatter)
jpayne@68 58
jpayne@68 59 log.addHandler(handler)
jpayne@68 60 log.setLevel(log_level)
jpayne@68 61
jpayne@68 62 return log
jpayne@68 63
jpayne@68 64
jpayne@68 65 '''
jpayne@68 66 Checkpoint the status plus a timestamp
jpayne@68 67 - appends the status
jpayne@68 68
jpayne@68 69 @param status_log: /path/to/status.log (or whatever you name it)
jpayne@68 70 @param status: status to append to status.log
jpayne@68 71
jpayne@68 72 '''
jpayne@68 73 def checkpoint_step(status_log, status):
jpayne@68 74 status_line = "%s,%s\n" % (status, time.strftime("%Y-%m-%d %H:%M:%S"))
jpayne@68 75
jpayne@68 76 with open(status_log, "a") as myfile:
jpayne@68 77 myfile.write(status_line)
jpayne@68 78
jpayne@68 79 '''
jpayne@68 80 returns the last step (status) from the pipeline
jpayne@68 81 @param status_log: /path/to/status.log (or whatever you name it)
jpayne@68 82 @param log: logger object
jpayne@68 83
jpayne@68 84 @return last status in the status log, "start" if nothing there
jpayne@68 85 '''
jpayne@68 86 def get_status(status_log, log = None):
jpayne@68 87 #status_log = "%s/%s" % (output_path, "test_status.log")
jpayne@68 88
jpayne@68 89 status = "start"
jpayne@68 90 timestamp = str(time.strftime("%Y-%m-%d %H:%M:%S"))
jpayne@68 91
jpayne@68 92 if os.path.isfile(status_log):
jpayne@68 93 fh = open(status_log, 'r')
jpayne@68 94 lines = fh.readlines()
jpayne@68 95 fh.close()
jpayne@68 96
jpayne@68 97 for line in lines:
jpayne@68 98 if line.startswith('#'): continue
jpayne@68 99 line_list = line.split(",")
jpayne@68 100 assert len(line_list) == 2
jpayne@68 101 status = str(line_list[0]).strip()
jpayne@68 102 timestamp = str(line_list[1]).strip()
jpayne@68 103
jpayne@68 104 if not status:
jpayne@68 105 status = "start"
jpayne@68 106
jpayne@68 107 if log:
jpayne@68 108 log.info("Last checkpointed step: %s (%s)", status, timestamp)
jpayne@68 109
jpayne@68 110 else:
jpayne@68 111 if log:
jpayne@68 112 log.info("Cannot find status.log (%s), assuming new run", status_log)
jpayne@68 113
jpayne@68 114 status = status.strip().lower()
jpayne@68 115
jpayne@68 116 return status
jpayne@68 117
jpayne@68 118
jpayne@68 119
jpayne@68 120 '''
jpayne@68 121 run a command from python
jpayne@68 122
jpayne@68 123 @param cmd: command to run
jpayne@68 124 @param live: False = run in dry mode (print command), True = run normally
jpayne@68 125 @param log: logger object
jpayne@68 126
jpayne@68 127 @return std_out, std_err, exit_code
jpayne@68 128 '''
jpayne@68 129 def run_command(cmd, live=False, log=None):
jpayne@68 130 stdOut = None
jpayne@68 131 stdErr = None
jpayne@68 132 exitCode = None
jpayne@68 133 #end = 0
jpayne@68 134 #elapsedSec = 0
jpayne@68 135
jpayne@68 136 if cmd:
jpayne@68 137 if not live:
jpayne@68 138 stdOut = "Not live: cmd = '%s'" % (cmd)
jpayne@68 139 exitCode = 0
jpayne@68 140 else:
jpayne@68 141 if log: log.info("cmd: %s" % (cmd))
jpayne@68 142
jpayne@68 143 p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
jpayne@68 144 stdOut, stdErr = p.communicate()
jpayne@68 145 exitCode = p.returncode
jpayne@68 146
jpayne@68 147 if log:
jpayne@68 148 log.info("Return values: exitCode=" + str(exitCode) + ", stdOut=" + str(stdOut) + ", stdErr=" + str(stdErr))
jpayne@68 149 if exitCode != 0:
jpayne@68 150 log.warn("- The exit code has non-zero value.")
jpayne@68 151
jpayne@68 152 else:
jpayne@68 153 if log:
jpayne@68 154 log.error("- No command to run.")
jpayne@68 155 return None, None, -1
jpayne@68 156
jpayne@68 157 return stdOut, stdErr, exitCode
jpayne@68 158
jpayne@68 159
jpayne@68 160 '''
jpayne@68 161 replacement for run_command
jpayne@68 162 - includes logging, convert_cmd & post_mortem
jpayne@68 163 '''
jpayne@68 164
jpayne@68 165 def run_cmd(cmd, log=None):
jpayne@68 166
jpayne@68 167 std_out = None
jpayne@68 168 std_err = None
jpayne@68 169 exit_code = 0
jpayne@68 170
jpayne@68 171 if cmd:
jpayne@68 172 # convert to work on genepool/denovo
jpayne@68 173 cmd = convert_cmd(cmd)
jpayne@68 174
jpayne@68 175 if log:
jpayne@68 176 log.info("- cmd: %s", cmd)
jpayne@68 177
jpayne@68 178 p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
jpayne@68 179 std_out, std_err = p.communicate()
jpayne@68 180 exit_code = p.returncode
jpayne@68 181
jpayne@68 182 post_mortem_cmd(cmd, exit_code, std_out, std_err, log)
jpayne@68 183
jpayne@68 184 return std_out, std_err, exit_code
jpayne@68 185
jpayne@68 186
jpayne@68 187 '''
jpayne@68 188 Simple function to output to the log what happened only if exit code > 0
jpayne@68 189
jpayne@68 190 Typical usage:
jpayne@68 191 std_out, std_err, exit_code = run_command(cmd, True)
jpayne@68 192 post_mortem_cmd(cmd, exit_code, std_out, std_err)
jpayne@68 193
jpayne@68 194 '''
jpayne@68 195 def post_mortem_cmd(cmd, exit_code, std_out, std_err, log = None):
jpayne@68 196 if exit_code > 0:
jpayne@68 197 if log:
jpayne@68 198 log.error("- cmd failed: %s", cmd)
jpayne@68 199 log.error("- exit code: %s", exit_code)
jpayne@68 200
jpayne@68 201 else:
jpayne@68 202 print "- cmd failed: %s" % (cmd)
jpayne@68 203 print "- exit code: %s" % (exit_code)
jpayne@68 204
jpayne@68 205
jpayne@68 206 if std_out:
jpayne@68 207 if log:
jpayne@68 208 log.error("- std_out: %s", std_out)
jpayne@68 209 else:
jpayne@68 210 print "- std_out: %s" % (std_out)
jpayne@68 211
jpayne@68 212 if std_err:
jpayne@68 213 if log:
jpayne@68 214 log.error("- std_err: %s", std_err)
jpayne@68 215 else:
jpayne@68 216 print "- std_err: %s" % (std_err)
jpayne@68 217
jpayne@68 218
jpayne@68 219 '''
jpayne@68 220 Convert command to use genepool or denovo (shifter) to run
jpayne@68 221 replace #placeholder; with shifter or module load command
jpayne@68 222 #placeholder.v; should specify the version to use
jpayne@68 223
jpayne@68 224 This should be the only place in the pipelines that specifies the images/modules translation
jpayne@68 225 '''
jpayne@68 226 def convert_cmd(cmd):
jpayne@68 227 new_cmd = cmd
jpayne@68 228
jpayne@68 229 shifter_img = {
jpayne@68 230 "#bbtools" : "shifter --image=bryce911/bbtools ",
jpayne@68 231 "#pigz" : "module load pigz;",
jpayne@68 232 "#jamo" : "shifter --image=registry.services.nersc.gov/htandra/jamo_dev:1.0 ", # works, but would like simple module to use - have one on Denovo but not Cori
jpayne@68 233 "#gnuplot" : "shifter --image=bryce911/bbtools ", # (1)
jpayne@68 234 "#spades/3.9.0" : "shifter --image=bryce911/spades3.9.0 ",
jpayne@68 235 "#spades/3.10.1" : "shifter --image=bryce911/spades3.10.1 ",
jpayne@68 236 "#spades/3.11.0" : "shifter --image=bryce911/spades-3.11.0 ", # GAA-3383
jpayne@68 237 "#spades/3.11.1-check" : "shifter --image=bryce911/spades3.11.1-check ", # development
jpayne@68 238 "#prodigal/2.6.3" : "shifter --image=registry.services.nersc.gov/jgi/prodigal ", # RQCSUPPORT-1318
jpayne@68 239 "#prodigal/2.5.0" : "shifter --image=registry.services.nersc.gov/jgi/prodigal ",
jpayne@68 240 "#prodigal/2.50" : "shifter --image=registry.services.nersc.gov/jgi/prodigal ",
jpayne@68 241 "#lastal/869" : "shifter --image=bryce911/lastal:869 ",
jpayne@68 242 "#lastal/828" : "shifter --image=bryce911/lastal:869 ",
jpayne@68 243 #"#lastal" : "shifter --image=bryce911/lastal:869 ",
jpayne@68 244 "#R/3.3.2" : "module load R/3.3.2;",
jpayne@68 245 "#texlive" : "shifter --image=bryce911/bbtools ", # (1)
jpayne@68 246 "#java" : "shifter --image=bryce911/bbtools ", # (1)
jpayne@68 247 "#blast+/2.6.0" : "shifter --image=sulsj/ncbi-blastplus:2.6.0 ",
jpayne@68 248 "#blast" : "shifter --image=sulsj/ncbi-blastplus:2.7.0 ",
jpayne@68 249 "#megahit-1.1.1" : "shifter --image=foster505/megahit:v1.1.1-2-g02102e1 ",
jpayne@68 250 "#smrtanalysis/2.3.0_p5" : "shifter --image=registry.services.nersc.gov/jgi/smrtanalysis:2.3.0_p5 ", # meth - need more memory
jpayne@68 251 "#mummer/3.23" : "shifter --image=bryce911/mummer3.23 ", # 3.23
jpayne@68 252 "#hmmer" : "shifter --image=registry.services.nersc.gov/jgi/hmmer:latest ", # 3.1b2
jpayne@68 253 "#samtools/1.4" : "shifter --image=rmonti/samtools ",
jpayne@68 254 "#mothur/1.39.5" : "shifter --image=bryce911/mothur1.39.5 ",
jpayne@68 255 "#vsearch/2.4.3" : "shifter --image=bryce911/vsearch2.4.3 ",
jpayne@68 256 "#graphviz" : "shifter --image=bryce911/bbtools ",
jpayne@68 257 "#ssu-align/0.1.1" : "shifter --image=bryce911/ssu-align0.1.1 ", # openmpi/1.10 included in docker container
jpayne@68 258 "#smrtlink/4.0.0.190159" : "shifter --image=registry.services.nersc.gov/jgi/smrtlink:4.0.0.190159 /smrtlink/smrtcmds/bin/", # progs not in path
jpayne@68 259 "#smrtlink/5.0.1.9585" : "shifter --image=registry.services.nersc.gov/jgi/smrtlink:5.0.1.9585 /smrtlink/smrtcmds/bin/", # progs not in path, Tony created 2017-10-16
jpayne@68 260 "#smrtlink" : "shifter --image=registry.services.nersc.gov/jgi/smrtlink:5.0.1.9585 /smrtlink/smrtcmds/bin/", # progs not in path
jpayne@68 261 "#prodege" : "shifter --image=bryce911/prodege ", # 2.2.1
jpayne@68 262 #"#hmmer" : "shifter --image=registry.services.nersc.gov/jgi/hmmer ", # 3.1b2 - Feb 2015, latest as of Oct 2017
jpayne@68 263 "#checkm" : "shifter --image=registry.services.nersc.gov/jgi/checkm ",
jpayne@68 264 }
jpayne@68 265
jpayne@68 266
jpayne@68 267 # (1) - added as part of the bryce911 bbtools package
jpayne@68 268
jpayne@68 269 #cmd = "#bbtools-shijie;bbmap...."
jpayne@68 270 # this dict will be deprecated as of March 2018 when genepool passes into legend
jpayne@68 271 genepool_mod = {
jpayne@68 272 "#bbtools" : "module load bbtools",
jpayne@68 273 "#pigz" : "module load pigz",
jpayne@68 274 "#jamo" : "module load jamo",
jpayne@68 275 "#gnuplot" : "module load gnuplot/4.6.2", # sag,iso,sps,ce:gc_cov, gc_histogram, contig_gc
jpayne@68 276 "#spades/3.9.0" : "module load spades/3.9.0",
jpayne@68 277 "#spades/3.10.1" : "module load spades/3.10.1",
jpayne@68 278 "#spades/3.11.1" : "module load spades/3.11.1-check",
jpayne@68 279 "#prodigal/2.6.3" : "module load prodigal/2.50", # aka 2.50, also 2.60 is available
jpayne@68 280 "#prodigal/2.5.0" : "module load prodigal/2.50",
jpayne@68 281 "#prodigal/2.50" : "module load prodigal/2.50",
jpayne@68 282 #"#lastal" : "module load last/828",
jpayne@68 283 "#lastal/828" : "module load last/828",
jpayne@68 284 "#R/3.3.2" : "module unload R;module load R/3.3.1", # 3.3.2 not on genepool - RQCSUPPORT-1516 unload R for Kecia
jpayne@68 285 "#texlive" : "module load texlive",
jpayne@68 286 "#blast+/2.6.0" : "module load blast+/2.6.0",
jpayne@68 287 #"#blast+/2.7.0" : "module load blast+/2.7.0", # not created
jpayne@68 288 "#blast" : "module load blast+/2.6.0",
jpayne@68 289 "#java" : "", # java runs natively on genepool
jpayne@68 290 "#megahit-1.1.1" : "module load megahit/1.1.1",
jpayne@68 291 "#smrtanalysis/2.3.0_p5" : "module load smrtanalysis/2.3.0_p5",
jpayne@68 292 "#smrtanalysis/2.3.0_p5_xmx32g" : "module load smrtanalysis/2.3.0_p5;export _JAVA_OPTIONS='-Xmx32g'",
jpayne@68 293 "#mummer/3.23" : "module load mummer/3.23",
jpayne@68 294 "#hmmer" : "module load hmmer/3.1b2",
jpayne@68 295 "#samtools/1.4" : "module load samtools/1.4",
jpayne@68 296 "#mothur/1.39.5" : "module load mothur/1.32.1", # 1.26.0 default, 1.32.1
jpayne@68 297 "#vsearch/2.4.3" : "module load vsearch/2.3.0", # 2.3.0
jpayne@68 298 "#graphviz" : "module load graphviz",
jpayne@68 299 "#ssu-align/0.1.1" : "module load ssu-align",
jpayne@68 300 "#smrtlink/4.0.0.190159" : "module load smrtlink/4.0.0.190159",
jpayne@68 301 "#smrtlink" : "module load smrtlink/5.0.1.9585",
jpayne@68 302 "#smrtlink/5.0.1.9585" : "module load smrtlink/5.0.1.9585",
jpayne@68 303 "#prodege" : "module load R;/projectb/sandbox/rqc/prod/pipelines/external_tools/sag_decontam/prodege-2.2/bin/",
jpayne@68 304 "#checkm" : "module load hmmer prodigal pplacer", # checkm installed in python by default on genepool
jpayne@68 305 }
jpayne@68 306
jpayne@68 307 #bbtools;stats.sh
jpayne@68 308
jpayne@68 309 if cmd.startswith("#"):
jpayne@68 310
jpayne@68 311 cluster = "genepool"
jpayne@68 312 # any other env ids to use?
jpayne@68 313
jpayne@68 314 # cori, denovo, genepool
jpayne@68 315 cluster = os.environ.get('NERSC_HOST', 'unknown')
jpayne@68 316
jpayne@68 317
jpayne@68 318 f = cmd.find(";")
jpayne@68 319 mod = "" # command to replace
jpayne@68 320 if f > -1:
jpayne@68 321 mod = cmd[0:f]
jpayne@68 322
jpayne@68 323 if mod:
jpayne@68 324
jpayne@68 325
jpayne@68 326 # use module load jamo on denovo
jpayne@68 327 if mod == "#jamo" and cluster == "denovo":
jpayne@68 328 shifter_img[mod] = "module load jamo;"
jpayne@68 329
jpayne@68 330 if cluster in ("denovo", "cori"):
jpayne@68 331
jpayne@68 332 if mod in shifter_img:
jpayne@68 333 new_cmd = new_cmd.replace(mod + ";", shifter_img[mod])
jpayne@68 334
jpayne@68 335 else:
jpayne@68 336 if mod in genepool_mod:
jpayne@68 337 if genepool_mod[mod] == "":
jpayne@68 338 new_cmd = new_cmd.replace(mod + ";", "")
jpayne@68 339 else:
jpayne@68 340 new_cmd = new_cmd.replace(mod, genepool_mod[mod])
jpayne@68 341
jpayne@68 342 if new_cmd.startswith("#"):
jpayne@68 343 print "Command not found! %s" % new_cmd
jpayne@68 344 sys.exit(18)
jpayne@68 345
jpayne@68 346 #print new_cmd
jpayne@68 347 return new_cmd
jpayne@68 348
jpayne@68 349
jpayne@68 350 '''
jpayne@68 351 returns human readable file size
jpayne@68 352 @param num = file size (e.g. 1000)
jpayne@68 353
jpayne@68 354 @return: readable float e.g. 1.5 KB
jpayne@68 355 '''
jpayne@68 356 def human_size(num):
jpayne@68 357 if not num:
jpayne@68 358 num = 0.0
jpayne@68 359
jpayne@68 360 for x in ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'XB']:
jpayne@68 361 if num < 1024.0:
jpayne@68 362 return "%3.1f %s" % (num, x)
jpayne@68 363 num /= 1024.0
jpayne@68 364
jpayne@68 365 return "%3.1f %s" % (num, 'ZB')
jpayne@68 366
jpayne@68 367
jpayne@68 368
jpayne@68 369 '''
jpayne@68 370 send out email
jpayne@68 371 @param emailTo: email receipient (e.g. bryce@lbl.gov)
jpayne@68 372 @param emailSubject: subject line for the email
jpayne@68 373 @param emailBody: content of the email
jpayne@68 374 @param emailFrom: optional email from
jpayne@68 375
jpayne@68 376 '''
jpayne@68 377 def send_email(email_to, email_subject, email_body, email_from = 'rqc@jgi-psf.org', log = None):
jpayne@68 378 msg = ""
jpayne@68 379 err_flag = 0
jpayne@68 380
jpayne@68 381 if not email_to:
jpayne@68 382 msg = "- send_email: email_to parameter missing!"
jpayne@68 383
jpayne@68 384 if not email_subject:
jpayne@68 385 msg = "- send_email: email_subject parameter missing!"
jpayne@68 386
jpayne@68 387 if not email_body:
jpayne@68 388 msg = "- send_email: email_body parameter missing!"
jpayne@68 389
jpayne@68 390
jpayne@68 391 if err_flag == 0:
jpayne@68 392 msg = "- sending email to: %s" % (email_to)
jpayne@68 393
jpayne@68 394 if log:
jpayne@68 395 log.info(msg)
jpayne@68 396 else:
jpayne@68 397 print msg
jpayne@68 398
jpayne@68 399 if err_flag == 1:
jpayne@68 400 return 0
jpayne@68 401
jpayne@68 402 # assume html
jpayne@68 403 email_msg = MIMEText(email_body, "html") # vs "plain"
jpayne@68 404 email_msg['Subject'] = email_subject
jpayne@68 405 email_msg['From'] = email_from
jpayne@68 406 email_msg['To'] = email_to
jpayne@68 407
jpayne@68 408 p = Popen(["/usr/sbin/sendmail", "-t"], stdin = PIPE)
jpayne@68 409 p.communicate(email_msg.as_string())
jpayne@68 410
jpayne@68 411 return err_flag
jpayne@68 412
jpayne@68 413 '''
jpayne@68 414 Write to rqc_file (e.g. rqc-files.tmp) the file_key and file_value
jpayne@68 415
jpayne@68 416 @param rqc_file_log: full path to file containing key=file
jpayne@68 417 @param file_key: key for the entry
jpayne@68 418 @param file_value: value for the entry
jpayne@68 419
jpayne@68 420 '''
jpayne@68 421 def append_rqc_file(rqc_file_log, file_key, file_value, log=None):
jpayne@68 422 if file_key:
jpayne@68 423
jpayne@68 424 buffer = "%s = %s\n" % (file_key, file_value)
jpayne@68 425
jpayne@68 426 with open(rqc_file_log, "a") as myfile:
jpayne@68 427 myfile.write(buffer)
jpayne@68 428
jpayne@68 429 if log: log.info("append_rqc_file: %s:%s" % (file_key, file_value))
jpayne@68 430
jpayne@68 431 else:
jpayne@68 432
jpayne@68 433 if log: log.warning("key or value error: %s:%s" % (file_key, file_value))
jpayne@68 434
jpayne@68 435 '''
jpayne@68 436 Write to rqc_stats (e.g. rqc-stats.tmp) the stats_key and stats_value
jpayne@68 437 @param rqc_file_log: full path to file containing key=file
jpayne@68 438 @param file_key: key for the entry
jpayne@68 439 @param file_value: value for the entry
jpayne@68 440
jpayne@68 441 '''
jpayne@68 442 def append_rqc_stats(rqc_stats_log, stats_key, stats_value, log=None):
jpayne@68 443 if stats_key:
jpayne@68 444
jpayne@68 445 buffer = "%s = %s\n" % (stats_key, stats_value)
jpayne@68 446
jpayne@68 447 with open(rqc_stats_log, "a") as myfile:
jpayne@68 448 myfile.write(buffer)
jpayne@68 449
jpayne@68 450 if log: log.info("append_rqc_stats: %s:%s" % (stats_key, stats_value))
jpayne@68 451
jpayne@68 452 else:
jpayne@68 453
jpayne@68 454 if log: log.warning("key or value error: %s:%s" % (stats_key, stats_value))
jpayne@68 455
jpayne@68 456 '''
jpayne@68 457 Return the file system path to jgi-rqc-pipeline so we can use */tools and */lib
jpayne@68 458
jpayne@68 459 @return /path/to/jgi-rqc-pipelines
jpayne@68 460 '''
jpayne@68 461 def get_run_path():
jpayne@68 462 current_path = os.path.dirname(os.path.abspath(__file__))
jpayne@68 463 run_path = os.path.abspath(os.path.join(current_path, os.pardir))
jpayne@68 464
jpayne@68 465 return run_path
jpayne@68 466
jpayne@68 467
jpayne@68 468
jpayne@68 469 '''
jpayne@68 470 Simple read count using bbtools n_contigs field
jpayne@68 471 - slightly different than in rqc_utility
jpayne@68 472 n_scaffolds n_contigs scaf_bp contig_bp gap_pct scaf_N50 scaf_L50 ctg_N50 ctg_L50 scaf_N90 scaf_L90 ctg_N90 ctg_L90 scaf_max ctg_max scaf_n_gt50K scaf_pct_gt50K gc_avg gc_std
jpayne@68 473 1346616 1346616 405331416 405331415 0.000 1346616 301 1346615 301 1346616 301 1346615 301 301 301 0 0.000 0.44824 0.02675
jpayne@68 474
jpayne@68 475 '''
jpayne@68 476 def get_read_count_fastq(fastq, log = None):
jpayne@68 477 read_cnt = 0
jpayne@68 478
jpayne@68 479 if os.path.isfile(fastq):
jpayne@68 480
jpayne@68 481 # EnvironmentModules.module(["load", "bbtools"])
jpayne@68 482 # bbtools faster than zcat | wc because bbtools uses pigz
jpayne@68 483 # cmd = "stats.sh format=3 in=%s" % fastq
jpayne@68 484 cmd = "#bbtools;stats.sh format=3 in=%s" % fastq
jpayne@68 485 cmd = convert_cmd(cmd)
jpayne@68 486
jpayne@68 487
jpayne@68 488 if log:
jpayne@68 489 log.info("- cmd: %s", cmd)
jpayne@68 490
jpayne@68 491 std_out, std_err, exit_code = run_command(cmd, True)
jpayne@68 492
jpayne@68 493 # EnvironmentModules.module(["unload", "bbtools"])
jpayne@68 494
jpayne@68 495 if exit_code == 0 and std_out:
jpayne@68 496
jpayne@68 497 line_list = std_out.split("\n")
jpayne@68 498 #print line_list
jpayne@68 499 val_list = str(line_list[1]).split() #.split('\t')
jpayne@68 500 #print "v = %s" % val_list
jpayne@68 501
jpayne@68 502 read_cnt = int(val_list[1])
jpayne@68 503
jpayne@68 504 if log:
jpayne@68 505 log.info("- read count: %s", read_cnt)
jpayne@68 506
jpayne@68 507 else:
jpayne@68 508 if log:
jpayne@68 509 post_mortem_cmd(cmd, exit_code, std_out, std_err, log)
jpayne@68 510
jpayne@68 511
jpayne@68 512 else:
jpayne@68 513 log.error("- fastq: %s does not exist!", fastq)
jpayne@68 514
jpayne@68 515 return read_cnt
jpayne@68 516
jpayne@68 517
jpayne@68 518
jpayne@68 519 '''
jpayne@68 520 Subsampling calculation
jpayne@68 521 0 .. 250k reads = 100%
jpayne@68 522 250k .. 25m = 100% to 1%
jpayne@68 523 25m .. 600m = 1%
jpayne@68 524 600m+ .. oo < 1%
jpayne@68 525
jpayne@68 526 July 2014 - 15 runs > 600m (HiSeq-2500 Rapid) - 4 actual libraries / 85325 seq units
jpayne@68 527 - returns new subsampling rate
jpayne@68 528 '''
jpayne@68 529 def get_subsample_rate(read_count):
jpayne@68 530 subsample = 0
jpayne@68 531 subsample_rate = 0.01
jpayne@68 532 max_subsample = 6000000 # 4 hours of blast time
jpayne@68 533
jpayne@68 534 new_subsample_rate = 250000.0/read_count
jpayne@68 535 subsample_rate = max(new_subsample_rate, subsample_rate)
jpayne@68 536 subsample_rate = min(1, subsample_rate) # if subsample_rate > 1, then set to 1
jpayne@68 537
jpayne@68 538 subsample = int(read_count * subsample_rate)
jpayne@68 539
jpayne@68 540 if subsample > max_subsample:
jpayne@68 541 subsample = max_subsample
jpayne@68 542
jpayne@68 543 subsample_rate = subsample / float(read_count)
jpayne@68 544
jpayne@68 545 return subsample_rate
jpayne@68 546
jpayne@68 547
jpayne@68 548 '''
jpayne@68 549 Set color hash
jpayne@68 550 - need to update to remove "c" parameter - used in too many places
jpayne@68 551 '''
jpayne@68 552 def set_colors(c, use_color = False):
jpayne@68 553
jpayne@68 554 if use_color == False:
jpayne@68 555
jpayne@68 556 color = {
jpayne@68 557 'black' : "",
jpayne@68 558 'red' : "",
jpayne@68 559 'green' : "",
jpayne@68 560 'yellow' : "",
jpayne@68 561 'blue' : "",
jpayne@68 562 'pink' : "",
jpayne@68 563 'cyan' : "",
jpayne@68 564 'white' : "",
jpayne@68 565 '' : ""
jpayne@68 566 }
jpayne@68 567
jpayne@68 568 else:
jpayne@68 569
jpayne@68 570 color = {
jpayne@68 571 'black' : "\033[1;30m",
jpayne@68 572 'red' : "\033[1;31m",
jpayne@68 573 'green' : "\033[1;32m",
jpayne@68 574 'yellow' : "\033[1;33m",
jpayne@68 575 'blue' : "\033[1;34m",
jpayne@68 576 'pink' : "\033[1;35m",
jpayne@68 577 'cyan' : "\033[1;36m",
jpayne@68 578 'white' : "\033[1;37m",
jpayne@68 579 '' : "\033[m"
jpayne@68 580 }
jpayne@68 581
jpayne@68 582
jpayne@68 583 return color
jpayne@68 584
jpayne@68 585 '''
jpayne@68 586 New function that just returns colors
jpayne@68 587 '''
jpayne@68 588 def get_colors():
jpayne@68 589
jpayne@68 590 color = {
jpayne@68 591 'black' : "\033[1;30m",
jpayne@68 592 'red' : "\033[1;31m",
jpayne@68 593 'green' : "\033[1;32m",
jpayne@68 594 'yellow' : "\033[1;33m",
jpayne@68 595 'blue' : "\033[1;34m",
jpayne@68 596 'pink' : "\033[1;35m",
jpayne@68 597 'cyan' : "\033[1;36m",
jpayne@68 598 'white' : "\033[1;37m",
jpayne@68 599 '' : "\033[m"
jpayne@68 600 }
jpayne@68 601
jpayne@68 602
jpayne@68 603 return color
jpayne@68 604
jpayne@68 605
jpayne@68 606
jpayne@68 607 '''
jpayne@68 608 Returns msg_ok, msg_fail, msg_warn colored or not colored
jpayne@68 609 '''
jpayne@68 610 def get_msg_settings(color):
jpayne@68 611
jpayne@68 612 msg_ok = "[ "+color['green']+"OK"+color['']+" ]"
jpayne@68 613 msg_fail = "[ "+color['red']+"FAIL"+color['']+" ]"
jpayne@68 614 msg_warn = "[ "+color['yellow']+"WARN"+color['']+" ]"
jpayne@68 615
jpayne@68 616 return msg_ok, msg_fail, msg_warn
jpayne@68 617
jpayne@68 618
jpayne@68 619 '''
jpayne@68 620 Use RQC's ap_tool to get the status
jpayne@68 621 set mode = "-sa" to show all, even completed
jpayne@68 622 '''
jpayne@68 623 def get_analysis_project_id(seq_proj_id, target_analysis_project_id, target_analysis_task_id, output_path, log = None, mode = ""):
jpayne@68 624
jpayne@68 625 if log:
jpayne@68 626 log.info("get_analysis_project_id: spid = %s, tapid = %s, tatid = %s", seq_proj_id, target_analysis_project_id, target_analysis_task_id)
jpayne@68 627
jpayne@68 628 analysis_project_id = 0
jpayne@68 629 analysis_task_id = 0
jpayne@68 630 project_type = None
jpayne@68 631 task_type = None
jpayne@68 632
jpayne@68 633 ap_list = os.path.join(output_path, "ap-info.txt")
jpayne@68 634 AP_TOOL = "/global/dna/projectdirs/PI/rqc/prod/jgi-rqc-pipeline/tools/ap_tool.py"
jpayne@68 635 #AP_TOOL = "/global/homes/b/brycef/git/jgi-rqc-pipeline/tools/ap_tool.py"
jpayne@68 636 cmd = "%s -spid %s -m psv -tapid %s -tatid %s %s > %s 2>&1" % (AP_TOOL, seq_proj_id, target_analysis_project_id, target_analysis_task_id, mode, ap_list)
jpayne@68 637 if log:
jpayne@68 638 log.info("- cmd: %s", cmd)
jpayne@68 639 else:
jpayne@68 640 print "- cmd: %s" % cmd
jpayne@68 641 std_out, std_err, exit_code = run_command(cmd, True)
jpayne@68 642
jpayne@68 643 post_mortem_cmd(cmd, exit_code, std_out, std_err, log)
jpayne@68 644
jpayne@68 645 if os.path.isfile(ap_list):
jpayne@68 646
jpayne@68 647 ap_dict = {} # header = value
jpayne@68 648 cnt = 0
jpayne@68 649 fh = open(ap_list, "r")
jpayne@68 650 for line in fh:
jpayne@68 651 arr = line.strip().split("|")
jpayne@68 652 if cnt == 0:
jpayne@68 653 c2 = 0 # position of title in header
jpayne@68 654 for a in arr:
jpayne@68 655 ap_dict[a.lower()] = c2
jpayne@68 656 c2 += 1
jpayne@68 657
jpayne@68 658 else:
jpayne@68 659
jpayne@68 660 for a in ap_dict:
jpayne@68 661
jpayne@68 662 if ap_dict[a] + 1 > len(arr):
jpayne@68 663 pass
jpayne@68 664 else:
jpayne@68 665 ap_dict[a] = arr[ap_dict[a]]
jpayne@68 666
jpayne@68 667
jpayne@68 668 cnt += 1
jpayne@68 669
jpayne@68 670 fh.close()
jpayne@68 671
jpayne@68 672
jpayne@68 673 analysis_project_id = ap_dict.get("analysis project id")
jpayne@68 674 analysis_task_id = ap_dict.get("analysis task id")
jpayne@68 675 project_type = ap_dict.get("analysis product name")
jpayne@68 676 task_type = ap_dict.get("analysis task name")
jpayne@68 677
jpayne@68 678 # nno such project
jpayne@68 679 if cnt == 1:
jpayne@68 680 analysis_project_id = 0
jpayne@68 681 analysis_task_id = 0
jpayne@68 682
jpayne@68 683 if log:
jpayne@68 684 log.info("- project type: %s, task type: %s", project_type, task_type)
jpayne@68 685 log.info("- analysis_project_id: %s, analysis_task_id: %s", analysis_project_id, analysis_task_id)
jpayne@68 686
jpayne@68 687 try:
jpayne@68 688 analysis_project_id = int(analysis_project_id)
jpayne@68 689 analysis_task_id = int(analysis_task_id)
jpayne@68 690 except:
jpayne@68 691 analysis_project_id = 0
jpayne@68 692 analysis_task_id = 0
jpayne@68 693
jpayne@68 694
jpayne@68 695 # ap = 4, at = 8 means its using the column names but didn't find anything
jpayne@68 696 if analysis_project_id < 100:
jpayne@68 697 analysis_project_id = 0
jpayne@68 698 if analysis_task_id < 100:
jpayne@68 699 analysis_task_id = 0
jpayne@68 700
jpayne@68 701 return analysis_project_id, analysis_task_id
jpayne@68 702
jpayne@68 703
jpayne@68 704 '''
jpayne@68 705 For creating a dot file from the pipeline flow
jpayne@68 706 '''
jpayne@68 707 def append_flow(flow_file, orig_node, orig_label, next_node, next_label, link_label):
jpayne@68 708
jpayne@68 709 fh = open(flow_file, "a")
jpayne@68 710 fh.write("%s|%s|%s|%s|%s\n" % (orig_node, orig_label, next_node, next_label, link_label))
jpayne@68 711 fh.close()
jpayne@68 712
jpayne@68 713 '''
jpayne@68 714 Flow file format:
jpayne@68 715 # comment
jpayne@68 716 *label|PBMID Pipeline run for BTXXY<br><font point-size="10">Run Date: 2017-09-28 14:22:50</font>
jpayne@68 717 # origin node, origin label, next node, next label, link label
jpayne@68 718 input_h5|BTXXY H5<br><font point-size="10">3 smrtcells</font>|assembly|HGAP Assembly<FONT POINT-SIZE="10"><br>3 contigs, 13,283,382bp</FONT>|HGAP v4.0.1
jpayne@68 719
jpayne@68 720 nodes should be the output of the transformation between the nodes
jpayne@68 721 e.g. input fastq (25m reads) --[ bbtools subsampling ]--> subsampled fastq (10m reads)
jpayne@68 722
jpayne@68 723 creates a dot file, to convert to png use:
jpayne@68 724 $ module load graphviz
jpayne@68 725 $ dot -T png (dot file) > (png file)
jpayne@68 726
jpayne@68 727 More info on formatting the labels
jpayne@68 728 http://www.graphviz.org/content/node-shapes#html
jpayne@68 729 '''
jpayne@68 730 def dot_flow(flow_file, dot_file, log = None):
jpayne@68 731
jpayne@68 732 if not os.path.isfile(flow_file):
jpayne@68 733 if log:
jpayne@68 734 log.info("- cannot find flow file: %s", flow_file)
jpayne@68 735 else:
jpayne@68 736 print "Cannot find flow file: %s" % flow_file
jpayne@68 737 return
jpayne@68 738
jpayne@68 739
jpayne@68 740 fhw = open(dot_file, "w")
jpayne@68 741
jpayne@68 742 fhw.write("// dot file\n")
jpayne@68 743 fhw.write("digraph rqc {\n") # directed graph
jpayne@68 744 fhw.write(" node [shape=box];\n")
jpayne@68 745 fhw.write(" rankdir=LR;\n")
jpayne@68 746
jpayne@68 747 fh = open(flow_file, "r")
jpayne@68 748
jpayne@68 749 for line in fh:
jpayne@68 750 line = line.strip()
jpayne@68 751
jpayne@68 752 if not line:
jpayne@68 753 continue
jpayne@68 754
jpayne@68 755 if line.startswith("#"):
jpayne@68 756 continue
jpayne@68 757
jpayne@68 758 # graph label
jpayne@68 759 if line.startswith("*label"):
jpayne@68 760 arr = line.split("|")
jpayne@68 761 label = flow_replace(str(arr[1]))
jpayne@68 762
jpayne@68 763 fhw.write(" label=<%s>;\n" % label)
jpayne@68 764 fhw.write(" labelloc=top;\n")
jpayne@68 765
jpayne@68 766 else:
jpayne@68 767
jpayne@68 768 arr = line.split("|")
jpayne@68 769 #print arr
jpayne@68 770
jpayne@68 771 if len(arr) == 5:
jpayne@68 772
jpayne@68 773 org_node = arr[0]
jpayne@68 774 org_label = str(arr[1])
jpayne@68 775 next_node = arr[2]
jpayne@68 776 next_label = str(arr[3])
jpayne@68 777 link_label = str(arr[4])
jpayne@68 778
jpayne@68 779 # must be <br/> in the dot file, I have a habit of using <br>
jpayne@68 780 org_label = flow_replace(org_label)
jpayne@68 781 next_label = flow_replace(next_label)
jpayne@68 782 link_label = flow_replace(link_label)
jpayne@68 783
jpayne@68 784 # label are enclosed by < > instead of " " to handle html-ish markups
jpayne@68 785
jpayne@68 786 if next_node:
jpayne@68 787 link = " %s -> %s;\n" % (org_node, next_node)
jpayne@68 788 if link_label:
jpayne@68 789 link = " %s -> %s [label=<%s>];\n" % (org_node, next_node, link_label)
jpayne@68 790 fhw.write(link)
jpayne@68 791
jpayne@68 792 if org_label:
jpayne@68 793 label = " %s [label=<%s>];\n" % (org_node, org_label)
jpayne@68 794 fhw.write(label)
jpayne@68 795
jpayne@68 796 if next_label:
jpayne@68 797 label = " %s [label=<%s>];\n" % (next_node, next_label)
jpayne@68 798 fhw.write(label)
jpayne@68 799
jpayne@68 800
jpayne@68 801
jpayne@68 802 fh.close()
jpayne@68 803
jpayne@68 804 fhw.write("}\n")
jpayne@68 805
jpayne@68 806 fhw.close()
jpayne@68 807 if log:
jpayne@68 808 log.info("- created dot file: %s", dot_file)
jpayne@68 809
jpayne@68 810 return dot_file
jpayne@68 811
jpayne@68 812 '''
jpayne@68 813 simple replacements
jpayne@68 814 '''
jpayne@68 815 def flow_replace(my_str):
jpayne@68 816
jpayne@68 817 new_str = my_str.replace("<br>", "<br/>").replace("<smf>", "<font point-size=\"10\">").replace("</f>", "</font>")
jpayne@68 818
jpayne@68 819 return new_str
jpayne@68 820
jpayne@68 821
jpayne@68 822 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
jpayne@68 823 ## main program
jpayne@68 824
jpayne@68 825
jpayne@68 826 if __name__ == "__main__":
jpayne@68 827 # unit tests
jpayne@68 828
jpayne@68 829 print human_size(102192203)
jpayne@68 830 print human_size(250000000000)
jpayne@68 831 #print get_read_count_fastq("/global/projectb/scratch/brycef/sag/phix/11185.1.195330.UNKNOWN_matched.fastq.gz")
jpayne@68 832
jpayne@68 833 cmd = "#bbtools;bbduk.sh in=/global/dna/dm_archive/sdm/illumina//01/14/88/11488.1.208132.UNKNOWN.fastq.gz ref=/global/dna/shared/rqc/ref_databases/qaqc/databases/phix174_ill.ref.fa outm=/global/projectb/scratch/brycef/phix/11488/11488.1.208132.UNKNOWN_matched.fastq.gz outu=/global/projectb/scratch/brycef/phix/11488/11488.1.208132.UNKNOWN_unmatched.fastq.gz"
jpayne@68 834 print convert_cmd(cmd)
jpayne@68 835 cmd = "#pigz;pigz /global/projectb/scratch/brycef/align/BTOYH/genome/11463.6.208000.CAAGGTC-AGACCTT.filter-RNA.fastq.gz-genome.sam"
jpayne@68 836 print convert_cmd(cmd)
jpayne@68 837
jpayne@68 838 cmd = "#java;java -version"
jpayne@68 839 print convert_cmd(cmd)
jpayne@68 840
jpayne@68 841 dot_flow("/global/projectb/scratch/brycef/pbmid/BWOAU/f2.flow", "/global/projectb/scratch/brycef/pbmid/BWOAU/BWOUAx.dot")
jpayne@68 842
jpayne@68 843 sys.exit(0)
jpayne@68 844
jpayne@68 845