jpayne@68: #! /usr/bin/env python jpayne@68: # -*- coding: utf-8 -*- jpayne@68: """ jpayne@68: Function definitions common to all programs. jpayne@68: jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ jpayne@68: ## libraries to use jpayne@68: jpayne@68: #import re jpayne@68: import os jpayne@68: import time jpayne@68: import sys jpayne@68: #import getpass jpayne@68: import logging jpayne@68: #from colorlog import ColoredFormatter jpayne@68: # import EnvironmentModules # get_read_count_fastq jpayne@68: from subprocess import Popen, PIPE jpayne@68: from email.mime.text import MIMEText jpayne@68: jpayne@68: jpayne@68: ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ jpayne@68: ## function definitions jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: creates a logging instance jpayne@68: https://docs.python.org/2/howto/logging.html jpayne@68: https://pypi.python.org/pypi/colorlog jpayne@68: ''' jpayne@68: def get_logger(log_name, log_file, log_level = "INFO", stdout = False, color = False): jpayne@68: log = logging.getLogger(log_name) jpayne@68: handler = None jpayne@68: if stdout: jpayne@68: handler = logging.StreamHandler(sys.stdout) jpayne@68: else: jpayne@68: handler = logging.FileHandler(log_file) jpayne@68: jpayne@68: formatter = logging.Formatter('%(filename)-15s:%(process)d %(asctime)s %(levelname)s: %(message)s') jpayne@68: jpayne@68: if color and 1==2: jpayne@68: """ jpayne@68: formatter = ColoredFormatter("%(filename)-15s:%(process)d %(asctime)s %(log_color)s%(levelname)s: %(message)s", datefmt=None, reset=True, jpayne@68: log_colors={ jpayne@68: 'DEBUG': 'blue', jpayne@68: 'INFO': 'green', jpayne@68: 'WARNING': 'yellow', jpayne@68: 'ERROR': 'red', jpayne@68: 'CRITICAL': 'red, bg_white', jpayne@68: }, jpayne@68: secondary_log_colors={}, jpayne@68: style='%') jpayne@68: Not working in conda - 2017-04-29 jpayne@68: """ jpayne@68: handler.setFormatter(formatter) jpayne@68: jpayne@68: log.addHandler(handler) jpayne@68: log.setLevel(log_level) jpayne@68: jpayne@68: return log jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: Checkpoint the status plus a timestamp jpayne@68: - appends the status jpayne@68: jpayne@68: @param status_log: /path/to/status.log (or whatever you name it) jpayne@68: @param status: status to append to status.log jpayne@68: jpayne@68: ''' jpayne@68: def checkpoint_step(status_log, status): jpayne@68: status_line = "%s,%s\n" % (status, time.strftime("%Y-%m-%d %H:%M:%S")) jpayne@68: jpayne@68: with open(status_log, "a") as myfile: jpayne@68: myfile.write(status_line) jpayne@68: jpayne@68: ''' jpayne@68: returns the last step (status) from the pipeline jpayne@68: @param status_log: /path/to/status.log (or whatever you name it) jpayne@68: @param log: logger object jpayne@68: jpayne@68: @return last status in the status log, "start" if nothing there jpayne@68: ''' jpayne@68: def get_status(status_log, log = None): jpayne@68: #status_log = "%s/%s" % (output_path, "test_status.log") jpayne@68: jpayne@68: status = "start" jpayne@68: timestamp = str(time.strftime("%Y-%m-%d %H:%M:%S")) jpayne@68: jpayne@68: if os.path.isfile(status_log): jpayne@68: fh = open(status_log, 'r') jpayne@68: lines = fh.readlines() jpayne@68: fh.close() jpayne@68: jpayne@68: for line in lines: jpayne@68: if line.startswith('#'): continue jpayne@68: line_list = line.split(",") jpayne@68: assert len(line_list) == 2 jpayne@68: status = str(line_list[0]).strip() jpayne@68: timestamp = str(line_list[1]).strip() jpayne@68: jpayne@68: if not status: jpayne@68: status = "start" jpayne@68: jpayne@68: if log: jpayne@68: log.info("Last checkpointed step: %s (%s)", status, timestamp) jpayne@68: jpayne@68: else: jpayne@68: if log: jpayne@68: log.info("Cannot find status.log (%s), assuming new run", status_log) jpayne@68: jpayne@68: status = status.strip().lower() jpayne@68: jpayne@68: return status jpayne@68: jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: run a command from python jpayne@68: jpayne@68: @param cmd: command to run jpayne@68: @param live: False = run in dry mode (print command), True = run normally jpayne@68: @param log: logger object jpayne@68: jpayne@68: @return std_out, std_err, exit_code jpayne@68: ''' jpayne@68: def run_command(cmd, live=False, log=None): jpayne@68: stdOut = None jpayne@68: stdErr = None jpayne@68: exitCode = None jpayne@68: #end = 0 jpayne@68: #elapsedSec = 0 jpayne@68: jpayne@68: if cmd: jpayne@68: if not live: jpayne@68: stdOut = "Not live: cmd = '%s'" % (cmd) jpayne@68: exitCode = 0 jpayne@68: else: jpayne@68: if log: log.info("cmd: %s" % (cmd)) jpayne@68: jpayne@68: p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) jpayne@68: stdOut, stdErr = p.communicate() jpayne@68: exitCode = p.returncode jpayne@68: jpayne@68: if log: jpayne@68: log.info("Return values: exitCode=" + str(exitCode) + ", stdOut=" + str(stdOut) + ", stdErr=" + str(stdErr)) jpayne@68: if exitCode != 0: jpayne@68: log.warn("- The exit code has non-zero value.") jpayne@68: jpayne@68: else: jpayne@68: if log: jpayne@68: log.error("- No command to run.") jpayne@68: return None, None, -1 jpayne@68: jpayne@68: return stdOut, stdErr, exitCode jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: replacement for run_command jpayne@68: - includes logging, convert_cmd & post_mortem jpayne@68: ''' jpayne@68: jpayne@68: def run_cmd(cmd, log=None): jpayne@68: jpayne@68: std_out = None jpayne@68: std_err = None jpayne@68: exit_code = 0 jpayne@68: jpayne@68: if cmd: jpayne@68: # convert to work on genepool/denovo jpayne@68: cmd = convert_cmd(cmd) jpayne@68: jpayne@68: if log: jpayne@68: log.info("- cmd: %s", cmd) jpayne@68: jpayne@68: p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) jpayne@68: std_out, std_err = p.communicate() jpayne@68: exit_code = p.returncode jpayne@68: jpayne@68: post_mortem_cmd(cmd, exit_code, std_out, std_err, log) jpayne@68: jpayne@68: return std_out, std_err, exit_code jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: Simple function to output to the log what happened only if exit code > 0 jpayne@68: jpayne@68: Typical usage: jpayne@68: std_out, std_err, exit_code = run_command(cmd, True) jpayne@68: post_mortem_cmd(cmd, exit_code, std_out, std_err) jpayne@68: jpayne@68: ''' jpayne@68: def post_mortem_cmd(cmd, exit_code, std_out, std_err, log = None): jpayne@68: if exit_code > 0: jpayne@68: if log: jpayne@68: log.error("- cmd failed: %s", cmd) jpayne@68: log.error("- exit code: %s", exit_code) jpayne@68: jpayne@68: else: jpayne@68: print "- cmd failed: %s" % (cmd) jpayne@68: print "- exit code: %s" % (exit_code) jpayne@68: jpayne@68: jpayne@68: if std_out: jpayne@68: if log: jpayne@68: log.error("- std_out: %s", std_out) jpayne@68: else: jpayne@68: print "- std_out: %s" % (std_out) jpayne@68: jpayne@68: if std_err: jpayne@68: if log: jpayne@68: log.error("- std_err: %s", std_err) jpayne@68: else: jpayne@68: print "- std_err: %s" % (std_err) jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: Convert command to use genepool or denovo (shifter) to run jpayne@68: replace #placeholder; with shifter or module load command jpayne@68: #placeholder.v; should specify the version to use jpayne@68: jpayne@68: This should be the only place in the pipelines that specifies the images/modules translation jpayne@68: ''' jpayne@68: def convert_cmd(cmd): jpayne@68: new_cmd = cmd jpayne@68: jpayne@68: shifter_img = { jpayne@68: "#bbtools" : "shifter --image=bryce911/bbtools ", jpayne@68: "#pigz" : "module load pigz;", jpayne@68: "#jamo" : "shifter --image=registry.services.nersc.gov/htandra/jamo_dev:1.0 ", # works, but would like simple module to use - have one on Denovo but not Cori jpayne@68: "#gnuplot" : "shifter --image=bryce911/bbtools ", # (1) jpayne@68: "#spades/3.9.0" : "shifter --image=bryce911/spades3.9.0 ", jpayne@68: "#spades/3.10.1" : "shifter --image=bryce911/spades3.10.1 ", jpayne@68: "#spades/3.11.0" : "shifter --image=bryce911/spades-3.11.0 ", # GAA-3383 jpayne@68: "#spades/3.11.1-check" : "shifter --image=bryce911/spades3.11.1-check ", # development jpayne@68: "#prodigal/2.6.3" : "shifter --image=registry.services.nersc.gov/jgi/prodigal ", # RQCSUPPORT-1318 jpayne@68: "#prodigal/2.5.0" : "shifter --image=registry.services.nersc.gov/jgi/prodigal ", jpayne@68: "#prodigal/2.50" : "shifter --image=registry.services.nersc.gov/jgi/prodigal ", jpayne@68: "#lastal/869" : "shifter --image=bryce911/lastal:869 ", jpayne@68: "#lastal/828" : "shifter --image=bryce911/lastal:869 ", jpayne@68: #"#lastal" : "shifter --image=bryce911/lastal:869 ", jpayne@68: "#R/3.3.2" : "module load R/3.3.2;", jpayne@68: "#texlive" : "shifter --image=bryce911/bbtools ", # (1) jpayne@68: "#java" : "shifter --image=bryce911/bbtools ", # (1) jpayne@68: "#blast+/2.6.0" : "shifter --image=sulsj/ncbi-blastplus:2.6.0 ", jpayne@68: "#blast" : "shifter --image=sulsj/ncbi-blastplus:2.7.0 ", jpayne@68: "#megahit-1.1.1" : "shifter --image=foster505/megahit:v1.1.1-2-g02102e1 ", jpayne@68: "#smrtanalysis/2.3.0_p5" : "shifter --image=registry.services.nersc.gov/jgi/smrtanalysis:2.3.0_p5 ", # meth - need more memory jpayne@68: "#mummer/3.23" : "shifter --image=bryce911/mummer3.23 ", # 3.23 jpayne@68: "#hmmer" : "shifter --image=registry.services.nersc.gov/jgi/hmmer:latest ", # 3.1b2 jpayne@68: "#samtools/1.4" : "shifter --image=rmonti/samtools ", jpayne@68: "#mothur/1.39.5" : "shifter --image=bryce911/mothur1.39.5 ", jpayne@68: "#vsearch/2.4.3" : "shifter --image=bryce911/vsearch2.4.3 ", jpayne@68: "#graphviz" : "shifter --image=bryce911/bbtools ", jpayne@68: "#ssu-align/0.1.1" : "shifter --image=bryce911/ssu-align0.1.1 ", # openmpi/1.10 included in docker container jpayne@68: "#smrtlink/4.0.0.190159" : "shifter --image=registry.services.nersc.gov/jgi/smrtlink:4.0.0.190159 /smrtlink/smrtcmds/bin/", # progs not in path jpayne@68: "#smrtlink/5.0.1.9585" : "shifter --image=registry.services.nersc.gov/jgi/smrtlink:5.0.1.9585 /smrtlink/smrtcmds/bin/", # progs not in path, Tony created 2017-10-16 jpayne@68: "#smrtlink" : "shifter --image=registry.services.nersc.gov/jgi/smrtlink:5.0.1.9585 /smrtlink/smrtcmds/bin/", # progs not in path jpayne@68: "#prodege" : "shifter --image=bryce911/prodege ", # 2.2.1 jpayne@68: #"#hmmer" : "shifter --image=registry.services.nersc.gov/jgi/hmmer ", # 3.1b2 - Feb 2015, latest as of Oct 2017 jpayne@68: "#checkm" : "shifter --image=registry.services.nersc.gov/jgi/checkm ", jpayne@68: } jpayne@68: jpayne@68: jpayne@68: # (1) - added as part of the bryce911 bbtools package jpayne@68: jpayne@68: #cmd = "#bbtools-shijie;bbmap...." jpayne@68: # this dict will be deprecated as of March 2018 when genepool passes into legend jpayne@68: genepool_mod = { jpayne@68: "#bbtools" : "module load bbtools", jpayne@68: "#pigz" : "module load pigz", jpayne@68: "#jamo" : "module load jamo", jpayne@68: "#gnuplot" : "module load gnuplot/4.6.2", # sag,iso,sps,ce:gc_cov, gc_histogram, contig_gc jpayne@68: "#spades/3.9.0" : "module load spades/3.9.0", jpayne@68: "#spades/3.10.1" : "module load spades/3.10.1", jpayne@68: "#spades/3.11.1" : "module load spades/3.11.1-check", jpayne@68: "#prodigal/2.6.3" : "module load prodigal/2.50", # aka 2.50, also 2.60 is available jpayne@68: "#prodigal/2.5.0" : "module load prodigal/2.50", jpayne@68: "#prodigal/2.50" : "module load prodigal/2.50", jpayne@68: #"#lastal" : "module load last/828", jpayne@68: "#lastal/828" : "module load last/828", jpayne@68: "#R/3.3.2" : "module unload R;module load R/3.3.1", # 3.3.2 not on genepool - RQCSUPPORT-1516 unload R for Kecia jpayne@68: "#texlive" : "module load texlive", jpayne@68: "#blast+/2.6.0" : "module load blast+/2.6.0", jpayne@68: #"#blast+/2.7.0" : "module load blast+/2.7.0", # not created jpayne@68: "#blast" : "module load blast+/2.6.0", jpayne@68: "#java" : "", # java runs natively on genepool jpayne@68: "#megahit-1.1.1" : "module load megahit/1.1.1", jpayne@68: "#smrtanalysis/2.3.0_p5" : "module load smrtanalysis/2.3.0_p5", jpayne@68: "#smrtanalysis/2.3.0_p5_xmx32g" : "module load smrtanalysis/2.3.0_p5;export _JAVA_OPTIONS='-Xmx32g'", jpayne@68: "#mummer/3.23" : "module load mummer/3.23", jpayne@68: "#hmmer" : "module load hmmer/3.1b2", jpayne@68: "#samtools/1.4" : "module load samtools/1.4", jpayne@68: "#mothur/1.39.5" : "module load mothur/1.32.1", # 1.26.0 default, 1.32.1 jpayne@68: "#vsearch/2.4.3" : "module load vsearch/2.3.0", # 2.3.0 jpayne@68: "#graphviz" : "module load graphviz", jpayne@68: "#ssu-align/0.1.1" : "module load ssu-align", jpayne@68: "#smrtlink/4.0.0.190159" : "module load smrtlink/4.0.0.190159", jpayne@68: "#smrtlink" : "module load smrtlink/5.0.1.9585", jpayne@68: "#smrtlink/5.0.1.9585" : "module load smrtlink/5.0.1.9585", jpayne@68: "#prodege" : "module load R;/projectb/sandbox/rqc/prod/pipelines/external_tools/sag_decontam/prodege-2.2/bin/", jpayne@68: "#checkm" : "module load hmmer prodigal pplacer", # checkm installed in python by default on genepool jpayne@68: } jpayne@68: jpayne@68: #bbtools;stats.sh jpayne@68: jpayne@68: if cmd.startswith("#"): jpayne@68: jpayne@68: cluster = "genepool" jpayne@68: # any other env ids to use? jpayne@68: jpayne@68: # cori, denovo, genepool jpayne@68: cluster = os.environ.get('NERSC_HOST', 'unknown') jpayne@68: jpayne@68: jpayne@68: f = cmd.find(";") jpayne@68: mod = "" # command to replace jpayne@68: if f > -1: jpayne@68: mod = cmd[0:f] jpayne@68: jpayne@68: if mod: jpayne@68: jpayne@68: jpayne@68: # use module load jamo on denovo jpayne@68: if mod == "#jamo" and cluster == "denovo": jpayne@68: shifter_img[mod] = "module load jamo;" jpayne@68: jpayne@68: if cluster in ("denovo", "cori"): jpayne@68: jpayne@68: if mod in shifter_img: jpayne@68: new_cmd = new_cmd.replace(mod + ";", shifter_img[mod]) jpayne@68: jpayne@68: else: jpayne@68: if mod in genepool_mod: jpayne@68: if genepool_mod[mod] == "": jpayne@68: new_cmd = new_cmd.replace(mod + ";", "") jpayne@68: else: jpayne@68: new_cmd = new_cmd.replace(mod, genepool_mod[mod]) jpayne@68: jpayne@68: if new_cmd.startswith("#"): jpayne@68: print "Command not found! %s" % new_cmd jpayne@68: sys.exit(18) jpayne@68: jpayne@68: #print new_cmd jpayne@68: return new_cmd jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: returns human readable file size jpayne@68: @param num = file size (e.g. 1000) jpayne@68: jpayne@68: @return: readable float e.g. 1.5 KB jpayne@68: ''' jpayne@68: def human_size(num): jpayne@68: if not num: jpayne@68: num = 0.0 jpayne@68: jpayne@68: for x in ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'XB']: jpayne@68: if num < 1024.0: jpayne@68: return "%3.1f %s" % (num, x) jpayne@68: num /= 1024.0 jpayne@68: jpayne@68: return "%3.1f %s" % (num, 'ZB') jpayne@68: jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: send out email jpayne@68: @param emailTo: email receipient (e.g. bryce@lbl.gov) jpayne@68: @param emailSubject: subject line for the email jpayne@68: @param emailBody: content of the email jpayne@68: @param emailFrom: optional email from jpayne@68: jpayne@68: ''' jpayne@68: def send_email(email_to, email_subject, email_body, email_from = 'rqc@jgi-psf.org', log = None): jpayne@68: msg = "" jpayne@68: err_flag = 0 jpayne@68: jpayne@68: if not email_to: jpayne@68: msg = "- send_email: email_to parameter missing!" jpayne@68: jpayne@68: if not email_subject: jpayne@68: msg = "- send_email: email_subject parameter missing!" jpayne@68: jpayne@68: if not email_body: jpayne@68: msg = "- send_email: email_body parameter missing!" jpayne@68: jpayne@68: jpayne@68: if err_flag == 0: jpayne@68: msg = "- sending email to: %s" % (email_to) jpayne@68: jpayne@68: if log: jpayne@68: log.info(msg) jpayne@68: else: jpayne@68: print msg jpayne@68: jpayne@68: if err_flag == 1: jpayne@68: return 0 jpayne@68: jpayne@68: # assume html jpayne@68: email_msg = MIMEText(email_body, "html") # vs "plain" jpayne@68: email_msg['Subject'] = email_subject jpayne@68: email_msg['From'] = email_from jpayne@68: email_msg['To'] = email_to jpayne@68: jpayne@68: p = Popen(["/usr/sbin/sendmail", "-t"], stdin = PIPE) jpayne@68: p.communicate(email_msg.as_string()) jpayne@68: jpayne@68: return err_flag jpayne@68: jpayne@68: ''' jpayne@68: Write to rqc_file (e.g. rqc-files.tmp) the file_key and file_value jpayne@68: jpayne@68: @param rqc_file_log: full path to file containing key=file jpayne@68: @param file_key: key for the entry jpayne@68: @param file_value: value for the entry jpayne@68: jpayne@68: ''' jpayne@68: def append_rqc_file(rqc_file_log, file_key, file_value, log=None): jpayne@68: if file_key: jpayne@68: jpayne@68: buffer = "%s = %s\n" % (file_key, file_value) jpayne@68: jpayne@68: with open(rqc_file_log, "a") as myfile: jpayne@68: myfile.write(buffer) jpayne@68: jpayne@68: if log: log.info("append_rqc_file: %s:%s" % (file_key, file_value)) jpayne@68: jpayne@68: else: jpayne@68: jpayne@68: if log: log.warning("key or value error: %s:%s" % (file_key, file_value)) jpayne@68: jpayne@68: ''' jpayne@68: Write to rqc_stats (e.g. rqc-stats.tmp) the stats_key and stats_value jpayne@68: @param rqc_file_log: full path to file containing key=file jpayne@68: @param file_key: key for the entry jpayne@68: @param file_value: value for the entry jpayne@68: jpayne@68: ''' jpayne@68: def append_rqc_stats(rqc_stats_log, stats_key, stats_value, log=None): jpayne@68: if stats_key: jpayne@68: jpayne@68: buffer = "%s = %s\n" % (stats_key, stats_value) jpayne@68: jpayne@68: with open(rqc_stats_log, "a") as myfile: jpayne@68: myfile.write(buffer) jpayne@68: jpayne@68: if log: log.info("append_rqc_stats: %s:%s" % (stats_key, stats_value)) jpayne@68: jpayne@68: else: jpayne@68: jpayne@68: if log: log.warning("key or value error: %s:%s" % (stats_key, stats_value)) jpayne@68: jpayne@68: ''' jpayne@68: Return the file system path to jgi-rqc-pipeline so we can use */tools and */lib jpayne@68: jpayne@68: @return /path/to/jgi-rqc-pipelines jpayne@68: ''' jpayne@68: def get_run_path(): jpayne@68: current_path = os.path.dirname(os.path.abspath(__file__)) jpayne@68: run_path = os.path.abspath(os.path.join(current_path, os.pardir)) jpayne@68: jpayne@68: return run_path jpayne@68: jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: Simple read count using bbtools n_contigs field jpayne@68: - slightly different than in rqc_utility jpayne@68: n_scaffolds n_contigs scaf_bp contig_bp gap_pct scaf_N50 scaf_L50 ctg_N50 ctg_L50 scaf_N90 scaf_L90 ctg_N90 ctg_L90 scaf_max ctg_max scaf_n_gt50K scaf_pct_gt50K gc_avg gc_std jpayne@68: 1346616 1346616 405331416 405331415 0.000 1346616 301 1346615 301 1346616 301 1346615 301 301 301 0 0.000 0.44824 0.02675 jpayne@68: jpayne@68: ''' jpayne@68: def get_read_count_fastq(fastq, log = None): jpayne@68: read_cnt = 0 jpayne@68: jpayne@68: if os.path.isfile(fastq): jpayne@68: jpayne@68: # EnvironmentModules.module(["load", "bbtools"]) jpayne@68: # bbtools faster than zcat | wc because bbtools uses pigz jpayne@68: # cmd = "stats.sh format=3 in=%s" % fastq jpayne@68: cmd = "#bbtools;stats.sh format=3 in=%s" % fastq jpayne@68: cmd = convert_cmd(cmd) jpayne@68: jpayne@68: jpayne@68: if log: jpayne@68: log.info("- cmd: %s", cmd) jpayne@68: jpayne@68: std_out, std_err, exit_code = run_command(cmd, True) jpayne@68: jpayne@68: # EnvironmentModules.module(["unload", "bbtools"]) jpayne@68: jpayne@68: if exit_code == 0 and std_out: jpayne@68: jpayne@68: line_list = std_out.split("\n") jpayne@68: #print line_list jpayne@68: val_list = str(line_list[1]).split() #.split('\t') jpayne@68: #print "v = %s" % val_list jpayne@68: jpayne@68: read_cnt = int(val_list[1]) jpayne@68: jpayne@68: if log: jpayne@68: log.info("- read count: %s", read_cnt) jpayne@68: jpayne@68: else: jpayne@68: if log: jpayne@68: post_mortem_cmd(cmd, exit_code, std_out, std_err, log) jpayne@68: jpayne@68: jpayne@68: else: jpayne@68: log.error("- fastq: %s does not exist!", fastq) jpayne@68: jpayne@68: return read_cnt jpayne@68: jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: Subsampling calculation jpayne@68: 0 .. 250k reads = 100% jpayne@68: 250k .. 25m = 100% to 1% jpayne@68: 25m .. 600m = 1% jpayne@68: 600m+ .. oo < 1% jpayne@68: jpayne@68: July 2014 - 15 runs > 600m (HiSeq-2500 Rapid) - 4 actual libraries / 85325 seq units jpayne@68: - returns new subsampling rate jpayne@68: ''' jpayne@68: def get_subsample_rate(read_count): jpayne@68: subsample = 0 jpayne@68: subsample_rate = 0.01 jpayne@68: max_subsample = 6000000 # 4 hours of blast time jpayne@68: jpayne@68: new_subsample_rate = 250000.0/read_count jpayne@68: subsample_rate = max(new_subsample_rate, subsample_rate) jpayne@68: subsample_rate = min(1, subsample_rate) # if subsample_rate > 1, then set to 1 jpayne@68: jpayne@68: subsample = int(read_count * subsample_rate) jpayne@68: jpayne@68: if subsample > max_subsample: jpayne@68: subsample = max_subsample jpayne@68: jpayne@68: subsample_rate = subsample / float(read_count) jpayne@68: jpayne@68: return subsample_rate jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: Set color hash jpayne@68: - need to update to remove "c" parameter - used in too many places jpayne@68: ''' jpayne@68: def set_colors(c, use_color = False): jpayne@68: jpayne@68: if use_color == False: jpayne@68: jpayne@68: color = { jpayne@68: 'black' : "", jpayne@68: 'red' : "", jpayne@68: 'green' : "", jpayne@68: 'yellow' : "", jpayne@68: 'blue' : "", jpayne@68: 'pink' : "", jpayne@68: 'cyan' : "", jpayne@68: 'white' : "", jpayne@68: '' : "" jpayne@68: } jpayne@68: jpayne@68: else: jpayne@68: jpayne@68: color = { jpayne@68: 'black' : "\033[1;30m", jpayne@68: 'red' : "\033[1;31m", jpayne@68: 'green' : "\033[1;32m", jpayne@68: 'yellow' : "\033[1;33m", jpayne@68: 'blue' : "\033[1;34m", jpayne@68: 'pink' : "\033[1;35m", jpayne@68: 'cyan' : "\033[1;36m", jpayne@68: 'white' : "\033[1;37m", jpayne@68: '' : "\033[m" jpayne@68: } jpayne@68: jpayne@68: jpayne@68: return color jpayne@68: jpayne@68: ''' jpayne@68: New function that just returns colors jpayne@68: ''' jpayne@68: def get_colors(): jpayne@68: jpayne@68: color = { jpayne@68: 'black' : "\033[1;30m", jpayne@68: 'red' : "\033[1;31m", jpayne@68: 'green' : "\033[1;32m", jpayne@68: 'yellow' : "\033[1;33m", jpayne@68: 'blue' : "\033[1;34m", jpayne@68: 'pink' : "\033[1;35m", jpayne@68: 'cyan' : "\033[1;36m", jpayne@68: 'white' : "\033[1;37m", jpayne@68: '' : "\033[m" jpayne@68: } jpayne@68: jpayne@68: jpayne@68: return color jpayne@68: jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: Returns msg_ok, msg_fail, msg_warn colored or not colored jpayne@68: ''' jpayne@68: def get_msg_settings(color): jpayne@68: jpayne@68: msg_ok = "[ "+color['green']+"OK"+color['']+" ]" jpayne@68: msg_fail = "[ "+color['red']+"FAIL"+color['']+" ]" jpayne@68: msg_warn = "[ "+color['yellow']+"WARN"+color['']+" ]" jpayne@68: jpayne@68: return msg_ok, msg_fail, msg_warn jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: Use RQC's ap_tool to get the status jpayne@68: set mode = "-sa" to show all, even completed jpayne@68: ''' jpayne@68: def get_analysis_project_id(seq_proj_id, target_analysis_project_id, target_analysis_task_id, output_path, log = None, mode = ""): jpayne@68: jpayne@68: if log: jpayne@68: log.info("get_analysis_project_id: spid = %s, tapid = %s, tatid = %s", seq_proj_id, target_analysis_project_id, target_analysis_task_id) jpayne@68: jpayne@68: analysis_project_id = 0 jpayne@68: analysis_task_id = 0 jpayne@68: project_type = None jpayne@68: task_type = None jpayne@68: jpayne@68: ap_list = os.path.join(output_path, "ap-info.txt") jpayne@68: AP_TOOL = "/global/dna/projectdirs/PI/rqc/prod/jgi-rqc-pipeline/tools/ap_tool.py" jpayne@68: #AP_TOOL = "/global/homes/b/brycef/git/jgi-rqc-pipeline/tools/ap_tool.py" jpayne@68: cmd = "%s -spid %s -m psv -tapid %s -tatid %s %s > %s 2>&1" % (AP_TOOL, seq_proj_id, target_analysis_project_id, target_analysis_task_id, mode, ap_list) jpayne@68: if log: jpayne@68: log.info("- cmd: %s", cmd) jpayne@68: else: jpayne@68: print "- cmd: %s" % cmd jpayne@68: std_out, std_err, exit_code = run_command(cmd, True) jpayne@68: jpayne@68: post_mortem_cmd(cmd, exit_code, std_out, std_err, log) jpayne@68: jpayne@68: if os.path.isfile(ap_list): jpayne@68: jpayne@68: ap_dict = {} # header = value jpayne@68: cnt = 0 jpayne@68: fh = open(ap_list, "r") jpayne@68: for line in fh: jpayne@68: arr = line.strip().split("|") jpayne@68: if cnt == 0: jpayne@68: c2 = 0 # position of title in header jpayne@68: for a in arr: jpayne@68: ap_dict[a.lower()] = c2 jpayne@68: c2 += 1 jpayne@68: jpayne@68: else: jpayne@68: jpayne@68: for a in ap_dict: jpayne@68: jpayne@68: if ap_dict[a] + 1 > len(arr): jpayne@68: pass jpayne@68: else: jpayne@68: ap_dict[a] = arr[ap_dict[a]] jpayne@68: jpayne@68: jpayne@68: cnt += 1 jpayne@68: jpayne@68: fh.close() jpayne@68: jpayne@68: jpayne@68: analysis_project_id = ap_dict.get("analysis project id") jpayne@68: analysis_task_id = ap_dict.get("analysis task id") jpayne@68: project_type = ap_dict.get("analysis product name") jpayne@68: task_type = ap_dict.get("analysis task name") jpayne@68: jpayne@68: # nno such project jpayne@68: if cnt == 1: jpayne@68: analysis_project_id = 0 jpayne@68: analysis_task_id = 0 jpayne@68: jpayne@68: if log: jpayne@68: log.info("- project type: %s, task type: %s", project_type, task_type) jpayne@68: log.info("- analysis_project_id: %s, analysis_task_id: %s", analysis_project_id, analysis_task_id) jpayne@68: jpayne@68: try: jpayne@68: analysis_project_id = int(analysis_project_id) jpayne@68: analysis_task_id = int(analysis_task_id) jpayne@68: except: jpayne@68: analysis_project_id = 0 jpayne@68: analysis_task_id = 0 jpayne@68: jpayne@68: jpayne@68: # ap = 4, at = 8 means its using the column names but didn't find anything jpayne@68: if analysis_project_id < 100: jpayne@68: analysis_project_id = 0 jpayne@68: if analysis_task_id < 100: jpayne@68: analysis_task_id = 0 jpayne@68: jpayne@68: return analysis_project_id, analysis_task_id jpayne@68: jpayne@68: jpayne@68: ''' jpayne@68: For creating a dot file from the pipeline flow jpayne@68: ''' jpayne@68: def append_flow(flow_file, orig_node, orig_label, next_node, next_label, link_label): jpayne@68: jpayne@68: fh = open(flow_file, "a") jpayne@68: fh.write("%s|%s|%s|%s|%s\n" % (orig_node, orig_label, next_node, next_label, link_label)) jpayne@68: fh.close() jpayne@68: jpayne@68: ''' jpayne@68: Flow file format: jpayne@68: # comment jpayne@68: *label|PBMID Pipeline run for BTXXY
Run Date: 2017-09-28 14:22:50 jpayne@68: # origin node, origin label, next node, next label, link label jpayne@68: input_h5|BTXXY H5
3 smrtcells|assembly|HGAP Assembly
3 contigs, 13,283,382bp
|HGAP v4.0.1 jpayne@68: jpayne@68: nodes should be the output of the transformation between the nodes jpayne@68: e.g. input fastq (25m reads) --[ bbtools subsampling ]--> subsampled fastq (10m reads) jpayne@68: jpayne@68: creates a dot file, to convert to png use: jpayne@68: $ module load graphviz jpayne@68: $ dot -T png (dot file) > (png file) jpayne@68: jpayne@68: More info on formatting the labels jpayne@68: http://www.graphviz.org/content/node-shapes#html jpayne@68: ''' jpayne@68: def dot_flow(flow_file, dot_file, log = None): jpayne@68: jpayne@68: if not os.path.isfile(flow_file): jpayne@68: if log: jpayne@68: log.info("- cannot find flow file: %s", flow_file) jpayne@68: else: jpayne@68: print "Cannot find flow file: %s" % flow_file jpayne@68: return jpayne@68: jpayne@68: jpayne@68: fhw = open(dot_file, "w") jpayne@68: jpayne@68: fhw.write("// dot file\n") jpayne@68: fhw.write("digraph rqc {\n") # directed graph jpayne@68: fhw.write(" node [shape=box];\n") jpayne@68: fhw.write(" rankdir=LR;\n") jpayne@68: jpayne@68: fh = open(flow_file, "r") jpayne@68: jpayne@68: for line in fh: jpayne@68: line = line.strip() jpayne@68: jpayne@68: if not line: jpayne@68: continue jpayne@68: jpayne@68: if line.startswith("#"): jpayne@68: continue jpayne@68: jpayne@68: # graph label jpayne@68: if line.startswith("*label"): jpayne@68: arr = line.split("|") jpayne@68: label = flow_replace(str(arr[1])) jpayne@68: jpayne@68: fhw.write(" label=<%s>;\n" % label) jpayne@68: fhw.write(" labelloc=top;\n") jpayne@68: jpayne@68: else: jpayne@68: jpayne@68: arr = line.split("|") jpayne@68: #print arr jpayne@68: jpayne@68: if len(arr) == 5: jpayne@68: jpayne@68: org_node = arr[0] jpayne@68: org_label = str(arr[1]) jpayne@68: next_node = arr[2] jpayne@68: next_label = str(arr[3]) jpayne@68: link_label = str(arr[4]) jpayne@68: jpayne@68: # must be
in the dot file, I have a habit of using
jpayne@68: org_label = flow_replace(org_label) jpayne@68: next_label = flow_replace(next_label) jpayne@68: link_label = flow_replace(link_label) jpayne@68: jpayne@68: # label are enclosed by < > instead of " " to handle html-ish markups jpayne@68: jpayne@68: if next_node: jpayne@68: link = " %s -> %s;\n" % (org_node, next_node) jpayne@68: if link_label: jpayne@68: link = " %s -> %s [label=<%s>];\n" % (org_node, next_node, link_label) jpayne@68: fhw.write(link) jpayne@68: jpayne@68: if org_label: jpayne@68: label = " %s [label=<%s>];\n" % (org_node, org_label) jpayne@68: fhw.write(label) jpayne@68: jpayne@68: if next_label: jpayne@68: label = " %s [label=<%s>];\n" % (next_node, next_label) jpayne@68: fhw.write(label) jpayne@68: jpayne@68: jpayne@68: jpayne@68: fh.close() jpayne@68: jpayne@68: fhw.write("}\n") jpayne@68: jpayne@68: fhw.close() jpayne@68: if log: jpayne@68: log.info("- created dot file: %s", dot_file) jpayne@68: jpayne@68: return dot_file jpayne@68: jpayne@68: ''' jpayne@68: simple replacements jpayne@68: ''' jpayne@68: def flow_replace(my_str): jpayne@68: jpayne@68: new_str = my_str.replace("
", "
").replace("", "").replace("", "") jpayne@68: jpayne@68: return new_str jpayne@68: jpayne@68: jpayne@68: ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ jpayne@68: ## main program jpayne@68: jpayne@68: jpayne@68: if __name__ == "__main__": jpayne@68: # unit tests jpayne@68: jpayne@68: print human_size(102192203) jpayne@68: print human_size(250000000000) jpayne@68: #print get_read_count_fastq("/global/projectb/scratch/brycef/sag/phix/11185.1.195330.UNKNOWN_matched.fastq.gz") jpayne@68: jpayne@68: cmd = "#bbtools;bbduk.sh in=/global/dna/dm_archive/sdm/illumina//01/14/88/11488.1.208132.UNKNOWN.fastq.gz ref=/global/dna/shared/rqc/ref_databases/qaqc/databases/phix174_ill.ref.fa outm=/global/projectb/scratch/brycef/phix/11488/11488.1.208132.UNKNOWN_matched.fastq.gz outu=/global/projectb/scratch/brycef/phix/11488/11488.1.208132.UNKNOWN_unmatched.fastq.gz" jpayne@68: print convert_cmd(cmd) jpayne@68: cmd = "#pigz;pigz /global/projectb/scratch/brycef/align/BTOYH/genome/11463.6.208000.CAAGGTC-AGACCTT.filter-RNA.fastq.gz-genome.sam" jpayne@68: print convert_cmd(cmd) jpayne@68: jpayne@68: cmd = "#java;java -version" jpayne@68: print convert_cmd(cmd) jpayne@68: jpayne@68: dot_flow("/global/projectb/scratch/brycef/pbmid/BWOAU/f2.flow", "/global/projectb/scratch/brycef/pbmid/BWOAU/BWOUAx.dot") jpayne@68: jpayne@68: sys.exit(0) jpayne@68: jpayne@68: