comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/opt/bbmap-39.01-1/pytools/lib/common.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 68:5028fdace37b
1 #! /usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """
4 Function definitions common to all programs.
5
6
7 """
8
9 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10 ## libraries to use
11
12 #import re
13 import os
14 import time
15 import sys
16 #import getpass
17 import logging
18 #from colorlog import ColoredFormatter
19 # import EnvironmentModules # get_read_count_fastq
20 from subprocess import Popen, PIPE
21 from email.mime.text import MIMEText
22
23
24 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 ## function definitions
26
27
28 '''
29 creates a logging instance
30 https://docs.python.org/2/howto/logging.html
31 https://pypi.python.org/pypi/colorlog
32 '''
33 def get_logger(log_name, log_file, log_level = "INFO", stdout = False, color = False):
34 log = logging.getLogger(log_name)
35 handler = None
36 if stdout:
37 handler = logging.StreamHandler(sys.stdout)
38 else:
39 handler = logging.FileHandler(log_file)
40
41 formatter = logging.Formatter('%(filename)-15s:%(process)d %(asctime)s %(levelname)s: %(message)s')
42
43 if color and 1==2:
44 """
45 formatter = ColoredFormatter("%(filename)-15s:%(process)d %(asctime)s %(log_color)s%(levelname)s: %(message)s", datefmt=None, reset=True,
46 log_colors={
47 'DEBUG': 'blue',
48 'INFO': 'green',
49 'WARNING': 'yellow',
50 'ERROR': 'red',
51 'CRITICAL': 'red, bg_white',
52 },
53 secondary_log_colors={},
54 style='%')
55 Not working in conda - 2017-04-29
56 """
57 handler.setFormatter(formatter)
58
59 log.addHandler(handler)
60 log.setLevel(log_level)
61
62 return log
63
64
65 '''
66 Checkpoint the status plus a timestamp
67 - appends the status
68
69 @param status_log: /path/to/status.log (or whatever you name it)
70 @param status: status to append to status.log
71
72 '''
73 def checkpoint_step(status_log, status):
74 status_line = "%s,%s\n" % (status, time.strftime("%Y-%m-%d %H:%M:%S"))
75
76 with open(status_log, "a") as myfile:
77 myfile.write(status_line)
78
79 '''
80 returns the last step (status) from the pipeline
81 @param status_log: /path/to/status.log (or whatever you name it)
82 @param log: logger object
83
84 @return last status in the status log, "start" if nothing there
85 '''
86 def get_status(status_log, log = None):
87 #status_log = "%s/%s" % (output_path, "test_status.log")
88
89 status = "start"
90 timestamp = str(time.strftime("%Y-%m-%d %H:%M:%S"))
91
92 if os.path.isfile(status_log):
93 fh = open(status_log, 'r')
94 lines = fh.readlines()
95 fh.close()
96
97 for line in lines:
98 if line.startswith('#'): continue
99 line_list = line.split(",")
100 assert len(line_list) == 2
101 status = str(line_list[0]).strip()
102 timestamp = str(line_list[1]).strip()
103
104 if not status:
105 status = "start"
106
107 if log:
108 log.info("Last checkpointed step: %s (%s)", status, timestamp)
109
110 else:
111 if log:
112 log.info("Cannot find status.log (%s), assuming new run", status_log)
113
114 status = status.strip().lower()
115
116 return status
117
118
119
120 '''
121 run a command from python
122
123 @param cmd: command to run
124 @param live: False = run in dry mode (print command), True = run normally
125 @param log: logger object
126
127 @return std_out, std_err, exit_code
128 '''
129 def run_command(cmd, live=False, log=None):
130 stdOut = None
131 stdErr = None
132 exitCode = None
133 #end = 0
134 #elapsedSec = 0
135
136 if cmd:
137 if not live:
138 stdOut = "Not live: cmd = '%s'" % (cmd)
139 exitCode = 0
140 else:
141 if log: log.info("cmd: %s" % (cmd))
142
143 p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
144 stdOut, stdErr = p.communicate()
145 exitCode = p.returncode
146
147 if log:
148 log.info("Return values: exitCode=" + str(exitCode) + ", stdOut=" + str(stdOut) + ", stdErr=" + str(stdErr))
149 if exitCode != 0:
150 log.warn("- The exit code has non-zero value.")
151
152 else:
153 if log:
154 log.error("- No command to run.")
155 return None, None, -1
156
157 return stdOut, stdErr, exitCode
158
159
160 '''
161 replacement for run_command
162 - includes logging, convert_cmd & post_mortem
163 '''
164
165 def run_cmd(cmd, log=None):
166
167 std_out = None
168 std_err = None
169 exit_code = 0
170
171 if cmd:
172 # convert to work on genepool/denovo
173 cmd = convert_cmd(cmd)
174
175 if log:
176 log.info("- cmd: %s", cmd)
177
178 p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
179 std_out, std_err = p.communicate()
180 exit_code = p.returncode
181
182 post_mortem_cmd(cmd, exit_code, std_out, std_err, log)
183
184 return std_out, std_err, exit_code
185
186
187 '''
188 Simple function to output to the log what happened only if exit code > 0
189
190 Typical usage:
191 std_out, std_err, exit_code = run_command(cmd, True)
192 post_mortem_cmd(cmd, exit_code, std_out, std_err)
193
194 '''
195 def post_mortem_cmd(cmd, exit_code, std_out, std_err, log = None):
196 if exit_code > 0:
197 if log:
198 log.error("- cmd failed: %s", cmd)
199 log.error("- exit code: %s", exit_code)
200
201 else:
202 print "- cmd failed: %s" % (cmd)
203 print "- exit code: %s" % (exit_code)
204
205
206 if std_out:
207 if log:
208 log.error("- std_out: %s", std_out)
209 else:
210 print "- std_out: %s" % (std_out)
211
212 if std_err:
213 if log:
214 log.error("- std_err: %s", std_err)
215 else:
216 print "- std_err: %s" % (std_err)
217
218
219 '''
220 Convert command to use genepool or denovo (shifter) to run
221 replace #placeholder; with shifter or module load command
222 #placeholder.v; should specify the version to use
223
224 This should be the only place in the pipelines that specifies the images/modules translation
225 '''
226 def convert_cmd(cmd):
227 new_cmd = cmd
228
229 shifter_img = {
230 "#bbtools" : "shifter --image=bryce911/bbtools ",
231 "#pigz" : "module load pigz;",
232 "#jamo" : "shifter --image=registry.services.nersc.gov/htandra/jamo_dev:1.0 ", # works, but would like simple module to use - have one on Denovo but not Cori
233 "#gnuplot" : "shifter --image=bryce911/bbtools ", # (1)
234 "#spades/3.9.0" : "shifter --image=bryce911/spades3.9.0 ",
235 "#spades/3.10.1" : "shifter --image=bryce911/spades3.10.1 ",
236 "#spades/3.11.0" : "shifter --image=bryce911/spades-3.11.0 ", # GAA-3383
237 "#spades/3.11.1-check" : "shifter --image=bryce911/spades3.11.1-check ", # development
238 "#prodigal/2.6.3" : "shifter --image=registry.services.nersc.gov/jgi/prodigal ", # RQCSUPPORT-1318
239 "#prodigal/2.5.0" : "shifter --image=registry.services.nersc.gov/jgi/prodigal ",
240 "#prodigal/2.50" : "shifter --image=registry.services.nersc.gov/jgi/prodigal ",
241 "#lastal/869" : "shifter --image=bryce911/lastal:869 ",
242 "#lastal/828" : "shifter --image=bryce911/lastal:869 ",
243 #"#lastal" : "shifter --image=bryce911/lastal:869 ",
244 "#R/3.3.2" : "module load R/3.3.2;",
245 "#texlive" : "shifter --image=bryce911/bbtools ", # (1)
246 "#java" : "shifter --image=bryce911/bbtools ", # (1)
247 "#blast+/2.6.0" : "shifter --image=sulsj/ncbi-blastplus:2.6.0 ",
248 "#blast" : "shifter --image=sulsj/ncbi-blastplus:2.7.0 ",
249 "#megahit-1.1.1" : "shifter --image=foster505/megahit:v1.1.1-2-g02102e1 ",
250 "#smrtanalysis/2.3.0_p5" : "shifter --image=registry.services.nersc.gov/jgi/smrtanalysis:2.3.0_p5 ", # meth - need more memory
251 "#mummer/3.23" : "shifter --image=bryce911/mummer3.23 ", # 3.23
252 "#hmmer" : "shifter --image=registry.services.nersc.gov/jgi/hmmer:latest ", # 3.1b2
253 "#samtools/1.4" : "shifter --image=rmonti/samtools ",
254 "#mothur/1.39.5" : "shifter --image=bryce911/mothur1.39.5 ",
255 "#vsearch/2.4.3" : "shifter --image=bryce911/vsearch2.4.3 ",
256 "#graphviz" : "shifter --image=bryce911/bbtools ",
257 "#ssu-align/0.1.1" : "shifter --image=bryce911/ssu-align0.1.1 ", # openmpi/1.10 included in docker container
258 "#smrtlink/4.0.0.190159" : "shifter --image=registry.services.nersc.gov/jgi/smrtlink:4.0.0.190159 /smrtlink/smrtcmds/bin/", # progs not in path
259 "#smrtlink/5.0.1.9585" : "shifter --image=registry.services.nersc.gov/jgi/smrtlink:5.0.1.9585 /smrtlink/smrtcmds/bin/", # progs not in path, Tony created 2017-10-16
260 "#smrtlink" : "shifter --image=registry.services.nersc.gov/jgi/smrtlink:5.0.1.9585 /smrtlink/smrtcmds/bin/", # progs not in path
261 "#prodege" : "shifter --image=bryce911/prodege ", # 2.2.1
262 #"#hmmer" : "shifter --image=registry.services.nersc.gov/jgi/hmmer ", # 3.1b2 - Feb 2015, latest as of Oct 2017
263 "#checkm" : "shifter --image=registry.services.nersc.gov/jgi/checkm ",
264 }
265
266
267 # (1) - added as part of the bryce911 bbtools package
268
269 #cmd = "#bbtools-shijie;bbmap...."
270 # this dict will be deprecated as of March 2018 when genepool passes into legend
271 genepool_mod = {
272 "#bbtools" : "module load bbtools",
273 "#pigz" : "module load pigz",
274 "#jamo" : "module load jamo",
275 "#gnuplot" : "module load gnuplot/4.6.2", # sag,iso,sps,ce:gc_cov, gc_histogram, contig_gc
276 "#spades/3.9.0" : "module load spades/3.9.0",
277 "#spades/3.10.1" : "module load spades/3.10.1",
278 "#spades/3.11.1" : "module load spades/3.11.1-check",
279 "#prodigal/2.6.3" : "module load prodigal/2.50", # aka 2.50, also 2.60 is available
280 "#prodigal/2.5.0" : "module load prodigal/2.50",
281 "#prodigal/2.50" : "module load prodigal/2.50",
282 #"#lastal" : "module load last/828",
283 "#lastal/828" : "module load last/828",
284 "#R/3.3.2" : "module unload R;module load R/3.3.1", # 3.3.2 not on genepool - RQCSUPPORT-1516 unload R for Kecia
285 "#texlive" : "module load texlive",
286 "#blast+/2.6.0" : "module load blast+/2.6.0",
287 #"#blast+/2.7.0" : "module load blast+/2.7.0", # not created
288 "#blast" : "module load blast+/2.6.0",
289 "#java" : "", # java runs natively on genepool
290 "#megahit-1.1.1" : "module load megahit/1.1.1",
291 "#smrtanalysis/2.3.0_p5" : "module load smrtanalysis/2.3.0_p5",
292 "#smrtanalysis/2.3.0_p5_xmx32g" : "module load smrtanalysis/2.3.0_p5;export _JAVA_OPTIONS='-Xmx32g'",
293 "#mummer/3.23" : "module load mummer/3.23",
294 "#hmmer" : "module load hmmer/3.1b2",
295 "#samtools/1.4" : "module load samtools/1.4",
296 "#mothur/1.39.5" : "module load mothur/1.32.1", # 1.26.0 default, 1.32.1
297 "#vsearch/2.4.3" : "module load vsearch/2.3.0", # 2.3.0
298 "#graphviz" : "module load graphviz",
299 "#ssu-align/0.1.1" : "module load ssu-align",
300 "#smrtlink/4.0.0.190159" : "module load smrtlink/4.0.0.190159",
301 "#smrtlink" : "module load smrtlink/5.0.1.9585",
302 "#smrtlink/5.0.1.9585" : "module load smrtlink/5.0.1.9585",
303 "#prodege" : "module load R;/projectb/sandbox/rqc/prod/pipelines/external_tools/sag_decontam/prodege-2.2/bin/",
304 "#checkm" : "module load hmmer prodigal pplacer", # checkm installed in python by default on genepool
305 }
306
307 #bbtools;stats.sh
308
309 if cmd.startswith("#"):
310
311 cluster = "genepool"
312 # any other env ids to use?
313
314 # cori, denovo, genepool
315 cluster = os.environ.get('NERSC_HOST', 'unknown')
316
317
318 f = cmd.find(";")
319 mod = "" # command to replace
320 if f > -1:
321 mod = cmd[0:f]
322
323 if mod:
324
325
326 # use module load jamo on denovo
327 if mod == "#jamo" and cluster == "denovo":
328 shifter_img[mod] = "module load jamo;"
329
330 if cluster in ("denovo", "cori"):
331
332 if mod in shifter_img:
333 new_cmd = new_cmd.replace(mod + ";", shifter_img[mod])
334
335 else:
336 if mod in genepool_mod:
337 if genepool_mod[mod] == "":
338 new_cmd = new_cmd.replace(mod + ";", "")
339 else:
340 new_cmd = new_cmd.replace(mod, genepool_mod[mod])
341
342 if new_cmd.startswith("#"):
343 print "Command not found! %s" % new_cmd
344 sys.exit(18)
345
346 #print new_cmd
347 return new_cmd
348
349
350 '''
351 returns human readable file size
352 @param num = file size (e.g. 1000)
353
354 @return: readable float e.g. 1.5 KB
355 '''
356 def human_size(num):
357 if not num:
358 num = 0.0
359
360 for x in ['bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'XB']:
361 if num < 1024.0:
362 return "%3.1f %s" % (num, x)
363 num /= 1024.0
364
365 return "%3.1f %s" % (num, 'ZB')
366
367
368
369 '''
370 send out email
371 @param emailTo: email receipient (e.g. bryce@lbl.gov)
372 @param emailSubject: subject line for the email
373 @param emailBody: content of the email
374 @param emailFrom: optional email from
375
376 '''
377 def send_email(email_to, email_subject, email_body, email_from = 'rqc@jgi-psf.org', log = None):
378 msg = ""
379 err_flag = 0
380
381 if not email_to:
382 msg = "- send_email: email_to parameter missing!"
383
384 if not email_subject:
385 msg = "- send_email: email_subject parameter missing!"
386
387 if not email_body:
388 msg = "- send_email: email_body parameter missing!"
389
390
391 if err_flag == 0:
392 msg = "- sending email to: %s" % (email_to)
393
394 if log:
395 log.info(msg)
396 else:
397 print msg
398
399 if err_flag == 1:
400 return 0
401
402 # assume html
403 email_msg = MIMEText(email_body, "html") # vs "plain"
404 email_msg['Subject'] = email_subject
405 email_msg['From'] = email_from
406 email_msg['To'] = email_to
407
408 p = Popen(["/usr/sbin/sendmail", "-t"], stdin = PIPE)
409 p.communicate(email_msg.as_string())
410
411 return err_flag
412
413 '''
414 Write to rqc_file (e.g. rqc-files.tmp) the file_key and file_value
415
416 @param rqc_file_log: full path to file containing key=file
417 @param file_key: key for the entry
418 @param file_value: value for the entry
419
420 '''
421 def append_rqc_file(rqc_file_log, file_key, file_value, log=None):
422 if file_key:
423
424 buffer = "%s = %s\n" % (file_key, file_value)
425
426 with open(rqc_file_log, "a") as myfile:
427 myfile.write(buffer)
428
429 if log: log.info("append_rqc_file: %s:%s" % (file_key, file_value))
430
431 else:
432
433 if log: log.warning("key or value error: %s:%s" % (file_key, file_value))
434
435 '''
436 Write to rqc_stats (e.g. rqc-stats.tmp) the stats_key and stats_value
437 @param rqc_file_log: full path to file containing key=file
438 @param file_key: key for the entry
439 @param file_value: value for the entry
440
441 '''
442 def append_rqc_stats(rqc_stats_log, stats_key, stats_value, log=None):
443 if stats_key:
444
445 buffer = "%s = %s\n" % (stats_key, stats_value)
446
447 with open(rqc_stats_log, "a") as myfile:
448 myfile.write(buffer)
449
450 if log: log.info("append_rqc_stats: %s:%s" % (stats_key, stats_value))
451
452 else:
453
454 if log: log.warning("key or value error: %s:%s" % (stats_key, stats_value))
455
456 '''
457 Return the file system path to jgi-rqc-pipeline so we can use */tools and */lib
458
459 @return /path/to/jgi-rqc-pipelines
460 '''
461 def get_run_path():
462 current_path = os.path.dirname(os.path.abspath(__file__))
463 run_path = os.path.abspath(os.path.join(current_path, os.pardir))
464
465 return run_path
466
467
468
469 '''
470 Simple read count using bbtools n_contigs field
471 - slightly different than in rqc_utility
472 n_scaffolds n_contigs scaf_bp contig_bp gap_pct scaf_N50 scaf_L50 ctg_N50 ctg_L50 scaf_N90 scaf_L90 ctg_N90 ctg_L90 scaf_max ctg_max scaf_n_gt50K scaf_pct_gt50K gc_avg gc_std
473 1346616 1346616 405331416 405331415 0.000 1346616 301 1346615 301 1346616 301 1346615 301 301 301 0 0.000 0.44824 0.02675
474
475 '''
476 def get_read_count_fastq(fastq, log = None):
477 read_cnt = 0
478
479 if os.path.isfile(fastq):
480
481 # EnvironmentModules.module(["load", "bbtools"])
482 # bbtools faster than zcat | wc because bbtools uses pigz
483 # cmd = "stats.sh format=3 in=%s" % fastq
484 cmd = "#bbtools;stats.sh format=3 in=%s" % fastq
485 cmd = convert_cmd(cmd)
486
487
488 if log:
489 log.info("- cmd: %s", cmd)
490
491 std_out, std_err, exit_code = run_command(cmd, True)
492
493 # EnvironmentModules.module(["unload", "bbtools"])
494
495 if exit_code == 0 and std_out:
496
497 line_list = std_out.split("\n")
498 #print line_list
499 val_list = str(line_list[1]).split() #.split('\t')
500 #print "v = %s" % val_list
501
502 read_cnt = int(val_list[1])
503
504 if log:
505 log.info("- read count: %s", read_cnt)
506
507 else:
508 if log:
509 post_mortem_cmd(cmd, exit_code, std_out, std_err, log)
510
511
512 else:
513 log.error("- fastq: %s does not exist!", fastq)
514
515 return read_cnt
516
517
518
519 '''
520 Subsampling calculation
521 0 .. 250k reads = 100%
522 250k .. 25m = 100% to 1%
523 25m .. 600m = 1%
524 600m+ .. oo < 1%
525
526 July 2014 - 15 runs > 600m (HiSeq-2500 Rapid) - 4 actual libraries / 85325 seq units
527 - returns new subsampling rate
528 '''
529 def get_subsample_rate(read_count):
530 subsample = 0
531 subsample_rate = 0.01
532 max_subsample = 6000000 # 4 hours of blast time
533
534 new_subsample_rate = 250000.0/read_count
535 subsample_rate = max(new_subsample_rate, subsample_rate)
536 subsample_rate = min(1, subsample_rate) # if subsample_rate > 1, then set to 1
537
538 subsample = int(read_count * subsample_rate)
539
540 if subsample > max_subsample:
541 subsample = max_subsample
542
543 subsample_rate = subsample / float(read_count)
544
545 return subsample_rate
546
547
548 '''
549 Set color hash
550 - need to update to remove "c" parameter - used in too many places
551 '''
552 def set_colors(c, use_color = False):
553
554 if use_color == False:
555
556 color = {
557 'black' : "",
558 'red' : "",
559 'green' : "",
560 'yellow' : "",
561 'blue' : "",
562 'pink' : "",
563 'cyan' : "",
564 'white' : "",
565 '' : ""
566 }
567
568 else:
569
570 color = {
571 'black' : "\033[1;30m",
572 'red' : "\033[1;31m",
573 'green' : "\033[1;32m",
574 'yellow' : "\033[1;33m",
575 'blue' : "\033[1;34m",
576 'pink' : "\033[1;35m",
577 'cyan' : "\033[1;36m",
578 'white' : "\033[1;37m",
579 '' : "\033[m"
580 }
581
582
583 return color
584
585 '''
586 New function that just returns colors
587 '''
588 def get_colors():
589
590 color = {
591 'black' : "\033[1;30m",
592 'red' : "\033[1;31m",
593 'green' : "\033[1;32m",
594 'yellow' : "\033[1;33m",
595 'blue' : "\033[1;34m",
596 'pink' : "\033[1;35m",
597 'cyan' : "\033[1;36m",
598 'white' : "\033[1;37m",
599 '' : "\033[m"
600 }
601
602
603 return color
604
605
606
607 '''
608 Returns msg_ok, msg_fail, msg_warn colored or not colored
609 '''
610 def get_msg_settings(color):
611
612 msg_ok = "[ "+color['green']+"OK"+color['']+" ]"
613 msg_fail = "[ "+color['red']+"FAIL"+color['']+" ]"
614 msg_warn = "[ "+color['yellow']+"WARN"+color['']+" ]"
615
616 return msg_ok, msg_fail, msg_warn
617
618
619 '''
620 Use RQC's ap_tool to get the status
621 set mode = "-sa" to show all, even completed
622 '''
623 def get_analysis_project_id(seq_proj_id, target_analysis_project_id, target_analysis_task_id, output_path, log = None, mode = ""):
624
625 if log:
626 log.info("get_analysis_project_id: spid = %s, tapid = %s, tatid = %s", seq_proj_id, target_analysis_project_id, target_analysis_task_id)
627
628 analysis_project_id = 0
629 analysis_task_id = 0
630 project_type = None
631 task_type = None
632
633 ap_list = os.path.join(output_path, "ap-info.txt")
634 AP_TOOL = "/global/dna/projectdirs/PI/rqc/prod/jgi-rqc-pipeline/tools/ap_tool.py"
635 #AP_TOOL = "/global/homes/b/brycef/git/jgi-rqc-pipeline/tools/ap_tool.py"
636 cmd = "%s -spid %s -m psv -tapid %s -tatid %s %s > %s 2>&1" % (AP_TOOL, seq_proj_id, target_analysis_project_id, target_analysis_task_id, mode, ap_list)
637 if log:
638 log.info("- cmd: %s", cmd)
639 else:
640 print "- cmd: %s" % cmd
641 std_out, std_err, exit_code = run_command(cmd, True)
642
643 post_mortem_cmd(cmd, exit_code, std_out, std_err, log)
644
645 if os.path.isfile(ap_list):
646
647 ap_dict = {} # header = value
648 cnt = 0
649 fh = open(ap_list, "r")
650 for line in fh:
651 arr = line.strip().split("|")
652 if cnt == 0:
653 c2 = 0 # position of title in header
654 for a in arr:
655 ap_dict[a.lower()] = c2
656 c2 += 1
657
658 else:
659
660 for a in ap_dict:
661
662 if ap_dict[a] + 1 > len(arr):
663 pass
664 else:
665 ap_dict[a] = arr[ap_dict[a]]
666
667
668 cnt += 1
669
670 fh.close()
671
672
673 analysis_project_id = ap_dict.get("analysis project id")
674 analysis_task_id = ap_dict.get("analysis task id")
675 project_type = ap_dict.get("analysis product name")
676 task_type = ap_dict.get("analysis task name")
677
678 # nno such project
679 if cnt == 1:
680 analysis_project_id = 0
681 analysis_task_id = 0
682
683 if log:
684 log.info("- project type: %s, task type: %s", project_type, task_type)
685 log.info("- analysis_project_id: %s, analysis_task_id: %s", analysis_project_id, analysis_task_id)
686
687 try:
688 analysis_project_id = int(analysis_project_id)
689 analysis_task_id = int(analysis_task_id)
690 except:
691 analysis_project_id = 0
692 analysis_task_id = 0
693
694
695 # ap = 4, at = 8 means its using the column names but didn't find anything
696 if analysis_project_id < 100:
697 analysis_project_id = 0
698 if analysis_task_id < 100:
699 analysis_task_id = 0
700
701 return analysis_project_id, analysis_task_id
702
703
704 '''
705 For creating a dot file from the pipeline flow
706 '''
707 def append_flow(flow_file, orig_node, orig_label, next_node, next_label, link_label):
708
709 fh = open(flow_file, "a")
710 fh.write("%s|%s|%s|%s|%s\n" % (orig_node, orig_label, next_node, next_label, link_label))
711 fh.close()
712
713 '''
714 Flow file format:
715 # comment
716 *label|PBMID Pipeline run for BTXXY<br><font point-size="10">Run Date: 2017-09-28 14:22:50</font>
717 # origin node, origin label, next node, next label, link label
718 input_h5|BTXXY H5<br><font point-size="10">3 smrtcells</font>|assembly|HGAP Assembly<FONT POINT-SIZE="10"><br>3 contigs, 13,283,382bp</FONT>|HGAP v4.0.1
719
720 nodes should be the output of the transformation between the nodes
721 e.g. input fastq (25m reads) --[ bbtools subsampling ]--> subsampled fastq (10m reads)
722
723 creates a dot file, to convert to png use:
724 $ module load graphviz
725 $ dot -T png (dot file) > (png file)
726
727 More info on formatting the labels
728 http://www.graphviz.org/content/node-shapes#html
729 '''
730 def dot_flow(flow_file, dot_file, log = None):
731
732 if not os.path.isfile(flow_file):
733 if log:
734 log.info("- cannot find flow file: %s", flow_file)
735 else:
736 print "Cannot find flow file: %s" % flow_file
737 return
738
739
740 fhw = open(dot_file, "w")
741
742 fhw.write("// dot file\n")
743 fhw.write("digraph rqc {\n") # directed graph
744 fhw.write(" node [shape=box];\n")
745 fhw.write(" rankdir=LR;\n")
746
747 fh = open(flow_file, "r")
748
749 for line in fh:
750 line = line.strip()
751
752 if not line:
753 continue
754
755 if line.startswith("#"):
756 continue
757
758 # graph label
759 if line.startswith("*label"):
760 arr = line.split("|")
761 label = flow_replace(str(arr[1]))
762
763 fhw.write(" label=<%s>;\n" % label)
764 fhw.write(" labelloc=top;\n")
765
766 else:
767
768 arr = line.split("|")
769 #print arr
770
771 if len(arr) == 5:
772
773 org_node = arr[0]
774 org_label = str(arr[1])
775 next_node = arr[2]
776 next_label = str(arr[3])
777 link_label = str(arr[4])
778
779 # must be <br/> in the dot file, I have a habit of using <br>
780 org_label = flow_replace(org_label)
781 next_label = flow_replace(next_label)
782 link_label = flow_replace(link_label)
783
784 # label are enclosed by < > instead of " " to handle html-ish markups
785
786 if next_node:
787 link = " %s -> %s;\n" % (org_node, next_node)
788 if link_label:
789 link = " %s -> %s [label=<%s>];\n" % (org_node, next_node, link_label)
790 fhw.write(link)
791
792 if org_label:
793 label = " %s [label=<%s>];\n" % (org_node, org_label)
794 fhw.write(label)
795
796 if next_label:
797 label = " %s [label=<%s>];\n" % (next_node, next_label)
798 fhw.write(label)
799
800
801
802 fh.close()
803
804 fhw.write("}\n")
805
806 fhw.close()
807 if log:
808 log.info("- created dot file: %s", dot_file)
809
810 return dot_file
811
812 '''
813 simple replacements
814 '''
815 def flow_replace(my_str):
816
817 new_str = my_str.replace("<br>", "<br/>").replace("<smf>", "<font point-size=\"10\">").replace("</f>", "</font>")
818
819 return new_str
820
821
822 ## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
823 ## main program
824
825
826 if __name__ == "__main__":
827 # unit tests
828
829 print human_size(102192203)
830 print human_size(250000000000)
831 #print get_read_count_fastq("/global/projectb/scratch/brycef/sag/phix/11185.1.195330.UNKNOWN_matched.fastq.gz")
832
833 cmd = "#bbtools;bbduk.sh in=/global/dna/dm_archive/sdm/illumina//01/14/88/11488.1.208132.UNKNOWN.fastq.gz ref=/global/dna/shared/rqc/ref_databases/qaqc/databases/phix174_ill.ref.fa outm=/global/projectb/scratch/brycef/phix/11488/11488.1.208132.UNKNOWN_matched.fastq.gz outu=/global/projectb/scratch/brycef/phix/11488/11488.1.208132.UNKNOWN_unmatched.fastq.gz"
834 print convert_cmd(cmd)
835 cmd = "#pigz;pigz /global/projectb/scratch/brycef/align/BTOYH/genome/11463.6.208000.CAAGGTC-AGACCTT.filter-RNA.fastq.gz-genome.sam"
836 print convert_cmd(cmd)
837
838 cmd = "#java;java -version"
839 print convert_cmd(cmd)
840
841 dot_flow("/global/projectb/scratch/brycef/pbmid/BWOAU/f2.flow", "/global/projectb/scratch/brycef/pbmid/BWOAU/BWOUAx.dot")
842
843 sys.exit(0)
844
845