annotate scripts/modules/utils.py @ 0:8be2feb96994

"planemo upload commit cb65588391944306ff3cb32a23e1c28f65122014"
author cstrittmatter
date Fri, 11 Mar 2022 15:50:35 -0500
parents
children
rev   line source
cstrittmatter@0 1 import pickle
cstrittmatter@0 2 import traceback
cstrittmatter@0 3 import shlex
cstrittmatter@0 4 import subprocess
cstrittmatter@0 5 from threading import Timer
cstrittmatter@0 6 import shutil
cstrittmatter@0 7 import time
cstrittmatter@0 8 import functools
cstrittmatter@0 9 import os.path
cstrittmatter@0 10 import sys
cstrittmatter@0 11 import argparse
cstrittmatter@0 12
cstrittmatter@0 13
cstrittmatter@0 14 def start_logger(workdir):
cstrittmatter@0 15 time_str = time.strftime("%Y%m%d-%H%M%S")
cstrittmatter@0 16 sys.stdout = Logger(workdir, time_str)
cstrittmatter@0 17 logfile = sys.stdout.getLogFile()
cstrittmatter@0 18 return logfile, time_str
cstrittmatter@0 19
cstrittmatter@0 20
cstrittmatter@0 21 class Logger(object):
cstrittmatter@0 22 def __init__(self, out_directory, time_str):
cstrittmatter@0 23 self.logfile = os.path.join(out_directory, str('run.' + time_str + '.log'))
cstrittmatter@0 24 self.terminal = sys.stdout
cstrittmatter@0 25 self.log = open(self.logfile, "w")
cstrittmatter@0 26
cstrittmatter@0 27 def write(self, message):
cstrittmatter@0 28 self.terminal.write(message)
cstrittmatter@0 29 self.log.write(message)
cstrittmatter@0 30 self.log.flush()
cstrittmatter@0 31
cstrittmatter@0 32 def flush(self):
cstrittmatter@0 33 pass
cstrittmatter@0 34
cstrittmatter@0 35 def getLogFile(self):
cstrittmatter@0 36 return self.logfile
cstrittmatter@0 37
cstrittmatter@0 38
cstrittmatter@0 39 def checkPrograms(programs_version_dictionary):
cstrittmatter@0 40 print('\n' + 'Checking dependencies...')
cstrittmatter@0 41 programs = programs_version_dictionary
cstrittmatter@0 42 which_program = ['which', '']
cstrittmatter@0 43 listMissings = []
cstrittmatter@0 44 for program in programs:
cstrittmatter@0 45 which_program[1] = program
cstrittmatter@0 46 run_successfully, stdout, stderr = runCommandPopenCommunicate(which_program, False, None, False)
cstrittmatter@0 47 if not run_successfully:
cstrittmatter@0 48 listMissings.append(program + ' not found in PATH.')
cstrittmatter@0 49 else:
cstrittmatter@0 50 print(stdout.splitlines()[0])
cstrittmatter@0 51 if programs[program][0] is None:
cstrittmatter@0 52 print(program + ' (impossible to determine programme version) found at: ' + stdout.splitlines()[0])
cstrittmatter@0 53 else:
cstrittmatter@0 54 if program.endswith('.jar'):
cstrittmatter@0 55 check_version = ['java', '-jar', stdout.splitlines()[0], programs[program][0]]
cstrittmatter@0 56 programs[program].append(stdout.splitlines()[0])
cstrittmatter@0 57 else:
cstrittmatter@0 58 check_version = [stdout.splitlines()[0], programs[program][0]]
cstrittmatter@0 59 run_successfully, stdout, stderr = runCommandPopenCommunicate(check_version, False, None, False)
cstrittmatter@0 60 if stdout == '':
cstrittmatter@0 61 stdout = stderr
cstrittmatter@0 62 if program in ['wget', 'awk']:
cstrittmatter@0 63 version_line = stdout.splitlines()[0].split(' ', 3)[2]
cstrittmatter@0 64 elif program in ['prefetch', 'fastq-dump']:
cstrittmatter@0 65 version_line = stdout.splitlines()[1].split(' ')[-1]
cstrittmatter@0 66 else:
cstrittmatter@0 67 version_line = stdout.splitlines()[0].split(' ')[-1]
cstrittmatter@0 68 replace_characters = ['"', 'v', 'V', '+', ',']
cstrittmatter@0 69 for i in replace_characters:
cstrittmatter@0 70 version_line = version_line.replace(i, '')
cstrittmatter@0 71 print(program + ' (' + version_line + ') found')
cstrittmatter@0 72 if programs[program][1] == '>=':
cstrittmatter@0 73 program_found_version = version_line.split('.')
cstrittmatter@0 74 program_version_required = programs[program][2].split('.')
cstrittmatter@0 75 if len(program_version_required) == 3:
cstrittmatter@0 76 if len(program_found_version) == 2:
cstrittmatter@0 77 program_found_version.append(0)
cstrittmatter@0 78 else:
cstrittmatter@0 79 program_found_version[2] = program_found_version[2].split('_')[0]
cstrittmatter@0 80 for i in range(0, len(program_version_required)):
cstrittmatter@0 81 if int(program_found_version[i]) > int(program_version_required[i]):
cstrittmatter@0 82 break
cstrittmatter@0 83 elif int(program_found_version[i]) == int(program_version_required[i]):
cstrittmatter@0 84 continue
cstrittmatter@0 85 else:
cstrittmatter@0 86 listMissings.append('It is required ' + program + ' with version ' +
cstrittmatter@0 87 programs[program][1] + ' ' + programs[program][2])
cstrittmatter@0 88 else:
cstrittmatter@0 89 if version_line != programs[program][2]:
cstrittmatter@0 90 listMissings.append('It is required ' + program + ' with version ' + programs[program][1] +
cstrittmatter@0 91 ' ' + programs[program][2])
cstrittmatter@0 92 return listMissings
cstrittmatter@0 93
cstrittmatter@0 94
cstrittmatter@0 95 def requiredPrograms():
cstrittmatter@0 96 programs_version_dictionary = {}
cstrittmatter@0 97 programs_version_dictionary['rematch.py'] = ['--version', '>=', '4.0']
cstrittmatter@0 98 missingPrograms = checkPrograms(programs_version_dictionary)
cstrittmatter@0 99 if len(missingPrograms) > 0:
cstrittmatter@0 100 sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))
cstrittmatter@0 101
cstrittmatter@0 102
cstrittmatter@0 103 def general_information(logfile, version, outdir, time_str):
cstrittmatter@0 104 # Check if output directory exists
cstrittmatter@0 105
cstrittmatter@0 106 print('\n' + '==========> patho_typing <==========')
cstrittmatter@0 107 print('\n' + 'Program start: ' + time.ctime())
cstrittmatter@0 108
cstrittmatter@0 109 # Tells where the logfile will be stored
cstrittmatter@0 110 print('\n' + 'LOGFILE:')
cstrittmatter@0 111 print(logfile)
cstrittmatter@0 112
cstrittmatter@0 113 # Print command
cstrittmatter@0 114 print('\n' + 'COMMAND:')
cstrittmatter@0 115 script_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'patho_typing.py')
cstrittmatter@0 116 print(sys.executable + ' ' + ' '.join(sys.argv))
cstrittmatter@0 117
cstrittmatter@0 118 # Print directory where programme was lunch
cstrittmatter@0 119 print('\n' + 'PRESENT DIRECTORY:')
cstrittmatter@0 120 present_directory = os.path.abspath(os.getcwd())
cstrittmatter@0 121 print(present_directory)
cstrittmatter@0 122
cstrittmatter@0 123 # Print program version
cstrittmatter@0 124 print('\n' + 'VERSION:')
cstrittmatter@0 125 script_version_git(version, present_directory, script_path)
cstrittmatter@0 126
cstrittmatter@0 127 # Check programms
cstrittmatter@0 128 requiredPrograms()
cstrittmatter@0 129
cstrittmatter@0 130 return script_path
cstrittmatter@0 131
cstrittmatter@0 132
cstrittmatter@0 133 def setPATHvariable(doNotUseProvidedSoftware, script_path):
cstrittmatter@0 134 path_variable = os.environ['PATH']
cstrittmatter@0 135 script_folder = os.path.dirname(script_path)
cstrittmatter@0 136 # Set path to use provided softwares
cstrittmatter@0 137 if not doNotUseProvidedSoftware:
cstrittmatter@0 138 bowtie2 = os.path.join(script_folder, 'src', 'bowtie2-2.2.9')
cstrittmatter@0 139 samtools = os.path.join(script_folder, 'src', 'samtools-1.3.1', 'bin')
cstrittmatter@0 140 bcftools = os.path.join(script_folder, 'src', 'bcftools-1.3.1', 'bin')
cstrittmatter@0 141
cstrittmatter@0 142 os.environ['PATH'] = str(':'.join([bowtie2, samtools, bcftools, path_variable]))
cstrittmatter@0 143
cstrittmatter@0 144 # Print PATH variable
cstrittmatter@0 145 print('\n' + 'PATH variable:')
cstrittmatter@0 146 print(os.environ['PATH'])
cstrittmatter@0 147
cstrittmatter@0 148
cstrittmatter@0 149 def script_version_git(version, current_directory, script_path, no_git_info=False):
cstrittmatter@0 150 """
cstrittmatter@0 151 Print script version and get GitHub commit information
cstrittmatter@0 152
cstrittmatter@0 153 Parameters
cstrittmatter@0 154 ----------
cstrittmatter@0 155 version : str
cstrittmatter@0 156 Version of the script, e.g. "4.0"
cstrittmatter@0 157 current_directory : str
cstrittmatter@0 158 Path to the directory where the script was start to run
cstrittmatter@0 159 script_path : str
cstrittmatter@0 160 Path to the script running
cstrittmatter@0 161 no_git_info : bool, default False
cstrittmatter@0 162 True if it is not necessary to retreive the GitHub commit information
cstrittmatter@0 163
cstrittmatter@0 164 Returns
cstrittmatter@0 165 -------
cstrittmatter@0 166
cstrittmatter@0 167 """
cstrittmatter@0 168 print('Version {}'.format(version))
cstrittmatter@0 169
cstrittmatter@0 170 if not no_git_info:
cstrittmatter@0 171 try:
cstrittmatter@0 172 os.chdir(os.path.dirname(os.path.dirname(script_path)))
cstrittmatter@0 173 command = ['git', 'log', '-1', '--date=local', '--pretty=format:"%h (%H) - Commit by %cn, %cd) : %s"']
cstrittmatter@0 174 run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False)
cstrittmatter@0 175 print(stdout)
cstrittmatter@0 176 command = ['git', 'remote', 'show', 'origin']
cstrittmatter@0 177 run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False)
cstrittmatter@0 178 print(stdout)
cstrittmatter@0 179 except:
cstrittmatter@0 180 print('HARMLESS WARNING: git command possibly not found. The GitHub repository information will not be'
cstrittmatter@0 181 ' obtained.')
cstrittmatter@0 182 finally:
cstrittmatter@0 183 os.chdir(current_directory)
cstrittmatter@0 184
cstrittmatter@0 185
cstrittmatter@0 186 def runTime(start_time):
cstrittmatter@0 187 end_time = time.time()
cstrittmatter@0 188 time_taken = end_time - start_time
cstrittmatter@0 189 hours, rest = divmod(time_taken, 3600)
cstrittmatter@0 190 minutes, seconds = divmod(rest, 60)
cstrittmatter@0 191 print('Runtime :' + str(hours) + 'h:' + str(minutes) + 'm:' + str(round(seconds, 2)) + 's')
cstrittmatter@0 192 return round(time_taken, 2)
cstrittmatter@0 193
cstrittmatter@0 194
cstrittmatter@0 195 def timer(function, name):
cstrittmatter@0 196 @functools.wraps(function)
cstrittmatter@0 197 def wrapper(*args, **kwargs):
cstrittmatter@0 198 print('\n' + 'RUNNING {0}\n'.format(name))
cstrittmatter@0 199 start_time = time.time()
cstrittmatter@0 200
cstrittmatter@0 201 results = list(function(*args, **kwargs)) # guarantees return is a list to allow .insert()
cstrittmatter@0 202
cstrittmatter@0 203 time_taken = runTime(start_time)
cstrittmatter@0 204 print('END {0}'.format(name))
cstrittmatter@0 205
cstrittmatter@0 206 results.insert(0, time_taken)
cstrittmatter@0 207 return results
cstrittmatter@0 208 return wrapper
cstrittmatter@0 209
cstrittmatter@0 210
cstrittmatter@0 211 def removeDirectory(directory):
cstrittmatter@0 212 if os.path.isdir(directory):
cstrittmatter@0 213 shutil.rmtree(directory)
cstrittmatter@0 214
cstrittmatter@0 215
cstrittmatter@0 216 def saveVariableToPickle(variableToStore, pickleFile):
cstrittmatter@0 217 with open(pickleFile, 'wb') as writer:
cstrittmatter@0 218 pickle.dump(variableToStore, writer)
cstrittmatter@0 219
cstrittmatter@0 220
cstrittmatter@0 221 def extractVariableFromPickle(pickleFile):
cstrittmatter@0 222 with open(pickleFile, 'rb') as reader:
cstrittmatter@0 223 variable = pickle.load(reader)
cstrittmatter@0 224 return variable
cstrittmatter@0 225
cstrittmatter@0 226
cstrittmatter@0 227 def trace_unhandled_exceptions(func):
cstrittmatter@0 228 @functools.wraps(func)
cstrittmatter@0 229 def wrapped_func(*args, **kwargs):
cstrittmatter@0 230 try:
cstrittmatter@0 231 func(*args, **kwargs)
cstrittmatter@0 232 except:
cstrittmatter@0 233 print('Exception in ' + func.__name__)
cstrittmatter@0 234 traceback.print_exc()
cstrittmatter@0 235 return wrapped_func
cstrittmatter@0 236
cstrittmatter@0 237
cstrittmatter@0 238 def kill_subprocess_Popen(subprocess_Popen, command):
cstrittmatter@0 239 print('Command run out of time: ' + str(command))
cstrittmatter@0 240 subprocess_Popen.kill()
cstrittmatter@0 241
cstrittmatter@0 242
cstrittmatter@0 243 def runCommandPopenCommunicate(command, shell_True, timeout_sec_None, print_comand_True):
cstrittmatter@0 244 run_successfully = False
cstrittmatter@0 245 if not isinstance(command, str):
cstrittmatter@0 246 command = ' '.join(command)
cstrittmatter@0 247 command = shlex.split(command)
cstrittmatter@0 248
cstrittmatter@0 249 if print_comand_True:
cstrittmatter@0 250 print('Running: ' + ' '.join(command))
cstrittmatter@0 251
cstrittmatter@0 252 if shell_True:
cstrittmatter@0 253 command = ' '.join(command)
cstrittmatter@0 254 proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
cstrittmatter@0 255 else:
cstrittmatter@0 256 proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
cstrittmatter@0 257
cstrittmatter@0 258 not_killed_by_timer = True
cstrittmatter@0 259 if timeout_sec_None is None:
cstrittmatter@0 260 stdout, stderr = proc.communicate()
cstrittmatter@0 261 else:
cstrittmatter@0 262 time_counter = Timer(timeout_sec_None, kill_subprocess_Popen, args=(proc, command,))
cstrittmatter@0 263 time_counter.start()
cstrittmatter@0 264 stdout, stderr = proc.communicate()
cstrittmatter@0 265 time_counter.cancel()
cstrittmatter@0 266 not_killed_by_timer = time_counter.isAlive()
cstrittmatter@0 267
cstrittmatter@0 268 stdout = stdout.decode("utf-8")
cstrittmatter@0 269 stderr = stderr.decode("utf-8")
cstrittmatter@0 270
cstrittmatter@0 271 if proc.returncode == 0:
cstrittmatter@0 272 run_successfully = True
cstrittmatter@0 273 else:
cstrittmatter@0 274 if not print_comand_True and not_killed_by_timer:
cstrittmatter@0 275 print('Running: ' + str(command))
cstrittmatter@0 276 if len(stdout) > 0:
cstrittmatter@0 277 print('STDOUT')
cstrittmatter@0 278 print(stdout)
cstrittmatter@0 279 if len(stderr) > 0:
cstrittmatter@0 280 print('STDERR')
cstrittmatter@0 281 print(stderr)
cstrittmatter@0 282 return run_successfully, stdout, stderr
cstrittmatter@0 283
cstrittmatter@0 284
cstrittmatter@0 285 def required_length(tuple_length_options, argument_name):
cstrittmatter@0 286 class RequiredLength(argparse.Action):
cstrittmatter@0 287 def __call__(self, parser, args, values, option_string=None):
cstrittmatter@0 288 if len(values) not in tuple_length_options:
cstrittmatter@0 289 msg = 'Option {argument_name} requires one of the following number of' \
cstrittmatter@0 290 ' arguments: {tuple_length_options}'.format(argument_name=self.argument_name,
cstrittmatter@0 291 tuple_length_options=tuple_length_options)
cstrittmatter@0 292 raise argparse.ArgumentTypeError(msg)
cstrittmatter@0 293 setattr(args, self.dest, values)
cstrittmatter@0 294 return RequiredLength
cstrittmatter@0 295
cstrittmatter@0 296
cstrittmatter@0 297 def get_sequence_information(fasta_file, length_extra_seq):
cstrittmatter@0 298 sequence_dict = {}
cstrittmatter@0 299 headers = {}
cstrittmatter@0 300
cstrittmatter@0 301 with open(fasta_file, 'rtU') as reader:
cstrittmatter@0 302 blank_line_found = False
cstrittmatter@0 303 sequence_counter = 0
cstrittmatter@0 304 temp_sequence_dict = {}
cstrittmatter@0 305 for line in reader:
cstrittmatter@0 306 line = line.splitlines()[0]
cstrittmatter@0 307 if len(line) > 0:
cstrittmatter@0 308 if not blank_line_found:
cstrittmatter@0 309 if line.startswith('>'):
cstrittmatter@0 310 if len(temp_sequence_dict) > 0:
cstrittmatter@0 311 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0:
cstrittmatter@0 312 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0]
cstrittmatter@0 313 headers[list(temp_sequence_dict.values())[0]['header'].lower()] = sequence_counter
cstrittmatter@0 314 else:
cstrittmatter@0 315 print(list(temp_sequence_dict.values())[0]['header'] + ' sequence ignored due to '
cstrittmatter@0 316 'length <= 0')
cstrittmatter@0 317 temp_sequence_dict = {}
cstrittmatter@0 318
cstrittmatter@0 319 if line[1:].lower() in headers:
cstrittmatter@0 320 sys.exit('Found duplicated sequence headers')
cstrittmatter@0 321
cstrittmatter@0 322 sequence_counter += 1
cstrittmatter@0 323 temp_sequence_dict[sequence_counter] = {'header': line[1:].lower(), 'sequence': '', 'length': 0}
cstrittmatter@0 324 else:
cstrittmatter@0 325 temp_sequence_dict[sequence_counter]['sequence'] += line.upper()
cstrittmatter@0 326 temp_sequence_dict[sequence_counter]['length'] += len(line)
cstrittmatter@0 327 else:
cstrittmatter@0 328 sys.exit('It was found a blank line between the fasta file above line ' + line)
cstrittmatter@0 329 else:
cstrittmatter@0 330 blank_line_found = True
cstrittmatter@0 331
cstrittmatter@0 332 if len(temp_sequence_dict) > 0:
cstrittmatter@0 333 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0:
cstrittmatter@0 334 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0]
cstrittmatter@0 335 headers[list(temp_sequence_dict.values())[0]['header'].lower()] = sequence_counter
cstrittmatter@0 336 else:
cstrittmatter@0 337 print(list(temp_sequence_dict.values())[0]['header'] + ' sequence ignored due to length <= 0')
cstrittmatter@0 338
cstrittmatter@0 339 return sequence_dict, headers
cstrittmatter@0 340
cstrittmatter@0 341
cstrittmatter@0 342 def simplify_sequence_dict(sequence_dict):
cstrittmatter@0 343 simple_sequence_dict = {}
cstrittmatter@0 344 for counter, info in list(sequence_dict.items()):
cstrittmatter@0 345 simple_sequence_dict[info['header']] = info
cstrittmatter@0 346 del simple_sequence_dict[info['header']]['header']
cstrittmatter@0 347 return simple_sequence_dict
cstrittmatter@0 348
cstrittmatter@0 349
cstrittmatter@0 350 def chunkstring(string, length):
cstrittmatter@0 351 return (string[0 + i:length + i] for i in range(0, len(string), length))
cstrittmatter@0 352
cstrittmatter@0 353
cstrittmatter@0 354 def clean_headers_sequences(sequence_dict):
cstrittmatter@0 355 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"]
cstrittmatter@0 356 # print 'Checking if reference sequences contain ' + str(problematic_characters) + '\n'
cstrittmatter@0 357
cstrittmatter@0 358 headers_changed = False
cstrittmatter@0 359 new_headers = {}
cstrittmatter@0 360 for i in sequence_dict:
cstrittmatter@0 361 if any(x in sequence_dict[i]['header'] for x in problematic_characters):
cstrittmatter@0 362 for x in problematic_characters:
cstrittmatter@0 363 sequence_dict[i]['header'] = sequence_dict[i]['header'].replace(x, '_')
cstrittmatter@0 364 headers_changed = True
cstrittmatter@0 365 new_headers[sequence_dict[i]['header'].lower()] = i
cstrittmatter@0 366
cstrittmatter@0 367 if headers_changed:
cstrittmatter@0 368 print('At least one of the those characters was found. Replacing those with _' + '\n')
cstrittmatter@0 369
cstrittmatter@0 370 return sequence_dict, new_headers