cstrittmatter@0: import pickle cstrittmatter@0: import traceback cstrittmatter@0: import shlex cstrittmatter@0: import subprocess cstrittmatter@0: from threading import Timer cstrittmatter@0: import shutil cstrittmatter@0: import time cstrittmatter@0: import functools cstrittmatter@0: import os.path cstrittmatter@0: import sys cstrittmatter@0: import argparse cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def start_logger(workdir): cstrittmatter@0: time_str = time.strftime("%Y%m%d-%H%M%S") cstrittmatter@0: sys.stdout = Logger(workdir, time_str) cstrittmatter@0: logfile = sys.stdout.getLogFile() cstrittmatter@0: return logfile, time_str cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: class Logger(object): cstrittmatter@0: def __init__(self, out_directory, time_str): cstrittmatter@0: self.logfile = os.path.join(out_directory, str('run.' + time_str + '.log')) cstrittmatter@0: self.terminal = sys.stdout cstrittmatter@0: self.log = open(self.logfile, "w") cstrittmatter@0: cstrittmatter@0: def write(self, message): cstrittmatter@0: self.terminal.write(message) cstrittmatter@0: self.log.write(message) cstrittmatter@0: self.log.flush() cstrittmatter@0: cstrittmatter@0: def flush(self): cstrittmatter@0: pass cstrittmatter@0: cstrittmatter@0: def getLogFile(self): cstrittmatter@0: return self.logfile cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def checkPrograms(programs_version_dictionary): cstrittmatter@0: print('\n' + 'Checking dependencies...') cstrittmatter@0: programs = programs_version_dictionary cstrittmatter@0: which_program = ['which', ''] cstrittmatter@0: listMissings = [] cstrittmatter@0: for program in programs: cstrittmatter@0: which_program[1] = program cstrittmatter@0: run_successfully, stdout, stderr = runCommandPopenCommunicate(which_program, False, None, False) cstrittmatter@0: if not run_successfully: cstrittmatter@0: listMissings.append(program + ' not found in PATH.') cstrittmatter@0: else: cstrittmatter@0: print(stdout.splitlines()[0]) cstrittmatter@0: if programs[program][0] is None: cstrittmatter@0: print(program + ' (impossible to determine programme version) found at: ' + stdout.splitlines()[0]) cstrittmatter@0: else: cstrittmatter@0: if program.endswith('.jar'): cstrittmatter@0: check_version = ['java', '-jar', stdout.splitlines()[0], programs[program][0]] cstrittmatter@0: programs[program].append(stdout.splitlines()[0]) cstrittmatter@0: else: cstrittmatter@0: check_version = [stdout.splitlines()[0], programs[program][0]] cstrittmatter@0: run_successfully, stdout, stderr = runCommandPopenCommunicate(check_version, False, None, False) cstrittmatter@0: if stdout == '': cstrittmatter@0: stdout = stderr cstrittmatter@0: if program in ['wget', 'awk']: cstrittmatter@0: version_line = stdout.splitlines()[0].split(' ', 3)[2] cstrittmatter@0: elif program in ['prefetch', 'fastq-dump']: cstrittmatter@0: version_line = stdout.splitlines()[1].split(' ')[-1] cstrittmatter@0: else: cstrittmatter@0: version_line = stdout.splitlines()[0].split(' ')[-1] cstrittmatter@0: replace_characters = ['"', 'v', 'V', '+', ','] cstrittmatter@0: for i in replace_characters: cstrittmatter@0: version_line = version_line.replace(i, '') cstrittmatter@0: print(program + ' (' + version_line + ') found') cstrittmatter@0: if programs[program][1] == '>=': cstrittmatter@0: program_found_version = version_line.split('.') cstrittmatter@0: program_version_required = programs[program][2].split('.') cstrittmatter@0: if len(program_version_required) == 3: cstrittmatter@0: if len(program_found_version) == 2: cstrittmatter@0: program_found_version.append(0) cstrittmatter@0: else: cstrittmatter@0: program_found_version[2] = program_found_version[2].split('_')[0] cstrittmatter@0: for i in range(0, len(program_version_required)): cstrittmatter@0: if int(program_found_version[i]) > int(program_version_required[i]): cstrittmatter@0: break cstrittmatter@0: elif int(program_found_version[i]) == int(program_version_required[i]): cstrittmatter@0: continue cstrittmatter@0: else: cstrittmatter@0: listMissings.append('It is required ' + program + ' with version ' + cstrittmatter@0: programs[program][1] + ' ' + programs[program][2]) cstrittmatter@0: else: cstrittmatter@0: if version_line != programs[program][2]: cstrittmatter@0: listMissings.append('It is required ' + program + ' with version ' + programs[program][1] + cstrittmatter@0: ' ' + programs[program][2]) cstrittmatter@0: return listMissings cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def requiredPrograms(): cstrittmatter@0: programs_version_dictionary = {} cstrittmatter@0: programs_version_dictionary['rematch.py'] = ['--version', '>=', '4.0'] cstrittmatter@0: missingPrograms = checkPrograms(programs_version_dictionary) cstrittmatter@0: if len(missingPrograms) > 0: cstrittmatter@0: sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def general_information(logfile, version, outdir, time_str): cstrittmatter@0: # Check if output directory exists cstrittmatter@0: cstrittmatter@0: print('\n' + '==========> patho_typing <==========') cstrittmatter@0: print('\n' + 'Program start: ' + time.ctime()) cstrittmatter@0: cstrittmatter@0: # Tells where the logfile will be stored cstrittmatter@0: print('\n' + 'LOGFILE:') cstrittmatter@0: print(logfile) cstrittmatter@0: cstrittmatter@0: # Print command cstrittmatter@0: print('\n' + 'COMMAND:') cstrittmatter@0: script_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'patho_typing.py') cstrittmatter@0: print(sys.executable + ' ' + ' '.join(sys.argv)) cstrittmatter@0: cstrittmatter@0: # Print directory where programme was lunch cstrittmatter@0: print('\n' + 'PRESENT DIRECTORY:') cstrittmatter@0: present_directory = os.path.abspath(os.getcwd()) cstrittmatter@0: print(present_directory) cstrittmatter@0: cstrittmatter@0: # Print program version cstrittmatter@0: print('\n' + 'VERSION:') cstrittmatter@0: script_version_git(version, present_directory, script_path) cstrittmatter@0: cstrittmatter@0: # Check programms cstrittmatter@0: requiredPrograms() cstrittmatter@0: cstrittmatter@0: return script_path cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def setPATHvariable(doNotUseProvidedSoftware, script_path): cstrittmatter@0: path_variable = os.environ['PATH'] cstrittmatter@0: script_folder = os.path.dirname(script_path) cstrittmatter@0: # Set path to use provided softwares cstrittmatter@0: if not doNotUseProvidedSoftware: cstrittmatter@0: bowtie2 = os.path.join(script_folder, 'src', 'bowtie2-2.2.9') cstrittmatter@0: samtools = os.path.join(script_folder, 'src', 'samtools-1.3.1', 'bin') cstrittmatter@0: bcftools = os.path.join(script_folder, 'src', 'bcftools-1.3.1', 'bin') cstrittmatter@0: cstrittmatter@0: os.environ['PATH'] = str(':'.join([bowtie2, samtools, bcftools, path_variable])) cstrittmatter@0: cstrittmatter@0: # Print PATH variable cstrittmatter@0: print('\n' + 'PATH variable:') cstrittmatter@0: print(os.environ['PATH']) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def script_version_git(version, current_directory, script_path, no_git_info=False): cstrittmatter@0: """ cstrittmatter@0: Print script version and get GitHub commit information cstrittmatter@0: cstrittmatter@0: Parameters cstrittmatter@0: ---------- cstrittmatter@0: version : str cstrittmatter@0: Version of the script, e.g. "4.0" cstrittmatter@0: current_directory : str cstrittmatter@0: Path to the directory where the script was start to run cstrittmatter@0: script_path : str cstrittmatter@0: Path to the script running cstrittmatter@0: no_git_info : bool, default False cstrittmatter@0: True if it is not necessary to retreive the GitHub commit information cstrittmatter@0: cstrittmatter@0: Returns cstrittmatter@0: ------- cstrittmatter@0: cstrittmatter@0: """ cstrittmatter@0: print('Version {}'.format(version)) cstrittmatter@0: cstrittmatter@0: if not no_git_info: cstrittmatter@0: try: cstrittmatter@0: os.chdir(os.path.dirname(os.path.dirname(script_path))) cstrittmatter@0: command = ['git', 'log', '-1', '--date=local', '--pretty=format:"%h (%H) - Commit by %cn, %cd) : %s"'] cstrittmatter@0: run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False) cstrittmatter@0: print(stdout) cstrittmatter@0: command = ['git', 'remote', 'show', 'origin'] cstrittmatter@0: run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False) cstrittmatter@0: print(stdout) cstrittmatter@0: except: cstrittmatter@0: print('HARMLESS WARNING: git command possibly not found. The GitHub repository information will not be' cstrittmatter@0: ' obtained.') cstrittmatter@0: finally: cstrittmatter@0: os.chdir(current_directory) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def runTime(start_time): cstrittmatter@0: end_time = time.time() cstrittmatter@0: time_taken = end_time - start_time cstrittmatter@0: hours, rest = divmod(time_taken, 3600) cstrittmatter@0: minutes, seconds = divmod(rest, 60) cstrittmatter@0: print('Runtime :' + str(hours) + 'h:' + str(minutes) + 'm:' + str(round(seconds, 2)) + 's') cstrittmatter@0: return round(time_taken, 2) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def timer(function, name): cstrittmatter@0: @functools.wraps(function) cstrittmatter@0: def wrapper(*args, **kwargs): cstrittmatter@0: print('\n' + 'RUNNING {0}\n'.format(name)) cstrittmatter@0: start_time = time.time() cstrittmatter@0: cstrittmatter@0: results = list(function(*args, **kwargs)) # guarantees return is a list to allow .insert() cstrittmatter@0: cstrittmatter@0: time_taken = runTime(start_time) cstrittmatter@0: print('END {0}'.format(name)) cstrittmatter@0: cstrittmatter@0: results.insert(0, time_taken) cstrittmatter@0: return results cstrittmatter@0: return wrapper cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def removeDirectory(directory): cstrittmatter@0: if os.path.isdir(directory): cstrittmatter@0: shutil.rmtree(directory) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def saveVariableToPickle(variableToStore, pickleFile): cstrittmatter@0: with open(pickleFile, 'wb') as writer: cstrittmatter@0: pickle.dump(variableToStore, writer) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def extractVariableFromPickle(pickleFile): cstrittmatter@0: with open(pickleFile, 'rb') as reader: cstrittmatter@0: variable = pickle.load(reader) cstrittmatter@0: return variable cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def trace_unhandled_exceptions(func): cstrittmatter@0: @functools.wraps(func) cstrittmatter@0: def wrapped_func(*args, **kwargs): cstrittmatter@0: try: cstrittmatter@0: func(*args, **kwargs) cstrittmatter@0: except: cstrittmatter@0: print('Exception in ' + func.__name__) cstrittmatter@0: traceback.print_exc() cstrittmatter@0: return wrapped_func cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def kill_subprocess_Popen(subprocess_Popen, command): cstrittmatter@0: print('Command run out of time: ' + str(command)) cstrittmatter@0: subprocess_Popen.kill() cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def runCommandPopenCommunicate(command, shell_True, timeout_sec_None, print_comand_True): cstrittmatter@0: run_successfully = False cstrittmatter@0: if not isinstance(command, str): cstrittmatter@0: command = ' '.join(command) cstrittmatter@0: command = shlex.split(command) cstrittmatter@0: cstrittmatter@0: if print_comand_True: cstrittmatter@0: print('Running: ' + ' '.join(command)) cstrittmatter@0: cstrittmatter@0: if shell_True: cstrittmatter@0: command = ' '.join(command) cstrittmatter@0: proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) cstrittmatter@0: else: cstrittmatter@0: proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cstrittmatter@0: cstrittmatter@0: not_killed_by_timer = True cstrittmatter@0: if timeout_sec_None is None: cstrittmatter@0: stdout, stderr = proc.communicate() cstrittmatter@0: else: cstrittmatter@0: time_counter = Timer(timeout_sec_None, kill_subprocess_Popen, args=(proc, command,)) cstrittmatter@0: time_counter.start() cstrittmatter@0: stdout, stderr = proc.communicate() cstrittmatter@0: time_counter.cancel() cstrittmatter@0: not_killed_by_timer = time_counter.isAlive() cstrittmatter@0: cstrittmatter@0: stdout = stdout.decode("utf-8") cstrittmatter@0: stderr = stderr.decode("utf-8") cstrittmatter@0: cstrittmatter@0: if proc.returncode == 0: cstrittmatter@0: run_successfully = True cstrittmatter@0: else: cstrittmatter@0: if not print_comand_True and not_killed_by_timer: cstrittmatter@0: print('Running: ' + str(command)) cstrittmatter@0: if len(stdout) > 0: cstrittmatter@0: print('STDOUT') cstrittmatter@0: print(stdout) cstrittmatter@0: if len(stderr) > 0: cstrittmatter@0: print('STDERR') cstrittmatter@0: print(stderr) cstrittmatter@0: return run_successfully, stdout, stderr cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def required_length(tuple_length_options, argument_name): cstrittmatter@0: class RequiredLength(argparse.Action): cstrittmatter@0: def __call__(self, parser, args, values, option_string=None): cstrittmatter@0: if len(values) not in tuple_length_options: cstrittmatter@0: msg = 'Option {argument_name} requires one of the following number of' \ cstrittmatter@0: ' arguments: {tuple_length_options}'.format(argument_name=self.argument_name, cstrittmatter@0: tuple_length_options=tuple_length_options) cstrittmatter@0: raise argparse.ArgumentTypeError(msg) cstrittmatter@0: setattr(args, self.dest, values) cstrittmatter@0: return RequiredLength cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def get_sequence_information(fasta_file, length_extra_seq): cstrittmatter@0: sequence_dict = {} cstrittmatter@0: headers = {} cstrittmatter@0: cstrittmatter@0: with open(fasta_file, 'rtU') as reader: cstrittmatter@0: blank_line_found = False cstrittmatter@0: sequence_counter = 0 cstrittmatter@0: temp_sequence_dict = {} cstrittmatter@0: for line in reader: cstrittmatter@0: line = line.splitlines()[0] cstrittmatter@0: if len(line) > 0: cstrittmatter@0: if not blank_line_found: cstrittmatter@0: if line.startswith('>'): cstrittmatter@0: if len(temp_sequence_dict) > 0: cstrittmatter@0: if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0: cstrittmatter@0: sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0] cstrittmatter@0: headers[list(temp_sequence_dict.values())[0]['header'].lower()] = sequence_counter cstrittmatter@0: else: cstrittmatter@0: print(list(temp_sequence_dict.values())[0]['header'] + ' sequence ignored due to ' cstrittmatter@0: 'length <= 0') cstrittmatter@0: temp_sequence_dict = {} cstrittmatter@0: cstrittmatter@0: if line[1:].lower() in headers: cstrittmatter@0: sys.exit('Found duplicated sequence headers') cstrittmatter@0: cstrittmatter@0: sequence_counter += 1 cstrittmatter@0: temp_sequence_dict[sequence_counter] = {'header': line[1:].lower(), 'sequence': '', 'length': 0} cstrittmatter@0: else: cstrittmatter@0: temp_sequence_dict[sequence_counter]['sequence'] += line.upper() cstrittmatter@0: temp_sequence_dict[sequence_counter]['length'] += len(line) cstrittmatter@0: else: cstrittmatter@0: sys.exit('It was found a blank line between the fasta file above line ' + line) cstrittmatter@0: else: cstrittmatter@0: blank_line_found = True cstrittmatter@0: cstrittmatter@0: if len(temp_sequence_dict) > 0: cstrittmatter@0: if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0: cstrittmatter@0: sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0] cstrittmatter@0: headers[list(temp_sequence_dict.values())[0]['header'].lower()] = sequence_counter cstrittmatter@0: else: cstrittmatter@0: print(list(temp_sequence_dict.values())[0]['header'] + ' sequence ignored due to length <= 0') cstrittmatter@0: cstrittmatter@0: return sequence_dict, headers cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def simplify_sequence_dict(sequence_dict): cstrittmatter@0: simple_sequence_dict = {} cstrittmatter@0: for counter, info in list(sequence_dict.items()): cstrittmatter@0: simple_sequence_dict[info['header']] = info cstrittmatter@0: del simple_sequence_dict[info['header']]['header'] cstrittmatter@0: return simple_sequence_dict cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def chunkstring(string, length): cstrittmatter@0: return (string[0 + i:length + i] for i in range(0, len(string), length)) cstrittmatter@0: cstrittmatter@0: cstrittmatter@0: def clean_headers_sequences(sequence_dict): cstrittmatter@0: problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] cstrittmatter@0: # print 'Checking if reference sequences contain ' + str(problematic_characters) + '\n' cstrittmatter@0: cstrittmatter@0: headers_changed = False cstrittmatter@0: new_headers = {} cstrittmatter@0: for i in sequence_dict: cstrittmatter@0: if any(x in sequence_dict[i]['header'] for x in problematic_characters): cstrittmatter@0: for x in problematic_characters: cstrittmatter@0: sequence_dict[i]['header'] = sequence_dict[i]['header'].replace(x, '_') cstrittmatter@0: headers_changed = True cstrittmatter@0: new_headers[sequence_dict[i]['header'].lower()] = i cstrittmatter@0: cstrittmatter@0: if headers_changed: cstrittmatter@0: print('At least one of the those characters was found. Replacing those with _' + '\n') cstrittmatter@0: cstrittmatter@0: return sequence_dict, new_headers