Mercurial > repos > cstrittmatter > test_galtrakr_eurl_vtec_wgs_pt_23
comparison scripts/modules/utils.py @ 0:8be2feb96994
"planemo upload commit cb65588391944306ff3cb32a23e1c28f65122014"
author | cstrittmatter |
---|---|
date | Fri, 11 Mar 2022 15:50:35 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8be2feb96994 |
---|---|
1 import pickle | |
2 import traceback | |
3 import shlex | |
4 import subprocess | |
5 from threading import Timer | |
6 import shutil | |
7 import time | |
8 import functools | |
9 import os.path | |
10 import sys | |
11 import argparse | |
12 | |
13 | |
14 def start_logger(workdir): | |
15 time_str = time.strftime("%Y%m%d-%H%M%S") | |
16 sys.stdout = Logger(workdir, time_str) | |
17 logfile = sys.stdout.getLogFile() | |
18 return logfile, time_str | |
19 | |
20 | |
21 class Logger(object): | |
22 def __init__(self, out_directory, time_str): | |
23 self.logfile = os.path.join(out_directory, str('run.' + time_str + '.log')) | |
24 self.terminal = sys.stdout | |
25 self.log = open(self.logfile, "w") | |
26 | |
27 def write(self, message): | |
28 self.terminal.write(message) | |
29 self.log.write(message) | |
30 self.log.flush() | |
31 | |
32 def flush(self): | |
33 pass | |
34 | |
35 def getLogFile(self): | |
36 return self.logfile | |
37 | |
38 | |
39 def checkPrograms(programs_version_dictionary): | |
40 print('\n' + 'Checking dependencies...') | |
41 programs = programs_version_dictionary | |
42 which_program = ['which', ''] | |
43 listMissings = [] | |
44 for program in programs: | |
45 which_program[1] = program | |
46 run_successfully, stdout, stderr = runCommandPopenCommunicate(which_program, False, None, False) | |
47 if not run_successfully: | |
48 listMissings.append(program + ' not found in PATH.') | |
49 else: | |
50 print(stdout.splitlines()[0]) | |
51 if programs[program][0] is None: | |
52 print(program + ' (impossible to determine programme version) found at: ' + stdout.splitlines()[0]) | |
53 else: | |
54 if program.endswith('.jar'): | |
55 check_version = ['java', '-jar', stdout.splitlines()[0], programs[program][0]] | |
56 programs[program].append(stdout.splitlines()[0]) | |
57 else: | |
58 check_version = [stdout.splitlines()[0], programs[program][0]] | |
59 run_successfully, stdout, stderr = runCommandPopenCommunicate(check_version, False, None, False) | |
60 if stdout == '': | |
61 stdout = stderr | |
62 if program in ['wget', 'awk']: | |
63 version_line = stdout.splitlines()[0].split(' ', 3)[2] | |
64 elif program in ['prefetch', 'fastq-dump']: | |
65 version_line = stdout.splitlines()[1].split(' ')[-1] | |
66 else: | |
67 version_line = stdout.splitlines()[0].split(' ')[-1] | |
68 replace_characters = ['"', 'v', 'V', '+', ','] | |
69 for i in replace_characters: | |
70 version_line = version_line.replace(i, '') | |
71 print(program + ' (' + version_line + ') found') | |
72 if programs[program][1] == '>=': | |
73 program_found_version = version_line.split('.') | |
74 program_version_required = programs[program][2].split('.') | |
75 if len(program_version_required) == 3: | |
76 if len(program_found_version) == 2: | |
77 program_found_version.append(0) | |
78 else: | |
79 program_found_version[2] = program_found_version[2].split('_')[0] | |
80 for i in range(0, len(program_version_required)): | |
81 if int(program_found_version[i]) > int(program_version_required[i]): | |
82 break | |
83 elif int(program_found_version[i]) == int(program_version_required[i]): | |
84 continue | |
85 else: | |
86 listMissings.append('It is required ' + program + ' with version ' + | |
87 programs[program][1] + ' ' + programs[program][2]) | |
88 else: | |
89 if version_line != programs[program][2]: | |
90 listMissings.append('It is required ' + program + ' with version ' + programs[program][1] + | |
91 ' ' + programs[program][2]) | |
92 return listMissings | |
93 | |
94 | |
95 def requiredPrograms(): | |
96 programs_version_dictionary = {} | |
97 programs_version_dictionary['rematch.py'] = ['--version', '>=', '4.0'] | |
98 missingPrograms = checkPrograms(programs_version_dictionary) | |
99 if len(missingPrograms) > 0: | |
100 sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms)) | |
101 | |
102 | |
103 def general_information(logfile, version, outdir, time_str): | |
104 # Check if output directory exists | |
105 | |
106 print('\n' + '==========> patho_typing <==========') | |
107 print('\n' + 'Program start: ' + time.ctime()) | |
108 | |
109 # Tells where the logfile will be stored | |
110 print('\n' + 'LOGFILE:') | |
111 print(logfile) | |
112 | |
113 # Print command | |
114 print('\n' + 'COMMAND:') | |
115 script_path = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'patho_typing.py') | |
116 print(sys.executable + ' ' + ' '.join(sys.argv)) | |
117 | |
118 # Print directory where programme was lunch | |
119 print('\n' + 'PRESENT DIRECTORY:') | |
120 present_directory = os.path.abspath(os.getcwd()) | |
121 print(present_directory) | |
122 | |
123 # Print program version | |
124 print('\n' + 'VERSION:') | |
125 script_version_git(version, present_directory, script_path) | |
126 | |
127 # Check programms | |
128 requiredPrograms() | |
129 | |
130 return script_path | |
131 | |
132 | |
133 def setPATHvariable(doNotUseProvidedSoftware, script_path): | |
134 path_variable = os.environ['PATH'] | |
135 script_folder = os.path.dirname(script_path) | |
136 # Set path to use provided softwares | |
137 if not doNotUseProvidedSoftware: | |
138 bowtie2 = os.path.join(script_folder, 'src', 'bowtie2-2.2.9') | |
139 samtools = os.path.join(script_folder, 'src', 'samtools-1.3.1', 'bin') | |
140 bcftools = os.path.join(script_folder, 'src', 'bcftools-1.3.1', 'bin') | |
141 | |
142 os.environ['PATH'] = str(':'.join([bowtie2, samtools, bcftools, path_variable])) | |
143 | |
144 # Print PATH variable | |
145 print('\n' + 'PATH variable:') | |
146 print(os.environ['PATH']) | |
147 | |
148 | |
149 def script_version_git(version, current_directory, script_path, no_git_info=False): | |
150 """ | |
151 Print script version and get GitHub commit information | |
152 | |
153 Parameters | |
154 ---------- | |
155 version : str | |
156 Version of the script, e.g. "4.0" | |
157 current_directory : str | |
158 Path to the directory where the script was start to run | |
159 script_path : str | |
160 Path to the script running | |
161 no_git_info : bool, default False | |
162 True if it is not necessary to retreive the GitHub commit information | |
163 | |
164 Returns | |
165 ------- | |
166 | |
167 """ | |
168 print('Version {}'.format(version)) | |
169 | |
170 if not no_git_info: | |
171 try: | |
172 os.chdir(os.path.dirname(os.path.dirname(script_path))) | |
173 command = ['git', 'log', '-1', '--date=local', '--pretty=format:"%h (%H) - Commit by %cn, %cd) : %s"'] | |
174 run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False) | |
175 print(stdout) | |
176 command = ['git', 'remote', 'show', 'origin'] | |
177 run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False) | |
178 print(stdout) | |
179 except: | |
180 print('HARMLESS WARNING: git command possibly not found. The GitHub repository information will not be' | |
181 ' obtained.') | |
182 finally: | |
183 os.chdir(current_directory) | |
184 | |
185 | |
186 def runTime(start_time): | |
187 end_time = time.time() | |
188 time_taken = end_time - start_time | |
189 hours, rest = divmod(time_taken, 3600) | |
190 minutes, seconds = divmod(rest, 60) | |
191 print('Runtime :' + str(hours) + 'h:' + str(minutes) + 'm:' + str(round(seconds, 2)) + 's') | |
192 return round(time_taken, 2) | |
193 | |
194 | |
195 def timer(function, name): | |
196 @functools.wraps(function) | |
197 def wrapper(*args, **kwargs): | |
198 print('\n' + 'RUNNING {0}\n'.format(name)) | |
199 start_time = time.time() | |
200 | |
201 results = list(function(*args, **kwargs)) # guarantees return is a list to allow .insert() | |
202 | |
203 time_taken = runTime(start_time) | |
204 print('END {0}'.format(name)) | |
205 | |
206 results.insert(0, time_taken) | |
207 return results | |
208 return wrapper | |
209 | |
210 | |
211 def removeDirectory(directory): | |
212 if os.path.isdir(directory): | |
213 shutil.rmtree(directory) | |
214 | |
215 | |
216 def saveVariableToPickle(variableToStore, pickleFile): | |
217 with open(pickleFile, 'wb') as writer: | |
218 pickle.dump(variableToStore, writer) | |
219 | |
220 | |
221 def extractVariableFromPickle(pickleFile): | |
222 with open(pickleFile, 'rb') as reader: | |
223 variable = pickle.load(reader) | |
224 return variable | |
225 | |
226 | |
227 def trace_unhandled_exceptions(func): | |
228 @functools.wraps(func) | |
229 def wrapped_func(*args, **kwargs): | |
230 try: | |
231 func(*args, **kwargs) | |
232 except: | |
233 print('Exception in ' + func.__name__) | |
234 traceback.print_exc() | |
235 return wrapped_func | |
236 | |
237 | |
238 def kill_subprocess_Popen(subprocess_Popen, command): | |
239 print('Command run out of time: ' + str(command)) | |
240 subprocess_Popen.kill() | |
241 | |
242 | |
243 def runCommandPopenCommunicate(command, shell_True, timeout_sec_None, print_comand_True): | |
244 run_successfully = False | |
245 if not isinstance(command, str): | |
246 command = ' '.join(command) | |
247 command = shlex.split(command) | |
248 | |
249 if print_comand_True: | |
250 print('Running: ' + ' '.join(command)) | |
251 | |
252 if shell_True: | |
253 command = ' '.join(command) | |
254 proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) | |
255 else: | |
256 proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
257 | |
258 not_killed_by_timer = True | |
259 if timeout_sec_None is None: | |
260 stdout, stderr = proc.communicate() | |
261 else: | |
262 time_counter = Timer(timeout_sec_None, kill_subprocess_Popen, args=(proc, command,)) | |
263 time_counter.start() | |
264 stdout, stderr = proc.communicate() | |
265 time_counter.cancel() | |
266 not_killed_by_timer = time_counter.isAlive() | |
267 | |
268 stdout = stdout.decode("utf-8") | |
269 stderr = stderr.decode("utf-8") | |
270 | |
271 if proc.returncode == 0: | |
272 run_successfully = True | |
273 else: | |
274 if not print_comand_True and not_killed_by_timer: | |
275 print('Running: ' + str(command)) | |
276 if len(stdout) > 0: | |
277 print('STDOUT') | |
278 print(stdout) | |
279 if len(stderr) > 0: | |
280 print('STDERR') | |
281 print(stderr) | |
282 return run_successfully, stdout, stderr | |
283 | |
284 | |
285 def required_length(tuple_length_options, argument_name): | |
286 class RequiredLength(argparse.Action): | |
287 def __call__(self, parser, args, values, option_string=None): | |
288 if len(values) not in tuple_length_options: | |
289 msg = 'Option {argument_name} requires one of the following number of' \ | |
290 ' arguments: {tuple_length_options}'.format(argument_name=self.argument_name, | |
291 tuple_length_options=tuple_length_options) | |
292 raise argparse.ArgumentTypeError(msg) | |
293 setattr(args, self.dest, values) | |
294 return RequiredLength | |
295 | |
296 | |
297 def get_sequence_information(fasta_file, length_extra_seq): | |
298 sequence_dict = {} | |
299 headers = {} | |
300 | |
301 with open(fasta_file, 'rtU') as reader: | |
302 blank_line_found = False | |
303 sequence_counter = 0 | |
304 temp_sequence_dict = {} | |
305 for line in reader: | |
306 line = line.splitlines()[0] | |
307 if len(line) > 0: | |
308 if not blank_line_found: | |
309 if line.startswith('>'): | |
310 if len(temp_sequence_dict) > 0: | |
311 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0: | |
312 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0] | |
313 headers[list(temp_sequence_dict.values())[0]['header'].lower()] = sequence_counter | |
314 else: | |
315 print(list(temp_sequence_dict.values())[0]['header'] + ' sequence ignored due to ' | |
316 'length <= 0') | |
317 temp_sequence_dict = {} | |
318 | |
319 if line[1:].lower() in headers: | |
320 sys.exit('Found duplicated sequence headers') | |
321 | |
322 sequence_counter += 1 | |
323 temp_sequence_dict[sequence_counter] = {'header': line[1:].lower(), 'sequence': '', 'length': 0} | |
324 else: | |
325 temp_sequence_dict[sequence_counter]['sequence'] += line.upper() | |
326 temp_sequence_dict[sequence_counter]['length'] += len(line) | |
327 else: | |
328 sys.exit('It was found a blank line between the fasta file above line ' + line) | |
329 else: | |
330 blank_line_found = True | |
331 | |
332 if len(temp_sequence_dict) > 0: | |
333 if list(temp_sequence_dict.values())[0]['length'] - 2 * length_extra_seq > 0: | |
334 sequence_dict[list(temp_sequence_dict.keys())[0]] = list(temp_sequence_dict.values())[0] | |
335 headers[list(temp_sequence_dict.values())[0]['header'].lower()] = sequence_counter | |
336 else: | |
337 print(list(temp_sequence_dict.values())[0]['header'] + ' sequence ignored due to length <= 0') | |
338 | |
339 return sequence_dict, headers | |
340 | |
341 | |
342 def simplify_sequence_dict(sequence_dict): | |
343 simple_sequence_dict = {} | |
344 for counter, info in list(sequence_dict.items()): | |
345 simple_sequence_dict[info['header']] = info | |
346 del simple_sequence_dict[info['header']]['header'] | |
347 return simple_sequence_dict | |
348 | |
349 | |
350 def chunkstring(string, length): | |
351 return (string[0 + i:length + i] for i in range(0, len(string), length)) | |
352 | |
353 | |
354 def clean_headers_sequences(sequence_dict): | |
355 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] | |
356 # print 'Checking if reference sequences contain ' + str(problematic_characters) + '\n' | |
357 | |
358 headers_changed = False | |
359 new_headers = {} | |
360 for i in sequence_dict: | |
361 if any(x in sequence_dict[i]['header'] for x in problematic_characters): | |
362 for x in problematic_characters: | |
363 sequence_dict[i]['header'] = sequence_dict[i]['header'].replace(x, '_') | |
364 headers_changed = True | |
365 new_headers[sequence_dict[i]['header'].lower()] = i | |
366 | |
367 if headers_changed: | |
368 print('At least one of the those characters was found. Replacing those with _' + '\n') | |
369 | |
370 return sequence_dict, new_headers |