Mercurial > repos > rliterman > csp2
comparison CSP2/bin/compileMUMmer.py @ 0:01431fa12065
"planemo upload"
author | rliterman |
---|---|
date | Mon, 02 Dec 2024 10:40:55 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:01431fa12065 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import sys | |
4 import os | |
5 import pandas as pd | |
6 import re | |
7 from pybedtools import BedTool,helpers,Interval | |
8 import warnings | |
9 import numpy as np | |
10 import hashlib | |
11 from Bio import SeqIO | |
12 import subprocess | |
13 import uuid | |
14 import traceback | |
15 import shutil | |
16 import time | |
17 import argparse | |
18 | |
19 warnings.filterwarnings("ignore") | |
20 | |
21 def parseMUmmerReport(mum_report_dir,report_id): | |
22 report_data = [] | |
23 | |
24 with open(mum_report_dir+"/"+report_id+".report", 'r') as report_file: | |
25 | |
26 for line in report_file.readlines(): | |
27 split_line = re.split(r'\s+', line.strip()) | |
28 if len(split_line) == 3: | |
29 report_data.append( | |
30 { | |
31 'Measure': split_line[0].split("(", 1)[0], | |
32 'Ref_Value':split_line[1].split("(", 1)[0], | |
33 'Query_Value':split_line[2].split("(", 1)[0] | |
34 }) | |
35 | |
36 report_data = pd.DataFrame(report_data) | |
37 report_data = report_data[report_data['Measure'].isin(['TotalSeqs','TotalBases','AlignedBases', | |
38 'TotalSNPs','TotalGSNPs','TotalIndels','TotalGIndels', | |
39 'Breakpoints','Relocations','Translocations','Inversions', | |
40 'Insertions','TandemIns'])] | |
41 | |
42 report_data = report_data.drop_duplicates(subset=['Measure']) # Drop M-to-M | |
43 | |
44 report_data = report_data.astype({"Ref_Value":"int","Query_Value":"int"}) | |
45 | |
46 # Fetch assembly data | |
47 ref_bases = report_data[report_data['Measure'] == 'TotalBases']['Ref_Value'].iloc[0] | |
48 query_bases = report_data[report_data['Measure'] == 'TotalBases']['Query_Value'].iloc[0] | |
49 | |
50 # Fetch alignment data | |
51 ref_aligned = report_data[report_data['Measure'] == 'AlignedBases']['Ref_Value'].iloc[0] | |
52 query_aligned = report_data[report_data['Measure'] == 'AlignedBases']['Query_Value'].iloc[0] | |
53 | |
54 percent_ref_aligned = 100*(float(ref_aligned)/ref_bases) | |
55 percent_query_aligned = 100*(float(query_aligned)/query_bases) | |
56 | |
57 # Fetch SNP data | |
58 g_snps = report_data[report_data['Measure'] == 'TotalGSNPs']['Ref_Value'].iloc[0] | |
59 g_indels = report_data[report_data['Measure'] == 'TotalGIndels']['Ref_Value'].iloc[0] | |
60 | |
61 ref_breakpoints = report_data[report_data['Measure'] == 'Breakpoints']['Ref_Value'].iloc[0] | |
62 query_breakpoints = report_data[report_data['Measure'] == 'Breakpoints']['Query_Value'].iloc[0] | |
63 | |
64 ref_relocations = report_data[report_data['Measure'] == 'Relocations']['Ref_Value'].iloc[0] | |
65 query_relocations = report_data[report_data['Measure'] == 'Relocations']['Query_Value'].iloc[0] | |
66 | |
67 ref_translocations = report_data[report_data['Measure'] == 'Translocations']['Ref_Value'].iloc[0] | |
68 query_translocations = report_data[report_data['Measure'] == 'Translocations']['Query_Value'].iloc[0] | |
69 | |
70 ref_inversions = report_data[report_data['Measure'] == 'Inversions']['Ref_Value'].iloc[0] | |
71 query_inversions = report_data[report_data['Measure'] == 'Inversions']['Query_Value'].iloc[0] | |
72 | |
73 ref_insertions = report_data[report_data['Measure'] == 'Insertions']['Ref_Value'].iloc[0] | |
74 query_insertions = report_data[report_data['Measure'] == 'Insertions']['Query_Value'].iloc[0] | |
75 | |
76 ref_tandem = report_data[report_data['Measure'] == 'TandemIns']['Ref_Value'].iloc[0] | |
77 query_tandem = report_data[report_data['Measure'] == 'TandemIns']['Query_Value'].iloc[0] | |
78 | |
79 return [ref_bases,percent_ref_aligned, | |
80 query_bases,percent_query_aligned, | |
81 g_snps,g_indels, | |
82 ref_breakpoints,query_breakpoints, | |
83 ref_relocations,query_relocations, | |
84 ref_translocations,query_translocations, | |
85 ref_inversions,query_inversions, | |
86 ref_insertions,query_insertions, | |
87 ref_tandem,query_tandem] | |
88 | |
89 def parseMUmmerCoords(mum_coords_dir,report_id,reference_chr_bed,query_chr_bed): | |
90 | |
91 coords_file = pd.read_csv(mum_coords_dir+"/"+report_id+".1coords",sep="\t",index_col=False, | |
92 names=['Ref_Start','Ref_End','Query_Start','Query_End', | |
93 'Ref_Aligned','Query_Aligned','Perc_Iden', | |
94 'Ref_Length','Query_Length','Ref_Cov', | |
95 'Query_Cov','Ref_Contig','Query_Contig'], | |
96 dtype={ | |
97 'Ref_Start': int, | |
98 'Ref_End': int, | |
99 'Query_Start': int, | |
100 'Query_End': int, | |
101 'Ref_Aligned': int, | |
102 'Query_Aligned': int, | |
103 'Perc_Iden': float, | |
104 'Ref_Length': int, | |
105 'Query_Length': int, | |
106 'Ref_Cov': float, | |
107 'Query_Cov': float, | |
108 'Ref_Contig': str, | |
109 'Query_Contig': str | |
110 }) | |
111 | |
112 if coords_file.shape[0] > 0: | |
113 | |
114 coords_file['Ref_Start'] = (coords_file['Ref_Start'] - 1) | |
115 coords_file = coords_file.apply(adjust_query_row, axis=1) | |
116 | |
117 merged_ref_bed = BedTool.from_dataframe(coords_file[['Ref_Contig','Ref_Start','Ref_End']]).sort().merge() | |
118 merged_query_bed = BedTool.from_dataframe(coords_file[['Query_Contig','Query_Start','Query_End']]).sort().merge() | |
119 | |
120 covered_ref_regions = reference_chr_bed.intersect(merged_ref_bed) | |
121 uncovered_ref_regions = reference_chr_bed.subtract(merged_ref_bed) | |
122 | |
123 covered_query_regions = query_chr_bed.intersect(merged_query_bed) | |
124 uncovered_query_regions = query_chr_bed.subtract(merged_query_bed) | |
125 else: | |
126 merged_ref_bed = BedTool([]) | |
127 merged_query_bed = BedTool([]) | |
128 | |
129 covered_ref_regions = BedTool([]) | |
130 uncovered_ref_regions = BedTool([]) | |
131 | |
132 covered_query_regions = BedTool([]) | |
133 uncovered_query_regions = BedTool([]) | |
134 | |
135 ref_genome_size = calculate_total_length(reference_chr_bed) | |
136 ref_covered_size = calculate_total_length(covered_ref_regions) | |
137 ref_uncovered_size = calculate_total_length(uncovered_ref_regions) | |
138 | |
139 query_genome_size = calculate_total_length(query_chr_bed) | |
140 query_covered_size = calculate_total_length(covered_query_regions) | |
141 query_uncovered_size = calculate_total_length(uncovered_query_regions) | |
142 | |
143 assert ref_covered_size + ref_uncovered_size == ref_genome_size | |
144 assert query_covered_size + query_uncovered_size == query_genome_size | |
145 | |
146 # Create return_df | |
147 return_columns = ['Ref_Contig','Ref_Start','Ref_End','Ref_Length','Ref_Aligned','Query_Contig','Query_Start','Query_End','Query_Length','Query_Aligned','Perc_Iden'] | |
148 | |
149 return_df = pd.DataFrame(columns=return_columns) | |
150 | |
151 if coords_file.shape[0] > 0: | |
152 return_df = pd.concat([return_df,coords_file[return_columns]],ignore_index=True) | |
153 | |
154 if uncovered_ref_regions.count() > 0: | |
155 return_uncovered_df = uncovered_ref_regions.to_dataframe().sort_values(['chrom','start']).rename(columns={'chrom':'Ref_Contig','start':'Ref_Start','end':'Ref_End'}).assign(Ref_Aligned = ".",Query_Aligned = ".",Query_Contig=".",Query_Start=".",Query_End=".",Ref_Length = ".",Query_Length=".",Perc_Iden = ".") | |
156 return_df = pd.concat([return_df,return_uncovered_df[return_columns]],ignore_index=True) | |
157 | |
158 if uncovered_query_regions.count() > 0: | |
159 return_uncovered_df = uncovered_query_regions.to_dataframe().sort_values(['chrom','start']).rename(columns={'chrom':'Query_Contig','start':'Query_Start','end':'Query_End'}).assign(Ref_Aligned = ".",Query_Aligned = ".",Ref_Contig=".",Ref_Start=".",Ref_End=".",Ref_Length = ".",Query_Length=".",Perc_Iden = ".") | |
160 return_df = pd.concat([return_df,return_uncovered_df[return_columns]],ignore_index=True) | |
161 | |
162 return return_df | |
163 | |
164 def adjust_query_row(row): | |
165 if set(['Query_Start', 'Query_End', 'Query_Contig']).issubset(row.index): | |
166 if row['Query_Start'] > row['Query_End']: | |
167 row['Query_Start'], row['Query_End'] = row['Query_End'] - 1, row['Query_Start'] | |
168 else: | |
169 row['Query_Start'] = row['Query_Start'] - 1 | |
170 return row | |
171 | |
172 def parseMUmmerSNPs(mum_snps_dir,report_id,coords_file): | |
173 | |
174 return_columns = ['Ref_Contig','Start_Ref','Ref_Pos', | |
175 'Query_Contig','Start_Query','Query_Pos', | |
176 'Ref_Loc','Query_Loc', | |
177 'Ref_Start','Ref_End', | |
178 'Query_Start','Query_End', | |
179 'Ref_Base','Query_Base', | |
180 'Dist_to_Ref_End','Dist_to_Query_End', | |
181 'Ref_Aligned','Query_Aligned', | |
182 'Query_Direction','Perc_Iden','Cat'] | |
183 | |
184 snp_file = pd.read_csv(mum_snps_dir+"/"+report_id+".snps",sep="\t",index_col=False, | |
185 names=['Ref_Pos','Ref_Base','Query_Base','Query_Pos', | |
186 'SNP_Buffer','Dist_to_End','Ref_Length','Query_Length', | |
187 'Ref_Direction','Query_Direction','Ref_Contig','Query_Contig']) | |
188 | |
189 int_columns = ['Ref_Pos', 'Query_Pos', 'Dist_to_End','Ref_Length', 'Query_Length'] | |
190 contig_columns = ['Ref_Contig','Query_Contig'] | |
191 | |
192 if snp_file.shape[0] == 0: | |
193 return pd.DataFrame(columns=return_columns) | |
194 else: | |
195 for col in int_columns: | |
196 snp_file.loc[:, col] = snp_file.loc[:, col].astype(float).astype(int) | |
197 for col in contig_columns: | |
198 snp_file.loc[:, col] = snp_file.loc[:, col].astype(str) | |
199 coords_file.loc[:, col] = coords_file.loc[:, col].astype(str) | |
200 | |
201 snp_file['Start_Ref'] = snp_file['Ref_Pos'] - 1 | |
202 snp_file['Start_Query'] = snp_file['Query_Pos'] - 1 | |
203 | |
204 snp_file['Dist_to_Ref_End'] = [min([x,y]) for x,y in zip(snp_file['Ref_Pos'], snp_file['Ref_Length'] - snp_file['Ref_Pos'])] | |
205 snp_file['Dist_to_Query_End'] = [min([x,y]) for x,y in zip(snp_file['Query_Pos'],snp_file['Query_Length'] - snp_file['Query_Pos'])] | |
206 | |
207 snp_file['Ref_Loc'] = ["/".join([str(x[0]),str(x[1])]) for x in list(zip(snp_file.Ref_Contig, snp_file.Ref_Pos))] | |
208 snp_file['Query_Loc'] = ["/".join([str(x[0]),str(x[1])]) for x in list(zip(snp_file.Query_Contig, snp_file.Query_Pos))] | |
209 | |
210 total_snp_count = snp_file.shape[0] | |
211 | |
212 valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T',"."] | |
213 | |
214 invalid_file = snp_file[(~snp_file['Ref_Base'].isin(valid_bases)) | (~snp_file['Query_Base'].isin(valid_bases))] | |
215 snp_file = snp_file[(snp_file['Ref_Base'].isin(valid_bases)) & (snp_file['Query_Base'].isin(valid_bases))] | |
216 indel_file = snp_file[(snp_file['Query_Base'] == ".") | (snp_file['Ref_Base'] == ".")] | |
217 snp_file = snp_file[~((snp_file['Query_Base'] == ".") | (snp_file['Ref_Base'] == "."))] | |
218 | |
219 if total_snp_count >= 10000: | |
220 return (snp_file.shape[0],indel_file.shape[0],invalid_file.shape[0]) | |
221 | |
222 else: | |
223 if snp_file.shape[0] == 0: | |
224 snp_coords = pd.DataFrame(columns=return_columns) | |
225 else: | |
226 snp_coords = makeSNPCoords(snp_file,coords_file,snp_file.shape[0]) | |
227 snp_coords['Cat'] = "SNP" | |
228 | |
229 if indel_file.shape[0] == 0: | |
230 indel_coords = pd.DataFrame(columns=return_columns) | |
231 else: | |
232 indel_coords = makeSNPCoords(indel_file,coords_file,indel_file.shape[0]) | |
233 indel_coords['Cat'] = "Indel" | |
234 | |
235 if invalid_file.shape[0] == 0: | |
236 invalid_coords = pd.DataFrame(columns=return_columns) | |
237 else: | |
238 invalid_coords = makeSNPCoords(invalid_file,coords_file,invalid_file.shape[0]) | |
239 invalid_coords['Cat'] = "Invalid" | |
240 | |
241 return_df = pd.concat([snp_coords,indel_coords,invalid_coords],ignore_index=True)[return_columns] | |
242 | |
243 assert return_df.shape[0] == total_snp_count | |
244 return return_df | |
245 | |
246 def makeSNPCoords(snp_df, coords_df, row_count): | |
247 snp_coords = pd.merge(snp_df, coords_df, how='left').dropna() | |
248 | |
249 condition = ((snp_coords['Ref_Start'] <= snp_coords['Ref_Pos']) & | |
250 (snp_coords['Ref_Pos'] <= snp_coords['Ref_End']) & | |
251 (snp_coords['Query_Start'] <= snp_coords['Query_Pos']) & | |
252 (snp_coords['Query_Pos'] <= snp_coords['Query_End'])) | |
253 snp_coords = snp_coords[condition] | |
254 | |
255 if row_count != snp_coords.shape[0]: | |
256 snp_coords.sort_values(by=['Ref_Aligned', 'Perc_Iden'], ascending=False, inplace=True) | |
257 snp_coords.drop_duplicates(subset=['Ref_Loc','Query_Loc'], keep='first', inplace=True) | |
258 | |
259 return snp_coords | |
260 | |
261 def fasta_info(file_path): | |
262 records = list(SeqIO.parse(file_path, 'fasta')) | |
263 contig_count = int(len(records)) | |
264 lengths = sorted([len(record) for record in records], reverse=True) | |
265 assembly_bases = sum(lengths) | |
266 | |
267 with open(file_path, 'rb') as file: | |
268 sha256 = hashlib.sha256(file.read()).hexdigest() | |
269 | |
270 cumulative_length = 0 | |
271 n50 = None | |
272 n90 = None | |
273 l50 = None | |
274 l90 = None | |
275 | |
276 for i, length in enumerate(lengths, start=1): | |
277 cumulative_length += length | |
278 if cumulative_length >= assembly_bases * 0.5 and n50 is None: | |
279 n50 = length | |
280 l50 = i | |
281 if cumulative_length >= assembly_bases * 0.9 and n90 is None: | |
282 n90 = length | |
283 l90 = i | |
284 if n50 is not None and n90 is not None: | |
285 break | |
286 | |
287 return [file_path,contig_count,assembly_bases,n50,n90,l50,l90,sha256] | |
288 | |
289 def fasta_to_bedtool(fasta_file): | |
290 intervals = [] | |
291 for record in SeqIO.parse(fasta_file, "fasta"): | |
292 chrom_name = record.id | |
293 chrom_length = len(record.seq) | |
294 interval = Interval(chrom_name, 0, chrom_length) | |
295 intervals.append(interval) | |
296 bedtool = BedTool(intervals).sort() | |
297 return bedtool | |
298 | |
299 def calculate_total_length(bedtool): | |
300 return sum(len(interval) for interval in bedtool) | |
301 | |
302 def get_kmers(command, log_file, retries=10, delay=5, timeout=30): | |
303 kmer_set = set() | |
304 | |
305 for attempt in range(retries): | |
306 try: | |
307 process = subprocess.run(command, capture_output=True, text=True, timeout=timeout) | |
308 kmer_set.update(process.stdout.strip().split('\n')) | |
309 | |
310 if len(kmer_set) > 1: | |
311 return kmer_set | |
312 | |
313 except subprocess.TimeoutExpired: | |
314 with open(log_file, "a") as log: | |
315 log.write(f"Kmer command timed out after {timeout} seconds\n") | |
316 time.sleep(delay) | |
317 | |
318 except subprocess.CalledProcessError as e: | |
319 with open(log_file, "a") as log: | |
320 log.write(f"Kmer command errored out: {e}\n") | |
321 time.sleep(delay) | |
322 | |
323 # Wait before retrying if this is not the last attempt | |
324 if attempt < retries - 1: | |
325 time.sleep(delay) | |
326 else: | |
327 raise RuntimeError(f"Command failed after {retries} attempts: {command}") | |
328 | |
329 return None | |
330 | |
331 def compare_kmers(query_file,reference_file,log_file): | |
332 | |
333 ref_command = ["kmercountexact.sh", f"in={reference_file}", "threads=1", "fastadump=f", "out=stdout", "|", "cut", "-f1"] | |
334 query_command = ["kmercountexact.sh", f"in={query_file}", "threads=1", "fastadump=f", "out=stdout", "|", "cut", "-f1"] | |
335 | |
336 ref_kmers = get_kmers(ref_command,log_file) | |
337 with open(log_file, "a") as log: | |
338 log.write(f"Fetched reference kmers...\n") | |
339 | |
340 query_kmers = get_kmers(query_command,log_file) | |
341 with open(log_file, "a") as log: | |
342 log.write(f"Fetched query kmers...\n") | |
343 | |
344 intersection = len(ref_kmers.intersection(query_kmers)) | |
345 similarity = 100*(intersection/(len(ref_kmers) + len(query_kmers) - intersection)) | |
346 unique_ref = len(ref_kmers.difference(query_kmers)) | |
347 unique_query = len(query_kmers.difference(ref_kmers)) | |
348 | |
349 return [len(ref_kmers),len(query_kmers), | |
350 unique_ref,unique_query, | |
351 intersection,similarity] | |
352 | |
353 #### 01: Read in arguments #### | |
354 run_failed = False | |
355 | |
356 parser = argparse.ArgumentParser(description='CSP2 MUMmer Compilation') | |
357 parser.add_argument('--query', type=str, help='Query') | |
358 parser.add_argument('--query_fasta', type=str, help='Query Fasta') | |
359 parser.add_argument('--reference', type=str, help='Reference') | |
360 parser.add_argument('--reference_fasta', type=str, help='Reference Fasta') | |
361 parser.add_argument('--mummer_dir', type=str, help='MUMmer Directory') | |
362 parser.add_argument('--snpdiffs_dir', type=str, help='snpdiffs Directory') | |
363 parser.add_argument('--temp_dir', type=str, default='', help='Temporary Directory') | |
364 parser.add_argument('--log_file', type=str, help='Log File') | |
365 args = parser.parse_args() | |
366 | |
367 query = args.query | |
368 query_fasta = args.query_fasta | |
369 reference = args.reference | |
370 reference_fasta = args.reference_fasta | |
371 | |
372 mummer_dir = os.path.normpath(os.path.abspath(args.mummer_dir)) | |
373 snpdiffs_dir = os.path.normpath(os.path.abspath(args.snpdiffs_dir)) | |
374 log_file = os.path.normpath(os.path.abspath(args.log_file)) | |
375 | |
376 with open(log_file, "w") as log: | |
377 log.write(f"CSP2 MUMmer Compilation Log: {query}__vs__{reference}\n") | |
378 log.write(f"\t- Query Fasta: {query_fasta}\n") | |
379 log.write(f"\t- Reference Fasta: {reference_fasta}\n") | |
380 log.write(f"\t - MUMmer Directory: {mummer_dir}\n") | |
381 log.write(f"\t - snpdiffs Directory: {snpdiffs_dir}\n") | |
382 | |
383 # Set TMP | |
384 global temp_dir | |
385 if args.temp_dir != "": | |
386 random_temp_id = str(uuid.uuid4()) | |
387 temp_dir = f"{os.path.normpath(os.path.abspath(args.temp_dir))}/{random_temp_id}" | |
388 try: | |
389 os.mkdir(temp_dir) | |
390 helpers.set_tempdir(temp_dir) | |
391 with open(log_file, "a") as log: | |
392 log.write(f"\t - Temporary Directory: {temp_dir}\n") | |
393 | |
394 except OSError as e: | |
395 run_failed = True | |
396 print(f"Error: Failed to create directory '{temp_dir}': {e}") | |
397 else: | |
398 temp_dir = "" | |
399 | |
400 with open(log_file, "a") as log: | |
401 log.write("-------------------------------------------------------\n\n") | |
402 | |
403 try: | |
404 # Get query data | |
405 query_data = [query] + fasta_info(query_fasta) | |
406 query_string = [x+":"+str(y) for x,y in zip(['Query_ID','Query_Assembly','Query_Contig_Count','Query_Assembly_Bases', | |
407 'Query_N50','Query_N90','Query_L50','Query_L90','Query_SHA256'],query_data)] | |
408 | |
409 with open(log_file, "a") as log: | |
410 log.write("Fetched Query Data...\n") | |
411 | |
412 # Get reference data | |
413 reference_data = [reference] + fasta_info(reference_fasta) | |
414 reference_string = [x+":"+str(y) for x,y in zip(['Reference_ID','Reference_Assembly','Reference_Contig_Count','Reference_Assembly_Bases', | |
415 'Reference_N50','Reference_N90','Reference_L50','Reference_L90','Reference_SHA256'],reference_data)] | |
416 | |
417 with open(log_file, "a") as log: | |
418 log.write("Fetched Reference Data...\n") | |
419 | |
420 # Get kmer distance | |
421 [ref_kmers,query_kmers, | |
422 unique_ref_kmers,unique_query_kmers, | |
423 kmer_intersection,kmer_similarity] = compare_kmers(query_fasta,reference_fasta,log_file) | |
424 | |
425 with open(log_file, "a") as log: | |
426 log.write("Fetched Kmer Data...\n") | |
427 | |
428 # Create reference BED file using fasta seq lengths | |
429 query_chr_bed = fasta_to_bedtool(query_fasta) | |
430 reference_chr_bed = fasta_to_bedtool(reference_fasta) | |
431 | |
432 with open(log_file, "a") as log: | |
433 log.write("Created BED files...\n") | |
434 | |
435 # Set report ID | |
436 report_id = query + "__vs__" + reference | |
437 snpdiffs_file = snpdiffs_dir + "/" + report_id + ".snpdiffs" | |
438 | |
439 # Create NA variables for all downstream options | |
440 median_percent_identity = "NA" | |
441 median_alignment_length = "NA" | |
442 | |
443 total_snp_count = "NA" | |
444 total_indel_count = "NA" | |
445 total_invalid_count = "NA" | |
446 | |
447 #### 02: Read in MUMmer report data #### | |
448 [ref_bases,percent_ref_aligned, | |
449 query_bases,percent_query_aligned, | |
450 g_snps,g_indels, | |
451 ref_breakpoints,query_breakpoints, | |
452 ref_relocations,query_relocations, | |
453 ref_translocations,query_translocations, | |
454 ref_inversions,query_inversions, | |
455 ref_insertions,query_insertions, | |
456 ref_tandem,query_tandem] = parseMUmmerReport(mummer_dir,report_id) | |
457 | |
458 with open(log_file, "a") as log: | |
459 log.write("Parsed MUMmer report...\n") | |
460 | |
461 if percent_ref_aligned > 0: | |
462 | |
463 #### 03: Process MUMmer coords file #### | |
464 coords_file = parseMUmmerCoords(mummer_dir,report_id,reference_chr_bed,query_chr_bed) | |
465 with open(log_file, "a") as log: | |
466 log.write("Parsed MUMmer coords...\n") | |
467 | |
468 aligned_coords = coords_file[~((coords_file['Query_Contig'] == ".") | (coords_file['Ref_Contig'] == "."))] | |
469 if aligned_coords.shape[0] > 0: | |
470 aligned_coords['Perc_Iden'] = aligned_coords['Perc_Iden'].astype(float) | |
471 aligned_coords['Ref_Aligned'] = aligned_coords['Ref_Aligned'].astype(int) | |
472 | |
473 median_percent_identity = np.median(aligned_coords['Perc_Iden']) | |
474 median_alignment_length = np.median(aligned_coords['Ref_Aligned']) | |
475 | |
476 ##### 04: Process MUMmer SNP file #### | |
477 processed_snps = parseMUmmerSNPs(mummer_dir,report_id,aligned_coords) | |
478 with open(log_file, "a") as log: | |
479 log.write("Parsed MUMmer SNPs...\n") | |
480 # Check if processed_snps is a df or a tuple | |
481 if isinstance(processed_snps, tuple): | |
482 total_snp_count,total_indel_count,total_invalid_count = processed_snps | |
483 else: | |
484 total_snp_count = processed_snps[processed_snps['Cat'] == "SNP"].shape[0] | |
485 total_indel_count = processed_snps[processed_snps['Cat'] == "Indel"].shape[0] | |
486 total_invalid_count = processed_snps[processed_snps['Cat'] == "Invalid"].shape[0] | |
487 | |
488 # Clean up pybedtools temp | |
489 helpers.cleanup(verbose=False,remove_all = False) | |
490 with open(log_file, "a") as log: | |
491 log.write("Cleaned up TMP...\n") | |
492 | |
493 # Create header | |
494 percent_ref_aligned = f"{percent_ref_aligned:.2f}" if percent_ref_aligned != "NA" else percent_ref_aligned | |
495 percent_query_aligned = f"{percent_query_aligned:.2f}" if percent_query_aligned != "NA" else percent_query_aligned | |
496 median_percent_identity = f"{median_percent_identity:.2f}" if median_percent_identity != "NA" else median_percent_identity | |
497 median_alignment_length = f"{median_alignment_length:.2f}" if median_alignment_length != "NA" else median_alignment_length | |
498 total_snp_count = f"{total_snp_count:.0f}" if total_snp_count != "NA" else total_snp_count | |
499 total_indel_count = f"{total_indel_count:.0f}" if total_indel_count != "NA" else total_indel_count | |
500 total_invalid_count = f"{total_invalid_count:.0f}" if total_invalid_count != "NA" else total_invalid_count | |
501 ref_breakpoints = f"{ref_breakpoints:.0f}" | |
502 ref_relocations = f"{ref_relocations:.0f}" | |
503 ref_translocations = f"{ref_translocations:.0f}" | |
504 ref_inversions = f"{ref_inversions:.0f}" | |
505 ref_insertions = f"{ref_insertions:.0f}" | |
506 ref_tandem = f"{ref_tandem:.0f}" | |
507 query_breakpoints = f"{query_breakpoints:.0f}" | |
508 query_relocations = f"{query_relocations:.0f}" | |
509 query_translocations = f"{query_translocations:.0f}" | |
510 query_inversions = f"{query_inversions:.0f}" | |
511 query_insertions = f"{query_insertions:.0f}" | |
512 query_tandem = f"{query_tandem:.0f}" | |
513 g_snps = f"{g_snps:.0f}" | |
514 g_indels = f"{g_indels:.0f}" | |
515 ref_kmers = f"{ref_kmers:.0f}" | |
516 query_kmers = f"{query_kmers:.0f}" | |
517 unique_ref_kmers = f"{unique_ref_kmers:.0f}" | |
518 unique_query_kmers = f"{unique_query_kmers:.0f}" | |
519 kmer_intersection = f"{kmer_intersection:.0f}" | |
520 kmer_similarity = f"{kmer_similarity:.2f}" | |
521 | |
522 snpdiffs_header=[] | |
523 snpdiffs_header.append("#\t" + | |
524 "\t".join(query_string) + | |
525 "\t" + "\t".join(reference_string) + | |
526 "\t" + "\t".join([ | |
527 "SNPs:"+total_snp_count, | |
528 "Reference_Percent_Aligned:"+percent_ref_aligned, | |
529 "Query_Percent_Aligned:"+percent_query_aligned, | |
530 "Median_Percent_Identity:"+median_percent_identity, | |
531 "Median_Alignment_Length:"+median_alignment_length, | |
532 "Kmer_Similarity:"+kmer_similarity, | |
533 "Shared_Kmers:"+kmer_intersection, | |
534 "Reference_Unique_Kmers:"+unique_ref_kmers, | |
535 "Query_Unique_Kmers:"+unique_query_kmers, | |
536 "Reference_Breakpoints:"+ref_breakpoints, | |
537 "Query_Breakpoints:"+query_breakpoints, | |
538 "Reference_Relocations:"+ref_relocations, | |
539 "Query_Relocations:"+query_relocations, | |
540 "Reference_Translocations:"+ref_translocations, | |
541 "Query_Translocations:"+query_translocations, | |
542 "Reference_Inversions:"+ref_inversions, | |
543 "Query_Inversions:"+query_inversions, | |
544 "Reference_Insertions:"+ref_insertions, | |
545 "Query_Insertions:"+query_insertions, | |
546 "Reference_Tandem:"+ref_tandem, | |
547 "Query_Tandem:"+query_tandem, | |
548 "Indels:"+total_indel_count, | |
549 "Invalid:"+total_invalid_count, | |
550 "gSNPs:"+g_snps, | |
551 "gIndels:"+g_indels])) | |
552 | |
553 with open(log_file, "a") as log: | |
554 log.write("Prepped snpdiffs output...\n") | |
555 | |
556 with open(snpdiffs_file,"w") as file: | |
557 file.write("\n".join(snpdiffs_header) + "\n") | |
558 for index, row in coords_file.iterrows(): | |
559 file.write("##\t" + "\t".join(map(str, row))+"\n") | |
560 if isinstance(processed_snps, tuple): | |
561 pass | |
562 else: | |
563 for index, row in processed_snps.iterrows(): | |
564 file.write("\t".join(map(str, row))+"\n") | |
565 | |
566 with open(log_file, "a") as log: | |
567 log.write("Saved snpdiffs file...\n") | |
568 | |
569 print(",".join([query,reference,snpdiffs_file])) | |
570 | |
571 except: | |
572 run_failed = True | |
573 print("Exception occurred:\n", traceback.format_exc()) | |
574 finally: | |
575 helpers.cleanup(verbose=False, remove_all=False) | |
576 if temp_dir != "": | |
577 shutil.rmtree(temp_dir) | |
578 if run_failed: | |
579 sys.exit(1) |