comparison CSP2/bin/compileMUMmer.py @ 0:01431fa12065

"planemo upload"
author rliterman
date Mon, 02 Dec 2024 10:40:55 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:01431fa12065
1 #!/usr/bin/env python3
2
3 import sys
4 import os
5 import pandas as pd
6 import re
7 from pybedtools import BedTool,helpers,Interval
8 import warnings
9 import numpy as np
10 import hashlib
11 from Bio import SeqIO
12 import subprocess
13 import uuid
14 import traceback
15 import shutil
16 import time
17 import argparse
18
19 warnings.filterwarnings("ignore")
20
21 def parseMUmmerReport(mum_report_dir,report_id):
22 report_data = []
23
24 with open(mum_report_dir+"/"+report_id+".report", 'r') as report_file:
25
26 for line in report_file.readlines():
27 split_line = re.split(r'\s+', line.strip())
28 if len(split_line) == 3:
29 report_data.append(
30 {
31 'Measure': split_line[0].split("(", 1)[0],
32 'Ref_Value':split_line[1].split("(", 1)[0],
33 'Query_Value':split_line[2].split("(", 1)[0]
34 })
35
36 report_data = pd.DataFrame(report_data)
37 report_data = report_data[report_data['Measure'].isin(['TotalSeqs','TotalBases','AlignedBases',
38 'TotalSNPs','TotalGSNPs','TotalIndels','TotalGIndels',
39 'Breakpoints','Relocations','Translocations','Inversions',
40 'Insertions','TandemIns'])]
41
42 report_data = report_data.drop_duplicates(subset=['Measure']) # Drop M-to-M
43
44 report_data = report_data.astype({"Ref_Value":"int","Query_Value":"int"})
45
46 # Fetch assembly data
47 ref_bases = report_data[report_data['Measure'] == 'TotalBases']['Ref_Value'].iloc[0]
48 query_bases = report_data[report_data['Measure'] == 'TotalBases']['Query_Value'].iloc[0]
49
50 # Fetch alignment data
51 ref_aligned = report_data[report_data['Measure'] == 'AlignedBases']['Ref_Value'].iloc[0]
52 query_aligned = report_data[report_data['Measure'] == 'AlignedBases']['Query_Value'].iloc[0]
53
54 percent_ref_aligned = 100*(float(ref_aligned)/ref_bases)
55 percent_query_aligned = 100*(float(query_aligned)/query_bases)
56
57 # Fetch SNP data
58 g_snps = report_data[report_data['Measure'] == 'TotalGSNPs']['Ref_Value'].iloc[0]
59 g_indels = report_data[report_data['Measure'] == 'TotalGIndels']['Ref_Value'].iloc[0]
60
61 ref_breakpoints = report_data[report_data['Measure'] == 'Breakpoints']['Ref_Value'].iloc[0]
62 query_breakpoints = report_data[report_data['Measure'] == 'Breakpoints']['Query_Value'].iloc[0]
63
64 ref_relocations = report_data[report_data['Measure'] == 'Relocations']['Ref_Value'].iloc[0]
65 query_relocations = report_data[report_data['Measure'] == 'Relocations']['Query_Value'].iloc[0]
66
67 ref_translocations = report_data[report_data['Measure'] == 'Translocations']['Ref_Value'].iloc[0]
68 query_translocations = report_data[report_data['Measure'] == 'Translocations']['Query_Value'].iloc[0]
69
70 ref_inversions = report_data[report_data['Measure'] == 'Inversions']['Ref_Value'].iloc[0]
71 query_inversions = report_data[report_data['Measure'] == 'Inversions']['Query_Value'].iloc[0]
72
73 ref_insertions = report_data[report_data['Measure'] == 'Insertions']['Ref_Value'].iloc[0]
74 query_insertions = report_data[report_data['Measure'] == 'Insertions']['Query_Value'].iloc[0]
75
76 ref_tandem = report_data[report_data['Measure'] == 'TandemIns']['Ref_Value'].iloc[0]
77 query_tandem = report_data[report_data['Measure'] == 'TandemIns']['Query_Value'].iloc[0]
78
79 return [ref_bases,percent_ref_aligned,
80 query_bases,percent_query_aligned,
81 g_snps,g_indels,
82 ref_breakpoints,query_breakpoints,
83 ref_relocations,query_relocations,
84 ref_translocations,query_translocations,
85 ref_inversions,query_inversions,
86 ref_insertions,query_insertions,
87 ref_tandem,query_tandem]
88
89 def parseMUmmerCoords(mum_coords_dir,report_id,reference_chr_bed,query_chr_bed):
90
91 coords_file = pd.read_csv(mum_coords_dir+"/"+report_id+".1coords",sep="\t",index_col=False,
92 names=['Ref_Start','Ref_End','Query_Start','Query_End',
93 'Ref_Aligned','Query_Aligned','Perc_Iden',
94 'Ref_Length','Query_Length','Ref_Cov',
95 'Query_Cov','Ref_Contig','Query_Contig'],
96 dtype={
97 'Ref_Start': int,
98 'Ref_End': int,
99 'Query_Start': int,
100 'Query_End': int,
101 'Ref_Aligned': int,
102 'Query_Aligned': int,
103 'Perc_Iden': float,
104 'Ref_Length': int,
105 'Query_Length': int,
106 'Ref_Cov': float,
107 'Query_Cov': float,
108 'Ref_Contig': str,
109 'Query_Contig': str
110 })
111
112 if coords_file.shape[0] > 0:
113
114 coords_file['Ref_Start'] = (coords_file['Ref_Start'] - 1)
115 coords_file = coords_file.apply(adjust_query_row, axis=1)
116
117 merged_ref_bed = BedTool.from_dataframe(coords_file[['Ref_Contig','Ref_Start','Ref_End']]).sort().merge()
118 merged_query_bed = BedTool.from_dataframe(coords_file[['Query_Contig','Query_Start','Query_End']]).sort().merge()
119
120 covered_ref_regions = reference_chr_bed.intersect(merged_ref_bed)
121 uncovered_ref_regions = reference_chr_bed.subtract(merged_ref_bed)
122
123 covered_query_regions = query_chr_bed.intersect(merged_query_bed)
124 uncovered_query_regions = query_chr_bed.subtract(merged_query_bed)
125 else:
126 merged_ref_bed = BedTool([])
127 merged_query_bed = BedTool([])
128
129 covered_ref_regions = BedTool([])
130 uncovered_ref_regions = BedTool([])
131
132 covered_query_regions = BedTool([])
133 uncovered_query_regions = BedTool([])
134
135 ref_genome_size = calculate_total_length(reference_chr_bed)
136 ref_covered_size = calculate_total_length(covered_ref_regions)
137 ref_uncovered_size = calculate_total_length(uncovered_ref_regions)
138
139 query_genome_size = calculate_total_length(query_chr_bed)
140 query_covered_size = calculate_total_length(covered_query_regions)
141 query_uncovered_size = calculate_total_length(uncovered_query_regions)
142
143 assert ref_covered_size + ref_uncovered_size == ref_genome_size
144 assert query_covered_size + query_uncovered_size == query_genome_size
145
146 # Create return_df
147 return_columns = ['Ref_Contig','Ref_Start','Ref_End','Ref_Length','Ref_Aligned','Query_Contig','Query_Start','Query_End','Query_Length','Query_Aligned','Perc_Iden']
148
149 return_df = pd.DataFrame(columns=return_columns)
150
151 if coords_file.shape[0] > 0:
152 return_df = pd.concat([return_df,coords_file[return_columns]],ignore_index=True)
153
154 if uncovered_ref_regions.count() > 0:
155 return_uncovered_df = uncovered_ref_regions.to_dataframe().sort_values(['chrom','start']).rename(columns={'chrom':'Ref_Contig','start':'Ref_Start','end':'Ref_End'}).assign(Ref_Aligned = ".",Query_Aligned = ".",Query_Contig=".",Query_Start=".",Query_End=".",Ref_Length = ".",Query_Length=".",Perc_Iden = ".")
156 return_df = pd.concat([return_df,return_uncovered_df[return_columns]],ignore_index=True)
157
158 if uncovered_query_regions.count() > 0:
159 return_uncovered_df = uncovered_query_regions.to_dataframe().sort_values(['chrom','start']).rename(columns={'chrom':'Query_Contig','start':'Query_Start','end':'Query_End'}).assign(Ref_Aligned = ".",Query_Aligned = ".",Ref_Contig=".",Ref_Start=".",Ref_End=".",Ref_Length = ".",Query_Length=".",Perc_Iden = ".")
160 return_df = pd.concat([return_df,return_uncovered_df[return_columns]],ignore_index=True)
161
162 return return_df
163
164 def adjust_query_row(row):
165 if set(['Query_Start', 'Query_End', 'Query_Contig']).issubset(row.index):
166 if row['Query_Start'] > row['Query_End']:
167 row['Query_Start'], row['Query_End'] = row['Query_End'] - 1, row['Query_Start']
168 else:
169 row['Query_Start'] = row['Query_Start'] - 1
170 return row
171
172 def parseMUmmerSNPs(mum_snps_dir,report_id,coords_file):
173
174 return_columns = ['Ref_Contig','Start_Ref','Ref_Pos',
175 'Query_Contig','Start_Query','Query_Pos',
176 'Ref_Loc','Query_Loc',
177 'Ref_Start','Ref_End',
178 'Query_Start','Query_End',
179 'Ref_Base','Query_Base',
180 'Dist_to_Ref_End','Dist_to_Query_End',
181 'Ref_Aligned','Query_Aligned',
182 'Query_Direction','Perc_Iden','Cat']
183
184 snp_file = pd.read_csv(mum_snps_dir+"/"+report_id+".snps",sep="\t",index_col=False,
185 names=['Ref_Pos','Ref_Base','Query_Base','Query_Pos',
186 'SNP_Buffer','Dist_to_End','Ref_Length','Query_Length',
187 'Ref_Direction','Query_Direction','Ref_Contig','Query_Contig'])
188
189 int_columns = ['Ref_Pos', 'Query_Pos', 'Dist_to_End','Ref_Length', 'Query_Length']
190 contig_columns = ['Ref_Contig','Query_Contig']
191
192 if snp_file.shape[0] == 0:
193 return pd.DataFrame(columns=return_columns)
194 else:
195 for col in int_columns:
196 snp_file.loc[:, col] = snp_file.loc[:, col].astype(float).astype(int)
197 for col in contig_columns:
198 snp_file.loc[:, col] = snp_file.loc[:, col].astype(str)
199 coords_file.loc[:, col] = coords_file.loc[:, col].astype(str)
200
201 snp_file['Start_Ref'] = snp_file['Ref_Pos'] - 1
202 snp_file['Start_Query'] = snp_file['Query_Pos'] - 1
203
204 snp_file['Dist_to_Ref_End'] = [min([x,y]) for x,y in zip(snp_file['Ref_Pos'], snp_file['Ref_Length'] - snp_file['Ref_Pos'])]
205 snp_file['Dist_to_Query_End'] = [min([x,y]) for x,y in zip(snp_file['Query_Pos'],snp_file['Query_Length'] - snp_file['Query_Pos'])]
206
207 snp_file['Ref_Loc'] = ["/".join([str(x[0]),str(x[1])]) for x in list(zip(snp_file.Ref_Contig, snp_file.Ref_Pos))]
208 snp_file['Query_Loc'] = ["/".join([str(x[0]),str(x[1])]) for x in list(zip(snp_file.Query_Contig, snp_file.Query_Pos))]
209
210 total_snp_count = snp_file.shape[0]
211
212 valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T',"."]
213
214 invalid_file = snp_file[(~snp_file['Ref_Base'].isin(valid_bases)) | (~snp_file['Query_Base'].isin(valid_bases))]
215 snp_file = snp_file[(snp_file['Ref_Base'].isin(valid_bases)) & (snp_file['Query_Base'].isin(valid_bases))]
216 indel_file = snp_file[(snp_file['Query_Base'] == ".") | (snp_file['Ref_Base'] == ".")]
217 snp_file = snp_file[~((snp_file['Query_Base'] == ".") | (snp_file['Ref_Base'] == "."))]
218
219 if total_snp_count >= 10000:
220 return (snp_file.shape[0],indel_file.shape[0],invalid_file.shape[0])
221
222 else:
223 if snp_file.shape[0] == 0:
224 snp_coords = pd.DataFrame(columns=return_columns)
225 else:
226 snp_coords = makeSNPCoords(snp_file,coords_file,snp_file.shape[0])
227 snp_coords['Cat'] = "SNP"
228
229 if indel_file.shape[0] == 0:
230 indel_coords = pd.DataFrame(columns=return_columns)
231 else:
232 indel_coords = makeSNPCoords(indel_file,coords_file,indel_file.shape[0])
233 indel_coords['Cat'] = "Indel"
234
235 if invalid_file.shape[0] == 0:
236 invalid_coords = pd.DataFrame(columns=return_columns)
237 else:
238 invalid_coords = makeSNPCoords(invalid_file,coords_file,invalid_file.shape[0])
239 invalid_coords['Cat'] = "Invalid"
240
241 return_df = pd.concat([snp_coords,indel_coords,invalid_coords],ignore_index=True)[return_columns]
242
243 assert return_df.shape[0] == total_snp_count
244 return return_df
245
246 def makeSNPCoords(snp_df, coords_df, row_count):
247 snp_coords = pd.merge(snp_df, coords_df, how='left').dropna()
248
249 condition = ((snp_coords['Ref_Start'] <= snp_coords['Ref_Pos']) &
250 (snp_coords['Ref_Pos'] <= snp_coords['Ref_End']) &
251 (snp_coords['Query_Start'] <= snp_coords['Query_Pos']) &
252 (snp_coords['Query_Pos'] <= snp_coords['Query_End']))
253 snp_coords = snp_coords[condition]
254
255 if row_count != snp_coords.shape[0]:
256 snp_coords.sort_values(by=['Ref_Aligned', 'Perc_Iden'], ascending=False, inplace=True)
257 snp_coords.drop_duplicates(subset=['Ref_Loc','Query_Loc'], keep='first', inplace=True)
258
259 return snp_coords
260
261 def fasta_info(file_path):
262 records = list(SeqIO.parse(file_path, 'fasta'))
263 contig_count = int(len(records))
264 lengths = sorted([len(record) for record in records], reverse=True)
265 assembly_bases = sum(lengths)
266
267 with open(file_path, 'rb') as file:
268 sha256 = hashlib.sha256(file.read()).hexdigest()
269
270 cumulative_length = 0
271 n50 = None
272 n90 = None
273 l50 = None
274 l90 = None
275
276 for i, length in enumerate(lengths, start=1):
277 cumulative_length += length
278 if cumulative_length >= assembly_bases * 0.5 and n50 is None:
279 n50 = length
280 l50 = i
281 if cumulative_length >= assembly_bases * 0.9 and n90 is None:
282 n90 = length
283 l90 = i
284 if n50 is not None and n90 is not None:
285 break
286
287 return [file_path,contig_count,assembly_bases,n50,n90,l50,l90,sha256]
288
289 def fasta_to_bedtool(fasta_file):
290 intervals = []
291 for record in SeqIO.parse(fasta_file, "fasta"):
292 chrom_name = record.id
293 chrom_length = len(record.seq)
294 interval = Interval(chrom_name, 0, chrom_length)
295 intervals.append(interval)
296 bedtool = BedTool(intervals).sort()
297 return bedtool
298
299 def calculate_total_length(bedtool):
300 return sum(len(interval) for interval in bedtool)
301
302 def get_kmers(command, log_file, retries=10, delay=5, timeout=30):
303 kmer_set = set()
304
305 for attempt in range(retries):
306 try:
307 process = subprocess.run(command, capture_output=True, text=True, timeout=timeout)
308 kmer_set.update(process.stdout.strip().split('\n'))
309
310 if len(kmer_set) > 1:
311 return kmer_set
312
313 except subprocess.TimeoutExpired:
314 with open(log_file, "a") as log:
315 log.write(f"Kmer command timed out after {timeout} seconds\n")
316 time.sleep(delay)
317
318 except subprocess.CalledProcessError as e:
319 with open(log_file, "a") as log:
320 log.write(f"Kmer command errored out: {e}\n")
321 time.sleep(delay)
322
323 # Wait before retrying if this is not the last attempt
324 if attempt < retries - 1:
325 time.sleep(delay)
326 else:
327 raise RuntimeError(f"Command failed after {retries} attempts: {command}")
328
329 return None
330
331 def compare_kmers(query_file,reference_file,log_file):
332
333 ref_command = ["kmercountexact.sh", f"in={reference_file}", "threads=1", "fastadump=f", "out=stdout", "|", "cut", "-f1"]
334 query_command = ["kmercountexact.sh", f"in={query_file}", "threads=1", "fastadump=f", "out=stdout", "|", "cut", "-f1"]
335
336 ref_kmers = get_kmers(ref_command,log_file)
337 with open(log_file, "a") as log:
338 log.write(f"Fetched reference kmers...\n")
339
340 query_kmers = get_kmers(query_command,log_file)
341 with open(log_file, "a") as log:
342 log.write(f"Fetched query kmers...\n")
343
344 intersection = len(ref_kmers.intersection(query_kmers))
345 similarity = 100*(intersection/(len(ref_kmers) + len(query_kmers) - intersection))
346 unique_ref = len(ref_kmers.difference(query_kmers))
347 unique_query = len(query_kmers.difference(ref_kmers))
348
349 return [len(ref_kmers),len(query_kmers),
350 unique_ref,unique_query,
351 intersection,similarity]
352
353 #### 01: Read in arguments ####
354 run_failed = False
355
356 parser = argparse.ArgumentParser(description='CSP2 MUMmer Compilation')
357 parser.add_argument('--query', type=str, help='Query')
358 parser.add_argument('--query_fasta', type=str, help='Query Fasta')
359 parser.add_argument('--reference', type=str, help='Reference')
360 parser.add_argument('--reference_fasta', type=str, help='Reference Fasta')
361 parser.add_argument('--mummer_dir', type=str, help='MUMmer Directory')
362 parser.add_argument('--snpdiffs_dir', type=str, help='snpdiffs Directory')
363 parser.add_argument('--temp_dir', type=str, default='', help='Temporary Directory')
364 parser.add_argument('--log_file', type=str, help='Log File')
365 args = parser.parse_args()
366
367 query = args.query
368 query_fasta = args.query_fasta
369 reference = args.reference
370 reference_fasta = args.reference_fasta
371
372 mummer_dir = os.path.normpath(os.path.abspath(args.mummer_dir))
373 snpdiffs_dir = os.path.normpath(os.path.abspath(args.snpdiffs_dir))
374 log_file = os.path.normpath(os.path.abspath(args.log_file))
375
376 with open(log_file, "w") as log:
377 log.write(f"CSP2 MUMmer Compilation Log: {query}__vs__{reference}\n")
378 log.write(f"\t- Query Fasta: {query_fasta}\n")
379 log.write(f"\t- Reference Fasta: {reference_fasta}\n")
380 log.write(f"\t - MUMmer Directory: {mummer_dir}\n")
381 log.write(f"\t - snpdiffs Directory: {snpdiffs_dir}\n")
382
383 # Set TMP
384 global temp_dir
385 if args.temp_dir != "":
386 random_temp_id = str(uuid.uuid4())
387 temp_dir = f"{os.path.normpath(os.path.abspath(args.temp_dir))}/{random_temp_id}"
388 try:
389 os.mkdir(temp_dir)
390 helpers.set_tempdir(temp_dir)
391 with open(log_file, "a") as log:
392 log.write(f"\t - Temporary Directory: {temp_dir}\n")
393
394 except OSError as e:
395 run_failed = True
396 print(f"Error: Failed to create directory '{temp_dir}': {e}")
397 else:
398 temp_dir = ""
399
400 with open(log_file, "a") as log:
401 log.write("-------------------------------------------------------\n\n")
402
403 try:
404 # Get query data
405 query_data = [query] + fasta_info(query_fasta)
406 query_string = [x+":"+str(y) for x,y in zip(['Query_ID','Query_Assembly','Query_Contig_Count','Query_Assembly_Bases',
407 'Query_N50','Query_N90','Query_L50','Query_L90','Query_SHA256'],query_data)]
408
409 with open(log_file, "a") as log:
410 log.write("Fetched Query Data...\n")
411
412 # Get reference data
413 reference_data = [reference] + fasta_info(reference_fasta)
414 reference_string = [x+":"+str(y) for x,y in zip(['Reference_ID','Reference_Assembly','Reference_Contig_Count','Reference_Assembly_Bases',
415 'Reference_N50','Reference_N90','Reference_L50','Reference_L90','Reference_SHA256'],reference_data)]
416
417 with open(log_file, "a") as log:
418 log.write("Fetched Reference Data...\n")
419
420 # Get kmer distance
421 [ref_kmers,query_kmers,
422 unique_ref_kmers,unique_query_kmers,
423 kmer_intersection,kmer_similarity] = compare_kmers(query_fasta,reference_fasta,log_file)
424
425 with open(log_file, "a") as log:
426 log.write("Fetched Kmer Data...\n")
427
428 # Create reference BED file using fasta seq lengths
429 query_chr_bed = fasta_to_bedtool(query_fasta)
430 reference_chr_bed = fasta_to_bedtool(reference_fasta)
431
432 with open(log_file, "a") as log:
433 log.write("Created BED files...\n")
434
435 # Set report ID
436 report_id = query + "__vs__" + reference
437 snpdiffs_file = snpdiffs_dir + "/" + report_id + ".snpdiffs"
438
439 # Create NA variables for all downstream options
440 median_percent_identity = "NA"
441 median_alignment_length = "NA"
442
443 total_snp_count = "NA"
444 total_indel_count = "NA"
445 total_invalid_count = "NA"
446
447 #### 02: Read in MUMmer report data ####
448 [ref_bases,percent_ref_aligned,
449 query_bases,percent_query_aligned,
450 g_snps,g_indels,
451 ref_breakpoints,query_breakpoints,
452 ref_relocations,query_relocations,
453 ref_translocations,query_translocations,
454 ref_inversions,query_inversions,
455 ref_insertions,query_insertions,
456 ref_tandem,query_tandem] = parseMUmmerReport(mummer_dir,report_id)
457
458 with open(log_file, "a") as log:
459 log.write("Parsed MUMmer report...\n")
460
461 if percent_ref_aligned > 0:
462
463 #### 03: Process MUMmer coords file ####
464 coords_file = parseMUmmerCoords(mummer_dir,report_id,reference_chr_bed,query_chr_bed)
465 with open(log_file, "a") as log:
466 log.write("Parsed MUMmer coords...\n")
467
468 aligned_coords = coords_file[~((coords_file['Query_Contig'] == ".") | (coords_file['Ref_Contig'] == "."))]
469 if aligned_coords.shape[0] > 0:
470 aligned_coords['Perc_Iden'] = aligned_coords['Perc_Iden'].astype(float)
471 aligned_coords['Ref_Aligned'] = aligned_coords['Ref_Aligned'].astype(int)
472
473 median_percent_identity = np.median(aligned_coords['Perc_Iden'])
474 median_alignment_length = np.median(aligned_coords['Ref_Aligned'])
475
476 ##### 04: Process MUMmer SNP file ####
477 processed_snps = parseMUmmerSNPs(mummer_dir,report_id,aligned_coords)
478 with open(log_file, "a") as log:
479 log.write("Parsed MUMmer SNPs...\n")
480 # Check if processed_snps is a df or a tuple
481 if isinstance(processed_snps, tuple):
482 total_snp_count,total_indel_count,total_invalid_count = processed_snps
483 else:
484 total_snp_count = processed_snps[processed_snps['Cat'] == "SNP"].shape[0]
485 total_indel_count = processed_snps[processed_snps['Cat'] == "Indel"].shape[0]
486 total_invalid_count = processed_snps[processed_snps['Cat'] == "Invalid"].shape[0]
487
488 # Clean up pybedtools temp
489 helpers.cleanup(verbose=False,remove_all = False)
490 with open(log_file, "a") as log:
491 log.write("Cleaned up TMP...\n")
492
493 # Create header
494 percent_ref_aligned = f"{percent_ref_aligned:.2f}" if percent_ref_aligned != "NA" else percent_ref_aligned
495 percent_query_aligned = f"{percent_query_aligned:.2f}" if percent_query_aligned != "NA" else percent_query_aligned
496 median_percent_identity = f"{median_percent_identity:.2f}" if median_percent_identity != "NA" else median_percent_identity
497 median_alignment_length = f"{median_alignment_length:.2f}" if median_alignment_length != "NA" else median_alignment_length
498 total_snp_count = f"{total_snp_count:.0f}" if total_snp_count != "NA" else total_snp_count
499 total_indel_count = f"{total_indel_count:.0f}" if total_indel_count != "NA" else total_indel_count
500 total_invalid_count = f"{total_invalid_count:.0f}" if total_invalid_count != "NA" else total_invalid_count
501 ref_breakpoints = f"{ref_breakpoints:.0f}"
502 ref_relocations = f"{ref_relocations:.0f}"
503 ref_translocations = f"{ref_translocations:.0f}"
504 ref_inversions = f"{ref_inversions:.0f}"
505 ref_insertions = f"{ref_insertions:.0f}"
506 ref_tandem = f"{ref_tandem:.0f}"
507 query_breakpoints = f"{query_breakpoints:.0f}"
508 query_relocations = f"{query_relocations:.0f}"
509 query_translocations = f"{query_translocations:.0f}"
510 query_inversions = f"{query_inversions:.0f}"
511 query_insertions = f"{query_insertions:.0f}"
512 query_tandem = f"{query_tandem:.0f}"
513 g_snps = f"{g_snps:.0f}"
514 g_indels = f"{g_indels:.0f}"
515 ref_kmers = f"{ref_kmers:.0f}"
516 query_kmers = f"{query_kmers:.0f}"
517 unique_ref_kmers = f"{unique_ref_kmers:.0f}"
518 unique_query_kmers = f"{unique_query_kmers:.0f}"
519 kmer_intersection = f"{kmer_intersection:.0f}"
520 kmer_similarity = f"{kmer_similarity:.2f}"
521
522 snpdiffs_header=[]
523 snpdiffs_header.append("#\t" +
524 "\t".join(query_string) +
525 "\t" + "\t".join(reference_string) +
526 "\t" + "\t".join([
527 "SNPs:"+total_snp_count,
528 "Reference_Percent_Aligned:"+percent_ref_aligned,
529 "Query_Percent_Aligned:"+percent_query_aligned,
530 "Median_Percent_Identity:"+median_percent_identity,
531 "Median_Alignment_Length:"+median_alignment_length,
532 "Kmer_Similarity:"+kmer_similarity,
533 "Shared_Kmers:"+kmer_intersection,
534 "Reference_Unique_Kmers:"+unique_ref_kmers,
535 "Query_Unique_Kmers:"+unique_query_kmers,
536 "Reference_Breakpoints:"+ref_breakpoints,
537 "Query_Breakpoints:"+query_breakpoints,
538 "Reference_Relocations:"+ref_relocations,
539 "Query_Relocations:"+query_relocations,
540 "Reference_Translocations:"+ref_translocations,
541 "Query_Translocations:"+query_translocations,
542 "Reference_Inversions:"+ref_inversions,
543 "Query_Inversions:"+query_inversions,
544 "Reference_Insertions:"+ref_insertions,
545 "Query_Insertions:"+query_insertions,
546 "Reference_Tandem:"+ref_tandem,
547 "Query_Tandem:"+query_tandem,
548 "Indels:"+total_indel_count,
549 "Invalid:"+total_invalid_count,
550 "gSNPs:"+g_snps,
551 "gIndels:"+g_indels]))
552
553 with open(log_file, "a") as log:
554 log.write("Prepped snpdiffs output...\n")
555
556 with open(snpdiffs_file,"w") as file:
557 file.write("\n".join(snpdiffs_header) + "\n")
558 for index, row in coords_file.iterrows():
559 file.write("##\t" + "\t".join(map(str, row))+"\n")
560 if isinstance(processed_snps, tuple):
561 pass
562 else:
563 for index, row in processed_snps.iterrows():
564 file.write("\t".join(map(str, row))+"\n")
565
566 with open(log_file, "a") as log:
567 log.write("Saved snpdiffs file...\n")
568
569 print(",".join([query,reference,snpdiffs_file]))
570
571 except:
572 run_failed = True
573 print("Exception occurred:\n", traceback.format_exc())
574 finally:
575 helpers.cleanup(verbose=False, remove_all=False)
576 if temp_dir != "":
577 shutil.rmtree(temp_dir)
578 if run_failed:
579 sys.exit(1)