rliterman@0
|
1 #!/usr/bin/env python3
|
rliterman@0
|
2
|
rliterman@0
|
3 import sys
|
rliterman@0
|
4 import os
|
rliterman@0
|
5 import pandas as pd
|
rliterman@0
|
6 import re
|
rliterman@0
|
7 from pybedtools import BedTool,helpers,Interval
|
rliterman@0
|
8 import warnings
|
rliterman@0
|
9 import numpy as np
|
rliterman@0
|
10 import hashlib
|
rliterman@0
|
11 from Bio import SeqIO
|
rliterman@0
|
12 import subprocess
|
rliterman@0
|
13 import uuid
|
rliterman@0
|
14 import traceback
|
rliterman@0
|
15 import shutil
|
rliterman@0
|
16 import time
|
rliterman@0
|
17 import argparse
|
rliterman@0
|
18
|
rliterman@0
|
19 warnings.filterwarnings("ignore")
|
rliterman@0
|
20
|
rliterman@0
|
21 def parseMUmmerReport(mum_report_dir,report_id):
|
rliterman@0
|
22 report_data = []
|
rliterman@0
|
23
|
rliterman@0
|
24 with open(mum_report_dir+"/"+report_id+".report", 'r') as report_file:
|
rliterman@0
|
25
|
rliterman@0
|
26 for line in report_file.readlines():
|
rliterman@0
|
27 split_line = re.split(r'\s+', line.strip())
|
rliterman@0
|
28 if len(split_line) == 3:
|
rliterman@0
|
29 report_data.append(
|
rliterman@0
|
30 {
|
rliterman@0
|
31 'Measure': split_line[0].split("(", 1)[0],
|
rliterman@0
|
32 'Ref_Value':split_line[1].split("(", 1)[0],
|
rliterman@0
|
33 'Query_Value':split_line[2].split("(", 1)[0]
|
rliterman@0
|
34 })
|
rliterman@0
|
35
|
rliterman@0
|
36 report_data = pd.DataFrame(report_data)
|
rliterman@0
|
37 report_data = report_data[report_data['Measure'].isin(['TotalSeqs','TotalBases','AlignedBases',
|
rliterman@0
|
38 'TotalSNPs','TotalGSNPs','TotalIndels','TotalGIndels',
|
rliterman@0
|
39 'Breakpoints','Relocations','Translocations','Inversions',
|
rliterman@0
|
40 'Insertions','TandemIns'])]
|
rliterman@0
|
41
|
rliterman@0
|
42 report_data = report_data.drop_duplicates(subset=['Measure']) # Drop M-to-M
|
rliterman@0
|
43
|
rliterman@0
|
44 report_data = report_data.astype({"Ref_Value":"int","Query_Value":"int"})
|
rliterman@0
|
45
|
rliterman@0
|
46 # Fetch assembly data
|
rliterman@0
|
47 ref_bases = report_data[report_data['Measure'] == 'TotalBases']['Ref_Value'].iloc[0]
|
rliterman@0
|
48 query_bases = report_data[report_data['Measure'] == 'TotalBases']['Query_Value'].iloc[0]
|
rliterman@0
|
49
|
rliterman@0
|
50 # Fetch alignment data
|
rliterman@0
|
51 ref_aligned = report_data[report_data['Measure'] == 'AlignedBases']['Ref_Value'].iloc[0]
|
rliterman@0
|
52 query_aligned = report_data[report_data['Measure'] == 'AlignedBases']['Query_Value'].iloc[0]
|
rliterman@0
|
53
|
rliterman@0
|
54 percent_ref_aligned = 100*(float(ref_aligned)/ref_bases)
|
rliterman@0
|
55 percent_query_aligned = 100*(float(query_aligned)/query_bases)
|
rliterman@0
|
56
|
rliterman@0
|
57 # Fetch SNP data
|
rliterman@0
|
58 g_snps = report_data[report_data['Measure'] == 'TotalGSNPs']['Ref_Value'].iloc[0]
|
rliterman@0
|
59 g_indels = report_data[report_data['Measure'] == 'TotalGIndels']['Ref_Value'].iloc[0]
|
rliterman@0
|
60
|
rliterman@0
|
61 ref_breakpoints = report_data[report_data['Measure'] == 'Breakpoints']['Ref_Value'].iloc[0]
|
rliterman@0
|
62 query_breakpoints = report_data[report_data['Measure'] == 'Breakpoints']['Query_Value'].iloc[0]
|
rliterman@0
|
63
|
rliterman@0
|
64 ref_relocations = report_data[report_data['Measure'] == 'Relocations']['Ref_Value'].iloc[0]
|
rliterman@0
|
65 query_relocations = report_data[report_data['Measure'] == 'Relocations']['Query_Value'].iloc[0]
|
rliterman@0
|
66
|
rliterman@0
|
67 ref_translocations = report_data[report_data['Measure'] == 'Translocations']['Ref_Value'].iloc[0]
|
rliterman@0
|
68 query_translocations = report_data[report_data['Measure'] == 'Translocations']['Query_Value'].iloc[0]
|
rliterman@0
|
69
|
rliterman@0
|
70 ref_inversions = report_data[report_data['Measure'] == 'Inversions']['Ref_Value'].iloc[0]
|
rliterman@0
|
71 query_inversions = report_data[report_data['Measure'] == 'Inversions']['Query_Value'].iloc[0]
|
rliterman@0
|
72
|
rliterman@0
|
73 ref_insertions = report_data[report_data['Measure'] == 'Insertions']['Ref_Value'].iloc[0]
|
rliterman@0
|
74 query_insertions = report_data[report_data['Measure'] == 'Insertions']['Query_Value'].iloc[0]
|
rliterman@0
|
75
|
rliterman@0
|
76 ref_tandem = report_data[report_data['Measure'] == 'TandemIns']['Ref_Value'].iloc[0]
|
rliterman@0
|
77 query_tandem = report_data[report_data['Measure'] == 'TandemIns']['Query_Value'].iloc[0]
|
rliterman@0
|
78
|
rliterman@0
|
79 return [ref_bases,percent_ref_aligned,
|
rliterman@0
|
80 query_bases,percent_query_aligned,
|
rliterman@0
|
81 g_snps,g_indels,
|
rliterman@0
|
82 ref_breakpoints,query_breakpoints,
|
rliterman@0
|
83 ref_relocations,query_relocations,
|
rliterman@0
|
84 ref_translocations,query_translocations,
|
rliterman@0
|
85 ref_inversions,query_inversions,
|
rliterman@0
|
86 ref_insertions,query_insertions,
|
rliterman@0
|
87 ref_tandem,query_tandem]
|
rliterman@0
|
88
|
rliterman@0
|
89 def parseMUmmerCoords(mum_coords_dir,report_id,reference_chr_bed,query_chr_bed):
|
rliterman@0
|
90
|
rliterman@0
|
91 coords_file = pd.read_csv(mum_coords_dir+"/"+report_id+".1coords",sep="\t",index_col=False,
|
rliterman@0
|
92 names=['Ref_Start','Ref_End','Query_Start','Query_End',
|
rliterman@0
|
93 'Ref_Aligned','Query_Aligned','Perc_Iden',
|
rliterman@0
|
94 'Ref_Length','Query_Length','Ref_Cov',
|
rliterman@0
|
95 'Query_Cov','Ref_Contig','Query_Contig'],
|
rliterman@0
|
96 dtype={
|
rliterman@0
|
97 'Ref_Start': int,
|
rliterman@0
|
98 'Ref_End': int,
|
rliterman@0
|
99 'Query_Start': int,
|
rliterman@0
|
100 'Query_End': int,
|
rliterman@0
|
101 'Ref_Aligned': int,
|
rliterman@0
|
102 'Query_Aligned': int,
|
rliterman@0
|
103 'Perc_Iden': float,
|
rliterman@0
|
104 'Ref_Length': int,
|
rliterman@0
|
105 'Query_Length': int,
|
rliterman@0
|
106 'Ref_Cov': float,
|
rliterman@0
|
107 'Query_Cov': float,
|
rliterman@0
|
108 'Ref_Contig': str,
|
rliterman@0
|
109 'Query_Contig': str
|
rliterman@0
|
110 })
|
rliterman@0
|
111
|
rliterman@0
|
112 if coords_file.shape[0] > 0:
|
rliterman@0
|
113
|
rliterman@0
|
114 coords_file['Ref_Start'] = (coords_file['Ref_Start'] - 1)
|
rliterman@0
|
115 coords_file = coords_file.apply(adjust_query_row, axis=1)
|
rliterman@0
|
116
|
rliterman@0
|
117 merged_ref_bed = BedTool.from_dataframe(coords_file[['Ref_Contig','Ref_Start','Ref_End']]).sort().merge()
|
rliterman@0
|
118 merged_query_bed = BedTool.from_dataframe(coords_file[['Query_Contig','Query_Start','Query_End']]).sort().merge()
|
rliterman@0
|
119
|
rliterman@0
|
120 covered_ref_regions = reference_chr_bed.intersect(merged_ref_bed)
|
rliterman@0
|
121 uncovered_ref_regions = reference_chr_bed.subtract(merged_ref_bed)
|
rliterman@0
|
122
|
rliterman@0
|
123 covered_query_regions = query_chr_bed.intersect(merged_query_bed)
|
rliterman@0
|
124 uncovered_query_regions = query_chr_bed.subtract(merged_query_bed)
|
rliterman@0
|
125 else:
|
rliterman@0
|
126 merged_ref_bed = BedTool([])
|
rliterman@0
|
127 merged_query_bed = BedTool([])
|
rliterman@0
|
128
|
rliterman@0
|
129 covered_ref_regions = BedTool([])
|
rliterman@0
|
130 uncovered_ref_regions = BedTool([])
|
rliterman@0
|
131
|
rliterman@0
|
132 covered_query_regions = BedTool([])
|
rliterman@0
|
133 uncovered_query_regions = BedTool([])
|
rliterman@0
|
134
|
rliterman@0
|
135 ref_genome_size = calculate_total_length(reference_chr_bed)
|
rliterman@0
|
136 ref_covered_size = calculate_total_length(covered_ref_regions)
|
rliterman@0
|
137 ref_uncovered_size = calculate_total_length(uncovered_ref_regions)
|
rliterman@0
|
138
|
rliterman@0
|
139 query_genome_size = calculate_total_length(query_chr_bed)
|
rliterman@0
|
140 query_covered_size = calculate_total_length(covered_query_regions)
|
rliterman@0
|
141 query_uncovered_size = calculate_total_length(uncovered_query_regions)
|
rliterman@0
|
142
|
rliterman@0
|
143 assert ref_covered_size + ref_uncovered_size == ref_genome_size
|
rliterman@0
|
144 assert query_covered_size + query_uncovered_size == query_genome_size
|
rliterman@0
|
145
|
rliterman@0
|
146 # Create return_df
|
rliterman@0
|
147 return_columns = ['Ref_Contig','Ref_Start','Ref_End','Ref_Length','Ref_Aligned','Query_Contig','Query_Start','Query_End','Query_Length','Query_Aligned','Perc_Iden']
|
rliterman@0
|
148
|
rliterman@0
|
149 return_df = pd.DataFrame(columns=return_columns)
|
rliterman@0
|
150
|
rliterman@0
|
151 if coords_file.shape[0] > 0:
|
rliterman@0
|
152 return_df = pd.concat([return_df,coords_file[return_columns]],ignore_index=True)
|
rliterman@0
|
153
|
rliterman@0
|
154 if uncovered_ref_regions.count() > 0:
|
rliterman@0
|
155 return_uncovered_df = uncovered_ref_regions.to_dataframe().sort_values(['chrom','start']).rename(columns={'chrom':'Ref_Contig','start':'Ref_Start','end':'Ref_End'}).assign(Ref_Aligned = ".",Query_Aligned = ".",Query_Contig=".",Query_Start=".",Query_End=".",Ref_Length = ".",Query_Length=".",Perc_Iden = ".")
|
rliterman@0
|
156 return_df = pd.concat([return_df,return_uncovered_df[return_columns]],ignore_index=True)
|
rliterman@0
|
157
|
rliterman@0
|
158 if uncovered_query_regions.count() > 0:
|
rliterman@0
|
159 return_uncovered_df = uncovered_query_regions.to_dataframe().sort_values(['chrom','start']).rename(columns={'chrom':'Query_Contig','start':'Query_Start','end':'Query_End'}).assign(Ref_Aligned = ".",Query_Aligned = ".",Ref_Contig=".",Ref_Start=".",Ref_End=".",Ref_Length = ".",Query_Length=".",Perc_Iden = ".")
|
rliterman@0
|
160 return_df = pd.concat([return_df,return_uncovered_df[return_columns]],ignore_index=True)
|
rliterman@0
|
161
|
rliterman@0
|
162 return return_df
|
rliterman@0
|
163
|
rliterman@0
|
164 def adjust_query_row(row):
|
rliterman@0
|
165 if set(['Query_Start', 'Query_End', 'Query_Contig']).issubset(row.index):
|
rliterman@0
|
166 if row['Query_Start'] > row['Query_End']:
|
rliterman@0
|
167 row['Query_Start'], row['Query_End'] = row['Query_End'] - 1, row['Query_Start']
|
rliterman@0
|
168 else:
|
rliterman@0
|
169 row['Query_Start'] = row['Query_Start'] - 1
|
rliterman@0
|
170 return row
|
rliterman@0
|
171
|
rliterman@0
|
172 def parseMUmmerSNPs(mum_snps_dir,report_id,coords_file):
|
rliterman@0
|
173
|
rliterman@0
|
174 return_columns = ['Ref_Contig','Start_Ref','Ref_Pos',
|
rliterman@0
|
175 'Query_Contig','Start_Query','Query_Pos',
|
rliterman@0
|
176 'Ref_Loc','Query_Loc',
|
rliterman@0
|
177 'Ref_Start','Ref_End',
|
rliterman@0
|
178 'Query_Start','Query_End',
|
rliterman@0
|
179 'Ref_Base','Query_Base',
|
rliterman@0
|
180 'Dist_to_Ref_End','Dist_to_Query_End',
|
rliterman@0
|
181 'Ref_Aligned','Query_Aligned',
|
rliterman@0
|
182 'Query_Direction','Perc_Iden','Cat']
|
rliterman@0
|
183
|
rliterman@0
|
184 snp_file = pd.read_csv(mum_snps_dir+"/"+report_id+".snps",sep="\t",index_col=False,
|
rliterman@0
|
185 names=['Ref_Pos','Ref_Base','Query_Base','Query_Pos',
|
rliterman@0
|
186 'SNP_Buffer','Dist_to_End','Ref_Length','Query_Length',
|
rliterman@0
|
187 'Ref_Direction','Query_Direction','Ref_Contig','Query_Contig'])
|
rliterman@0
|
188
|
rliterman@0
|
189 int_columns = ['Ref_Pos', 'Query_Pos', 'Dist_to_End','Ref_Length', 'Query_Length']
|
rliterman@0
|
190 contig_columns = ['Ref_Contig','Query_Contig']
|
rliterman@0
|
191
|
rliterman@0
|
192 if snp_file.shape[0] == 0:
|
rliterman@0
|
193 return pd.DataFrame(columns=return_columns)
|
rliterman@0
|
194 else:
|
rliterman@0
|
195 for col in int_columns:
|
rliterman@0
|
196 snp_file.loc[:, col] = snp_file.loc[:, col].astype(float).astype(int)
|
rliterman@0
|
197 for col in contig_columns:
|
rliterman@0
|
198 snp_file.loc[:, col] = snp_file.loc[:, col].astype(str)
|
rliterman@0
|
199 coords_file.loc[:, col] = coords_file.loc[:, col].astype(str)
|
rliterman@0
|
200
|
rliterman@0
|
201 snp_file['Start_Ref'] = snp_file['Ref_Pos'] - 1
|
rliterman@0
|
202 snp_file['Start_Query'] = snp_file['Query_Pos'] - 1
|
rliterman@0
|
203
|
rliterman@0
|
204 snp_file['Dist_to_Ref_End'] = [min([x,y]) for x,y in zip(snp_file['Ref_Pos'], snp_file['Ref_Length'] - snp_file['Ref_Pos'])]
|
rliterman@0
|
205 snp_file['Dist_to_Query_End'] = [min([x,y]) for x,y in zip(snp_file['Query_Pos'],snp_file['Query_Length'] - snp_file['Query_Pos'])]
|
rliterman@0
|
206
|
rliterman@0
|
207 snp_file['Ref_Loc'] = ["/".join([str(x[0]),str(x[1])]) for x in list(zip(snp_file.Ref_Contig, snp_file.Ref_Pos))]
|
rliterman@0
|
208 snp_file['Query_Loc'] = ["/".join([str(x[0]),str(x[1])]) for x in list(zip(snp_file.Query_Contig, snp_file.Query_Pos))]
|
rliterman@0
|
209
|
rliterman@0
|
210 total_snp_count = snp_file.shape[0]
|
rliterman@0
|
211
|
rliterman@0
|
212 valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T',"."]
|
rliterman@0
|
213
|
rliterman@0
|
214 invalid_file = snp_file[(~snp_file['Ref_Base'].isin(valid_bases)) | (~snp_file['Query_Base'].isin(valid_bases))]
|
rliterman@0
|
215 snp_file = snp_file[(snp_file['Ref_Base'].isin(valid_bases)) & (snp_file['Query_Base'].isin(valid_bases))]
|
rliterman@0
|
216 indel_file = snp_file[(snp_file['Query_Base'] == ".") | (snp_file['Ref_Base'] == ".")]
|
rliterman@0
|
217 snp_file = snp_file[~((snp_file['Query_Base'] == ".") | (snp_file['Ref_Base'] == "."))]
|
rliterman@0
|
218
|
rliterman@0
|
219 if total_snp_count >= 10000:
|
rliterman@0
|
220 return (snp_file.shape[0],indel_file.shape[0],invalid_file.shape[0])
|
rliterman@0
|
221
|
rliterman@0
|
222 else:
|
rliterman@0
|
223 if snp_file.shape[0] == 0:
|
rliterman@0
|
224 snp_coords = pd.DataFrame(columns=return_columns)
|
rliterman@0
|
225 else:
|
rliterman@0
|
226 snp_coords = makeSNPCoords(snp_file,coords_file,snp_file.shape[0])
|
rliterman@0
|
227 snp_coords['Cat'] = "SNP"
|
rliterman@0
|
228
|
rliterman@0
|
229 if indel_file.shape[0] == 0:
|
rliterman@0
|
230 indel_coords = pd.DataFrame(columns=return_columns)
|
rliterman@0
|
231 else:
|
rliterman@0
|
232 indel_coords = makeSNPCoords(indel_file,coords_file,indel_file.shape[0])
|
rliterman@0
|
233 indel_coords['Cat'] = "Indel"
|
rliterman@0
|
234
|
rliterman@0
|
235 if invalid_file.shape[0] == 0:
|
rliterman@0
|
236 invalid_coords = pd.DataFrame(columns=return_columns)
|
rliterman@0
|
237 else:
|
rliterman@0
|
238 invalid_coords = makeSNPCoords(invalid_file,coords_file,invalid_file.shape[0])
|
rliterman@0
|
239 invalid_coords['Cat'] = "Invalid"
|
rliterman@0
|
240
|
rliterman@0
|
241 return_df = pd.concat([snp_coords,indel_coords,invalid_coords],ignore_index=True)[return_columns]
|
rliterman@0
|
242
|
rliterman@0
|
243 assert return_df.shape[0] == total_snp_count
|
rliterman@0
|
244 return return_df
|
rliterman@0
|
245
|
rliterman@0
|
246 def makeSNPCoords(snp_df, coords_df, row_count):
|
rliterman@0
|
247 snp_coords = pd.merge(snp_df, coords_df, how='left').dropna()
|
rliterman@0
|
248
|
rliterman@0
|
249 condition = ((snp_coords['Ref_Start'] <= snp_coords['Ref_Pos']) &
|
rliterman@0
|
250 (snp_coords['Ref_Pos'] <= snp_coords['Ref_End']) &
|
rliterman@0
|
251 (snp_coords['Query_Start'] <= snp_coords['Query_Pos']) &
|
rliterman@0
|
252 (snp_coords['Query_Pos'] <= snp_coords['Query_End']))
|
rliterman@0
|
253 snp_coords = snp_coords[condition]
|
rliterman@0
|
254
|
rliterman@0
|
255 if row_count != snp_coords.shape[0]:
|
rliterman@0
|
256 snp_coords.sort_values(by=['Ref_Aligned', 'Perc_Iden'], ascending=False, inplace=True)
|
rliterman@0
|
257 snp_coords.drop_duplicates(subset=['Ref_Loc','Query_Loc'], keep='first', inplace=True)
|
rliterman@0
|
258
|
rliterman@0
|
259 return snp_coords
|
rliterman@0
|
260
|
rliterman@0
|
261 def fasta_info(file_path):
|
rliterman@0
|
262 records = list(SeqIO.parse(file_path, 'fasta'))
|
rliterman@0
|
263 contig_count = int(len(records))
|
rliterman@0
|
264 lengths = sorted([len(record) for record in records], reverse=True)
|
rliterman@0
|
265 assembly_bases = sum(lengths)
|
rliterman@0
|
266
|
rliterman@0
|
267 with open(file_path, 'rb') as file:
|
rliterman@0
|
268 sha256 = hashlib.sha256(file.read()).hexdigest()
|
rliterman@0
|
269
|
rliterman@0
|
270 cumulative_length = 0
|
rliterman@0
|
271 n50 = None
|
rliterman@0
|
272 n90 = None
|
rliterman@0
|
273 l50 = None
|
rliterman@0
|
274 l90 = None
|
rliterman@0
|
275
|
rliterman@0
|
276 for i, length in enumerate(lengths, start=1):
|
rliterman@0
|
277 cumulative_length += length
|
rliterman@0
|
278 if cumulative_length >= assembly_bases * 0.5 and n50 is None:
|
rliterman@0
|
279 n50 = length
|
rliterman@0
|
280 l50 = i
|
rliterman@0
|
281 if cumulative_length >= assembly_bases * 0.9 and n90 is None:
|
rliterman@0
|
282 n90 = length
|
rliterman@0
|
283 l90 = i
|
rliterman@0
|
284 if n50 is not None and n90 is not None:
|
rliterman@0
|
285 break
|
rliterman@0
|
286
|
rliterman@0
|
287 return [file_path,contig_count,assembly_bases,n50,n90,l50,l90,sha256]
|
rliterman@0
|
288
|
rliterman@0
|
289 def fasta_to_bedtool(fasta_file):
|
rliterman@0
|
290 intervals = []
|
rliterman@0
|
291 for record in SeqIO.parse(fasta_file, "fasta"):
|
rliterman@0
|
292 chrom_name = record.id
|
rliterman@0
|
293 chrom_length = len(record.seq)
|
rliterman@0
|
294 interval = Interval(chrom_name, 0, chrom_length)
|
rliterman@0
|
295 intervals.append(interval)
|
rliterman@0
|
296 bedtool = BedTool(intervals).sort()
|
rliterman@0
|
297 return bedtool
|
rliterman@0
|
298
|
rliterman@0
|
299 def calculate_total_length(bedtool):
|
rliterman@0
|
300 return sum(len(interval) for interval in bedtool)
|
rliterman@0
|
301
|
rliterman@0
|
302 def get_kmers(command, log_file, retries=10, delay=5, timeout=30):
|
rliterman@0
|
303 kmer_set = set()
|
rliterman@0
|
304
|
rliterman@0
|
305 for attempt in range(retries):
|
rliterman@0
|
306 try:
|
rliterman@0
|
307 process = subprocess.run(command, capture_output=True, text=True, timeout=timeout)
|
rliterman@0
|
308 kmer_set.update(process.stdout.strip().split('\n'))
|
rliterman@0
|
309
|
rliterman@0
|
310 if len(kmer_set) > 1:
|
rliterman@0
|
311 return kmer_set
|
rliterman@0
|
312
|
rliterman@0
|
313 except subprocess.TimeoutExpired:
|
rliterman@0
|
314 with open(log_file, "a") as log:
|
rliterman@0
|
315 log.write(f"Kmer command timed out after {timeout} seconds\n")
|
rliterman@0
|
316 time.sleep(delay)
|
rliterman@0
|
317
|
rliterman@0
|
318 except subprocess.CalledProcessError as e:
|
rliterman@0
|
319 with open(log_file, "a") as log:
|
rliterman@0
|
320 log.write(f"Kmer command errored out: {e}\n")
|
rliterman@0
|
321 time.sleep(delay)
|
rliterman@0
|
322
|
rliterman@0
|
323 # Wait before retrying if this is not the last attempt
|
rliterman@0
|
324 if attempt < retries - 1:
|
rliterman@0
|
325 time.sleep(delay)
|
rliterman@0
|
326 else:
|
rliterman@0
|
327 raise RuntimeError(f"Command failed after {retries} attempts: {command}")
|
rliterman@0
|
328
|
rliterman@0
|
329 return None
|
rliterman@0
|
330
|
rliterman@0
|
331 def compare_kmers(query_file,reference_file,log_file):
|
rliterman@0
|
332
|
rliterman@0
|
333 ref_command = ["kmercountexact.sh", f"in={reference_file}", "threads=1", "fastadump=f", "out=stdout", "|", "cut", "-f1"]
|
rliterman@0
|
334 query_command = ["kmercountexact.sh", f"in={query_file}", "threads=1", "fastadump=f", "out=stdout", "|", "cut", "-f1"]
|
rliterman@0
|
335
|
rliterman@0
|
336 ref_kmers = get_kmers(ref_command,log_file)
|
rliterman@0
|
337 with open(log_file, "a") as log:
|
rliterman@0
|
338 log.write(f"Fetched reference kmers...\n")
|
rliterman@0
|
339
|
rliterman@0
|
340 query_kmers = get_kmers(query_command,log_file)
|
rliterman@0
|
341 with open(log_file, "a") as log:
|
rliterman@0
|
342 log.write(f"Fetched query kmers...\n")
|
rliterman@0
|
343
|
rliterman@0
|
344 intersection = len(ref_kmers.intersection(query_kmers))
|
rliterman@0
|
345 similarity = 100*(intersection/(len(ref_kmers) + len(query_kmers) - intersection))
|
rliterman@0
|
346 unique_ref = len(ref_kmers.difference(query_kmers))
|
rliterman@0
|
347 unique_query = len(query_kmers.difference(ref_kmers))
|
rliterman@0
|
348
|
rliterman@0
|
349 return [len(ref_kmers),len(query_kmers),
|
rliterman@0
|
350 unique_ref,unique_query,
|
rliterman@0
|
351 intersection,similarity]
|
rliterman@0
|
352
|
rliterman@0
|
353 #### 01: Read in arguments ####
|
rliterman@0
|
354 run_failed = False
|
rliterman@0
|
355
|
rliterman@0
|
356 parser = argparse.ArgumentParser(description='CSP2 MUMmer Compilation')
|
rliterman@0
|
357 parser.add_argument('--query', type=str, help='Query')
|
rliterman@0
|
358 parser.add_argument('--query_fasta', type=str, help='Query Fasta')
|
rliterman@0
|
359 parser.add_argument('--reference', type=str, help='Reference')
|
rliterman@0
|
360 parser.add_argument('--reference_fasta', type=str, help='Reference Fasta')
|
rliterman@0
|
361 parser.add_argument('--mummer_dir', type=str, help='MUMmer Directory')
|
rliterman@0
|
362 parser.add_argument('--snpdiffs_dir', type=str, help='snpdiffs Directory')
|
rliterman@0
|
363 parser.add_argument('--temp_dir', type=str, default='', help='Temporary Directory')
|
rliterman@0
|
364 parser.add_argument('--log_file', type=str, help='Log File')
|
rliterman@0
|
365 args = parser.parse_args()
|
rliterman@0
|
366
|
rliterman@0
|
367 query = args.query
|
rliterman@0
|
368 query_fasta = args.query_fasta
|
rliterman@0
|
369 reference = args.reference
|
rliterman@0
|
370 reference_fasta = args.reference_fasta
|
rliterman@0
|
371
|
rliterman@0
|
372 mummer_dir = os.path.normpath(os.path.abspath(args.mummer_dir))
|
rliterman@0
|
373 snpdiffs_dir = os.path.normpath(os.path.abspath(args.snpdiffs_dir))
|
rliterman@0
|
374 log_file = os.path.normpath(os.path.abspath(args.log_file))
|
rliterman@0
|
375
|
rliterman@0
|
376 with open(log_file, "w") as log:
|
rliterman@0
|
377 log.write(f"CSP2 MUMmer Compilation Log: {query}__vs__{reference}\n")
|
rliterman@0
|
378 log.write(f"\t- Query Fasta: {query_fasta}\n")
|
rliterman@0
|
379 log.write(f"\t- Reference Fasta: {reference_fasta}\n")
|
rliterman@0
|
380 log.write(f"\t - MUMmer Directory: {mummer_dir}\n")
|
rliterman@0
|
381 log.write(f"\t - snpdiffs Directory: {snpdiffs_dir}\n")
|
rliterman@0
|
382
|
rliterman@0
|
383 # Set TMP
|
rliterman@0
|
384 global temp_dir
|
rliterman@0
|
385 if args.temp_dir != "":
|
rliterman@0
|
386 random_temp_id = str(uuid.uuid4())
|
rliterman@0
|
387 temp_dir = f"{os.path.normpath(os.path.abspath(args.temp_dir))}/{random_temp_id}"
|
rliterman@0
|
388 try:
|
rliterman@0
|
389 os.mkdir(temp_dir)
|
rliterman@0
|
390 helpers.set_tempdir(temp_dir)
|
rliterman@0
|
391 with open(log_file, "a") as log:
|
rliterman@0
|
392 log.write(f"\t - Temporary Directory: {temp_dir}\n")
|
rliterman@0
|
393
|
rliterman@0
|
394 except OSError as e:
|
rliterman@0
|
395 run_failed = True
|
rliterman@0
|
396 print(f"Error: Failed to create directory '{temp_dir}': {e}")
|
rliterman@0
|
397 else:
|
rliterman@0
|
398 temp_dir = ""
|
rliterman@0
|
399
|
rliterman@0
|
400 with open(log_file, "a") as log:
|
rliterman@0
|
401 log.write("-------------------------------------------------------\n\n")
|
rliterman@0
|
402
|
rliterman@0
|
403 try:
|
rliterman@0
|
404 # Get query data
|
rliterman@0
|
405 query_data = [query] + fasta_info(query_fasta)
|
rliterman@0
|
406 query_string = [x+":"+str(y) for x,y in zip(['Query_ID','Query_Assembly','Query_Contig_Count','Query_Assembly_Bases',
|
rliterman@0
|
407 'Query_N50','Query_N90','Query_L50','Query_L90','Query_SHA256'],query_data)]
|
rliterman@0
|
408
|
rliterman@0
|
409 with open(log_file, "a") as log:
|
rliterman@0
|
410 log.write("Fetched Query Data...\n")
|
rliterman@0
|
411
|
rliterman@0
|
412 # Get reference data
|
rliterman@0
|
413 reference_data = [reference] + fasta_info(reference_fasta)
|
rliterman@0
|
414 reference_string = [x+":"+str(y) for x,y in zip(['Reference_ID','Reference_Assembly','Reference_Contig_Count','Reference_Assembly_Bases',
|
rliterman@0
|
415 'Reference_N50','Reference_N90','Reference_L50','Reference_L90','Reference_SHA256'],reference_data)]
|
rliterman@0
|
416
|
rliterman@0
|
417 with open(log_file, "a") as log:
|
rliterman@0
|
418 log.write("Fetched Reference Data...\n")
|
rliterman@0
|
419
|
rliterman@0
|
420 # Get kmer distance
|
rliterman@0
|
421 [ref_kmers,query_kmers,
|
rliterman@0
|
422 unique_ref_kmers,unique_query_kmers,
|
rliterman@0
|
423 kmer_intersection,kmer_similarity] = compare_kmers(query_fasta,reference_fasta,log_file)
|
rliterman@0
|
424
|
rliterman@0
|
425 with open(log_file, "a") as log:
|
rliterman@0
|
426 log.write("Fetched Kmer Data...\n")
|
rliterman@0
|
427
|
rliterman@0
|
428 # Create reference BED file using fasta seq lengths
|
rliterman@0
|
429 query_chr_bed = fasta_to_bedtool(query_fasta)
|
rliterman@0
|
430 reference_chr_bed = fasta_to_bedtool(reference_fasta)
|
rliterman@0
|
431
|
rliterman@0
|
432 with open(log_file, "a") as log:
|
rliterman@0
|
433 log.write("Created BED files...\n")
|
rliterman@0
|
434
|
rliterman@0
|
435 # Set report ID
|
rliterman@0
|
436 report_id = query + "__vs__" + reference
|
rliterman@0
|
437 snpdiffs_file = snpdiffs_dir + "/" + report_id + ".snpdiffs"
|
rliterman@0
|
438
|
rliterman@0
|
439 # Create NA variables for all downstream options
|
rliterman@0
|
440 median_percent_identity = "NA"
|
rliterman@0
|
441 median_alignment_length = "NA"
|
rliterman@0
|
442
|
rliterman@0
|
443 total_snp_count = "NA"
|
rliterman@0
|
444 total_indel_count = "NA"
|
rliterman@0
|
445 total_invalid_count = "NA"
|
rliterman@0
|
446
|
rliterman@0
|
447 #### 02: Read in MUMmer report data ####
|
rliterman@0
|
448 [ref_bases,percent_ref_aligned,
|
rliterman@0
|
449 query_bases,percent_query_aligned,
|
rliterman@0
|
450 g_snps,g_indels,
|
rliterman@0
|
451 ref_breakpoints,query_breakpoints,
|
rliterman@0
|
452 ref_relocations,query_relocations,
|
rliterman@0
|
453 ref_translocations,query_translocations,
|
rliterman@0
|
454 ref_inversions,query_inversions,
|
rliterman@0
|
455 ref_insertions,query_insertions,
|
rliterman@0
|
456 ref_tandem,query_tandem] = parseMUmmerReport(mummer_dir,report_id)
|
rliterman@0
|
457
|
rliterman@0
|
458 with open(log_file, "a") as log:
|
rliterman@0
|
459 log.write("Parsed MUMmer report...\n")
|
rliterman@0
|
460
|
rliterman@0
|
461 if percent_ref_aligned > 0:
|
rliterman@0
|
462
|
rliterman@0
|
463 #### 03: Process MUMmer coords file ####
|
rliterman@0
|
464 coords_file = parseMUmmerCoords(mummer_dir,report_id,reference_chr_bed,query_chr_bed)
|
rliterman@0
|
465 with open(log_file, "a") as log:
|
rliterman@0
|
466 log.write("Parsed MUMmer coords...\n")
|
rliterman@0
|
467
|
rliterman@0
|
468 aligned_coords = coords_file[~((coords_file['Query_Contig'] == ".") | (coords_file['Ref_Contig'] == "."))]
|
rliterman@0
|
469 if aligned_coords.shape[0] > 0:
|
rliterman@0
|
470 aligned_coords['Perc_Iden'] = aligned_coords['Perc_Iden'].astype(float)
|
rliterman@0
|
471 aligned_coords['Ref_Aligned'] = aligned_coords['Ref_Aligned'].astype(int)
|
rliterman@0
|
472
|
rliterman@0
|
473 median_percent_identity = np.median(aligned_coords['Perc_Iden'])
|
rliterman@0
|
474 median_alignment_length = np.median(aligned_coords['Ref_Aligned'])
|
rliterman@0
|
475
|
rliterman@0
|
476 ##### 04: Process MUMmer SNP file ####
|
rliterman@0
|
477 processed_snps = parseMUmmerSNPs(mummer_dir,report_id,aligned_coords)
|
rliterman@0
|
478 with open(log_file, "a") as log:
|
rliterman@0
|
479 log.write("Parsed MUMmer SNPs...\n")
|
rliterman@0
|
480 # Check if processed_snps is a df or a tuple
|
rliterman@0
|
481 if isinstance(processed_snps, tuple):
|
rliterman@0
|
482 total_snp_count,total_indel_count,total_invalid_count = processed_snps
|
rliterman@0
|
483 else:
|
rliterman@0
|
484 total_snp_count = processed_snps[processed_snps['Cat'] == "SNP"].shape[0]
|
rliterman@0
|
485 total_indel_count = processed_snps[processed_snps['Cat'] == "Indel"].shape[0]
|
rliterman@0
|
486 total_invalid_count = processed_snps[processed_snps['Cat'] == "Invalid"].shape[0]
|
rliterman@0
|
487
|
rliterman@0
|
488 # Clean up pybedtools temp
|
rliterman@0
|
489 helpers.cleanup(verbose=False,remove_all = False)
|
rliterman@0
|
490 with open(log_file, "a") as log:
|
rliterman@0
|
491 log.write("Cleaned up TMP...\n")
|
rliterman@0
|
492
|
rliterman@0
|
493 # Create header
|
rliterman@0
|
494 percent_ref_aligned = f"{percent_ref_aligned:.2f}" if percent_ref_aligned != "NA" else percent_ref_aligned
|
rliterman@0
|
495 percent_query_aligned = f"{percent_query_aligned:.2f}" if percent_query_aligned != "NA" else percent_query_aligned
|
rliterman@0
|
496 median_percent_identity = f"{median_percent_identity:.2f}" if median_percent_identity != "NA" else median_percent_identity
|
rliterman@0
|
497 median_alignment_length = f"{median_alignment_length:.2f}" if median_alignment_length != "NA" else median_alignment_length
|
rliterman@0
|
498 total_snp_count = f"{total_snp_count:.0f}" if total_snp_count != "NA" else total_snp_count
|
rliterman@0
|
499 total_indel_count = f"{total_indel_count:.0f}" if total_indel_count != "NA" else total_indel_count
|
rliterman@0
|
500 total_invalid_count = f"{total_invalid_count:.0f}" if total_invalid_count != "NA" else total_invalid_count
|
rliterman@0
|
501 ref_breakpoints = f"{ref_breakpoints:.0f}"
|
rliterman@0
|
502 ref_relocations = f"{ref_relocations:.0f}"
|
rliterman@0
|
503 ref_translocations = f"{ref_translocations:.0f}"
|
rliterman@0
|
504 ref_inversions = f"{ref_inversions:.0f}"
|
rliterman@0
|
505 ref_insertions = f"{ref_insertions:.0f}"
|
rliterman@0
|
506 ref_tandem = f"{ref_tandem:.0f}"
|
rliterman@0
|
507 query_breakpoints = f"{query_breakpoints:.0f}"
|
rliterman@0
|
508 query_relocations = f"{query_relocations:.0f}"
|
rliterman@0
|
509 query_translocations = f"{query_translocations:.0f}"
|
rliterman@0
|
510 query_inversions = f"{query_inversions:.0f}"
|
rliterman@0
|
511 query_insertions = f"{query_insertions:.0f}"
|
rliterman@0
|
512 query_tandem = f"{query_tandem:.0f}"
|
rliterman@0
|
513 g_snps = f"{g_snps:.0f}"
|
rliterman@0
|
514 g_indels = f"{g_indels:.0f}"
|
rliterman@0
|
515 ref_kmers = f"{ref_kmers:.0f}"
|
rliterman@0
|
516 query_kmers = f"{query_kmers:.0f}"
|
rliterman@0
|
517 unique_ref_kmers = f"{unique_ref_kmers:.0f}"
|
rliterman@0
|
518 unique_query_kmers = f"{unique_query_kmers:.0f}"
|
rliterman@0
|
519 kmer_intersection = f"{kmer_intersection:.0f}"
|
rliterman@0
|
520 kmer_similarity = f"{kmer_similarity:.2f}"
|
rliterman@0
|
521
|
rliterman@0
|
522 snpdiffs_header=[]
|
rliterman@0
|
523 snpdiffs_header.append("#\t" +
|
rliterman@0
|
524 "\t".join(query_string) +
|
rliterman@0
|
525 "\t" + "\t".join(reference_string) +
|
rliterman@0
|
526 "\t" + "\t".join([
|
rliterman@0
|
527 "SNPs:"+total_snp_count,
|
rliterman@0
|
528 "Reference_Percent_Aligned:"+percent_ref_aligned,
|
rliterman@0
|
529 "Query_Percent_Aligned:"+percent_query_aligned,
|
rliterman@0
|
530 "Median_Percent_Identity:"+median_percent_identity,
|
rliterman@0
|
531 "Median_Alignment_Length:"+median_alignment_length,
|
rliterman@0
|
532 "Kmer_Similarity:"+kmer_similarity,
|
rliterman@0
|
533 "Shared_Kmers:"+kmer_intersection,
|
rliterman@0
|
534 "Reference_Unique_Kmers:"+unique_ref_kmers,
|
rliterman@0
|
535 "Query_Unique_Kmers:"+unique_query_kmers,
|
rliterman@0
|
536 "Reference_Breakpoints:"+ref_breakpoints,
|
rliterman@0
|
537 "Query_Breakpoints:"+query_breakpoints,
|
rliterman@0
|
538 "Reference_Relocations:"+ref_relocations,
|
rliterman@0
|
539 "Query_Relocations:"+query_relocations,
|
rliterman@0
|
540 "Reference_Translocations:"+ref_translocations,
|
rliterman@0
|
541 "Query_Translocations:"+query_translocations,
|
rliterman@0
|
542 "Reference_Inversions:"+ref_inversions,
|
rliterman@0
|
543 "Query_Inversions:"+query_inversions,
|
rliterman@0
|
544 "Reference_Insertions:"+ref_insertions,
|
rliterman@0
|
545 "Query_Insertions:"+query_insertions,
|
rliterman@0
|
546 "Reference_Tandem:"+ref_tandem,
|
rliterman@0
|
547 "Query_Tandem:"+query_tandem,
|
rliterman@0
|
548 "Indels:"+total_indel_count,
|
rliterman@0
|
549 "Invalid:"+total_invalid_count,
|
rliterman@0
|
550 "gSNPs:"+g_snps,
|
rliterman@0
|
551 "gIndels:"+g_indels]))
|
rliterman@0
|
552
|
rliterman@0
|
553 with open(log_file, "a") as log:
|
rliterman@0
|
554 log.write("Prepped snpdiffs output...\n")
|
rliterman@0
|
555
|
rliterman@0
|
556 with open(snpdiffs_file,"w") as file:
|
rliterman@0
|
557 file.write("\n".join(snpdiffs_header) + "\n")
|
rliterman@0
|
558 for index, row in coords_file.iterrows():
|
rliterman@0
|
559 file.write("##\t" + "\t".join(map(str, row))+"\n")
|
rliterman@0
|
560 if isinstance(processed_snps, tuple):
|
rliterman@0
|
561 pass
|
rliterman@0
|
562 else:
|
rliterman@0
|
563 for index, row in processed_snps.iterrows():
|
rliterman@0
|
564 file.write("\t".join(map(str, row))+"\n")
|
rliterman@0
|
565
|
rliterman@0
|
566 with open(log_file, "a") as log:
|
rliterman@0
|
567 log.write("Saved snpdiffs file...\n")
|
rliterman@0
|
568
|
rliterman@0
|
569 print(",".join([query,reference,snpdiffs_file]))
|
rliterman@0
|
570
|
rliterman@0
|
571 except:
|
rliterman@0
|
572 run_failed = True
|
rliterman@0
|
573 print("Exception occurred:\n", traceback.format_exc())
|
rliterman@0
|
574 finally:
|
rliterman@0
|
575 helpers.cleanup(verbose=False, remove_all=False)
|
rliterman@0
|
576 if temp_dir != "":
|
rliterman@0
|
577 shutil.rmtree(temp_dir)
|
rliterman@0
|
578 if run_failed:
|
rliterman@0
|
579 sys.exit(1) |