test_galtrakr_eurl_vtec_wgs_pt_23: scripts/ReMatCh/rematch.py annotate

annotate scripts/ReMatCh/rematch.py @ 1:5224b4d51065 tip

"planemo upload commit cb65588391944306ff3cb32a23e1c28f65122014-dirty"

author	cstrittmatter
date	Fri, 11 Mar 2022 15:52:43 -0500
parents	8be2feb96994
children

rev	line source
cstrittmatter@0	1 #!/usr/bin/env python3
cstrittmatter@0	2
cstrittmatter@0	3 # -- coding: utf-8 --
cstrittmatter@0	4
cstrittmatter@0	5 """
cstrittmatter@0	6 rematch.py - Reads mapping against target sequences, checking mapping
cstrittmatter@0	7 and consensus sequences production
cstrittmatter@0	8 <https://github.com/B-UMMI/ReMatCh/>
cstrittmatter@0	9
cstrittmatter@0	10 Copyright (C) 2019 Miguel Machado <mpmachado@medicina.ulisboa.pt>
cstrittmatter@0	11
cstrittmatter@0	12 Last modified: August 08, 2019
cstrittmatter@0	13
cstrittmatter@0	14 This program is free software: you can redistribute it and/or modify
cstrittmatter@0	15 it under the terms of the GNU General Public License as published by
cstrittmatter@0	16 the Free Software Foundation, either version 3 of the License, or
cstrittmatter@0	17 (at your option) any later version.
cstrittmatter@0	18
cstrittmatter@0	19 This program is distributed in the hope that it will be useful,
cstrittmatter@0	20 but WITHOUT ANY WARRANTY; without even the implied warranty of
cstrittmatter@0	21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
cstrittmatter@0	22 GNU General Public License for more details.
cstrittmatter@0	23
cstrittmatter@0	24 You should have received a copy of the GNU General Public License
cstrittmatter@0	25 along with this program. If not, see <http://www.gnu.org/licenses/>.
cstrittmatter@0	26 """
cstrittmatter@0	27
cstrittmatter@0	28 import os
cstrittmatter@0	29 import sys
cstrittmatter@0	30 import time
cstrittmatter@0	31 import argparse
cstrittmatter@0	32
cstrittmatter@0	33 try:
cstrittmatter@0	34 from __init__ import __version__
cstrittmatter@0	35
cstrittmatter@0	36 import modules.utils as utils
cstrittmatter@0	37 import modules.seqFromWebTaxon as seq_from_web_taxon
cstrittmatter@0	38 import modules.download as download
cstrittmatter@0	39 import modules.rematch_module as rematch_module
cstrittmatter@0	40 import modules.checkMLST as check_mlst
cstrittmatter@0	41 except ImportError:
cstrittmatter@0	42 from ReMatCh.__init__ import __version__
cstrittmatter@0	43
cstrittmatter@0	44 from ReMatCh.modules import utils as utils
cstrittmatter@0	45 from ReMatCh.modules import seqFromWebTaxon as seq_from_web_taxon
cstrittmatter@0	46 from ReMatCh.modules import download as download
cstrittmatter@0	47 from ReMatCh.modules import rematch_module as rematch_module
cstrittmatter@0	48 from ReMatCh.modules import checkMLST as check_mlst
cstrittmatter@0	49
cstrittmatter@0	50
cstrittmatter@0	51 def search_fastq_files(directory):
cstrittmatter@0	52 files_extensions = ['.fastq.gz', '.fq.gz']
cstrittmatter@0	53 pair_end_files_separation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']]
cstrittmatter@0	54
cstrittmatter@0	55 list_ids = {}
cstrittmatter@0	56 directories = [d for d in os.listdir(directory) if
cstrittmatter@0	57 not d.startswith('.') and os.path.isdir(os.path.join(directory, d, ''))]
cstrittmatter@0	58 for directory_found in directories:
cstrittmatter@0	59 if directory_found != 'pubmlst':
cstrittmatter@0	60 directory_path = os.path.join(directory, directory_found, '')
cstrittmatter@0	61
cstrittmatter@0	62 fastq_found = []
cstrittmatter@0	63 files = [f for f in os.listdir(directory_path) if
cstrittmatter@0	64 not f.startswith('.') and os.path.isfile(os.path.join(directory_path, f))]
cstrittmatter@0	65 for file_found in files:
cstrittmatter@0	66 if file_found.endswith(tuple(files_extensions)):
cstrittmatter@0	67 fastq_found.append(file_found)
cstrittmatter@0	68
cstrittmatter@0	69 if len(fastq_found) == 1:
cstrittmatter@0	70 list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastq_found]
cstrittmatter@0	71 elif len(fastq_found) >= 2:
cstrittmatter@0	72 file_pair = []
cstrittmatter@0	73
cstrittmatter@0	74 # Search pairs
cstrittmatter@0	75 for pe_separation in pair_end_files_separation:
cstrittmatter@0	76 for fastq in fastq_found:
cstrittmatter@0	77 if pe_separation[0] in fastq or pe_separation[1] in fastq:
cstrittmatter@0	78 file_pair.append(fastq)
cstrittmatter@0	79
cstrittmatter@0	80 if len(file_pair) == 2:
cstrittmatter@0	81 break
cstrittmatter@0	82 else:
cstrittmatter@0	83 file_pair = []
cstrittmatter@0	84
cstrittmatter@0	85 # Search single
cstrittmatter@0	86 if len(file_pair) == 0:
cstrittmatter@0	87 for pe_separation in pair_end_files_separation:
cstrittmatter@0	88 for fastq in fastq_found:
cstrittmatter@0	89 if pe_separation[0] not in fastq or pe_separation[1] not in fastq:
cstrittmatter@0	90 file_pair.append(fastq)
cstrittmatter@0	91
cstrittmatter@0	92 if len(file_pair) >= 1:
cstrittmatter@0	93 file_pair = file_pair[0]
cstrittmatter@0	94
cstrittmatter@0	95 if len(file_pair) >= 1:
cstrittmatter@0	96 list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair]
cstrittmatter@0	97
cstrittmatter@0	98 return list_ids
cstrittmatter@0	99
cstrittmatter@0	100
cstrittmatter@0	101 def get_list_ids_from_file(file_list_ids):
cstrittmatter@0	102 list_ids = []
cstrittmatter@0	103
cstrittmatter@0	104 with open(file_list_ids, 'rtU') as lines:
cstrittmatter@0	105 for line in lines:
cstrittmatter@0	106 line = line.rstrip('\r\n')
cstrittmatter@0	107 if len(line) > 0:
cstrittmatter@0	108 list_ids.append(line)
cstrittmatter@0	109
cstrittmatter@0	110 if len(list_ids) == 0:
cstrittmatter@0	111 sys.exit('No runIDs were found in ' + file_list_ids)
cstrittmatter@0	112
cstrittmatter@0	113 return list_ids
cstrittmatter@0	114
cstrittmatter@0	115
cstrittmatter@0	116 def get_taxon_run_ids(taxon_name, outputfile):
cstrittmatter@0	117 seq_from_web_taxon.run_seq_from_web_taxon(taxon_name, outputfile, True, True, True, False)
cstrittmatter@0	118
cstrittmatter@0	119 run_ids = []
cstrittmatter@0	120 with open(outputfile, 'rtU') as reader:
cstrittmatter@0	121 for line in reader:
cstrittmatter@0	122 line = line.rstrip('\r\n')
cstrittmatter@0	123 if len(line) > 0:
cstrittmatter@0	124 if not line.startswith('#'):
cstrittmatter@0	125 line = line.split('\t')
cstrittmatter@0	126 run_ids.append(line[0])
cstrittmatter@0	127
cstrittmatter@0	128 return run_ids
cstrittmatter@0	129
cstrittmatter@0	130
cstrittmatter@0	131 def get_list_ids(workdir, file_list_ids, taxon_name):
cstrittmatter@0	132 searched_fastq_files = False
cstrittmatter@0	133 list_ids = []
cstrittmatter@0	134 if file_list_ids is None and taxon_name is None:
cstrittmatter@0	135 list_ids = search_fastq_files(workdir)
cstrittmatter@0	136 searched_fastq_files = True
cstrittmatter@0	137 elif file_list_ids is not None:
cstrittmatter@0	138 list_ids = get_list_ids_from_file(os.path.abspath(file_list_ids))
cstrittmatter@0	139 elif taxon_name is not None and file_list_ids is None:
cstrittmatter@0	140 list_ids = get_taxon_run_ids(taxon_name, os.path.join(workdir, 'IDs_list.seqFromWebTaxon.tab'))
cstrittmatter@0	141
cstrittmatter@0	142 if len(list_ids) == 0:
cstrittmatter@0	143 sys.exit('No IDs were found')
cstrittmatter@0	144 return list_ids, searched_fastq_files
cstrittmatter@0	145
cstrittmatter@0	146
cstrittmatter@0	147 def format_gene_info(gene_specific_info, minimum_gene_coverage, minimum_gene_identity, reported_data_type, summary,
cstrittmatter@0	148 sample, genes_present):
cstrittmatter@0	149 info = None
cstrittmatter@0	150 if gene_specific_info['gene_coverage'] >= minimum_gene_coverage and \
cstrittmatter@0	151 gene_specific_info['gene_identity'] >= minimum_gene_identity:
cstrittmatter@0	152 if summary and sample not in genes_present:
cstrittmatter@0	153 genes_present[sample] = {}
cstrittmatter@0	154
cstrittmatter@0	155 if gene_specific_info['gene_number_positions_multiple_alleles'] == 0:
cstrittmatter@0	156 s = str(gene_specific_info[reported_data_type])
cstrittmatter@0	157 info = str(s)
cstrittmatter@0	158 if summary:
cstrittmatter@0	159 genes_present[sample][gene_specific_info['header']] = str(s)
cstrittmatter@0	160 else:
cstrittmatter@0	161 s = 'multiAlleles_' + str(gene_specific_info[reported_data_type])
cstrittmatter@0	162 info = str(s)
cstrittmatter@0	163 if summary:
cstrittmatter@0	164 genes_present[sample][gene_specific_info['header']] = str(s)
cstrittmatter@0	165 else:
cstrittmatter@0	166 info = 'absent_' + str(gene_specific_info[reported_data_type])
cstrittmatter@0	167
cstrittmatter@0	168 return info, genes_present
cstrittmatter@0	169
cstrittmatter@0	170
cstrittmatter@0	171 def write_data_by_gene(gene_list_reference, minimum_gene_coverage, sample, data_by_gene, outdir, time_str, run_times,
cstrittmatter@0	172 minimum_gene_identity, reported_data_type, summary, genes_present):
cstrittmatter@0	173 combined_report = \
cstrittmatter@0	174 os.path.join(outdir,
cstrittmatter@0	175 'combined_report.data_by_gene.' + run_times + '.' + reported_data_type + '.' + time_str + '.tab')
cstrittmatter@0	176
cstrittmatter@0	177 if reported_data_type == 'coverage_depth':
cstrittmatter@0	178 reported_data_type = 'gene_mean_read_coverage'
cstrittmatter@0	179 elif reported_data_type == 'sequence_coverage':
cstrittmatter@0	180 reported_data_type = 'gene_coverage'
cstrittmatter@0	181
cstrittmatter@0	182 combined_report_exist = os.path.isfile(combined_report)
cstrittmatter@0	183 with open(combined_report, 'at') as writer:
cstrittmatter@0	184 seq_list = sorted(gene_list_reference.keys())
cstrittmatter@0	185 if not combined_report_exist:
cstrittmatter@0	186 writer.write('#sample' + '\t' + '\t'.join([gene_list_reference[seq] for seq in seq_list]) + '\n')
cstrittmatter@0	187
cstrittmatter@0	188 results = {}
cstrittmatter@0	189 headers = []
cstrittmatter@0	190 for i in data_by_gene:
cstrittmatter@0	191 results[data_by_gene[i]['header']], genes_present = format_gene_info(data_by_gene[i], minimum_gene_coverage,
cstrittmatter@0	192 minimum_gene_identity,
cstrittmatter@0	193 reported_data_type, summary, sample,
cstrittmatter@0	194 genes_present)
cstrittmatter@0	195 headers.append(data_by_gene[i]['header'])
cstrittmatter@0	196
cstrittmatter@0	197 if len(headers) != gene_list_reference:
cstrittmatter@0	198 for gene in gene_list_reference:
cstrittmatter@0	199 if gene not in headers:
cstrittmatter@0	200 results[gene] = 'NA'
cstrittmatter@0	201
cstrittmatter@0	202 writer.write(sample + '\t' + '\t'.join([results[seq] for seq in seq_list]) + '\n')
cstrittmatter@0	203
cstrittmatter@0	204 return genes_present
cstrittmatter@0	205
cstrittmatter@0	206
cstrittmatter@0	207 def write_sample_report(sample, outdir, time_str, file_size, run_successfully_fastq, run_successfully_rematch_first,
cstrittmatter@0	208 run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first,
cstrittmatter@0	209 time_taken_rematch_second, time_taken_sample, sequencing_information, sample_data_general_first,
cstrittmatter@0	210 sample_data_general_second, fastq_used):
cstrittmatter@0	211 sample_report = os.path.join(outdir, 'sample_report.' + time_str + '.tab')
cstrittmatter@0	212 report_exist = os.path.isfile(sample_report)
cstrittmatter@0	213
cstrittmatter@0	214 header_general = ['sample', 'sample_run_successfully', 'sample_run_time', 'files_size', 'download_run_successfully',
cstrittmatter@0	215 'download_run_time', 'rematch_run_successfully_first', 'rematch_run_time_first',
cstrittmatter@0	216 'rematch_run_successfully_second', 'rematch_run_time_second']
cstrittmatter@0	217 header_data_general = ['number_absent_genes', 'number_genes_multiple_alleles', 'mean_sample_coverage']
cstrittmatter@0	218 header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source',
cstrittmatter@0	219 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download']
cstrittmatter@0	220
cstrittmatter@0	221 with open(sample_report, 'at') as writer:
cstrittmatter@0	222 if not report_exist:
cstrittmatter@0	223 writer.write('#' + '\t'.join(header_general) + '\t' + '_first\t'.join(header_data_general) + '_first\t' +
cstrittmatter@0	224 '_second\t'.join(header_data_general) + '_second\t' + '\t'.join(header_sequencing) + '\t' +
cstrittmatter@0	225 'fastq_used' + '\n')
cstrittmatter@0	226
cstrittmatter@0	227 writer.write('\t'.join([sample,
cstrittmatter@0	228 str(all([run_successfully_fastq is not False,
cstrittmatter@0	229 run_successfully_rematch_first is not False,
cstrittmatter@0	230 run_successfully_rematch_second is not False])),
cstrittmatter@0	231 str(time_taken_sample),
cstrittmatter@0	232 str(file_size),
cstrittmatter@0	233 str(run_successfully_fastq),
cstrittmatter@0	234 str(time_taken_fastq),
cstrittmatter@0	235 str(run_successfully_rematch_first),
cstrittmatter@0	236 str(time_taken_rematch_first),
cstrittmatter@0	237 str(run_successfully_rematch_second),
cstrittmatter@0	238 str(time_taken_rematch_second)]) +
cstrittmatter@0	239 '\t' + '\t'.join([str(sample_data_general_first[i]) for i in header_data_general]) +
cstrittmatter@0	240 '\t' + '\t'.join([str(sample_data_general_second[i]) for i in header_data_general]) +
cstrittmatter@0	241 '\t' + '\t'.join([str(sequencing_information[i]) for i in header_sequencing]) +
cstrittmatter@0	242 '\t' + ','.join(fastq_used) + '\n')
cstrittmatter@0	243
cstrittmatter@0	244
cstrittmatter@0	245 def concatenate_extra_seq_2_consensus(consensus_sequence, reference_sequence, extra_seq_length, outdir):
cstrittmatter@0	246 reference_dict, ignore, ignore = rematch_module.get_sequence_information(reference_sequence, extra_seq_length)
cstrittmatter@0	247 consensus_dict, genes, ignore = rematch_module.get_sequence_information(consensus_sequence, 0)
cstrittmatter@0	248 number_consensus_with_sequences = 0
cstrittmatter@0	249 for k, values_consensus in list(consensus_dict.items()):
cstrittmatter@0	250 for values_reference in list(reference_dict.values()):
cstrittmatter@0	251 if values_reference['header'] == values_consensus['header']:
cstrittmatter@0	252 if len(set(consensus_dict[k]['sequence'])) > 1:
cstrittmatter@0	253 number_consensus_with_sequences += 1
cstrittmatter@0	254 if extra_seq_length <= len(values_reference['sequence']):
cstrittmatter@0	255 right_extra_seq = \
cstrittmatter@0	256 '' if extra_seq_length == 0 else values_reference['sequence'][-extra_seq_length:]
cstrittmatter@0	257 consensus_dict[k]['sequence'] = \
cstrittmatter@0	258 values_reference['sequence'][:extra_seq_length] + \
cstrittmatter@0	259 consensus_dict[k]['sequence'] + \
cstrittmatter@0	260 right_extra_seq
cstrittmatter@0	261 consensus_dict[k]['length'] += extra_seq_length + len(right_extra_seq)
cstrittmatter@0	262
cstrittmatter@0	263 consensus_concatenated = os.path.join(outdir, 'consensus_concatenated_extraSeq.fasta')
cstrittmatter@0	264 with open(consensus_concatenated, 'wt') as writer:
cstrittmatter@0	265 for i in consensus_dict:
cstrittmatter@0	266 writer.write('>' + consensus_dict[i]['header'] + '\n')
cstrittmatter@0	267 fasta_sequence_lines = rematch_module.chunkstring(consensus_dict[i]['sequence'], 80)
cstrittmatter@0	268 for line in fasta_sequence_lines:
cstrittmatter@0	269 writer.write(line + '\n')
cstrittmatter@0	270
cstrittmatter@0	271 return consensus_concatenated, genes, consensus_dict, number_consensus_with_sequences
cstrittmatter@0	272
cstrittmatter@0	273
cstrittmatter@0	274 def clean_headers_reference_file(reference_file, outdir, extra_seq):
cstrittmatter@0	275 problematic_characters = ["\|", " ", ",", ".", "(", ")", "'", "/", ":"]
cstrittmatter@0	276 print('Checking if reference sequences contain ' + str(problematic_characters) + '\n')
cstrittmatter@0	277 # headers_changed = False
cstrittmatter@0	278 new_reference_file = str(reference_file)
cstrittmatter@0	279 sequences, genes, headers_changed = rematch_module.get_sequence_information(reference_file, extra_seq)
cstrittmatter@0	280 if headers_changed:
cstrittmatter@0	281 print('At least one of the those characters was found. Replacing those with _' + '\n')
cstrittmatter@0	282 new_reference_file = \
cstrittmatter@0	283 os.path.join(outdir, os.path.splitext(os.path.basename(reference_file))[0] + '.headers_renamed.fasta')
cstrittmatter@0	284 with open(new_reference_file, 'wt') as writer:
cstrittmatter@0	285 for i in sequences:
cstrittmatter@0	286 writer.write('>' + sequences[i]['header'] + '\n')
cstrittmatter@0	287 fasta_sequence_lines = rematch_module.chunkstring(sequences[i]['sequence'], 80)
cstrittmatter@0	288 for line in fasta_sequence_lines:
cstrittmatter@0	289 writer.write(line + '\n')
cstrittmatter@0	290 return new_reference_file, genes, sequences
cstrittmatter@0	291
cstrittmatter@0	292
cstrittmatter@0	293 def write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, loci_order, outdir, time_str):
cstrittmatter@0	294 mlst_report = os.path.join(outdir, 'mlst_report.' + time_str + '.tab')
cstrittmatter@0	295 mlst_report_exist = os.path.isfile(mlst_report)
cstrittmatter@0	296 with open(mlst_report, 'at') as writer:
cstrittmatter@0	297 if not mlst_report_exist:
cstrittmatter@0	298 writer.write('\t'.join(['#sample', 'ReMatCh_run', 'consensus_type', 'ST'] + loci_order) + '\n')
cstrittmatter@0	299 writer.write('\t'.join([sample, run_times, consensus_type, str(st)] + alleles_profile.split(',')) + '\n')
cstrittmatter@0	300
cstrittmatter@0	301
cstrittmatter@0	302 def run_get_st(sample, mlst_dicts, consensus_sequences, mlst_consensus, run_times, outdir, time_str):
cstrittmatter@0	303 if mlst_consensus == 'all':
cstrittmatter@0	304 for consensus_type in consensus_sequences:
cstrittmatter@0	305 print('Searching MLST for ' + consensus_type + ' consensus')
cstrittmatter@0	306 st, alleles_profile = check_mlst.get_st(mlst_dicts, consensus_sequences[consensus_type])
cstrittmatter@0	307 write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, mlst_dicts[2], outdir, time_str)
cstrittmatter@0	308 print('ST found: ' + str(st) + ' (' + alleles_profile + ')')
cstrittmatter@0	309 else:
cstrittmatter@0	310 st, alleles_profile = check_mlst.get_st(mlst_dicts, consensus_sequences[mlst_consensus])
cstrittmatter@0	311 write_mlst_report(sample, run_times, mlst_consensus, st, alleles_profile, mlst_dicts[2], outdir, time_str)
cstrittmatter@0	312 print('ST found for ' + mlst_consensus + ' consensus: ' + str(st) + ' (' + alleles_profile + ')')
cstrittmatter@0	313
cstrittmatter@0	314
cstrittmatter@0	315 def write_summary_report(outdir, reported_data_type, time_str, gene_list_reference, genes_present):
cstrittmatter@0	316 with open(os.path.join(outdir,
cstrittmatter@0	317 'summary.{reported_data_type}.{time_str}.tab'.format(reported_data_type=reported_data_type,
cstrittmatter@0	318 time_str=time_str)), 'wt') as writer:
cstrittmatter@0	319 seq_list = []
cstrittmatter@0	320 for info in list(genes_present.values()):
cstrittmatter@0	321 seq_list.extend(list(info.keys()))
cstrittmatter@0	322 seq_list = list(set(seq_list))
cstrittmatter@0	323 writer.write('#sample' + '\t' + '\t'.join([gene_list_reference[seq] for seq in sorted(seq_list)]) + '\n')
cstrittmatter@0	324 for sample, info in list(genes_present.items()):
cstrittmatter@0	325 data = []
cstrittmatter@0	326 for seq in sorted(seq_list):
cstrittmatter@0	327 if seq in info:
cstrittmatter@0	328 data.append(info[seq])
cstrittmatter@0	329 else:
cstrittmatter@0	330 data.append('NF')
cstrittmatter@0	331 writer.write(sample + '\t' + '\t'.join(data) + '\n')
cstrittmatter@0	332
cstrittmatter@0	333
cstrittmatter@0	334 def run_rematch(args):
cstrittmatter@0	335 workdir = os.path.abspath(args.workdir)
cstrittmatter@0	336 if not os.path.isdir(workdir):
cstrittmatter@0	337 os.makedirs(workdir)
cstrittmatter@0	338
cstrittmatter@0	339 aspera_key = os.path.abspath(args.asperaKey.name) if args.asperaKey is not None else None
cstrittmatter@0	340
cstrittmatter@0	341 # Start logger
cstrittmatter@0	342 logfile, time_str = utils.start_logger(workdir)
cstrittmatter@0	343
cstrittmatter@0	344 # Get general information
cstrittmatter@0	345 script_path = utils.general_information(logfile, __version__, workdir, time_str, args.doNotUseProvidedSoftware,
cstrittmatter@0	346 aspera_key, args.downloadCramBam, args.SRA, args.SRAopt)
cstrittmatter@0	347
cstrittmatter@0	348 # Set list_ids
cstrittmatter@0	349 list_ids, searched_fastq_files = get_list_ids(workdir, args.listIDs.name if args.listIDs is not None else None,
cstrittmatter@0	350 args.taxon)
cstrittmatter@0	351
cstrittmatter@0	352 mlst_sequences = None
cstrittmatter@0	353 mlst_dicts = None
cstrittmatter@0	354 if args.mlst is not None:
cstrittmatter@0	355 time_taken_pub_mlst, mlst_dicts, mlst_sequences = check_mlst.download_pub_mlst_xml(args.mlst,
cstrittmatter@0	356 args.mlstSchemaNumber,
cstrittmatter@0	357 workdir)
cstrittmatter@0	358 args.softClip_recodeRun = 'first'
cstrittmatter@0	359
cstrittmatter@0	360 if args.reference is None:
cstrittmatter@0	361 if args.mlst is not None:
cstrittmatter@0	362 reference_file = check_mlst.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path)
cstrittmatter@0	363 args.extraSeq = 200
cstrittmatter@0	364 if reference_file is None:
cstrittmatter@0	365 print('It was not found provided MLST scheme sequences for ' + args.mlst)
cstrittmatter@0	366 print('Trying to obtain reference MLST sequences from PubMLST')
cstrittmatter@0	367 if len(mlst_sequences) > 0:
cstrittmatter@0	368 reference_file = check_mlst.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str)
cstrittmatter@0	369 args.extraSeq = 0
cstrittmatter@0	370 else:
cstrittmatter@0	371 sys.exit('It was not possible to download MLST sequences from PubMLST!')
cstrittmatter@0	372 else:
cstrittmatter@0	373 print('Using provided scheme as referece: ' + reference_file)
cstrittmatter@0	374 else:
cstrittmatter@0	375 sys.exit('Need to provide at least one of the following options: "--reference" and "--mlst"')
cstrittmatter@0	376 else:
cstrittmatter@0	377 reference_file = os.path.abspath(args.reference.name)
cstrittmatter@0	378
cstrittmatter@0	379 # Run ReMatCh for each sample
cstrittmatter@0	380 print('\n' + 'STARTING ReMatCh' + '\n')
cstrittmatter@0	381
cstrittmatter@0	382 # Clean sequences headers
cstrittmatter@0	383 reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference_file, workdir,
cstrittmatter@0	384 args.extraSeq)
cstrittmatter@0	385
cstrittmatter@0	386 if args.mlst is not None:
cstrittmatter@0	387 problem_genes = False
cstrittmatter@0	388 for header in mlst_sequences:
cstrittmatter@0	389 if header not in gene_list_reference:
cstrittmatter@0	390 print('MLST gene {header} not found between reference sequences'.format(header=header))
cstrittmatter@0	391 problem_genes = True
cstrittmatter@0	392 if problem_genes:
cstrittmatter@0	393 sys.exit('Missing MLST genes from reference sequences (at least sequences names do not match)!')
cstrittmatter@0	394
cstrittmatter@0	395 if len(gene_list_reference) == 0:
cstrittmatter@0	396 sys.exit('No sequences left')
cstrittmatter@0	397
cstrittmatter@0	398 # To use in combined report
cstrittmatter@0	399
cstrittmatter@0	400 number_samples_successfully = 0
cstrittmatter@0	401 genes_present_coverage_depth = {}
cstrittmatter@0	402 genes_present_sequence_coverage = {}
cstrittmatter@0	403 for sample in list_ids:
cstrittmatter@0	404 sample_start_time = time.time()
cstrittmatter@0	405 print('\n\n' + 'Sample ID: ' + sample)
cstrittmatter@0	406
cstrittmatter@0	407 # Create sample outdir
cstrittmatter@0	408 sample_outdir = os.path.join(workdir, sample, '')
cstrittmatter@0	409 if not os.path.isdir(sample_outdir):
cstrittmatter@0	410 os.mkdir(sample_outdir)
cstrittmatter@0	411
cstrittmatter@0	412 run_successfully_fastq = None
cstrittmatter@0	413 time_taken_fastq = 0
cstrittmatter@0	414 sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None,
cstrittmatter@0	415 'library_layout': None, 'library_source': None, 'extra_run_accession': None,
cstrittmatter@0	416 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None}
cstrittmatter@0	417 if not searched_fastq_files:
cstrittmatter@0	418 # Download Files
cstrittmatter@0	419 time_taken_fastq, run_successfully_fastq, fastq_files, sequencing_information = \
cstrittmatter@0	420 download.run_download(sample, args.downloadLibrariesType, aspera_key, sample_outdir,
cstrittmatter@0	421 args.downloadCramBam, args.threads, args.downloadInstrumentPlatform, args.SRA,
cstrittmatter@0	422 args.SRAopt)
cstrittmatter@0	423 else:
cstrittmatter@0	424 fastq_files = list_ids[sample]
cstrittmatter@0	425
cstrittmatter@0	426 file_size = None
cstrittmatter@0	427
cstrittmatter@0	428 run_successfully_rematch_first = None
cstrittmatter@0	429 run_successfully_rematch_second = None
cstrittmatter@0	430 time_taken_rematch_first = 0
cstrittmatter@0	431 time_taken_rematch_second = 0
cstrittmatter@0	432 sample_data_general_first = None
cstrittmatter@0	433 sample_data_general_second = None
cstrittmatter@0	434 if run_successfully_fastq is not False:
cstrittmatter@0	435 file_size = sum(os.path.getsize(fastq) for fastq in fastq_files)
cstrittmatter@0	436 # Run ReMatCh
cstrittmatter@0	437 time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, \
cstrittmatter@0	438 consensus_files, consensus_sequences = \
cstrittmatter@0	439 rematch_module.run_rematch_module(sample, fastq_files, reference_file, args.threads, sample_outdir,
cstrittmatter@0	440 args.extraSeq, args.minCovPresence, args.minCovCall,
cstrittmatter@0	441 args.minFrequencyDominantAllele, args.minGeneCoverage,
cstrittmatter@0	442 args.debug, args.numMapLoc, args.minGeneIdentity,
cstrittmatter@0	443 'first', args.softClip_baseQuality, args.softClip_recodeRun,
cstrittmatter@0	444 reference_dict, args.softClip_cigarFlagRecode,
cstrittmatter@0	445 args.bowtieAlgo, args.bowtieOPT,
cstrittmatter@0	446 gene_list_reference, args.notWriteConsensus, clean_run=True)
cstrittmatter@0	447 if run_successfully_rematch_first:
cstrittmatter@0	448 if args.mlst is not None and (args.mlstRun == 'first' or args.mlstRun == 'all'):
cstrittmatter@0	449 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'first', workdir, time_str)
cstrittmatter@0	450 genes_present_coverage_depth = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample,
cstrittmatter@0	451 data_by_gene, workdir, time_str, 'first_run',
cstrittmatter@0	452 args.minGeneIdentity, 'coverage_depth', args.summary,
cstrittmatter@0	453 genes_present_coverage_depth)
cstrittmatter@0	454 if args.reportSequenceCoverage:
cstrittmatter@0	455 genes_present_sequence_coverage = write_data_by_gene(gene_list_reference, args.minGeneCoverage,
cstrittmatter@0	456 sample, data_by_gene, workdir, time_str,
cstrittmatter@0	457 'first_run', args.minGeneIdentity,
cstrittmatter@0	458 'sequence_coverage', args.summary,
cstrittmatter@0	459 genes_present_sequence_coverage)
cstrittmatter@0	460 if args.doubleRun:
cstrittmatter@0	461 rematch_second_outdir = os.path.join(sample_outdir, 'rematch_second_run', '')
cstrittmatter@0	462 if not os.path.isdir(rematch_second_outdir):
cstrittmatter@0	463 os.mkdir(rematch_second_outdir)
cstrittmatter@0	464 consensus_concatenated_fasta, consensus_concatenated_gene_list, consensus_concatenated_dict, \
cstrittmatter@0	465 number_consensus_with_sequences = \
cstrittmatter@0	466 concatenate_extra_seq_2_consensus(consensus_files['noMatter'], reference_file, args.extraSeq,
cstrittmatter@0	467 rematch_second_outdir)
cstrittmatter@0	468 if len(consensus_concatenated_gene_list) > 0:
cstrittmatter@0	469 if args.mlst is None or \
cstrittmatter@0	470 (args.mlst is not None and number_consensus_with_sequences == len(gene_list_reference)):
cstrittmatter@0	471 time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, \
cstrittmatter@0	472 sample_data_general_second, consensus_files, consensus_sequences = \
cstrittmatter@0	473 rematch_module.run_rematch_module(sample, fastq_files, consensus_concatenated_fasta,
cstrittmatter@0	474 args.threads, rematch_second_outdir, args.extraSeq,
cstrittmatter@0	475 args.minCovPresence, args.minCovCall,
cstrittmatter@0	476 args.minFrequencyDominantAllele, args.minGeneCoverage,
cstrittmatter@0	477 args.debug, args.numMapLoc,
cstrittmatter@0	478 args.minGeneIdentity, 'second',
cstrittmatter@0	479 args.softClip_baseQuality, args.softClip_recodeRun,
cstrittmatter@0	480 consensus_concatenated_dict,
cstrittmatter@0	481 args.softClip_cigarFlagRecode,
cstrittmatter@0	482 args.bowtieAlgo, args.bowtieOPT,
cstrittmatter@0	483 gene_list_reference, args.notWriteConsensus,
cstrittmatter@0	484 clean_run=True)
cstrittmatter@0	485 if not args.debug:
cstrittmatter@0	486 os.remove(consensus_concatenated_fasta)
cstrittmatter@0	487 if run_successfully_rematch_second:
cstrittmatter@0	488 if args.mlst is not None and (args.mlstRun == 'second' or args.mlstRun == 'all'):
cstrittmatter@0	489 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'second',
cstrittmatter@0	490 workdir, time_str)
cstrittmatter@0	491 _ = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene,
cstrittmatter@0	492 workdir, time_str, 'second_run', args.minGeneIdentity,
cstrittmatter@0	493 'coverage_depth', False, {})
cstrittmatter@0	494 if args.reportSequenceCoverage:
cstrittmatter@0	495 _ = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample,
cstrittmatter@0	496 data_by_gene, workdir, time_str, 'second_run',
cstrittmatter@0	497 args.minGeneIdentity, 'sequence_coverage', False, {})
cstrittmatter@0	498 else:
cstrittmatter@0	499 print('Some sequences missing after ReMatCh module first run. Second run will not be'
cstrittmatter@0	500 ' performed')
cstrittmatter@0	501 if os.path.isfile(consensus_concatenated_fasta):
cstrittmatter@0	502 os.remove(consensus_concatenated_fasta)
cstrittmatter@0	503 if os.path.isdir(rematch_second_outdir):
cstrittmatter@0	504 utils.remove_directory(rematch_second_outdir)
cstrittmatter@0	505 else:
cstrittmatter@0	506 print('No sequences left after ReMatCh module first run. Second run will not be performed')
cstrittmatter@0	507 if os.path.isfile(consensus_concatenated_fasta):
cstrittmatter@0	508 os.remove(consensus_concatenated_fasta)
cstrittmatter@0	509 if os.path.isdir(rematch_second_outdir):
cstrittmatter@0	510 utils.remove_directory(rematch_second_outdir)
cstrittmatter@0	511
cstrittmatter@0	512 if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None:
cstrittmatter@0	513 for fastq in fastq_files:
cstrittmatter@0	514 if os.path.isfile(fastq):
cstrittmatter@0	515 os.remove(fastq)
cstrittmatter@0	516
cstrittmatter@0	517 time_taken = utils.run_time(sample_start_time)
cstrittmatter@0	518
cstrittmatter@0	519 write_sample_report(sample, workdir, time_str, file_size, run_successfully_fastq,
cstrittmatter@0	520 run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq,
cstrittmatter@0	521 time_taken_rematch_first, time_taken_rematch_second, time_taken, sequencing_information,
cstrittmatter@0	522 sample_data_general_first if run_successfully_rematch_first else
cstrittmatter@0	523 {'number_absent_genes': None, 'number_genes_multiple_alleles': None,
cstrittmatter@0	524 'mean_sample_coverage': None},
cstrittmatter@0	525 sample_data_general_second if run_successfully_rematch_second else
cstrittmatter@0	526 {'number_absent_genes': None, 'number_genes_multiple_alleles': None,
cstrittmatter@0	527 'mean_sample_coverage': None},
cstrittmatter@0	528 fastq_files if fastq_files is not None else '')
cstrittmatter@0	529
cstrittmatter@0	530 if all([run_successfully_fastq is not False,
cstrittmatter@0	531 run_successfully_rematch_first is not False,
cstrittmatter@0	532 run_successfully_rematch_second is not False]):
cstrittmatter@0	533 number_samples_successfully += 1
cstrittmatter@0	534
cstrittmatter@0	535 if args.summary:
cstrittmatter@0	536 write_summary_report(workdir, 'coverage_depth', time_str, gene_list_reference, genes_present_coverage_depth)
cstrittmatter@0	537 if args.reportSequenceCoverage:
cstrittmatter@0	538 write_summary_report(workdir, 'sequence_coverage', time_str, gene_list_reference,
cstrittmatter@0	539 genes_present_sequence_coverage)
cstrittmatter@0	540
cstrittmatter@0	541 return number_samples_successfully, len(list_ids)
cstrittmatter@0	542
cstrittmatter@0	543
cstrittmatter@0	544 def main():
cstrittmatter@0	545 if sys.version_info[0] < 3:
cstrittmatter@0	546 sys.exit('Must be using Python 3. Try calling "python3 rematch.py"')
cstrittmatter@0	547
cstrittmatter@0	548 parser = argparse.ArgumentParser(prog='rematch.py',
cstrittmatter@0	549 description='Reads mapping against target sequences, checking mapping and'
cstrittmatter@0	550 ' consensus sequences production',
cstrittmatter@0	551 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
cstrittmatter@0	552 parser.add_argument('--version', help='Version information', action='version',
cstrittmatter@0	553 version='{prog} v{version}'.format(prog=parser.prog, version=__version__))
cstrittmatter@0	554
cstrittmatter@0	555 parser_optional_general = parser.add_argument_group('General facultative options')
cstrittmatter@0	556 parser_optional_general.add_argument('-r', '--reference', type=argparse.FileType('r'),
cstrittmatter@0	557 metavar='/path/to/reference_sequence.fasta',
cstrittmatter@0	558 help='Fasta file containing reference sequences', required=False)
cstrittmatter@0	559 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/',
cstrittmatter@0	560 help='Path to the directory where ReMatCh will run and produce the outputs'
cstrittmatter@0	561 ' with reads (ended with fastq.gz/fq.gz and, in case of PE data, pair-end'
cstrittmatter@0	562 ' direction coded as _R1_001 / _R2_001 or _1 / _2) already'
cstrittmatter@0	563 ' present (organized in sample folders) or to be downloaded',
cstrittmatter@0	564 required=False, default='.')
cstrittmatter@0	565 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use',
cstrittmatter@0	566 required=False, default=1)
cstrittmatter@0	567 parser_optional_general.add_argument('--mlst', type=str, metavar='"Streptococcus agalactiae"',
cstrittmatter@0	568 help='Species name (same as in PubMLST) to be used in MLST'
cstrittmatter@0	569 ' determination. ReMatCh will use Bowtie2 very-sensitive-local mapping'
cstrittmatter@0	570 ' parameters and will recode the soft clip CIGAR flags of the first run',
cstrittmatter@0	571 required=False)
cstrittmatter@0	572 parser_optional_general.add_argument('--doNotUseProvidedSoftware', action='store_true',
cstrittmatter@0	573 help='Tells ReMatCh to not use Bowtie2, Samtools and Bcftools that are'
cstrittmatter@0	574 ' provided with it')
cstrittmatter@0	575
cstrittmatter@0	576 parser_optional_download_exclusive = parser.add_mutually_exclusive_group()
cstrittmatter@0	577 parser_optional_download_exclusive.add_argument('-l', '--listIDs', type=argparse.FileType('r'),
cstrittmatter@0	578 metavar='/path/to/list_IDs.txt',
cstrittmatter@0	579 help='Path to list containing the IDs to be'
cstrittmatter@0	580 ' downloaded (one per line)', required=False)
cstrittmatter@0	581 parser_optional_download_exclusive.add_argument('-t', '--taxon', type=str, metavar='"Streptococcus agalactiae"',
cstrittmatter@0	582 help='Taxon name for which ReMatCh will download fastq files',
cstrittmatter@0	583 required=False)
cstrittmatter@0	584
cstrittmatter@0	585 parser_optional_rematch = parser.add_argument_group('ReMatCh module facultative options')
cstrittmatter@0	586 parser_optional_rematch.add_argument('--extraSeq', type=int, metavar='N',
cstrittmatter@0	587 help='Sequence length added to both ends of target sequences (usefull to'
cstrittmatter@0	588 ' improve reads mapping to the target one) that will be trimmed in'
cstrittmatter@0	589 ' ReMatCh outputs', required=False, default=0)
cstrittmatter@0	590 parser_optional_rematch.add_argument('--minCovPresence', type=int, metavar='N',
cstrittmatter@0	591 help='Reference position minimum coverage depth to consider the position to be'
cstrittmatter@0	592 ' present in the sample', required=False, default=5)
cstrittmatter@0	593 parser_optional_rematch.add_argument('--minCovCall', type=int, metavar='N',
cstrittmatter@0	594 help='Reference position minimum coverage depth to perform a base call. Lower'
cstrittmatter@0	595 ' coverage will be coded as N', required=False, default=10)
cstrittmatter@0	596 parser_optional_rematch.add_argument('--minFrequencyDominantAllele', type=float, metavar='0.6',
cstrittmatter@0	597 help='Minimum relative frequency of the dominant allele coverage depth (value'
cstrittmatter@0	598 ' between [0, 1]). Positions with lower values will be considered as'
cstrittmatter@0	599 ' having multiple alleles (and will be coded as N)', required=False,
cstrittmatter@0	600 default=0.6)
cstrittmatter@0	601 parser_optional_rematch.add_argument('--minGeneCoverage', type=int, metavar='N',
cstrittmatter@0	602 help='Minimum percentage of target reference gene sequence covered'
cstrittmatter@0	603 ' by --minCovPresence to consider a gene to be present (value'
cstrittmatter@0	604 ' between [0, 100])', required=False, default=70)
cstrittmatter@0	605 parser_optional_rematch.add_argument('--minGeneIdentity', type=int, metavar='N',
cstrittmatter@0	606 help='Minimum percentage of identity of reference gene sequence covered'
cstrittmatter@0	607 ' by --minCovCall to consider a gene to be present (value'
cstrittmatter@0	608 ' between [0, 100]). One INDEL will be considered as one difference',
cstrittmatter@0	609 required=False, default=80)
cstrittmatter@0	610 parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help=argparse.SUPPRESS, required=False,
cstrittmatter@0	611 default=1)
cstrittmatter@0	612 # parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help='Maximum number of locations to which a read can map (sometimes useful when mapping against similar sequences)', required=False, default=1)
cstrittmatter@0	613 parser_optional_rematch.add_argument('--doubleRun', action='store_true',
cstrittmatter@0	614 help='Tells ReMatCh to run a second time using as reference the noMatter'
cstrittmatter@0	615 ' consensus sequence produced in the first run. This will improve'
cstrittmatter@0	616 ' consensus sequence determination for sequences with high percentage of'
cstrittmatter@0	617 ' target reference gene sequence covered')
cstrittmatter@0	618 parser_optional_rematch.add_argument('--reportSequenceCoverage', action='store_true',
cstrittmatter@0	619 help='Produce an extra combined_report.data_by_gene with the sequence coverage'
cstrittmatter@0	620 ' instead of coverage depth')
cstrittmatter@0	621 parser_optional_rematch.add_argument('--summary', action='store_true',
cstrittmatter@0	622 help='Produce extra report files containing only sequences present in at least'
cstrittmatter@0	623 ' one sample (usefull when using a large number of reference'
cstrittmatter@0	624 ' sequences, and only for first run)')
cstrittmatter@0	625 parser_optional_rematch.add_argument('--notWriteConsensus', action='store_true',
cstrittmatter@0	626 help='Do not write consensus sequences')
cstrittmatter@0	627 parser_optional_rematch.add_argument('--bowtieAlgo', type=str, metavar='"--very-sensitive-local"',
cstrittmatter@0	628 help='Bowtie2 alignment mode. It can be an end-to-end alignment (unclipped'
cstrittmatter@0	629 ' alignment) or local alignment (soft clipped alignment). Also, can'
cstrittmatter@0	630 ' choose between fast or sensitive alignments. Please check Bowtie2'
cstrittmatter@0	631 ' manual for extra'
cstrittmatter@0	632 ' information: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml .'
cstrittmatter@0	633 ' This option should be provided between quotes and starting with'
cstrittmatter@0	634 ' an empty space (like --bowtieAlgo " --very-fast") or using equal'
cstrittmatter@0	635 ' sign (like --bowtieAlgo="--very-fast")',
cstrittmatter@0	636 required=False, default='--very-sensitive-local')
cstrittmatter@0	637 parser_optional_rematch.add_argument('--bowtieOPT', type=str, metavar='"--no-mixed"',
cstrittmatter@0	638 help='Extra Bowtie2 options. This option should be provided between quotes and'
cstrittmatter@0	639 ' starting with an empty space (like --bowtieOPT " --no-mixed") or using'
cstrittmatter@0	640 ' equal sign (like --bowtieOPT="--no-mixed")',
cstrittmatter@0	641 required=False)
cstrittmatter@0	642 parser_optional_rematch.add_argument('--debug', action='store_true',
cstrittmatter@0	643 help='DeBug Mode: do not remove temporary files')
cstrittmatter@0	644
cstrittmatter@0	645 parser_optional_mlst = parser.add_argument_group('MLST facultative options')
cstrittmatter@0	646 parser_optional_rematch.add_argument('--mlstReference', action='store_true',
cstrittmatter@0	647 help='If the curated scheme for MLST alleles is available, tells ReMatCh to'
cstrittmatter@0	648 ' use these as reference (force Bowtie2 to run with very-sensitive-local'
cstrittmatter@0	649 ' parameters, and sets --extraSeq to 200), otherwise ReMatCh uses the'
cstrittmatter@0	650 ' first alleles of each MLST gene fragment in PubMLST as reference'
cstrittmatter@0	651 ' sequences (force Bowtie2 to run with very-sensitive-local parameters,'
cstrittmatter@0	652 ' and sets --extraSeq to 0)')
cstrittmatter@0	653 parser_optional_mlst.add_argument('--mlstSchemaNumber', type=int, metavar='N',
cstrittmatter@0	654 help='Number of the species PubMLST schema to be used in case of multiple schemes'
cstrittmatter@0	655 ' available (by default will use the first schema)', required=False)
cstrittmatter@0	656 parser_optional_mlst.add_argument('--mlstConsensus', choices=['noMatter', 'correct', 'alignment', 'all'], type=str,
cstrittmatter@0	657 metavar='noMatter',
cstrittmatter@0	658 help='Consensus sequence to be used in MLST'
cstrittmatter@0	659 ' determination (available options: %(choices)s)', required=False,
cstrittmatter@0	660 default='noMatter')
cstrittmatter@0	661 parser_optional_mlst.add_argument('--mlstRun', choices=['first', 'second', 'all'], type=str, metavar='first',
cstrittmatter@0	662 help='ReMatCh run outputs to be used in MLST determination (available'
cstrittmatter@0	663 ' options: %(choices)s)', required=False, default='all')
cstrittmatter@0	664
cstrittmatter@0	665 parser_optional_download = parser.add_argument_group('Download facultative options')
cstrittmatter@0	666 parser_optional_download.add_argument('-a', '--asperaKey', type=argparse.FileType('r'),
cstrittmatter@0	667 metavar='/path/to/asperaweb_id_dsa.openssh',
cstrittmatter@0	668 help='Tells ReMatCh to download fastq files from ENA using Aspera'
cstrittmatter@0	669 ' Connect. With this option, the path to Private-key file'
cstrittmatter@0	670 ' asperaweb_id_dsa.openssh must be provided (normaly found in'
cstrittmatter@0	671 ' ~/.aspera/connect/etc/asperaweb_id_dsa.openssh).', required=False)
cstrittmatter@0	672 parser_optional_download.add_argument('-k', '--keepDownloadedFastq', action='store_true',
cstrittmatter@0	673 help='Tells ReMatCh to keep the fastq files downloaded')
cstrittmatter@0	674 parser_optional_download.add_argument('--downloadLibrariesType', type=str, metavar='PAIRED',
cstrittmatter@0	675 help='Tells ReMatCh to download files with specific library'
cstrittmatter@0	676 ' layout (available options: %(choices)s)',
cstrittmatter@0	677 choices=['PAIRED', 'SINGLE', 'BOTH'], required=False, default='BOTH')
cstrittmatter@0	678 parser_optional_download.add_argument('--downloadInstrumentPlatform', type=str, metavar='ILLUMINA',
cstrittmatter@0	679 help='Tells ReMatCh to download files with specific library layout (available'
cstrittmatter@0	680 ' options: %(choices)s)', choices=['ILLUMINA', 'ALL'], required=False,
cstrittmatter@0	681 default='ILLUMINA')
cstrittmatter@0	682 parser_optional_download.add_argument('--downloadCramBam', action='store_true',
cstrittmatter@0	683 help='Tells ReMatCh to also download cram/bam files and convert them to fastq'
cstrittmatter@0	684 ' files')
cstrittmatter@0	685
cstrittmatter@0	686 parser_optional_sra = parser.add_mutually_exclusive_group()
cstrittmatter@0	687 parser_optional_sra.add_argument('--SRA', action='store_true',
cstrittmatter@0	688 help='Tells getSeqENA.py to download reads in fastq format only from NCBI SRA'
cstrittmatter@0	689 ' database (not recommended)')
cstrittmatter@0	690 parser_optional_sra.add_argument('--SRAopt', action='store_true',
cstrittmatter@0	691 help='Tells getSeqENA.py to download reads from NCBI SRA if the download from ENA'
cstrittmatter@0	692 ' fails')
cstrittmatter@0	693
cstrittmatter@0	694 parser_optional_soft_clip = parser.add_argument_group('Soft clip facultative options')
cstrittmatter@0	695 parser_optional_soft_clip.add_argument('--softClip_baseQuality', type=int, metavar='N',
cstrittmatter@0	696 help='Base quality phred score in reads soft clipped regions',
cstrittmatter@0	697 required=False,
cstrittmatter@0	698 default=7)
cstrittmatter@0	699 parser_optional_soft_clip.add_argument('--softClip_recodeRun', type=str, metavar='first',
cstrittmatter@0	700 help='ReMatCh run to recode soft clipped regions (available'
cstrittmatter@0	701 ' options: %(choices)s)', choices=['first', 'second', 'both', 'none'],
cstrittmatter@0	702 required=False, default='none')
cstrittmatter@0	703 parser_optional_soft_clip.add_argument('--softClip_cigarFlagRecode', type=str, metavar='M',
cstrittmatter@0	704 help='CIGAR flag to recode CIGAR soft clip (available options: %(choices)s)',
cstrittmatter@0	705 choices=['M', 'I', 'X'], required=False, default='X')
cstrittmatter@0	706
cstrittmatter@0	707 args = parser.parse_args()
cstrittmatter@0	708
cstrittmatter@0	709 msg = []
cstrittmatter@0	710 if args.reference is None and not args.mlstReference:
cstrittmatter@0	711 msg.append('At least --reference or --mlstReference should be provided')
cstrittmatter@0	712 elif args.reference is not None and args.mlstReference:
cstrittmatter@0	713 msg.append('Only --reference or --mlstReference should be provided')
cstrittmatter@0	714 else:
cstrittmatter@0	715 if args.mlstReference:
cstrittmatter@0	716 if args.mlst is None:
cstrittmatter@0	717 msg.append('Please provide species name using --mlst')
cstrittmatter@0	718 if args.minFrequencyDominantAllele < 0 or args.minFrequencyDominantAllele > 1:
cstrittmatter@0	719 msg.append('--minFrequencyDominantAllele should be a value between [0, 1]')
cstrittmatter@0	720 if args.minGeneCoverage < 0 or args.minGeneCoverage > 100:
cstrittmatter@0	721 msg.append('--minGeneCoverage should be a value between [0, 100]')
cstrittmatter@0	722 if args.minGeneIdentity < 0 or args.minGeneIdentity > 100:
cstrittmatter@0	723 msg.append('--minGeneIdentity should be a value between [0, 100]')
cstrittmatter@0	724 if args.notWriteConsensus and args.doubleRun:
cstrittmatter@0	725 msg.append('--notWriteConsensus and --doubleRun cannot be used together.'
cstrittmatter@0	726 ' Maybe you only want to use --doubleRun')
cstrittmatter@0	727
cstrittmatter@0	728 if len(msg) > 0:
cstrittmatter@0	729 argparse.ArgumentParser.error('\n'.join(msg))
cstrittmatter@0	730
cstrittmatter@0	731 start_time = time.time()
cstrittmatter@0	732
cstrittmatter@0	733 number_samples_successfully, samples_total_number = run_rematch(args)
cstrittmatter@0	734
cstrittmatter@0	735 print('\n' + 'END ReMatCh')
cstrittmatter@0	736 print('\n' +
cstrittmatter@0	737 str(number_samples_successfully) + ' samples out of ' + str(samples_total_number) + ' run successfully')
cstrittmatter@0	738 time_taken = utils.run_time(start_time)
cstrittmatter@0	739 del time_taken
cstrittmatter@0	740
cstrittmatter@0	741 if number_samples_successfully == 0:
cstrittmatter@0	742 sys.exit('No samples run successfully!')
cstrittmatter@0	743
cstrittmatter@0	744
cstrittmatter@0	745 if __name__ == "__main__":
cstrittmatter@0	746 main()

Mercurial > repos > cstrittmatter > test_galtrakr_eurl_vtec_wgs_pt_23

annotate scripts/ReMatCh/rematch.py @ 1:5224b4d51065 tip