comparison subsamplr.py @ 4:a90a883f88f9

"planemo upload for repository https://toolrepo.galaxytrakr.org/"
author jpayne
date Fri, 19 Feb 2021 15:28:20 -0500
parents b2915e7e9dfa
children 3852b3edc8a4
comparison
equal deleted inserted replaced
3:504004e78363 4:a90a883f88f9
67 with ExitStack() as stack: 67 with ExitStack() as stack:
68 ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files 68 ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files
69 inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input 69 inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input
70 outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files 70 outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files
71 71
72 for file in ins:
73 print(file.name)
74
72 # https://en.m.wikipedia.org/wiki/Reservoir_sampling 75 # https://en.m.wikipedia.org/wiki/Reservoir_sampling
73 76
74 reservoir = [] 77 reservoir = []
75 # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file 78 # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file
76 # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads 79 # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads
77 # from the open files 80 # from the open files
78 for readpair in zip(*inns): 81 reads = 0
82 for i, readpair in enumerate(zip(*inns)):
83 reads += len(readpair[0][1])
79 reservoir.append(readpair) 84 reservoir.append(readpair)
80 if coverage(reservoir, gen_size) > cov: 85 if reads / gen_size > cov:
81 break 86 break
82 87
83 k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage 88 k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage
84 #W = exp(log(random.random()) / k) 89 #W = exp(log(random.random()) / k)
85 90