Mercurial > repos > jpayne > gtsubsampler
changeset 4:a90a883f88f9
"planemo upload for repository https://toolrepo.galaxytrakr.org/"
author | jpayne |
---|---|
date | Fri, 19 Feb 2021 15:28:20 -0500 |
parents | 504004e78363 |
children | 3852b3edc8a4 |
files | subsamplr.py |
diffstat | 1 files changed, 7 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/subsamplr.py Fri Feb 19 14:57:58 2021 -0500 +++ b/subsamplr.py Fri Feb 19 15:28:20 2021 -0500 @@ -69,15 +69,20 @@ inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files + for file in ins: + print(file.name) + # https://en.m.wikipedia.org/wiki/Reservoir_sampling reservoir = [] # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads # from the open files - for readpair in zip(*inns): + reads = 0 + for i, readpair in enumerate(zip(*inns)): + reads += len(readpair[0][1]) reservoir.append(readpair) - if coverage(reservoir, gen_size) > cov: + if reads / gen_size > cov: break k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage