Mercurial > repos > jpayne > gtsubsampler
comparison subsamplr.py @ 4:a90a883f88f9
"planemo upload for repository https://toolrepo.galaxytrakr.org/"
author | jpayne |
---|---|
date | Fri, 19 Feb 2021 15:28:20 -0500 |
parents | b2915e7e9dfa |
children | 3852b3edc8a4 |
comparison
equal
deleted
inserted
replaced
3:504004e78363 | 4:a90a883f88f9 |
---|---|
67 with ExitStack() as stack: | 67 with ExitStack() as stack: |
68 ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files | 68 ins = [stack.enter_context(openn(path, 'r')) for openn, path in zip(file_openers, ins)] # opened input files |
69 inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input | 69 inns = [iter(grouper(inn, 4)) for inn in ins] # stateful 4-ply iterator over lines in the input |
70 outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files | 70 outs = [stack.enter_context(openn(path, 'w')) for openn, path in zip(file_openers, outs)] # opened output files |
71 | 71 |
72 for file in ins: | |
73 print(file.name) | |
74 | |
72 # https://en.m.wikipedia.org/wiki/Reservoir_sampling | 75 # https://en.m.wikipedia.org/wiki/Reservoir_sampling |
73 | 76 |
74 reservoir = [] | 77 reservoir = [] |
75 # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file | 78 # this is going to be 1 or 2-tuples of 4-tuples representing the 4 lines of the fastq file |
76 # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads | 79 # we determine its current coverage (and thus its reservoir size) to fill it, which consumes reads |
77 # from the open files | 80 # from the open files |
78 for readpair in zip(*inns): | 81 reads = 0 |
82 for i, readpair in enumerate(zip(*inns)): | |
83 reads += len(readpair[0][1]) | |
79 reservoir.append(readpair) | 84 reservoir.append(readpair) |
80 if coverage(reservoir, gen_size) > cov: | 85 if reads / gen_size > cov: |
81 break | 86 break |
82 | 87 |
83 k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage | 88 k = len(reservoir) # this is about how big the reservoir needs to be to get cov coverage |
84 #W = exp(log(random.random()) / k) | 89 #W = exp(log(random.random()) / k) |
85 | 90 |