annotate CSP2/bin/fetchReads.py @ 0:01431fa12065

"planemo upload"
author rliterman
date Mon, 02 Dec 2024 10:40:55 -0500
parents
children 792274118b2e
rev   line source
rliterman@0 1 #!/usr/bin/env python3
rliterman@0 2
rliterman@0 3 import os
rliterman@0 4 import sys
rliterman@0 5 from glob import glob
rliterman@0 6 import argparse
rliterman@0 7
rliterman@0 8 # Parse args
rliterman@0 9 parser = argparse.ArgumentParser(description='Fetch Reads')
rliterman@0 10 parser.add_argument('--read_dir', type=str, help='path to directory containing read files')
rliterman@0 11 parser.add_argument('--read_filetype', type=str, help='read filetype information')
rliterman@0 12 parser.add_argument('--forward_suffix', type=str, help='forward suffix')
rliterman@0 13 parser.add_argument('--reverse_suffix', type=str, help='reverse suffix')
rliterman@0 14 parser.add_argument('--trim_name', type=str, help='trim name')
rliterman@0 15 args = parser.parse_args()
rliterman@0 16
rliterman@0 17 # Get path to directory containing read files
rliterman@0 18 read_dir = os.path.abspath(args.read_dir)
rliterman@0 19 if not os.path.isdir(read_dir):
rliterman@0 20 sys.exit("--read_dir is not a valid path: " + str(read_dir))
rliterman@0 21
rliterman@0 22 # Get read filetype information
rliterman@0 23 read_filetype = args.read_filetype
rliterman@0 24 if not read_filetype.startswith("."):
rliterman@0 25 read_filetype = "." + read_filetype
rliterman@0 26 forward_suffix = args.forward_suffix
rliterman@0 27 reverse_suffix = args.reverse_suffix
rliterman@0 28 trim_name = args.trim_name
rliterman@0 29
rliterman@0 30 # Check if sequence files exist in directory, ignoring undetermined reads
rliterman@0 31 read_files = sorted(glob(read_dir+"/*"+read_filetype))
rliterman@0 32 read_files = [r for r in read_files if not os.path.basename(r).startswith("Undetermined_")]
rliterman@0 33
rliterman@0 34 if len(read_files) == 0:
rliterman@0 35 sys.exit("No "+read_filetype +" files detected in "+read_dir)
rliterman@0 36
rliterman@0 37 # Get data for paired-end and single-end reads
rliterman@0 38 left_files = [s for s in read_files if s.endswith(forward_suffix)]
rliterman@0 39 right_files = [s for s in read_files if s.endswith(reverse_suffix)]
rliterman@0 40
rliterman@0 41 # Identify pairs based on file name
rliterman@0 42 left_pairs = list()
rliterman@0 43 right_pairs = list()
rliterman@0 44 paired_files = list(set([x.replace(forward_suffix, '') for x in left_files]).intersection([y.replace(reverse_suffix, '') for y in right_files]))
rliterman@0 45
rliterman@0 46 for pair in paired_files:
rliterman@0 47 left_pairs.append(pair+forward_suffix)
rliterman@0 48 right_pairs.append(pair+reverse_suffix)
rliterman@0 49 single_end = [x for x in read_files if x not in left_pairs + right_pairs]
rliterman@0 50
rliterman@0 51 for left in left_pairs:
rliterman@0 52 base = str(os.path.basename(left).replace(forward_suffix,"").replace(trim_name,""))
rliterman@0 53 print(",".join([base,"Paired",";".join([left,left.replace(forward_suffix,reverse_suffix)])]))
rliterman@0 54
rliterman@0 55 for single in single_end:
rliterman@0 56 if single.endswith(forward_suffix):
rliterman@0 57 base = str(os.path.basename(single).replace(forward_suffix,"").replace(trim_name,""))
rliterman@0 58 else:
rliterman@0 59 base = str(os.path.basename(single).replace(read_filetype,"").replace(trim_name,""))
rliterman@0 60 print(",".join([base,"Single",single]))