rliterman@0: #!/usr/bin/env python3 rliterman@0: rliterman@0: import os rliterman@0: import sys rliterman@0: from glob import glob rliterman@0: import argparse rliterman@0: rliterman@0: # Parse args rliterman@0: parser = argparse.ArgumentParser(description='Fetch Reads') rliterman@0: parser.add_argument('--read_dir', type=str, help='path to directory containing read files') rliterman@27: parser.add_argument('--read_filetype',default='fastq.gz', type=str, help='read filetype information') rliterman@27: parser.add_argument('--forward_suffix',default='_1.fastq.gz', type=str, help='forward suffix') rliterman@27: parser.add_argument('--reverse_suffix',default = '_2.fastq.gz', type=str, help='reverse suffix') rliterman@28: parser.add_argument('--trim_name', nargs='?', const="", default="", type=str, help='Trim name') rliterman@0: args = parser.parse_args() rliterman@0: rliterman@0: # Get path to directory containing read files rliterman@0: read_dir = os.path.abspath(args.read_dir) rliterman@0: if not os.path.isdir(read_dir): rliterman@0: sys.exit("--read_dir is not a valid path: " + str(read_dir)) rliterman@0: rliterman@0: # Get read filetype information rliterman@0: read_filetype = args.read_filetype rliterman@0: if not read_filetype.startswith("."): rliterman@0: read_filetype = "." + read_filetype rliterman@0: forward_suffix = args.forward_suffix rliterman@0: reverse_suffix = args.reverse_suffix rliterman@0: trim_name = args.trim_name rliterman@0: rliterman@0: # Check if sequence files exist in directory, ignoring undetermined reads rliterman@0: read_files = sorted(glob(read_dir+"/*"+read_filetype)) rliterman@0: read_files = [r for r in read_files if not os.path.basename(r).startswith("Undetermined_")] rliterman@0: rliterman@0: if len(read_files) == 0: rliterman@0: sys.exit("No "+read_filetype +" files detected in "+read_dir) rliterman@0: rliterman@0: # Get data for paired-end and single-end reads rliterman@0: left_files = [s for s in read_files if s.endswith(forward_suffix)] rliterman@0: right_files = [s for s in read_files if s.endswith(reverse_suffix)] rliterman@0: rliterman@0: # Identify pairs based on file name rliterman@0: left_pairs = list() rliterman@0: right_pairs = list() rliterman@0: paired_files = list(set([x.replace(forward_suffix, '') for x in left_files]).intersection([y.replace(reverse_suffix, '') for y in right_files])) rliterman@0: rliterman@0: for pair in paired_files: rliterman@0: left_pairs.append(pair+forward_suffix) rliterman@0: right_pairs.append(pair+reverse_suffix) rliterman@0: single_end = [x for x in read_files if x not in left_pairs + right_pairs] rliterman@0: rliterman@0: for left in left_pairs: rliterman@0: base = str(os.path.basename(left).replace(forward_suffix,"").replace(trim_name,"")) rliterman@0: print(",".join([base,"Paired",";".join([left,left.replace(forward_suffix,reverse_suffix)])])) rliterman@0: rliterman@0: for single in single_end: rliterman@0: if single.endswith(forward_suffix): rliterman@0: base = str(os.path.basename(single).replace(forward_suffix,"").replace(trim_name,"")) rliterman@0: else: rliterman@0: base = str(os.path.basename(single).replace(read_filetype,"").replace(trim_name,"")) rliterman@0: print(",".join([base,"Single",single]))