rliterman@0
|
1 #!/usr/bin/env python3
|
rliterman@0
|
2
|
rliterman@0
|
3 import os
|
rliterman@0
|
4 import sys
|
rliterman@0
|
5 from glob import glob
|
rliterman@0
|
6 import argparse
|
rliterman@0
|
7
|
rliterman@0
|
8 # Parse args
|
rliterman@0
|
9 parser = argparse.ArgumentParser(description='Fetch Reads')
|
rliterman@0
|
10 parser.add_argument('--read_dir', type=str, help='path to directory containing read files')
|
rliterman@27
|
11 parser.add_argument('--read_filetype',default='fastq.gz', type=str, help='read filetype information')
|
rliterman@27
|
12 parser.add_argument('--forward_suffix',default='_1.fastq.gz', type=str, help='forward suffix')
|
rliterman@27
|
13 parser.add_argument('--reverse_suffix',default = '_2.fastq.gz', type=str, help='reverse suffix')
|
rliterman@28
|
14 parser.add_argument('--trim_name', nargs='?', const="", default="", type=str, help='Trim name')
|
rliterman@0
|
15 args = parser.parse_args()
|
rliterman@0
|
16
|
rliterman@0
|
17 # Get path to directory containing read files
|
rliterman@0
|
18 read_dir = os.path.abspath(args.read_dir)
|
rliterman@0
|
19 if not os.path.isdir(read_dir):
|
rliterman@0
|
20 sys.exit("--read_dir is not a valid path: " + str(read_dir))
|
rliterman@0
|
21
|
rliterman@0
|
22 # Get read filetype information
|
rliterman@0
|
23 read_filetype = args.read_filetype
|
rliterman@0
|
24 if not read_filetype.startswith("."):
|
rliterman@0
|
25 read_filetype = "." + read_filetype
|
rliterman@0
|
26 forward_suffix = args.forward_suffix
|
rliterman@0
|
27 reverse_suffix = args.reverse_suffix
|
rliterman@0
|
28 trim_name = args.trim_name
|
rliterman@0
|
29
|
rliterman@0
|
30 # Check if sequence files exist in directory, ignoring undetermined reads
|
rliterman@0
|
31 read_files = sorted(glob(read_dir+"/*"+read_filetype))
|
rliterman@0
|
32 read_files = [r for r in read_files if not os.path.basename(r).startswith("Undetermined_")]
|
rliterman@0
|
33
|
rliterman@0
|
34 if len(read_files) == 0:
|
rliterman@0
|
35 sys.exit("No "+read_filetype +" files detected in "+read_dir)
|
rliterman@0
|
36
|
rliterman@0
|
37 # Get data for paired-end and single-end reads
|
rliterman@0
|
38 left_files = [s for s in read_files if s.endswith(forward_suffix)]
|
rliterman@0
|
39 right_files = [s for s in read_files if s.endswith(reverse_suffix)]
|
rliterman@0
|
40
|
rliterman@0
|
41 # Identify pairs based on file name
|
rliterman@0
|
42 left_pairs = list()
|
rliterman@0
|
43 right_pairs = list()
|
rliterman@0
|
44 paired_files = list(set([x.replace(forward_suffix, '') for x in left_files]).intersection([y.replace(reverse_suffix, '') for y in right_files]))
|
rliterman@0
|
45
|
rliterman@0
|
46 for pair in paired_files:
|
rliterman@0
|
47 left_pairs.append(pair+forward_suffix)
|
rliterman@0
|
48 right_pairs.append(pair+reverse_suffix)
|
rliterman@0
|
49 single_end = [x for x in read_files if x not in left_pairs + right_pairs]
|
rliterman@0
|
50
|
rliterman@0
|
51 for left in left_pairs:
|
rliterman@0
|
52 base = str(os.path.basename(left).replace(forward_suffix,"").replace(trim_name,""))
|
rliterman@0
|
53 print(",".join([base,"Paired",";".join([left,left.replace(forward_suffix,reverse_suffix)])]))
|
rliterman@0
|
54
|
rliterman@0
|
55 for single in single_end:
|
rliterman@0
|
56 if single.endswith(forward_suffix):
|
rliterman@0
|
57 base = str(os.path.basename(single).replace(forward_suffix,"").replace(trim_name,""))
|
rliterman@0
|
58 else:
|
rliterman@0
|
59 base = str(os.path.basename(single).replace(read_filetype,"").replace(trim_name,""))
|
rliterman@0
|
60 print(",".join([base,"Single",single]))
|