annotate 0.4.2/bin/prepare_nanopore_fastq_dir.py @ 0:082e0091e813 draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:27:47 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python3
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
2
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
3 import os
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
4 import re
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
5 import glob
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
6 import argparse
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
7 import logging
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
8
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
9 def main():
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
10 # READ IN ARGUMENTS
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
11 desc = """
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
12 Takes in a file with flowcell ID, one per line and creates soft links
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
13 to 'fastq_pass' directory at target location.
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
14
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
15 Ex:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
16
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
17 prepare_nanopore_fastq_dir.py \
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
18 -o /hpc/scratch/Kranti.Konganti/np_test \
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
19 -f flowcells.txt
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
20
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
21 where flowcells.txt contains the following lines:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
22
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
23 FAL11127
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
24 FAL11151
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
25
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
26 """
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
27 parser = argparse.ArgumentParser(prog='prepare_nanopore_fastq_dir.py',
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
28 formatter_class=argparse.ArgumentDefaultsHelpFormatter,
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
29 description=desc)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
30 required = parser.add_argument_group('required arguments')
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
31
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
32 required.add_argument("-f", dest='flowcells', required=True,
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
33 help="Path to a text file containing Nanopore flowcell IDs, one per line")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
34 required.add_argument("-i", dest='inputdir',
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
35 required=False, action='append', nargs='*',
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
36 help="Path to search directory. This directory location is where" +
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
37 " the presence of 'fastq_pass' will be searched for each flowcell.")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
38 required.add_argument("-o", dest='outputdir',
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
39 required=True,
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
40 help="Path to output directory. This directory is created by the script" +
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
41 " and new soft links (symlinks) are created in this directory.")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
42
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
43 args = parser.parse_args()
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
44 flowcells = args.flowcells
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
45 output = args.outputdir
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
46 inputs = args.inputdir
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
47
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
48 logging.basicConfig(format='%(asctime)s - %(levelname)s => %(message)s', level=logging.DEBUG)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
49
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
50 if not inputs:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
51 inputs = ['/projects/nanopore/raw']
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
52 nanopore_machines = ['RazorCrest', 'Revolution', 'ObiWan', 'MinIT',
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
53 'Mayhem', 'CaptainMarvel', 'MinION', 'MinION_Padmini', 'RogueOne']
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
54 logging.info(f"Searching default path(s). Use -i option if custom path should be searched.")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
55 else:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
56 nanopore_machines = ['custom']
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
57
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
58 fastq_pass_found = {}
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
59 was_fastq_pass_found = []
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
60
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
61 for each_input in inputs:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
62 for machine in nanopore_machines:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
63 if ''.join(nanopore_machines) != 'custom':
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
64 input = os.path.join(each_input, machine)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
65 else:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
66 input = ''.join(each_input)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
67
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
68 logging.info(f"Searching path: {input}")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
69
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
70 if (os.path.exists(flowcells) and os.path.getsize(flowcells) > 0):
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
71 with open(flowcells, 'r') as fcells:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
72 for flowcell in fcells:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
73 if re.match('^\s*$', flowcell):
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
74 continue
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
75 flowcell = flowcell.strip()
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
76 fastq_pass_path = glob.glob(os.path.join(input, flowcell, f"**", f"*[!fast5]*", 'fastq_pass'))
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
77 # Try one more time since the flowcell user is trying to query may be the parent directory
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
78 # of fastq_pass
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
79 fastq_pass = fastq_pass_path if fastq_pass_path else glob.glob(os.path.join(input, f"**", f"*[!fast5]*", flowcell, 'fastq_pass'))
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
80 if not fastq_pass:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
81 # logging.warning(f"Flowcell " +
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
82 # os.path.join(input, flowcell).strip() +
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
83 # f" does not seem to have a fastq_pass directory! Skipped!!")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
84 if not flowcell in fastq_pass_found.keys():
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
85 fastq_pass_found[flowcell] = 0
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
86 else:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
87 fastq_pass_found[flowcell] = 1
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
88 sym_link_dir = os.path.join(output, flowcell)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
89 sym_link_dir_dest = os.path.join(sym_link_dir, 'fastq_pass')
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
90 if not os.path.exists(sym_link_dir):
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
91 os.makedirs(sym_link_dir)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
92 os.symlink(
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
93 ''.join(fastq_pass),
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
94 sym_link_dir_dest, target_is_directory=True
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
95 )
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
96 logging.info(f"New soft link created: {sym_link_dir_dest}")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
97 else:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
98 logging.info(f"Soft link {sym_link_dir_dest} already exists! Skipped!!")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
99 fcells.close()
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
100 else:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
101 logging.error(f"File {flowcells} is empty or does not exist!\n")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
102
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
103 for k,v in fastq_pass_found.items():
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
104 if not v:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
105 was_fastq_pass_found.append(k)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
106
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
107 if was_fastq_pass_found:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
108 logging.warning("Did not find fastq_pass folder for the supplied flowcells: " +
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
109 ', '.join(was_fastq_pass_found))
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
110
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
111 if was_fastq_pass_found and len(was_fastq_pass_found) == len(fastq_pass_found):
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
112 logging.error(f"None of the supplied flowcells were found! The output directory, {output} may not have been created!")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
113 else:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
114 logging.info(f"NOTE: Now you can use {output} directory as --input to cpipes.\n")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
115
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
116 if __name__ == "__main__":
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
117 main()