kkonganti@101: #!/usr/bin/env python3 kkonganti@101: kkonganti@101: import os kkonganti@101: import re kkonganti@101: import glob kkonganti@101: import argparse kkonganti@101: import logging kkonganti@101: kkonganti@101: def main(): kkonganti@101: # READ IN ARGUMENTS kkonganti@101: desc = """ kkonganti@101: Takes in a file with flowcell ID, one per line and creates soft links kkonganti@101: to 'fastq_pass' directory at target location. kkonganti@101: kkonganti@101: Ex: kkonganti@101: kkonganti@101: prepare_nanopore_fastq_dir.py \ kkonganti@101: -o /hpc/scratch/Kranti.Konganti/np_test \ kkonganti@101: -f flowcells.txt kkonganti@101: kkonganti@101: where flowcells.txt contains the following lines: kkonganti@101: kkonganti@101: FAL11127 kkonganti@101: FAL11151 kkonganti@101: kkonganti@101: """ kkonganti@101: parser = argparse.ArgumentParser(prog='prepare_nanopore_fastq_dir.py', kkonganti@101: formatter_class=argparse.ArgumentDefaultsHelpFormatter, kkonganti@101: description=desc) kkonganti@101: required = parser.add_argument_group('required arguments') kkonganti@101: kkonganti@101: required.add_argument("-f", dest='flowcells', required=True, kkonganti@101: help="Path to a text file containing Nanopore flowcell IDs, one per line") kkonganti@101: required.add_argument("-i", dest='inputdir', kkonganti@101: required=False, action='append', nargs='*', kkonganti@101: help="Path to search directory. This directory location is where" + kkonganti@101: " the presence of 'fastq_pass' will be searched for each flowcell.") kkonganti@101: required.add_argument("-o", dest='outputdir', kkonganti@101: required=True, kkonganti@101: help="Path to output directory. This directory is created by the script" + kkonganti@101: " and new soft links (symlinks) are created in this directory.") kkonganti@101: kkonganti@101: args = parser.parse_args() kkonganti@101: flowcells = args.flowcells kkonganti@101: output = args.outputdir kkonganti@101: inputs = args.inputdir kkonganti@101: kkonganti@101: logging.basicConfig(format='%(asctime)s - %(levelname)s => %(message)s', level=logging.DEBUG) kkonganti@101: kkonganti@101: if not inputs: kkonganti@101: inputs = ['/projects/nanopore/raw'] kkonganti@101: nanopore_machines = ['RazorCrest', 'Revolution', 'ObiWan', 'MinIT', kkonganti@101: 'Mayhem', 'CaptainMarvel', 'MinION', 'MinION_Padmini', 'RogueOne'] kkonganti@101: logging.info(f"Searching default path(s). Use -i option if custom path should be searched.") kkonganti@101: else: kkonganti@101: nanopore_machines = ['custom'] kkonganti@101: kkonganti@101: fastq_pass_found = {} kkonganti@101: was_fastq_pass_found = [] kkonganti@101: kkonganti@101: for each_input in inputs: kkonganti@101: for machine in nanopore_machines: kkonganti@101: if ''.join(nanopore_machines) != 'custom': kkonganti@101: input = os.path.join(each_input, machine) kkonganti@101: else: kkonganti@101: input = ''.join(each_input) kkonganti@101: kkonganti@101: logging.info(f"Searching path: {input}") kkonganti@101: kkonganti@101: if (os.path.exists(flowcells) and os.path.getsize(flowcells) > 0): kkonganti@101: with open(flowcells, 'r') as fcells: kkonganti@101: for flowcell in fcells: kkonganti@101: if re.match('^\s*$', flowcell): kkonganti@101: continue kkonganti@101: flowcell = flowcell.strip() kkonganti@101: fastq_pass_path = glob.glob(os.path.join(input, flowcell, f"**", f"*[!fast5]*", 'fastq_pass')) kkonganti@101: # Try one more time since the flowcell user is trying to query may be the parent directory kkonganti@101: # of fastq_pass kkonganti@101: fastq_pass = fastq_pass_path if fastq_pass_path else glob.glob(os.path.join(input, f"**", f"*[!fast5]*", flowcell, 'fastq_pass')) kkonganti@101: if not fastq_pass: kkonganti@101: # logging.warning(f"Flowcell " + kkonganti@101: # os.path.join(input, flowcell).strip() + kkonganti@101: # f" does not seem to have a fastq_pass directory! Skipped!!") kkonganti@101: if not flowcell in fastq_pass_found.keys(): kkonganti@101: fastq_pass_found[flowcell] = 0 kkonganti@101: else: kkonganti@101: fastq_pass_found[flowcell] = 1 kkonganti@101: sym_link_dir = os.path.join(output, flowcell) kkonganti@101: sym_link_dir_dest = os.path.join(sym_link_dir, 'fastq_pass') kkonganti@101: if not os.path.exists(sym_link_dir): kkonganti@101: os.makedirs(sym_link_dir) kkonganti@101: os.symlink( kkonganti@101: ''.join(fastq_pass), kkonganti@101: sym_link_dir_dest, target_is_directory=True kkonganti@101: ) kkonganti@101: logging.info(f"New soft link created: {sym_link_dir_dest}") kkonganti@101: else: kkonganti@101: logging.info(f"Soft link {sym_link_dir_dest} already exists! Skipped!!") kkonganti@101: fcells.close() kkonganti@101: else: kkonganti@101: logging.error(f"File {flowcells} is empty or does not exist!\n") kkonganti@101: kkonganti@101: for k,v in fastq_pass_found.items(): kkonganti@101: if not v: kkonganti@101: was_fastq_pass_found.append(k) kkonganti@101: kkonganti@101: if was_fastq_pass_found: kkonganti@101: logging.warning("Did not find fastq_pass folder for the supplied flowcells: " + kkonganti@101: ', '.join(was_fastq_pass_found)) kkonganti@101: kkonganti@101: if was_fastq_pass_found and len(was_fastq_pass_found) == len(fastq_pass_found): kkonganti@101: logging.error(f"None of the supplied flowcells were found! The output directory, {output} may not have been created!") kkonganti@101: else: kkonganti@101: logging.info(f"NOTE: Now you can use {output} directory as --input to cpipes.\n") kkonganti@101: kkonganti@101: if __name__ == "__main__": kkonganti@101: main()