annotate 0.2.1/bin/prepare_nanopore_fastq_dir.py @ 70:4e72dfe54475

"planemo upload"
author kkonganti
date Wed, 13 Jul 2022 14:13:16 -0400
parents 77494b0fa3c7
children
rev   line source
kkonganti@0 1 #!/usr/bin/env python3
kkonganti@0 2
kkonganti@0 3 import os
kkonganti@0 4 import re
kkonganti@0 5 import glob
kkonganti@0 6 import argparse
kkonganti@0 7 import logging
kkonganti@0 8
kkonganti@0 9 def main():
kkonganti@0 10 # READ IN ARGUMENTS
kkonganti@0 11 desc = """
kkonganti@0 12 Takes in a file with flowcell ID, one per line and creates soft links
kkonganti@0 13 to 'fastq_pass' directory at target location.
kkonganti@0 14
kkonganti@0 15 Ex:
kkonganti@0 16
kkonganti@0 17 prepare_nanopore_fastq_dir.py \
kkonganti@0 18 -o /hpc/scratch/Kranti.Konganti/np_test \
kkonganti@0 19 -f flowcells.txt
kkonganti@0 20
kkonganti@0 21 where flowcells.txt contains the following lines:
kkonganti@0 22
kkonganti@0 23 FAL11127
kkonganti@0 24 FAL11151
kkonganti@0 25
kkonganti@0 26 """
kkonganti@0 27 parser = argparse.ArgumentParser(prog='prepare_nanopore_fastq_dir.py',
kkonganti@0 28 formatter_class=argparse.ArgumentDefaultsHelpFormatter,
kkonganti@0 29 description=desc)
kkonganti@0 30 required = parser.add_argument_group('required arguments')
kkonganti@0 31
kkonganti@0 32 required.add_argument("-f", dest='flowcells', required=True,
kkonganti@0 33 help="Path to a text file containing Nanopore flowcell IDs, one per line")
kkonganti@0 34 required.add_argument("-i", dest='inputdir',
kkonganti@0 35 required=False, action='append', nargs='*',
kkonganti@0 36 help="Path to search directory. This directory location is where" +
kkonganti@0 37 " the presence of 'fastq_pass' will be searched for each flowcell.")
kkonganti@0 38 required.add_argument("-o", dest='outputdir',
kkonganti@0 39 required=True,
kkonganti@0 40 help="Path to output directory. This directory is created by the script" +
kkonganti@0 41 " and new soft links (symlinks) are created in this directory.")
kkonganti@0 42
kkonganti@0 43 args = parser.parse_args()
kkonganti@0 44 flowcells = args.flowcells
kkonganti@0 45 output = args.outputdir
kkonganti@0 46 inputs = args.inputdir
kkonganti@0 47
kkonganti@0 48 logging.basicConfig(format='%(asctime)s - %(levelname)s => %(message)s', level=logging.DEBUG)
kkonganti@0 49
kkonganti@0 50 if not inputs:
kkonganti@0 51 inputs = ['/projects/nanopore/raw']
kkonganti@0 52 nanopore_machines = ['RazorCrest', 'Revolution', 'ObiWan', 'MinIT',
kkonganti@0 53 'Mayhem', 'CaptainMarvel', 'MinION', 'MinION_Padmini', 'RogueOne']
kkonganti@0 54 logging.info(f"Searching default path(s). Use -i option if custom path should be searched.")
kkonganti@0 55 else:
kkonganti@0 56 nanopore_machines = ['custom']
kkonganti@0 57
kkonganti@0 58 fastq_pass_found = {}
kkonganti@0 59 was_fastq_pass_found = []
kkonganti@0 60
kkonganti@0 61 for each_input in inputs:
kkonganti@0 62 for machine in nanopore_machines:
kkonganti@0 63 if ''.join(nanopore_machines) != 'custom':
kkonganti@0 64 input = os.path.join(each_input, machine)
kkonganti@0 65 else:
kkonganti@0 66 input = ''.join(each_input)
kkonganti@0 67
kkonganti@0 68 logging.info(f"Searching path: {input}")
kkonganti@0 69
kkonganti@0 70 if (os.path.exists(flowcells) and os.path.getsize(flowcells) > 0):
kkonganti@0 71 with open(flowcells, 'r') as fcells:
kkonganti@0 72 for flowcell in fcells:
kkonganti@0 73 if re.match('^\s*$', flowcell):
kkonganti@0 74 continue
kkonganti@0 75 flowcell = flowcell.strip()
kkonganti@0 76 fastq_pass_path = glob.glob(os.path.join(input, flowcell, f"**", f"*[!fast5]*", 'fastq_pass'))
kkonganti@0 77 # Try one more time since the flowcell user is trying to query may be the parent directory
kkonganti@0 78 # of fastq_pass
kkonganti@0 79 fastq_pass = fastq_pass_path if fastq_pass_path else glob.glob(os.path.join(input, f"**", f"*[!fast5]*", flowcell, 'fastq_pass'))
kkonganti@0 80 if not fastq_pass:
kkonganti@0 81 # logging.warning(f"Flowcell " +
kkonganti@0 82 # os.path.join(input, flowcell).strip() +
kkonganti@0 83 # f" does not seem to have a fastq_pass directory! Skipped!!")
kkonganti@0 84 if not flowcell in fastq_pass_found.keys():
kkonganti@0 85 fastq_pass_found[flowcell] = 0
kkonganti@0 86 else:
kkonganti@0 87 fastq_pass_found[flowcell] = 1
kkonganti@0 88 sym_link_dir = os.path.join(output, flowcell)
kkonganti@0 89 sym_link_dir_dest = os.path.join(sym_link_dir, 'fastq_pass')
kkonganti@0 90 if not os.path.exists(sym_link_dir):
kkonganti@0 91 os.makedirs(sym_link_dir)
kkonganti@0 92 os.symlink(
kkonganti@0 93 ''.join(fastq_pass),
kkonganti@0 94 sym_link_dir_dest, target_is_directory=True
kkonganti@0 95 )
kkonganti@0 96 logging.info(f"New soft link created: {sym_link_dir_dest}")
kkonganti@0 97 else:
kkonganti@0 98 logging.info(f"Soft link {sym_link_dir_dest} already exists! Skipped!!")
kkonganti@0 99 fcells.close()
kkonganti@0 100 else:
kkonganti@0 101 logging.error(f"File {flowcells} is empty or does not exist!\n")
kkonganti@0 102
kkonganti@0 103 for k,v in fastq_pass_found.items():
kkonganti@0 104 if not v:
kkonganti@0 105 was_fastq_pass_found.append(k)
kkonganti@0 106
kkonganti@0 107 if was_fastq_pass_found:
kkonganti@0 108 logging.warning("Did not find fastq_pass folder for the supplied flowcells: " +
kkonganti@0 109 ', '.join(was_fastq_pass_found))
kkonganti@0 110
kkonganti@0 111 if was_fastq_pass_found and len(was_fastq_pass_found) == len(fastq_pass_found):
kkonganti@0 112 logging.error(f"None of the supplied flowcells were found! The output directory, {output} may not have been created!")
kkonganti@0 113 else:
kkonganti@0 114 logging.info(f"NOTE: Now you can use {output} directory as --input to cpipes.\n")
kkonganti@0 115
kkonganti@0 116 if __name__ == "__main__":
kkonganti@0 117 main()