diff 0.4.2/bin/prepare_nanopore_fastq_dir.py @ 130:04f6ac8ca13c

planemo upload
author kkonganti
date Wed, 03 Jul 2024 15:16:39 -0400
parents 52045ea4679d
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/0.4.2/bin/prepare_nanopore_fastq_dir.py	Wed Jul 03 15:16:39 2024 -0400
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import glob
+import argparse
+import logging
+
+def main():
+    # READ IN ARGUMENTS
+    desc = """
+    Takes in a file with flowcell ID, one per line and creates soft links
+    to 'fastq_pass' directory at target location.
+
+    Ex:
+
+    prepare_nanopore_fastq_dir.py \
+        -o /hpc/scratch/Kranti.Konganti/np_test \
+        -f flowcells.txt
+
+    where flowcells.txt contains the following lines:
+
+    FAL11127
+    FAL11151
+
+    """
+    parser = argparse.ArgumentParser(prog='prepare_nanopore_fastq_dir.py',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=desc)
+    required = parser.add_argument_group('required arguments')
+    
+    required.add_argument("-f", dest='flowcells', required=True,
+        help="Path to a text file containing Nanopore flowcell IDs, one per line")
+    required.add_argument("-i", dest='inputdir',
+        required=False, action='append', nargs='*',
+        help="Path to search directory. This directory location is where" +
+            " the presence of 'fastq_pass' will be searched for each flowcell.")
+    required.add_argument("-o", dest='outputdir',
+        required=True,
+        help="Path to output directory. This directory is created by the script" +
+            " and new soft links (symlinks) are created in this directory.")
+    
+    args = parser.parse_args()
+    flowcells = args.flowcells
+    output = args.outputdir
+    inputs = args.inputdir
+
+    logging.basicConfig(format='%(asctime)s - %(levelname)s => %(message)s', level=logging.DEBUG)
+
+    if not inputs:
+        inputs = ['/projects/nanopore/raw']
+        nanopore_machines = ['RazorCrest', 'Revolution', 'ObiWan', 'MinIT',
+            'Mayhem', 'CaptainMarvel', 'MinION', 'MinION_Padmini', 'RogueOne']
+        logging.info(f"Searching default path(s). Use -i option if custom path should be searched.")
+    else:
+        nanopore_machines = ['custom']
+
+    fastq_pass_found = {}
+    was_fastq_pass_found = []
+
+    for each_input in inputs:
+        for machine in nanopore_machines:
+            if ''.join(nanopore_machines) != 'custom':
+                input = os.path.join(each_input, machine)
+            else:
+                input = ''.join(each_input)
+
+            logging.info(f"Searching path: {input}")
+
+            if (os.path.exists(flowcells) and os.path.getsize(flowcells) > 0):
+                with open(flowcells, 'r') as fcells:
+                    for flowcell in fcells:
+                        if re.match('^\s*$', flowcell):
+                            continue
+                        flowcell = flowcell.strip()
+                        fastq_pass_path = glob.glob(os.path.join(input, flowcell, f"**", f"*[!fast5]*", 'fastq_pass'))
+                        # Try one more time since the flowcell user is trying to query may be the parent directory
+                        # of fastq_pass
+                        fastq_pass = fastq_pass_path if fastq_pass_path else glob.glob(os.path.join(input, f"**", f"*[!fast5]*", flowcell, 'fastq_pass'))
+                        if not fastq_pass:
+                            # logging.warning(f"Flowcell " +
+                            #     os.path.join(input, flowcell).strip() +
+                            #     f" does not seem to have a fastq_pass directory! Skipped!!")
+                            if not flowcell in fastq_pass_found.keys():
+                                fastq_pass_found[flowcell] = 0 
+                        else:
+                            fastq_pass_found[flowcell] = 1
+                            sym_link_dir = os.path.join(output, flowcell)
+                            sym_link_dir_dest = os.path.join(sym_link_dir, 'fastq_pass')
+                            if not os.path.exists(sym_link_dir):
+                                os.makedirs(sym_link_dir)
+                                os.symlink(
+                                    ''.join(fastq_pass),
+                                    sym_link_dir_dest, target_is_directory=True
+                                )
+                                logging.info(f"New soft link created: {sym_link_dir_dest}")
+                            else:
+                                logging.info(f"Soft link {sym_link_dir_dest} already exists! Skipped!!")
+                    fcells.close()
+            else:
+                logging.error(f"File {flowcells} is empty or does not exist!\n")
+
+    for k,v in fastq_pass_found.items():
+        if not v:
+            was_fastq_pass_found.append(k)
+
+    if was_fastq_pass_found:
+        logging.warning("Did not find fastq_pass folder for the supplied flowcells: " +
+                ', '.join(was_fastq_pass_found))
+
+    if was_fastq_pass_found and len(was_fastq_pass_found) == len(fastq_pass_found):
+        logging.error(f"None of the supplied flowcells were found! The output directory, {output} may not have been created!")
+    else:
+        logging.info(f"NOTE: Now you can use {output} directory as --input to cpipes.\n")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file