view lexmapr2.py @ 23:e4215818a909 tip

"planemo upload"
author kkonganti
date Wed, 09 Nov 2022 10:29:25 -0500
parents 0be9a7117ba5
children
line wrap: on
line source
"""Entry script"""

__version__ = "1.1.0"
import argparse, datetime, json, logging, os, pandas, sys
import lexmapr.pipeline, lexmapr.run_summary
from lexmapr.definitions import arg_bins


def valid_input(file_path):
    """Exits if input file is invalid"""
    _, file_ext = os.path.splitext(file_path)
    if file_ext.lower() != ".csv" and file_ext.lower() != ".tsv":
        sys.exit("Please supply a CSV or TSV input file with the correct file extension")
    if not os.path.exists(file_path):
        sys.exit(f'Input file named "{file_path}" not found')
    return file_path.strip()


def valid_json(file_path):
    """Outputs read JSON file and exits if file is invalid"""
    try:
        with open(file_path, "r") as JSON_file:
            try:
                return json.load(JSON_file)
            except (json.decoder.JSONDecodeError):
                sys.exit(f"User-defined bins not in readable JSON format")
    except (FileNotFoundError):
        sys.exit(f'File named "{file_path}" not found')


def valid_list(list_str):
    """Return list of user-defined ontologies"""
    return [x.strip().upper() for x in list_str.split(",")]


if __name__ == "__main__":
    # Parse arguments, initiate log file and start run
    arg_parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    arg_parser.add_argument("input", help="input CSV or TSV file; required", type=valid_input)
    arg_parser.add_argument(
        "-o", "--output", metavar="\b", help="    output TSV file path; default is stdout"
    )
    arg_parser.add_argument(
        "-a", "--no_ancestors", action="store_true", help="remove ancestral terms from output"
    )
    arg_parser.add_argument(
        "-b", "--bin", action="store_true", help="classify samples into default bins"
    )
    arg_parser.add_argument(
        "-e",
        "--embl_ontol",
        metavar="\b",
        type=valid_list,
        help="    user-defined comma-separated ontology short names",
    )
    arg_parser.add_argument("-f", "--full", action="store_true", help="full output format")
    arg_parser.add_argument(
        "-g", "--graph", action="store_true", help="visualize summaries of mapping and binning"
    )
    arg_parser.add_argument(
        "-j",
        "--graph_only",
        action="store_true",
        help="only perform visualization with LexMapr output",
    )
    arg_parser.add_argument(
        "-r", "--remake_cache", action="store_true", help="remake cached resources"
    )
    arg_parser.add_argument(
        "-u",
        "--user_bin",
        metavar="\b",
        type=valid_json,
        help="    path to JSON file with user-defined bins",
    )
    arg_parser.add_argument(
        "-w",
        "--num_words",
        metavar="\b",
        default=3,
        help="    number of word combinations to sample",
    )
    arg_parser.add_argument(
        "-p",
        "--cpus",
        metavar="\b",
        default=8,
        help="    number of CPUs to try and parallelize permuations on",
    )
    arg_parser.add_argument("-v", "--version", action="version", version="%(prog)s " + __version__)

    # TODO: encoding argument addded to logging.basicConfig in Python 3.9; now defaults to open()
    run_args = arg_parser.parse_args()
    if run_args.user_bin is not None:
        run_args.bin = True
        arg_bins = run_args.user_bin

    logging.basicConfig(filename="lexmapr_run.log", level=logging.DEBUG)

    if run_args.graph_only:
        try:
            mapping_results = pandas.read_csv(run_args.input, delimiter="\t")
        except:
            sys.exit("Input file not readable or not in expected format")
        needed_columns = ["Matched_Components", "Match_Status (Macro Level)"] + list(
            arg_bins.keys()
        )
        missing_columns = set(needed_columns).difference(set(mapping_results.columns))
        if missing_columns:
            sys.exit(f"Missing column(s) {missing_columns} from input file")
        t0 = datetime.datetime.now()
        logging.info(f"Run start: {t0}")
        logging.info("Graphing only")
        print("\nGraphing only...")
        lexmapr.run_summary.figure_folder()
        lexmapr.run_summary.report_results(run_args.input, list(arg_bins.keys()))
        lexmapr.run_summary.visualize_results(run_args.input, list(arg_bins.keys()))
        print("\t" + f"Done! {datetime.datetime.now()-t0} passed".ljust(60) + "\n")
    else:
        logging.info(f"Run start: {datetime.datetime.now()}")
        lexmapr.pipeline.run(run_args)

    logging.info(f"Run end: {datetime.datetime.now()}\n")