Mercurial > repos > kkonganti > lexmapr2_from_cfsan

--- a/cfsan_lexmapr2.xml	Thu Sep 15 14:50:34 2022 -0400
+++ b/cfsan_lexmapr2.xml	Wed Nov 09 09:05:28 2022 -0500
@@ -56,10 +56,10 @@
             <when value="false">
             </when>
         </conditional>
-        <param name="bins" truevalue="-b" type="boolean" checked="true"
-            label="Classify samples into default bins" help="Default: true"/>
-        <param name="graph" truevalue="-g" type="boolean" checked="true"
-            label="Visualize summary of mapping and binning" help="Default: true"/>
+        <param name="bins" truevalue="-b" type="boolean" checked="false"
+            label="Classify samples into default bins" help="Default: false. Turning on this option may increase run time."/>
+        <param name="graph" truevalue="-g" type="boolean" checked="false"
+            label="Visualize summary of mapping and binning" help="Default: false. Turning on this option may increase run time."/>
         <param name="full" truevalue="-f" type="boolean" checked="true"
             label="Generate full output format" help="Default: true"/>
         <param name="no_ancestors" truevalue="-a" type="boolean" checked="false"
--- a/lexmapr2.py	Thu Sep 15 14:50:34 2022 -0400
+++ b/lexmapr2.py	Wed Nov 09 09:05:28 2022 -0500
@@ -1,58 +1,93 @@
 """Entry script"""

-__version__ = '1.0.0'
+__version__ = "1.1.0"
 import argparse, datetime, json, logging, os, pandas, sys
 import lexmapr.pipeline, lexmapr.run_summary
 from lexmapr.definitions import arg_bins


 def valid_input(file_path):
-    '''Exits if input file is invalid'''
+    """Exits if input file is invalid"""
     _, file_ext = os.path.splitext(file_path)
-    if file_ext.lower() != '.csv' and file_ext.lower() != '.tsv':
-        sys.exit('Please supply a CSV or TSV input file with the correct file extension')
+    if file_ext.lower() != ".csv" and file_ext.lower() != ".tsv":
+        sys.exit("Please supply a CSV or TSV input file with the correct file extension")
     if not os.path.exists(file_path):
-        sys.exit(f'Input file named \"{file_path}\" not found')
-    return(file_path.strip())
+        sys.exit(f'Input file named "{file_path}" not found')
+    return file_path.strip()
+

 def valid_json(file_path):
-    '''Outputs read JSON file and exits if file is invalid'''
+    """Outputs read JSON file and exits if file is invalid"""
     try:
-        with open(file_path, 'r') as JSON_file:
+        with open(file_path, "r") as JSON_file:
             try:
-                return(json.load(JSON_file))
-            except(json.decoder.JSONDecodeError):
-                sys.exit(f'User-defined bins not in readable JSON format')
-    except(FileNotFoundError):
-        sys.exit(f'File named \"{file_path}\" not found')
+                return json.load(JSON_file)
+            except (json.decoder.JSONDecodeError):
+                sys.exit(f"User-defined bins not in readable JSON format")
+    except (FileNotFoundError):
+        sys.exit(f'File named "{file_path}" not found')
+

 def valid_list(list_str):
-    '''Return list of user-defined ontologies'''
-    return([x.strip().upper() for x in list_str.split(',')])
+    """Return list of user-defined ontologies"""
+    return [x.strip().upper() for x in list_str.split(",")]
+

 if __name__ == "__main__":
     # Parse arguments, initiate log file and start run
     arg_parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    arg_parser.add_argument('input', help='input CSV or TSV file; required', type=valid_input)
-    arg_parser.add_argument('-o', '--output', metavar='\b',
-                            help='    output TSV file path; default is stdout')
-    arg_parser.add_argument('-a', '--no_ancestors', action='store_true',
-                            help='remove ancestral terms from output')
-    arg_parser.add_argument('-b', '--bin', action='store_true',
-                            help='classify samples into default bins')
-    arg_parser.add_argument('-e', '--embl_ontol', metavar='\b', type=valid_list,
-                            help='    user-defined comma-separated ontology short names')
-    arg_parser.add_argument('-f', '--full', action='store_true', help='full output format')
-    arg_parser.add_argument('-g', '--graph', action='store_true',
-                            help='visualize summaries of mapping and binning')
-    arg_parser.add_argument('-j', '--graph_only', action='store_true',
-                            help='only perform visualization with LexMapr output')
-    arg_parser.add_argument('-r', '--remake_cache', action='store_true',
-                            help='remake cached resources')
-    arg_parser.add_argument('-u', '--user_bin', metavar='\b', type=valid_json,
-                            help='    path to JSON file with user-defined bins')
-    arg_parser.add_argument('-v', '--version', action='version',
-                            version='%(prog)s '+__version__)
+    arg_parser.add_argument("input", help="input CSV or TSV file; required", type=valid_input)
+    arg_parser.add_argument(
+        "-o", "--output", metavar="\b", help="    output TSV file path; default is stdout"
+    )
+    arg_parser.add_argument(
+        "-a", "--no_ancestors", action="store_true", help="remove ancestral terms from output"
+    )
+    arg_parser.add_argument(
+        "-b", "--bin", action="store_true", help="classify samples into default bins"
+    )
+    arg_parser.add_argument(
+        "-e",
+        "--embl_ontol",
+        metavar="\b",
+        type=valid_list,
+        help="    user-defined comma-separated ontology short names",
+    )
+    arg_parser.add_argument("-f", "--full", action="store_true", help="full output format")
+    arg_parser.add_argument(
+        "-g", "--graph", action="store_true", help="visualize summaries of mapping and binning"
+    )
+    arg_parser.add_argument(
+        "-j",
+        "--graph_only",
+        action="store_true",
+        help="only perform visualization with LexMapr output",
+    )
+    arg_parser.add_argument(
+        "-r", "--remake_cache", action="store_true", help="remake cached resources"
+    )
+    arg_parser.add_argument(
+        "-u",
+        "--user_bin",
+        metavar="\b",
+        type=valid_json,
+        help="    path to JSON file with user-defined bins",
+    )
+    arg_parser.add_argument(
+        "-w",
+        "--num_words",
+        metavar="\b",
+        default=3,
+        help="    number of word combinations to sample",
+    )
+    arg_parser.add_argument(
+        "-p",
+        "--cpus",
+        metavar="\b",
+        default=8,
+        help="    number of CPUs to try and parallelize permuations on",
+    )
+    arg_parser.add_argument("-v", "--version", action="version", version="%(prog)s " + __version__)

     # TODO: encoding argument addded to logging.basicConfig in Python 3.9; now defaults to open()
     run_args = arg_parser.parse_args()
@@ -60,27 +95,29 @@
         run_args.bin = True
         arg_bins = run_args.user_bin

-    logging.basicConfig(filename='lexmapr_run.log', level=logging.DEBUG)
+    logging.basicConfig(filename="lexmapr_run.log", level=logging.DEBUG)

     if run_args.graph_only:
         try:
-            mapping_results = pandas.read_csv(run_args.input, delimiter='\t')
+            mapping_results = pandas.read_csv(run_args.input, delimiter="\t")
         except:
-            sys.exit('Input file not readable or not in expected format')
-        needed_columns = ['Matched_Components','Match_Status (Macro Level)']+list(arg_bins.keys())
+            sys.exit("Input file not readable or not in expected format")
+        needed_columns = ["Matched_Components", "Match_Status (Macro Level)"] + list(
+            arg_bins.keys()
+        )
         missing_columns = set(needed_columns).difference(set(mapping_results.columns))
         if missing_columns:
-            sys.exit(f'Missing column(s) {missing_columns} from input file')
+            sys.exit(f"Missing column(s) {missing_columns} from input file")
         t0 = datetime.datetime.now()
-        logging.info(f'Run start: {t0}')
-        logging.info('Graphing only')
-        print('\nGraphing only...')
+        logging.info(f"Run start: {t0}")
+        logging.info("Graphing only")
+        print("\nGraphing only...")
         lexmapr.run_summary.figure_folder()
         lexmapr.run_summary.report_results(run_args.input, list(arg_bins.keys()))
         lexmapr.run_summary.visualize_results(run_args.input, list(arg_bins.keys()))
-        print('\t'+f'Done! {datetime.datetime.now()-t0} passed'.ljust(60)+'\n')
+        print("\t" + f"Done! {datetime.datetime.now()-t0} passed".ljust(60) + "\n")
     else:
-        logging.info(f'Run start: {datetime.datetime.now()}')
+        logging.info(f"Run start: {datetime.datetime.now()}")
         lexmapr.pipeline.run(run_args)

-    logging.info(f'Run end: {datetime.datetime.now()}\n')
+    logging.info(f"Run end: {datetime.datetime.now()}\n")
--- a/lexmapr_run.log	Thu Sep 15 14:50:34 2022 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-INFO:root:Run start: 2022-09-15 14:05:40.236514
-INFO:root:Run start: 2022-09-15 14:07:13.815176
-INFO:root:Run start: 2022-09-15 14:08:11.894380
-INFO:root:Database build/confirm: 2022-09-15 14:08:11.908013
-WARNING:root:Did not retrieve URL for https://www.ebi.ac.uk/ols/api/ontologies/foodon/ancestors?id=FOODON:00002703&page=1&size=20 during API search
--- a/o.tsv	Thu Sep 15 14:50:34 2022 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-Sample_Id	Sample_Desc	Processed_Sample	Annotated_Sample	Matched_Components	Match_Status (Macro Level)	Match_Status (Micro Level)	Sample_Transformations	ncbi_taxon
-small_simple1	Chicken Breast		Not annotated	chicken breast:FOODON_00002703	Full Term Match	[]	{}
\ No newline at end of file
--- a/t.csv	Thu Sep 15 14:50:34 2022 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-SampleId,Sample
-small_simple1,Chicken Breast
-small_simple2,Baked Potato
-small_simple3,Canned Corn
-small_simple4,Frozen Yogurt
-small_simple5,Apple Pie
-small_simple5,Apple cider
-small_simple6,Chicken egg
\ No newline at end of file
--- a/t.json	Thu Sep 15 14:50:34 2022 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-{
-  "ncbi_taxon":{
-                "Actinopterygii":"NCBITaxon_7898",
-                "Ecdysozoa":"NCBITaxon_1206794",
-                "Echinodermata":"NCBITaxon_7586",
-                "Fungi":"NCBITaxon_4751",
-                "Mammalia":"NCBITaxon_40674",
-                "Sauropsida":"NCBITaxon_8457",
-                "Spiralia":"NCBITaxon_2697495",
-                "Viridiplantae":"NCBITaxon_33090"
-               }
-}
\ No newline at end of file