Mercurial > repos > jpayne > tableops

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Dockerfile	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,9 @@
+FROM python:3.10-slim-bullseye
+
+COPY . /tools
+WORKDIR /tools
+
+RUN pip install --no-cache-dir click
+
+ENTRYPOINT []
+CMD ["python", "table-union.py"]
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.MD	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,37 @@
+# Table Ops
+
+A collection of simple command-line table manipulation tools written in Python. These tools are designed to be efficient and easy to use for common table operations.
+
+## Tools
+
+### `table-union`
+
+Merges multiple tabular data files (e.g., CSV, TSV) either by unioning rows with identical columns or by performing a join based on shared key columns.
+
+**Key Features:**
+
+*   **Union Mode (Default):** Combines rows from all input files, assuming they have the same columns. Duplicate rows are retained.
+*   **Join Mode (`--no-union` or similar):** Performs a join operation based on automatically detected shared key columns. It intelligently identifies potential key columns by looking for columns with unique, non-null values across all input files. This mode merges rows based on matching key values.
+*   **Automatic Key Detection:** Automatically identifies suitable columns for joining based on uniqueness and non-null constraints.
+*   **Handles various delimiters:** Supports tab-separated (TSV) and comma-separated (CSV) files.
+*   **Memory Efficient:** Optimized to handle large files without loading them entirely into memory (where possible).
+
+**Usage Example:**
+
+```bash
+table-union file1.tsv file2.tsv file3.tsv > output.tsv
+```
+
+```bash
+table-summarize data.tsv
+```
+
+```bash
+table-sort -k Age -k Name data.tsv > sorted_data.tsv
+```
+
+**Run Unit Tests:**
+
+```bash
+python -m unittest test_table_ops.py
+```
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/job_conf.yml	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,37 @@
+runners:
+  local:
+    load: galaxy.jobs.runners.local:LocalJobRunner
+    workers: 16
+
+# handling:
+#   processes:
+#     handler0:
+
+execution:
+  default: local
+  environments:
+    local:
+      runner: local
+    docker_local:
+      runner: local
+      docker_enabled: true
+      # container: "auto"
+      docker_volumes: $defaults
+      # docker_set_user: null
+      docker_run_extra_arguments: "--entrypoint ''"
+      docker_set_user: root
+
+tools:
+- id: table-sort
+  environment: docker_local
+- id: table-union
+  environment: docker_local
+- id: table-summary
+  environment: docker_local
+
+limits:
+-
+  # Amount of time a job can run (in any environment) before it
+  # will be terminated by Galaxy.
+  type: walltime
+  value: '01:00:00'
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/table-sort.py	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+import csv
+import sys
+
+
+def main(headers):
+    rows = csv.DictReader(sys.stdin, delimiter="\t", dialect="excel-tab")
+
+    # More efficient header check using set intersection
+    if not set(headers).intersection(rows.fieldnames):
+        raise ValueError(
+            f"Couldn't find any of supplied headers ({', '.join(map(repr, headers))}) in the table."
+        )
+
+    # Optimized sorting using tuple comparison (generally faster than list comparison)
+    items = sorted(rows, key=lambda d: tuple(d.get(h, "") for h in headers))
+
+    wr = csv.DictWriter(sys.stdout, dialect="excel-tab", fieldnames=rows.fieldnames)
+    wr.writeheader()
+    wr.writerows(items)
+    # sys.stdout.flush()
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/table-sort.xml	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,34 @@
+<tool id="table-sort" name="Table Sort: Sort tabular dataset" version="0.1.0">
+    <description>on specified header or headers</description>
+    <requirements>
+        <container type="docker">cfsanbiostatistics/tableops:latest</container>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        cat $table | python $__tool_directory__/table-sort.py $headers > $sorted
+    ]]></command>
+    <inputs>
+        <param name="table" format="tsv" label="Table in TSV format to sort." type="data" />
+        <param name="headers" type="text" label="Space-delimited list of headers on which to sort." />
+    </inputs>
+    <outputs>
+        <data name="sorted" format="tsv,tabular" label="Sorted table" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="table" value="combined.tsv" ftype="tsv" />
+            <param name="headers" value="flavor" />
+            <output name="sorted" value="sorted.tsv"/>
+        </test>
+        <test>
+            <param name="table" value="sort_test_multiple.txt" ftype="tsv" />
+            <param name="headers" value="A D" />
+            <output name="sorted" value="sort_test_multiple_sorted.txt"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        Sort a table, ascending, by one or more rows.
+    ]]></help>
+
+    <citations>
+    </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/table-summarize.py	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,39 @@
+#! /usr/bin/env python
+
+from __future__ import print_function
+
+import csv
+import sys
+from collections import Counter, OrderedDict
+
+
+def main(table):
+    with open(
+        table, "r", newline="", encoding="utf-8"
+    ) as table_f:  # Improved file opening
+        rdr = csv.DictReader(table_f, delimiter="\t", dialect="excel")
+
+        # Check if fieldnames exist before proceeding to avoid potential errors
+        if not rdr.fieldnames or len(rdr.fieldnames) <= 1:
+            print("No data columns found in the table.")
+            return
+
+        summary = OrderedDict()
+        for row in rdr:  # Iterate directly without creating a list in memory
+            for name in rdr.fieldnames[1:]:
+                summary.setdefault(name, Counter()).update(
+                    [row[name]]
+                )  # More efficient counting
+
+        total = rdr.line_num - 1  # get the number of rows
+
+        print("Summary:")
+        for name, results in summary.items():
+            print(f"{name}:")  # f-string
+            for result, num in results.items():
+                if result:
+                    print(f"\t - {result}: {num} of {total}")  # f-string
+
+
+if __name__ == "__main__":
+    main(sys.argv[1])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/table-summary.xml	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,28 @@
+<tool id="table-summary" name="Table Summarize: Summarize a tabular dataset" version="0.1.0">
+    <description>by counting unique values in columns</description>
+    <requirements>
+        <container type="docker">cfsanbiostatistics/tableops:latest</container>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python $__tool_directory__/table-summarize.py $table > $summary
+    ]]></command>
+    <inputs>
+        <param name="table" type="data" format="tsv,tabular" />
+    </inputs>
+    <outputs>
+        <data name="summary" format="txt" label="Summary" />
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="table" value="summary_test.txt" />
+            <output name="summary" value="summary.txt" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        Summarize tabular results by counting up unique values in columns.
+    ]]></help>
+
+    <citations>
+    </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/table-union.py	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,79 @@
+#! /usr/bin/env python
+import csv
+import sys
+from collections import defaultdict
+
+import click
+
+@click.command()
+@click.option("--unionize/--no-unionize", default=False, help="Attempt to unionize on an autodetected key column", is_flag=True)
+@click.option("--tuple/--no-tuple", "tuple_mode", default=False, help="For tables with inconsistent headers - unionize by column order instead of column label")
+@click.argument("files", nargs=-1, type=click.Path(exists=True))
+def cli(files, unionize=False, tuple_mode=False):
+    header = []
+    items = []
+    possible_identity_headers = None
+
+    for fi in files:
+        with open(
+            fi, "r", newline="", encoding="utf-8"
+        ) as table:  # Improved file opening
+            if not tuple_mode:
+                reader = csv.DictReader(table, delimiter="\t", dialect="excel")
+
+                # Efficient header update using set operations
+                header_set = set(header)
+                new_headers = [
+                    field for field in reader.fieldnames if field not in header_set
+                ]
+                header.extend(new_headers)
+
+                rows = list(reader)  # Keep this for now, but see optimization below
+                if not rows:  # skip empty files
+                    continue
+
+                if unionize:
+                    # More efficient identity header detection
+                    if possible_identity_headers is None:
+                        possible_identity_headers = set(reader.fieldnames)
+
+                    # Optimized identity header filtering
+                    possible_identity_headers.intersection_update(
+                        f
+                        for f in reader.fieldnames
+                        if len({row[f] for row in rows if f in row}) == len(rows)
+                        and all(row.get(f) is not None for row in rows)
+                    )
+                items.extend(rows)
+            else:
+                reader = csv.reader(table, delimiter="\t", dialect="excel")
+                if not header:
+                    header = next(reader)
+                else:
+                    next(reader)  # skip header in subsequent files
+                items.extend(reader)
+
+
+    if possible_identity_headers and unionize and not tuple_mode:
+        key_column = possible_identity_headers.pop()
+        # More efficient merging using defaultdict
+        merged_rows = defaultdict(dict)
+        for row in items:
+            key = row.get(key_column)
+            if key is not None:  # skip rows with null keys
+                merged_rows[key].update(row)
+        items = list(merged_rows.values())
+
+    if not tuple_mode:
+        wr = csv.DictWriter(
+            sys.stdout, delimiter="\t", dialect="excel", fieldnames=header
+        )
+        wr.writeheader()
+    else:
+        wr = csv.writer(sys.stdout, delimiter="\t", dialect="excel")
+        wr.writerow(header)
+    wr.writerows(items)
+
+
+if __name__ == "__main__":
+    cli()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/table-union.xml	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,99 @@
+<tool id="table-union" name="Table Union: Merge tabular datasets" version="0.1.0">
+    <description>by combining their rows and unionizing columns</description>
+    <requirements>
+        <container type="docker">cfsanbiostatistics/tableops:latest</container>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+    	#if $input_type.source_select == 'two'
+    	#set tables=($table_1, $table_2)
+    	#else
+    	#set tables=$input_type.tables
+    	#end if
+        python $__tool_directory__/table-union.py $adv.union $adv.tuple_mode
+        #for $table in $tables
+            $table
+        #end for
+        > $combined_table
+    ]]></command>
+    <inputs>
+    	<conditional name="input_type">
+    		<param name="source_select" type="select" label="Mode">
+    			<option value="two">Combine two tables</option>
+    			<option value="collection">Combine a collection of tables</option>
+    		</param>
+    		<when value="collection">
+       			<param name="tables" type="data_collection" format="tabular,tsv" collection_type="list" optional="true" />
+       		</when>
+       		<when value="two">
+       			<param name="table_1" type="data" format="tabular,tsv" optional="true" />
+       			<param name="table_2" type="data" format="tabular,tsv" optional="true"/>
+       		</when>
+        </conditional>
+        <!-- Advanced options, click to show -->
+        <section name="adv" title="Advanced options">
+            <param name="union" type="boolean" label="Attempt to unionize on an autodetected key column" checked="false" truevalue="--unionize" falsevalue="" />
+            <param name="tuple_mode" type="boolean" label="For tables with inconsistent headers - unionize by column order instead of column " checked="false" truevalue="--tuple" falsevalue="" />
+        </section>
+    </inputs>
+    <outputs>
+        <data name="combined_table" format="tsv" label="Combined table" />
+    </outputs>
+
+    <tests>
+        <test>
+        	<conditional name="input_type">
+        		<param name="source_select" value="collection" />
+
+            <param name="tables">
+                <collection type="list">
+                    <element name="table_1" value="dingbat.tsv" />
+                    <element name="table_2" value="loki.tsv" />
+                </collection>
+            </param>
+            </conditional>
+            <section name="adv">
+            <param name="union" value="true" />
+            </section>
+            <output name="combined_table" value="combined.tsv" />
+        </test>
+        <test>
+        	<conditional name="input_type">
+        		<param name="source_select" value="two" />
+
+        	<param name="table_1" value="dingbat.tsv" />
+        	<param name="table_2" value="loki.tsv" />
+            </conditional>
+            <section name="adv">
+            <param name="union" value="false" />
+            </section>
+        	<output name="combined_table" value="no_union.tsv" />
+        </test>
+        <test>
+        	<conditional name="input_type">
+        		<param name="source_select" value="two" />
+
+        	<param name="table_1" value="dingbat.tsv" />
+        	<param name="table_2" value="loki.tsv" />
+            </conditional>
+            <section name="adv">
+            <param name="tuple_mode" value="true" />
+            </section>
+        	<output name="combined_table" value="tuple.tsv" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        Concatenate a collection of TSV files wile preserving and unionizing headers.
+    ]]></help>
+
+    <citations>
+            <citation type="bibtex">
+@misc{githubsometool,
+  author = {Payne, Justin},
+  year = {2022},
+  title = {tableops},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  url = {https://github.com/CFSAN-Biostatistics/table-ops},
+}</citation>
+    </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test_tables_ops.py	Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,105 @@
+import unittest
+import subprocess
+import os
+import csv
+
+
+class TestTableOps(unittest.TestCase):
+    TEST_DATA_DIR = "test-data"
+
+    def _run_command(self, command, input_data=None):
+        process = subprocess.Popen(
+            command,
+            stdin=subprocess.PIPE if input_data else None,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,  # Important for handling text I/O
+        )
+        stdout, stderr = process.communicate(input_data)
+        return process.returncode, stdout, stderr
+
+    def _compare_tsv(self, expected_file, actual_output):
+        with open(
+            os.path.join(self.TEST_DATA_DIR, expected_file), "r", encoding="utf-8"
+        ) as f:
+            expected_lines = list(csv.reader(f, delimiter="\t"))
+        actual_lines = list(csv.reader(actual_output.splitlines(), delimiter="\t"))
+        self.assertEqual(expected_lines, actual_lines)
+
+    def test_table_union_union(self):
+        returncode, stdout, stderr = self._run_command(
+            [
+                "table-union",
+                os.path.join(self.TEST_DATA_DIR, "dingbat.tsv"),
+                os.path.join(self.TEST_DATA_DIR, "loki.tsv"),
+            ]
+        )
+        self.assertEqual(returncode, 0)
+        self._compare_tsv("combined.tsv", stdout)
+        self.assertEqual(stderr, "")
+
+    def test_table_union_join(self):
+        returncode, stdout, stderr = self._run_command(
+            [
+                "table-union",
+                "--no-union",
+                os.path.join(self.TEST_DATA_DIR, "users.tsv"),
+                os.path.join(self.TEST_DATA_DIR, "orders.tsv"),
+            ]
+        )
+        self.assertEqual(returncode, 0)
+        self._compare_tsv("merged_expected.tsv", stdout)
+        self.assertEqual(stderr, "")
+
+    def test_table_summarize(self):
+        returncode, stdout, stderr = self._run_command(
+            ["table-summarize", os.path.join(self.TEST_DATA_DIR, "data_summarize.tsv")]
+        )
+        self.assertEqual(returncode, 0)
+
+        expected_summary = """Summary:
+Category:
+\t - A: 3 of 6
+\t - B: 2 of 6
+\t - C: 1 of 6
+Value:
+\t - 10: 1 of 6
+\t - 12: 1 of 6
+\t - 15: 1 of 6
+\t - 20: 1 of 6
+\t - 25: 1 of 6
+\t - 30: 1 of 6
+"""
+        self.assertEqual(stdout.strip(), expected_summary.strip())
+        self.assertEqual(stderr, "")
+
+    def test_table_sort(self):
+        returncode, stdout, stderr = self._run_command(
+            [
+                "table-sort",
+                "-k",
+                "Age",
+                "-k",
+                "Name",
+                os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"),
+            ]
+        )
+        self.assertEqual(returncode, 0)
+        self._compare_tsv("sorted_data_expected.tsv", stdout)
+        self.assertEqual(stderr, "")
+
+    def test_table_sort_pipe(self):
+        with open(
+            os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"), "r", encoding="utf-8"
+        ) as infile:
+            input_data = infile.read()
+        returncode, stdout, stderr = self._run_command(
+            ["table-sort", "-k", "Age", "-k", "Name"], input_data
+        )
+        self.assertEqual(returncode, 0)
+        self._compare_tsv("sorted_data_expected.tsv", stdout)
+        self.assertEqual(stderr, "")
+
+
+if __name__ == "__main__":
+    unittest.main()