# HG changeset patch
# User jpayne
# Date 1765206186 0
# Node ID 402b58f45844077d7650e31a651ee6cc80443ff4
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
diff -r 000000000000 -r 402b58f45844 Dockerfile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Dockerfile Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,9 @@
+FROM python:3.10-slim-bullseye
+
+COPY . /tools
+WORKDIR /tools
+
+RUN pip install --no-cache-dir click
+
+ENTRYPOINT []
+CMD ["python", "table-union.py"]
\ No newline at end of file
diff -r 000000000000 -r 402b58f45844 README.MD
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.MD Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,37 @@
+# Table Ops
+
+A collection of simple command-line table manipulation tools written in Python. These tools are designed to be efficient and easy to use for common table operations.
+
+## Tools
+
+### `table-union`
+
+Merges multiple tabular data files (e.g., CSV, TSV) either by unioning rows with identical columns or by performing a join based on shared key columns.
+
+**Key Features:**
+
+* **Union Mode (Default):** Combines rows from all input files, assuming they have the same columns. Duplicate rows are retained.
+* **Join Mode (`--no-union` or similar):** Performs a join operation based on automatically detected shared key columns. It intelligently identifies potential key columns by looking for columns with unique, non-null values across all input files. This mode merges rows based on matching key values.
+* **Automatic Key Detection:** Automatically identifies suitable columns for joining based on uniqueness and non-null constraints.
+* **Handles various delimiters:** Supports tab-separated (TSV) and comma-separated (CSV) files.
+* **Memory Efficient:** Optimized to handle large files without loading them entirely into memory (where possible).
+
+**Usage Example:**
+
+```bash
+table-union file1.tsv file2.tsv file3.tsv > output.tsv
+```
+
+```bash
+table-summarize data.tsv
+```
+
+```bash
+table-sort -k Age -k Name data.tsv > sorted_data.tsv
+```
+
+**Run Unit Tests:**
+
+```bash
+python -m unittest test_table_ops.py
+```
diff -r 000000000000 -r 402b58f45844 job_conf.yml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/job_conf.yml Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,37 @@
+runners:
+ local:
+ load: galaxy.jobs.runners.local:LocalJobRunner
+ workers: 16
+
+# handling:
+# processes:
+# handler0:
+
+execution:
+ default: local
+ environments:
+ local:
+ runner: local
+ docker_local:
+ runner: local
+ docker_enabled: true
+ # container: "auto"
+ docker_volumes: $defaults
+ # docker_set_user: null
+ docker_run_extra_arguments: "--entrypoint ''"
+ docker_set_user: root
+
+tools:
+- id: table-sort
+ environment: docker_local
+- id: table-union
+ environment: docker_local
+- id: table-summary
+ environment: docker_local
+
+limits:
+-
+ # Amount of time a job can run (in any environment) before it
+ # will be terminated by Galaxy.
+ type: walltime
+ value: '01:00:00'
\ No newline at end of file
diff -r 000000000000 -r 402b58f45844 table-sort.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/table-sort.py Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+import csv
+import sys
+
+
+def main(headers):
+ rows = csv.DictReader(sys.stdin, delimiter="\t", dialect="excel-tab")
+
+ # More efficient header check using set intersection
+ if not set(headers).intersection(rows.fieldnames):
+ raise ValueError(
+ f"Couldn't find any of supplied headers ({', '.join(map(repr, headers))}) in the table."
+ )
+
+ # Optimized sorting using tuple comparison (generally faster than list comparison)
+ items = sorted(rows, key=lambda d: tuple(d.get(h, "") for h in headers))
+
+ wr = csv.DictWriter(sys.stdout, dialect="excel-tab", fieldnames=rows.fieldnames)
+ wr.writeheader()
+ wr.writerows(items)
+ # sys.stdout.flush()
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
diff -r 000000000000 -r 402b58f45844 table-sort.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/table-sort.xml Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,34 @@
+
+ on specified header or headers
+
+ cfsanbiostatistics/tableops:latest
+
+ $sorted
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 402b58f45844 table-summarize.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/table-summarize.py Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,39 @@
+#! /usr/bin/env python
+
+from __future__ import print_function
+
+import csv
+import sys
+from collections import Counter, OrderedDict
+
+
+def main(table):
+ with open(
+ table, "r", newline="", encoding="utf-8"
+ ) as table_f: # Improved file opening
+ rdr = csv.DictReader(table_f, delimiter="\t", dialect="excel")
+
+ # Check if fieldnames exist before proceeding to avoid potential errors
+ if not rdr.fieldnames or len(rdr.fieldnames) <= 1:
+ print("No data columns found in the table.")
+ return
+
+ summary = OrderedDict()
+ for row in rdr: # Iterate directly without creating a list in memory
+ for name in rdr.fieldnames[1:]:
+ summary.setdefault(name, Counter()).update(
+ [row[name]]
+ ) # More efficient counting
+
+ total = rdr.line_num - 1 # get the number of rows
+
+ print("Summary:")
+ for name, results in summary.items():
+ print(f"{name}:") # f-string
+ for result, num in results.items():
+ if result:
+ print(f"\t - {result}: {num} of {total}") # f-string
+
+
+if __name__ == "__main__":
+ main(sys.argv[1])
diff -r 000000000000 -r 402b58f45844 table-summary.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/table-summary.xml Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,28 @@
+
+ by counting unique values in columns
+
+ cfsanbiostatistics/tableops:latest
+
+ $summary
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 402b58f45844 table-union.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/table-union.py Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,79 @@
+#! /usr/bin/env python
+import csv
+import sys
+from collections import defaultdict
+
+import click
+
+@click.command()
+@click.option("--unionize/--no-unionize", default=False, help="Attempt to unionize on an autodetected key column", is_flag=True)
+@click.option("--tuple/--no-tuple", "tuple_mode", default=False, help="For tables with inconsistent headers - unionize by column order instead of column label")
+@click.argument("files", nargs=-1, type=click.Path(exists=True))
+def cli(files, unionize=False, tuple_mode=False):
+ header = []
+ items = []
+ possible_identity_headers = None
+
+ for fi in files:
+ with open(
+ fi, "r", newline="", encoding="utf-8"
+ ) as table: # Improved file opening
+ if not tuple_mode:
+ reader = csv.DictReader(table, delimiter="\t", dialect="excel")
+
+ # Efficient header update using set operations
+ header_set = set(header)
+ new_headers = [
+ field for field in reader.fieldnames if field not in header_set
+ ]
+ header.extend(new_headers)
+
+ rows = list(reader) # Keep this for now, but see optimization below
+ if not rows: # skip empty files
+ continue
+
+ if unionize:
+ # More efficient identity header detection
+ if possible_identity_headers is None:
+ possible_identity_headers = set(reader.fieldnames)
+
+ # Optimized identity header filtering
+ possible_identity_headers.intersection_update(
+ f
+ for f in reader.fieldnames
+ if len({row[f] for row in rows if f in row}) == len(rows)
+ and all(row.get(f) is not None for row in rows)
+ )
+ items.extend(rows)
+ else:
+ reader = csv.reader(table, delimiter="\t", dialect="excel")
+ if not header:
+ header = next(reader)
+ else:
+ next(reader) # skip header in subsequent files
+ items.extend(reader)
+
+
+ if possible_identity_headers and unionize and not tuple_mode:
+ key_column = possible_identity_headers.pop()
+ # More efficient merging using defaultdict
+ merged_rows = defaultdict(dict)
+ for row in items:
+ key = row.get(key_column)
+ if key is not None: # skip rows with null keys
+ merged_rows[key].update(row)
+ items = list(merged_rows.values())
+
+ if not tuple_mode:
+ wr = csv.DictWriter(
+ sys.stdout, delimiter="\t", dialect="excel", fieldnames=header
+ )
+ wr.writeheader()
+ else:
+ wr = csv.writer(sys.stdout, delimiter="\t", dialect="excel")
+ wr.writerow(header)
+ wr.writerows(items)
+
+
+if __name__ == "__main__":
+ cli()
diff -r 000000000000 -r 402b58f45844 table-union.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/table-union.xml Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,99 @@
+
+ by combining their rows and unionizing columns
+
+ cfsanbiostatistics/tableops:latest
+
+ $combined_table
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+@misc{githubsometool,
+ author = {Payne, Justin},
+ year = {2022},
+ title = {tableops},
+ publisher = {GitHub},
+ journal = {GitHub repository},
+ url = {https://github.com/CFSAN-Biostatistics/table-ops},
+}
+
+
\ No newline at end of file
diff -r 000000000000 -r 402b58f45844 test_tables_ops.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test_tables_ops.py Mon Dec 08 15:03:06 2025 +0000
@@ -0,0 +1,105 @@
+import unittest
+import subprocess
+import os
+import csv
+
+
+class TestTableOps(unittest.TestCase):
+ TEST_DATA_DIR = "test-data"
+
+ def _run_command(self, command, input_data=None):
+ process = subprocess.Popen(
+ command,
+ stdin=subprocess.PIPE if input_data else None,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True, # Important for handling text I/O
+ )
+ stdout, stderr = process.communicate(input_data)
+ return process.returncode, stdout, stderr
+
+ def _compare_tsv(self, expected_file, actual_output):
+ with open(
+ os.path.join(self.TEST_DATA_DIR, expected_file), "r", encoding="utf-8"
+ ) as f:
+ expected_lines = list(csv.reader(f, delimiter="\t"))
+ actual_lines = list(csv.reader(actual_output.splitlines(), delimiter="\t"))
+ self.assertEqual(expected_lines, actual_lines)
+
+ def test_table_union_union(self):
+ returncode, stdout, stderr = self._run_command(
+ [
+ "table-union",
+ os.path.join(self.TEST_DATA_DIR, "dingbat.tsv"),
+ os.path.join(self.TEST_DATA_DIR, "loki.tsv"),
+ ]
+ )
+ self.assertEqual(returncode, 0)
+ self._compare_tsv("combined.tsv", stdout)
+ self.assertEqual(stderr, "")
+
+ def test_table_union_join(self):
+ returncode, stdout, stderr = self._run_command(
+ [
+ "table-union",
+ "--no-union",
+ os.path.join(self.TEST_DATA_DIR, "users.tsv"),
+ os.path.join(self.TEST_DATA_DIR, "orders.tsv"),
+ ]
+ )
+ self.assertEqual(returncode, 0)
+ self._compare_tsv("merged_expected.tsv", stdout)
+ self.assertEqual(stderr, "")
+
+ def test_table_summarize(self):
+ returncode, stdout, stderr = self._run_command(
+ ["table-summarize", os.path.join(self.TEST_DATA_DIR, "data_summarize.tsv")]
+ )
+ self.assertEqual(returncode, 0)
+
+ expected_summary = """Summary:
+Category:
+\t - A: 3 of 6
+\t - B: 2 of 6
+\t - C: 1 of 6
+Value:
+\t - 10: 1 of 6
+\t - 12: 1 of 6
+\t - 15: 1 of 6
+\t - 20: 1 of 6
+\t - 25: 1 of 6
+\t - 30: 1 of 6
+"""
+ self.assertEqual(stdout.strip(), expected_summary.strip())
+ self.assertEqual(stderr, "")
+
+ def test_table_sort(self):
+ returncode, stdout, stderr = self._run_command(
+ [
+ "table-sort",
+ "-k",
+ "Age",
+ "-k",
+ "Name",
+ os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"),
+ ]
+ )
+ self.assertEqual(returncode, 0)
+ self._compare_tsv("sorted_data_expected.tsv", stdout)
+ self.assertEqual(stderr, "")
+
+ def test_table_sort_pipe(self):
+ with open(
+ os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"), "r", encoding="utf-8"
+ ) as infile:
+ input_data = infile.read()
+ returncode, stdout, stderr = self._run_command(
+ ["table-sort", "-k", "Age", "-k", "Name"], input_data
+ )
+ self.assertEqual(returncode, 0)
+ self._compare_tsv("sorted_data_expected.tsv", stdout)
+ self.assertEqual(stderr, "")
+
+
+if __name__ == "__main__":
+ unittest.main()