# HG changeset patch # User jpayne # Date 1765206186 0 # Node ID 402b58f45844077d7650e31a651ee6cc80443ff4 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty diff -r 000000000000 -r 402b58f45844 Dockerfile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dockerfile Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,9 @@ +FROM python:3.10-slim-bullseye + +COPY . /tools +WORKDIR /tools + +RUN pip install --no-cache-dir click + +ENTRYPOINT [] +CMD ["python", "table-union.py"] \ No newline at end of file diff -r 000000000000 -r 402b58f45844 README.MD --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.MD Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,37 @@ +# Table Ops + +A collection of simple command-line table manipulation tools written in Python. These tools are designed to be efficient and easy to use for common table operations. + +## Tools + +### `table-union` + +Merges multiple tabular data files (e.g., CSV, TSV) either by unioning rows with identical columns or by performing a join based on shared key columns. + +**Key Features:** + +* **Union Mode (Default):** Combines rows from all input files, assuming they have the same columns. Duplicate rows are retained. +* **Join Mode (`--no-union` or similar):** Performs a join operation based on automatically detected shared key columns. It intelligently identifies potential key columns by looking for columns with unique, non-null values across all input files. This mode merges rows based on matching key values. +* **Automatic Key Detection:** Automatically identifies suitable columns for joining based on uniqueness and non-null constraints. +* **Handles various delimiters:** Supports tab-separated (TSV) and comma-separated (CSV) files. +* **Memory Efficient:** Optimized to handle large files without loading them entirely into memory (where possible). + +**Usage Example:** + +```bash +table-union file1.tsv file2.tsv file3.tsv > output.tsv +``` + +```bash +table-summarize data.tsv +``` + +```bash +table-sort -k Age -k Name data.tsv > sorted_data.tsv +``` + +**Run Unit Tests:** + +```bash +python -m unittest test_table_ops.py +``` diff -r 000000000000 -r 402b58f45844 job_conf.yml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/job_conf.yml Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,37 @@ +runners: + local: + load: galaxy.jobs.runners.local:LocalJobRunner + workers: 16 + +# handling: +# processes: +# handler0: + +execution: + default: local + environments: + local: + runner: local + docker_local: + runner: local + docker_enabled: true + # container: "auto" + docker_volumes: $defaults + # docker_set_user: null + docker_run_extra_arguments: "--entrypoint ''" + docker_set_user: root + +tools: +- id: table-sort + environment: docker_local +- id: table-union + environment: docker_local +- id: table-summary + environment: docker_local + +limits: +- + # Amount of time a job can run (in any environment) before it + # will be terminated by Galaxy. + type: walltime + value: '01:00:00' \ No newline at end of file diff -r 000000000000 -r 402b58f45844 table-sort.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-sort.py Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +import csv +import sys + + +def main(headers): + rows = csv.DictReader(sys.stdin, delimiter="\t", dialect="excel-tab") + + # More efficient header check using set intersection + if not set(headers).intersection(rows.fieldnames): + raise ValueError( + f"Couldn't find any of supplied headers ({', '.join(map(repr, headers))}) in the table." + ) + + # Optimized sorting using tuple comparison (generally faster than list comparison) + items = sorted(rows, key=lambda d: tuple(d.get(h, "") for h in headers)) + + wr = csv.DictWriter(sys.stdout, dialect="excel-tab", fieldnames=rows.fieldnames) + wr.writeheader() + wr.writerows(items) + # sys.stdout.flush() + + +if __name__ == "__main__": + main(sys.argv[1:]) diff -r 000000000000 -r 402b58f45844 table-sort.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-sort.xml Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,34 @@ + + on specified header or headers + + cfsanbiostatistics/tableops:latest + + $sorted + ]]> + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r 402b58f45844 table-summarize.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-summarize.py Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,39 @@ +#! /usr/bin/env python + +from __future__ import print_function + +import csv +import sys +from collections import Counter, OrderedDict + + +def main(table): + with open( + table, "r", newline="", encoding="utf-8" + ) as table_f: # Improved file opening + rdr = csv.DictReader(table_f, delimiter="\t", dialect="excel") + + # Check if fieldnames exist before proceeding to avoid potential errors + if not rdr.fieldnames or len(rdr.fieldnames) <= 1: + print("No data columns found in the table.") + return + + summary = OrderedDict() + for row in rdr: # Iterate directly without creating a list in memory + for name in rdr.fieldnames[1:]: + summary.setdefault(name, Counter()).update( + [row[name]] + ) # More efficient counting + + total = rdr.line_num - 1 # get the number of rows + + print("Summary:") + for name, results in summary.items(): + print(f"{name}:") # f-string + for result, num in results.items(): + if result: + print(f"\t - {result}: {num} of {total}") # f-string + + +if __name__ == "__main__": + main(sys.argv[1]) diff -r 000000000000 -r 402b58f45844 table-summary.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-summary.xml Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,28 @@ + + by counting unique values in columns + + cfsanbiostatistics/tableops:latest + + $summary + ]]> + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r 402b58f45844 table-union.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-union.py Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,79 @@ +#! /usr/bin/env python +import csv +import sys +from collections import defaultdict + +import click + +@click.command() +@click.option("--unionize/--no-unionize", default=False, help="Attempt to unionize on an autodetected key column", is_flag=True) +@click.option("--tuple/--no-tuple", "tuple_mode", default=False, help="For tables with inconsistent headers - unionize by column order instead of column label") +@click.argument("files", nargs=-1, type=click.Path(exists=True)) +def cli(files, unionize=False, tuple_mode=False): + header = [] + items = [] + possible_identity_headers = None + + for fi in files: + with open( + fi, "r", newline="", encoding="utf-8" + ) as table: # Improved file opening + if not tuple_mode: + reader = csv.DictReader(table, delimiter="\t", dialect="excel") + + # Efficient header update using set operations + header_set = set(header) + new_headers = [ + field for field in reader.fieldnames if field not in header_set + ] + header.extend(new_headers) + + rows = list(reader) # Keep this for now, but see optimization below + if not rows: # skip empty files + continue + + if unionize: + # More efficient identity header detection + if possible_identity_headers is None: + possible_identity_headers = set(reader.fieldnames) + + # Optimized identity header filtering + possible_identity_headers.intersection_update( + f + for f in reader.fieldnames + if len({row[f] for row in rows if f in row}) == len(rows) + and all(row.get(f) is not None for row in rows) + ) + items.extend(rows) + else: + reader = csv.reader(table, delimiter="\t", dialect="excel") + if not header: + header = next(reader) + else: + next(reader) # skip header in subsequent files + items.extend(reader) + + + if possible_identity_headers and unionize and not tuple_mode: + key_column = possible_identity_headers.pop() + # More efficient merging using defaultdict + merged_rows = defaultdict(dict) + for row in items: + key = row.get(key_column) + if key is not None: # skip rows with null keys + merged_rows[key].update(row) + items = list(merged_rows.values()) + + if not tuple_mode: + wr = csv.DictWriter( + sys.stdout, delimiter="\t", dialect="excel", fieldnames=header + ) + wr.writeheader() + else: + wr = csv.writer(sys.stdout, delimiter="\t", dialect="excel") + wr.writerow(header) + wr.writerows(items) + + +if __name__ == "__main__": + cli() diff -r 000000000000 -r 402b58f45844 table-union.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-union.xml Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,99 @@ + + by combining their rows and unionizing columns + + cfsanbiostatistics/tableops:latest + + $combined_table + ]]> + + + + + + + + + + + + + + + +
+ + +
+
+ + + + + + + + + + + + + + + + +
+ +
+ +
+ + + + + + + +
+ +
+ +
+ + + + + + + +
+ +
+ +
+
+ + + + +@misc{githubsometool, + author = {Payne, Justin}, + year = {2022}, + title = {tableops}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/CFSAN-Biostatistics/table-ops}, +} + +
\ No newline at end of file diff -r 000000000000 -r 402b58f45844 test_tables_ops.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_tables_ops.py Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,105 @@ +import unittest +import subprocess +import os +import csv + + +class TestTableOps(unittest.TestCase): + TEST_DATA_DIR = "test-data" + + def _run_command(self, command, input_data=None): + process = subprocess.Popen( + command, + stdin=subprocess.PIPE if input_data else None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, # Important for handling text I/O + ) + stdout, stderr = process.communicate(input_data) + return process.returncode, stdout, stderr + + def _compare_tsv(self, expected_file, actual_output): + with open( + os.path.join(self.TEST_DATA_DIR, expected_file), "r", encoding="utf-8" + ) as f: + expected_lines = list(csv.reader(f, delimiter="\t")) + actual_lines = list(csv.reader(actual_output.splitlines(), delimiter="\t")) + self.assertEqual(expected_lines, actual_lines) + + def test_table_union_union(self): + returncode, stdout, stderr = self._run_command( + [ + "table-union", + os.path.join(self.TEST_DATA_DIR, "dingbat.tsv"), + os.path.join(self.TEST_DATA_DIR, "loki.tsv"), + ] + ) + self.assertEqual(returncode, 0) + self._compare_tsv("combined.tsv", stdout) + self.assertEqual(stderr, "") + + def test_table_union_join(self): + returncode, stdout, stderr = self._run_command( + [ + "table-union", + "--no-union", + os.path.join(self.TEST_DATA_DIR, "users.tsv"), + os.path.join(self.TEST_DATA_DIR, "orders.tsv"), + ] + ) + self.assertEqual(returncode, 0) + self._compare_tsv("merged_expected.tsv", stdout) + self.assertEqual(stderr, "") + + def test_table_summarize(self): + returncode, stdout, stderr = self._run_command( + ["table-summarize", os.path.join(self.TEST_DATA_DIR, "data_summarize.tsv")] + ) + self.assertEqual(returncode, 0) + + expected_summary = """Summary: +Category: +\t - A: 3 of 6 +\t - B: 2 of 6 +\t - C: 1 of 6 +Value: +\t - 10: 1 of 6 +\t - 12: 1 of 6 +\t - 15: 1 of 6 +\t - 20: 1 of 6 +\t - 25: 1 of 6 +\t - 30: 1 of 6 +""" + self.assertEqual(stdout.strip(), expected_summary.strip()) + self.assertEqual(stderr, "") + + def test_table_sort(self): + returncode, stdout, stderr = self._run_command( + [ + "table-sort", + "-k", + "Age", + "-k", + "Name", + os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"), + ] + ) + self.assertEqual(returncode, 0) + self._compare_tsv("sorted_data_expected.tsv", stdout) + self.assertEqual(stderr, "") + + def test_table_sort_pipe(self): + with open( + os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"), "r", encoding="utf-8" + ) as infile: + input_data = infile.read() + returncode, stdout, stderr = self._run_command( + ["table-sort", "-k", "Age", "-k", "Name"], input_data + ) + self.assertEqual(returncode, 0) + self._compare_tsv("sorted_data_expected.tsv", stdout) + self.assertEqual(stderr, "") + + +if __name__ == "__main__": + unittest.main()