Mercurial > repos > jpayne > tableops
changeset 0:402b58f45844 draft default tip
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
| author | jpayne |
|---|---|
| date | Mon, 08 Dec 2025 15:03:06 +0000 |
| parents | |
| children | |
| files | Dockerfile README.MD job_conf.yml table-sort.py table-sort.xml table-summarize.py table-summary.xml table-union.py table-union.xml test_tables_ops.py |
| diffstat | 10 files changed, 493 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Dockerfile Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,9 @@ +FROM python:3.10-slim-bullseye + +COPY . /tools +WORKDIR /tools + +RUN pip install --no-cache-dir click + +ENTRYPOINT [] +CMD ["python", "table-union.py"] \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.MD Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,37 @@ +# Table Ops + +A collection of simple command-line table manipulation tools written in Python. These tools are designed to be efficient and easy to use for common table operations. + +## Tools + +### `table-union` + +Merges multiple tabular data files (e.g., CSV, TSV) either by unioning rows with identical columns or by performing a join based on shared key columns. + +**Key Features:** + +* **Union Mode (Default):** Combines rows from all input files, assuming they have the same columns. Duplicate rows are retained. +* **Join Mode (`--no-union` or similar):** Performs a join operation based on automatically detected shared key columns. It intelligently identifies potential key columns by looking for columns with unique, non-null values across all input files. This mode merges rows based on matching key values. +* **Automatic Key Detection:** Automatically identifies suitable columns for joining based on uniqueness and non-null constraints. +* **Handles various delimiters:** Supports tab-separated (TSV) and comma-separated (CSV) files. +* **Memory Efficient:** Optimized to handle large files without loading them entirely into memory (where possible). + +**Usage Example:** + +```bash +table-union file1.tsv file2.tsv file3.tsv > output.tsv +``` + +```bash +table-summarize data.tsv +``` + +```bash +table-sort -k Age -k Name data.tsv > sorted_data.tsv +``` + +**Run Unit Tests:** + +```bash +python -m unittest test_table_ops.py +```
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/job_conf.yml Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,37 @@ +runners: + local: + load: galaxy.jobs.runners.local:LocalJobRunner + workers: 16 + +# handling: +# processes: +# handler0: + +execution: + default: local + environments: + local: + runner: local + docker_local: + runner: local + docker_enabled: true + # container: "auto" + docker_volumes: $defaults + # docker_set_user: null + docker_run_extra_arguments: "--entrypoint ''" + docker_set_user: root + +tools: +- id: table-sort + environment: docker_local +- id: table-union + environment: docker_local +- id: table-summary + environment: docker_local + +limits: +- + # Amount of time a job can run (in any environment) before it + # will be terminated by Galaxy. + type: walltime + value: '01:00:00' \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-sort.py Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +import csv +import sys + + +def main(headers): + rows = csv.DictReader(sys.stdin, delimiter="\t", dialect="excel-tab") + + # More efficient header check using set intersection + if not set(headers).intersection(rows.fieldnames): + raise ValueError( + f"Couldn't find any of supplied headers ({', '.join(map(repr, headers))}) in the table." + ) + + # Optimized sorting using tuple comparison (generally faster than list comparison) + items = sorted(rows, key=lambda d: tuple(d.get(h, "") for h in headers)) + + wr = csv.DictWriter(sys.stdout, dialect="excel-tab", fieldnames=rows.fieldnames) + wr.writeheader() + wr.writerows(items) + # sys.stdout.flush() + + +if __name__ == "__main__": + main(sys.argv[1:])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-sort.xml Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,34 @@ +<tool id="table-sort" name="Table Sort: Sort tabular dataset" version="0.1.0"> + <description>on specified header or headers</description> + <requirements> + <container type="docker">cfsanbiostatistics/tableops:latest</container> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + cat $table | python $__tool_directory__/table-sort.py $headers > $sorted + ]]></command> + <inputs> + <param name="table" format="tsv" label="Table in TSV format to sort." type="data" /> + <param name="headers" type="text" label="Space-delimited list of headers on which to sort." /> + </inputs> + <outputs> + <data name="sorted" format="tsv,tabular" label="Sorted table" /> + </outputs> + <tests> + <test> + <param name="table" value="combined.tsv" ftype="tsv" /> + <param name="headers" value="flavor" /> + <output name="sorted" value="sorted.tsv"/> + </test> + <test> + <param name="table" value="sort_test_multiple.txt" ftype="tsv" /> + <param name="headers" value="A D" /> + <output name="sorted" value="sort_test_multiple_sorted.txt"/> + </test> + </tests> + <help><![CDATA[ + Sort a table, ascending, by one or more rows. + ]]></help> + + <citations> + </citations> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-summarize.py Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,39 @@ +#! /usr/bin/env python + +from __future__ import print_function + +import csv +import sys +from collections import Counter, OrderedDict + + +def main(table): + with open( + table, "r", newline="", encoding="utf-8" + ) as table_f: # Improved file opening + rdr = csv.DictReader(table_f, delimiter="\t", dialect="excel") + + # Check if fieldnames exist before proceeding to avoid potential errors + if not rdr.fieldnames or len(rdr.fieldnames) <= 1: + print("No data columns found in the table.") + return + + summary = OrderedDict() + for row in rdr: # Iterate directly without creating a list in memory + for name in rdr.fieldnames[1:]: + summary.setdefault(name, Counter()).update( + [row[name]] + ) # More efficient counting + + total = rdr.line_num - 1 # get the number of rows + + print("Summary:") + for name, results in summary.items(): + print(f"{name}:") # f-string + for result, num in results.items(): + if result: + print(f"\t - {result}: {num} of {total}") # f-string + + +if __name__ == "__main__": + main(sys.argv[1])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-summary.xml Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,28 @@ +<tool id="table-summary" name="Table Summarize: Summarize a tabular dataset" version="0.1.0"> + <description>by counting unique values in columns</description> + <requirements> + <container type="docker">cfsanbiostatistics/tableops:latest</container> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python $__tool_directory__/table-summarize.py $table > $summary + ]]></command> + <inputs> + <param name="table" type="data" format="tsv,tabular" /> + </inputs> + <outputs> + <data name="summary" format="txt" label="Summary" /> + </outputs> + + <tests> + <test> + <param name="table" value="summary_test.txt" /> + <output name="summary" value="summary.txt" /> + </test> + </tests> + <help><![CDATA[ + Summarize tabular results by counting up unique values in columns. + ]]></help> + + <citations> + </citations> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-union.py Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,79 @@ +#! /usr/bin/env python +import csv +import sys +from collections import defaultdict + +import click + +@click.command() +@click.option("--unionize/--no-unionize", default=False, help="Attempt to unionize on an autodetected key column", is_flag=True) +@click.option("--tuple/--no-tuple", "tuple_mode", default=False, help="For tables with inconsistent headers - unionize by column order instead of column label") +@click.argument("files", nargs=-1, type=click.Path(exists=True)) +def cli(files, unionize=False, tuple_mode=False): + header = [] + items = [] + possible_identity_headers = None + + for fi in files: + with open( + fi, "r", newline="", encoding="utf-8" + ) as table: # Improved file opening + if not tuple_mode: + reader = csv.DictReader(table, delimiter="\t", dialect="excel") + + # Efficient header update using set operations + header_set = set(header) + new_headers = [ + field for field in reader.fieldnames if field not in header_set + ] + header.extend(new_headers) + + rows = list(reader) # Keep this for now, but see optimization below + if not rows: # skip empty files + continue + + if unionize: + # More efficient identity header detection + if possible_identity_headers is None: + possible_identity_headers = set(reader.fieldnames) + + # Optimized identity header filtering + possible_identity_headers.intersection_update( + f + for f in reader.fieldnames + if len({row[f] for row in rows if f in row}) == len(rows) + and all(row.get(f) is not None for row in rows) + ) + items.extend(rows) + else: + reader = csv.reader(table, delimiter="\t", dialect="excel") + if not header: + header = next(reader) + else: + next(reader) # skip header in subsequent files + items.extend(reader) + + + if possible_identity_headers and unionize and not tuple_mode: + key_column = possible_identity_headers.pop() + # More efficient merging using defaultdict + merged_rows = defaultdict(dict) + for row in items: + key = row.get(key_column) + if key is not None: # skip rows with null keys + merged_rows[key].update(row) + items = list(merged_rows.values()) + + if not tuple_mode: + wr = csv.DictWriter( + sys.stdout, delimiter="\t", dialect="excel", fieldnames=header + ) + wr.writeheader() + else: + wr = csv.writer(sys.stdout, delimiter="\t", dialect="excel") + wr.writerow(header) + wr.writerows(items) + + +if __name__ == "__main__": + cli()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/table-union.xml Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,99 @@ +<tool id="table-union" name="Table Union: Merge tabular datasets" version="0.1.0"> + <description>by combining their rows and unionizing columns</description> + <requirements> + <container type="docker">cfsanbiostatistics/tableops:latest</container> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + #if $input_type.source_select == 'two' + #set tables=($table_1, $table_2) + #else + #set tables=$input_type.tables + #end if + python $__tool_directory__/table-union.py $adv.union $adv.tuple_mode + #for $table in $tables + $table + #end for + > $combined_table + ]]></command> + <inputs> + <conditional name="input_type"> + <param name="source_select" type="select" label="Mode"> + <option value="two">Combine two tables</option> + <option value="collection">Combine a collection of tables</option> + </param> + <when value="collection"> + <param name="tables" type="data_collection" format="tabular,tsv" collection_type="list" optional="true" /> + </when> + <when value="two"> + <param name="table_1" type="data" format="tabular,tsv" optional="true" /> + <param name="table_2" type="data" format="tabular,tsv" optional="true"/> + </when> + </conditional> + <!-- Advanced options, click to show --> + <section name="adv" title="Advanced options"> + <param name="union" type="boolean" label="Attempt to unionize on an autodetected key column" checked="false" truevalue="--unionize" falsevalue="" /> + <param name="tuple_mode" type="boolean" label="For tables with inconsistent headers - unionize by column order instead of column " checked="false" truevalue="--tuple" falsevalue="" /> + </section> + </inputs> + <outputs> + <data name="combined_table" format="tsv" label="Combined table" /> + </outputs> + + <tests> + <test> + <conditional name="input_type"> + <param name="source_select" value="collection" /> + + <param name="tables"> + <collection type="list"> + <element name="table_1" value="dingbat.tsv" /> + <element name="table_2" value="loki.tsv" /> + </collection> + </param> + </conditional> + <section name="adv"> + <param name="union" value="true" /> + </section> + <output name="combined_table" value="combined.tsv" /> + </test> + <test> + <conditional name="input_type"> + <param name="source_select" value="two" /> + + <param name="table_1" value="dingbat.tsv" /> + <param name="table_2" value="loki.tsv" /> + </conditional> + <section name="adv"> + <param name="union" value="false" /> + </section> + <output name="combined_table" value="no_union.tsv" /> + </test> + <test> + <conditional name="input_type"> + <param name="source_select" value="two" /> + + <param name="table_1" value="dingbat.tsv" /> + <param name="table_2" value="loki.tsv" /> + </conditional> + <section name="adv"> + <param name="tuple_mode" value="true" /> + </section> + <output name="combined_table" value="tuple.tsv" /> + </test> + </tests> + <help><![CDATA[ + Concatenate a collection of TSV files wile preserving and unionizing headers. + ]]></help> + + <citations> + <citation type="bibtex"> +@misc{githubsometool, + author = {Payne, Justin}, + year = {2022}, + title = {tableops}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/CFSAN-Biostatistics/table-ops}, +}</citation> + </citations> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test_tables_ops.py Mon Dec 08 15:03:06 2025 +0000 @@ -0,0 +1,105 @@ +import unittest +import subprocess +import os +import csv + + +class TestTableOps(unittest.TestCase): + TEST_DATA_DIR = "test-data" + + def _run_command(self, command, input_data=None): + process = subprocess.Popen( + command, + stdin=subprocess.PIPE if input_data else None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, # Important for handling text I/O + ) + stdout, stderr = process.communicate(input_data) + return process.returncode, stdout, stderr + + def _compare_tsv(self, expected_file, actual_output): + with open( + os.path.join(self.TEST_DATA_DIR, expected_file), "r", encoding="utf-8" + ) as f: + expected_lines = list(csv.reader(f, delimiter="\t")) + actual_lines = list(csv.reader(actual_output.splitlines(), delimiter="\t")) + self.assertEqual(expected_lines, actual_lines) + + def test_table_union_union(self): + returncode, stdout, stderr = self._run_command( + [ + "table-union", + os.path.join(self.TEST_DATA_DIR, "dingbat.tsv"), + os.path.join(self.TEST_DATA_DIR, "loki.tsv"), + ] + ) + self.assertEqual(returncode, 0) + self._compare_tsv("combined.tsv", stdout) + self.assertEqual(stderr, "") + + def test_table_union_join(self): + returncode, stdout, stderr = self._run_command( + [ + "table-union", + "--no-union", + os.path.join(self.TEST_DATA_DIR, "users.tsv"), + os.path.join(self.TEST_DATA_DIR, "orders.tsv"), + ] + ) + self.assertEqual(returncode, 0) + self._compare_tsv("merged_expected.tsv", stdout) + self.assertEqual(stderr, "") + + def test_table_summarize(self): + returncode, stdout, stderr = self._run_command( + ["table-summarize", os.path.join(self.TEST_DATA_DIR, "data_summarize.tsv")] + ) + self.assertEqual(returncode, 0) + + expected_summary = """Summary: +Category: +\t - A: 3 of 6 +\t - B: 2 of 6 +\t - C: 1 of 6 +Value: +\t - 10: 1 of 6 +\t - 12: 1 of 6 +\t - 15: 1 of 6 +\t - 20: 1 of 6 +\t - 25: 1 of 6 +\t - 30: 1 of 6 +""" + self.assertEqual(stdout.strip(), expected_summary.strip()) + self.assertEqual(stderr, "") + + def test_table_sort(self): + returncode, stdout, stderr = self._run_command( + [ + "table-sort", + "-k", + "Age", + "-k", + "Name", + os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"), + ] + ) + self.assertEqual(returncode, 0) + self._compare_tsv("sorted_data_expected.tsv", stdout) + self.assertEqual(stderr, "") + + def test_table_sort_pipe(self): + with open( + os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"), "r", encoding="utf-8" + ) as infile: + input_data = infile.read() + returncode, stdout, stderr = self._run_command( + ["table-sort", "-k", "Age", "-k", "Name"], input_data + ) + self.assertEqual(returncode, 0) + self._compare_tsv("sorted_data_expected.tsv", stdout) + self.assertEqual(stderr, "") + + +if __name__ == "__main__": + unittest.main()
