Mercurial > repos > jpayne > tableops
comparison table-union.py @ 0:402b58f45844 draft default tip
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
| author | jpayne |
|---|---|
| date | Mon, 08 Dec 2025 15:03:06 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:402b58f45844 |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 import csv | |
| 3 import sys | |
| 4 from collections import defaultdict | |
| 5 | |
| 6 import click | |
| 7 | |
| 8 @click.command() | |
| 9 @click.option("--unionize/--no-unionize", default=False, help="Attempt to unionize on an autodetected key column", is_flag=True) | |
| 10 @click.option("--tuple/--no-tuple", "tuple_mode", default=False, help="For tables with inconsistent headers - unionize by column order instead of column label") | |
| 11 @click.argument("files", nargs=-1, type=click.Path(exists=True)) | |
| 12 def cli(files, unionize=False, tuple_mode=False): | |
| 13 header = [] | |
| 14 items = [] | |
| 15 possible_identity_headers = None | |
| 16 | |
| 17 for fi in files: | |
| 18 with open( | |
| 19 fi, "r", newline="", encoding="utf-8" | |
| 20 ) as table: # Improved file opening | |
| 21 if not tuple_mode: | |
| 22 reader = csv.DictReader(table, delimiter="\t", dialect="excel") | |
| 23 | |
| 24 # Efficient header update using set operations | |
| 25 header_set = set(header) | |
| 26 new_headers = [ | |
| 27 field for field in reader.fieldnames if field not in header_set | |
| 28 ] | |
| 29 header.extend(new_headers) | |
| 30 | |
| 31 rows = list(reader) # Keep this for now, but see optimization below | |
| 32 if not rows: # skip empty files | |
| 33 continue | |
| 34 | |
| 35 if unionize: | |
| 36 # More efficient identity header detection | |
| 37 if possible_identity_headers is None: | |
| 38 possible_identity_headers = set(reader.fieldnames) | |
| 39 | |
| 40 # Optimized identity header filtering | |
| 41 possible_identity_headers.intersection_update( | |
| 42 f | |
| 43 for f in reader.fieldnames | |
| 44 if len({row[f] for row in rows if f in row}) == len(rows) | |
| 45 and all(row.get(f) is not None for row in rows) | |
| 46 ) | |
| 47 items.extend(rows) | |
| 48 else: | |
| 49 reader = csv.reader(table, delimiter="\t", dialect="excel") | |
| 50 if not header: | |
| 51 header = next(reader) | |
| 52 else: | |
| 53 next(reader) # skip header in subsequent files | |
| 54 items.extend(reader) | |
| 55 | |
| 56 | |
| 57 if possible_identity_headers and unionize and not tuple_mode: | |
| 58 key_column = possible_identity_headers.pop() | |
| 59 # More efficient merging using defaultdict | |
| 60 merged_rows = defaultdict(dict) | |
| 61 for row in items: | |
| 62 key = row.get(key_column) | |
| 63 if key is not None: # skip rows with null keys | |
| 64 merged_rows[key].update(row) | |
| 65 items = list(merged_rows.values()) | |
| 66 | |
| 67 if not tuple_mode: | |
| 68 wr = csv.DictWriter( | |
| 69 sys.stdout, delimiter="\t", dialect="excel", fieldnames=header | |
| 70 ) | |
| 71 wr.writeheader() | |
| 72 else: | |
| 73 wr = csv.writer(sys.stdout, delimiter="\t", dialect="excel") | |
| 74 wr.writerow(header) | |
| 75 wr.writerows(items) | |
| 76 | |
| 77 | |
| 78 if __name__ == "__main__": | |
| 79 cli() |
