comparison table-union.py @ 0:402b58f45844 draft default tip

planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
author jpayne
date Mon, 08 Dec 2025 15:03:06 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:402b58f45844
1 #! /usr/bin/env python
2 import csv
3 import sys
4 from collections import defaultdict
5
6 import click
7
8 @click.command()
9 @click.option("--unionize/--no-unionize", default=False, help="Attempt to unionize on an autodetected key column", is_flag=True)
10 @click.option("--tuple/--no-tuple", "tuple_mode", default=False, help="For tables with inconsistent headers - unionize by column order instead of column label")
11 @click.argument("files", nargs=-1, type=click.Path(exists=True))
12 def cli(files, unionize=False, tuple_mode=False):
13 header = []
14 items = []
15 possible_identity_headers = None
16
17 for fi in files:
18 with open(
19 fi, "r", newline="", encoding="utf-8"
20 ) as table: # Improved file opening
21 if not tuple_mode:
22 reader = csv.DictReader(table, delimiter="\t", dialect="excel")
23
24 # Efficient header update using set operations
25 header_set = set(header)
26 new_headers = [
27 field for field in reader.fieldnames if field not in header_set
28 ]
29 header.extend(new_headers)
30
31 rows = list(reader) # Keep this for now, but see optimization below
32 if not rows: # skip empty files
33 continue
34
35 if unionize:
36 # More efficient identity header detection
37 if possible_identity_headers is None:
38 possible_identity_headers = set(reader.fieldnames)
39
40 # Optimized identity header filtering
41 possible_identity_headers.intersection_update(
42 f
43 for f in reader.fieldnames
44 if len({row[f] for row in rows if f in row}) == len(rows)
45 and all(row.get(f) is not None for row in rows)
46 )
47 items.extend(rows)
48 else:
49 reader = csv.reader(table, delimiter="\t", dialect="excel")
50 if not header:
51 header = next(reader)
52 else:
53 next(reader) # skip header in subsequent files
54 items.extend(reader)
55
56
57 if possible_identity_headers and unionize and not tuple_mode:
58 key_column = possible_identity_headers.pop()
59 # More efficient merging using defaultdict
60 merged_rows = defaultdict(dict)
61 for row in items:
62 key = row.get(key_column)
63 if key is not None: # skip rows with null keys
64 merged_rows[key].update(row)
65 items = list(merged_rows.values())
66
67 if not tuple_mode:
68 wr = csv.DictWriter(
69 sys.stdout, delimiter="\t", dialect="excel", fieldnames=header
70 )
71 wr.writeheader()
72 else:
73 wr = csv.writer(sys.stdout, delimiter="\t", dialect="excel")
74 wr.writerow(header)
75 wr.writerows(items)
76
77
78 if __name__ == "__main__":
79 cli()