Mercurial > repos > jpayne > tableops
annotate table-union.py @ 0:402b58f45844 draft default tip
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
| author | jpayne |
|---|---|
| date | Mon, 08 Dec 2025 15:03:06 +0000 |
| parents | |
| children |
| rev | line source |
|---|---|
|
0
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
1 #! /usr/bin/env python |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
2 import csv |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
3 import sys |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
4 from collections import defaultdict |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
5 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
6 import click |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
7 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
8 @click.command() |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
9 @click.option("--unionize/--no-unionize", default=False, help="Attempt to unionize on an autodetected key column", is_flag=True) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
10 @click.option("--tuple/--no-tuple", "tuple_mode", default=False, help="For tables with inconsistent headers - unionize by column order instead of column label") |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
11 @click.argument("files", nargs=-1, type=click.Path(exists=True)) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
12 def cli(files, unionize=False, tuple_mode=False): |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
13 header = [] |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
14 items = [] |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
15 possible_identity_headers = None |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
16 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
17 for fi in files: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
18 with open( |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
19 fi, "r", newline="", encoding="utf-8" |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
20 ) as table: # Improved file opening |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
21 if not tuple_mode: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
22 reader = csv.DictReader(table, delimiter="\t", dialect="excel") |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
23 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
24 # Efficient header update using set operations |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
25 header_set = set(header) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
26 new_headers = [ |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
27 field for field in reader.fieldnames if field not in header_set |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
28 ] |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
29 header.extend(new_headers) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
30 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
31 rows = list(reader) # Keep this for now, but see optimization below |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
32 if not rows: # skip empty files |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
33 continue |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
34 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
35 if unionize: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
36 # More efficient identity header detection |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
37 if possible_identity_headers is None: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
38 possible_identity_headers = set(reader.fieldnames) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
39 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
40 # Optimized identity header filtering |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
41 possible_identity_headers.intersection_update( |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
42 f |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
43 for f in reader.fieldnames |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
44 if len({row[f] for row in rows if f in row}) == len(rows) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
45 and all(row.get(f) is not None for row in rows) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
46 ) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
47 items.extend(rows) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
48 else: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
49 reader = csv.reader(table, delimiter="\t", dialect="excel") |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
50 if not header: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
51 header = next(reader) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
52 else: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
53 next(reader) # skip header in subsequent files |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
54 items.extend(reader) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
55 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
56 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
57 if possible_identity_headers and unionize and not tuple_mode: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
58 key_column = possible_identity_headers.pop() |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
59 # More efficient merging using defaultdict |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
60 merged_rows = defaultdict(dict) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
61 for row in items: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
62 key = row.get(key_column) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
63 if key is not None: # skip rows with null keys |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
64 merged_rows[key].update(row) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
65 items = list(merged_rows.values()) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
66 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
67 if not tuple_mode: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
68 wr = csv.DictWriter( |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
69 sys.stdout, delimiter="\t", dialect="excel", fieldnames=header |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
70 ) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
71 wr.writeheader() |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
72 else: |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
73 wr = csv.writer(sys.stdout, delimiter="\t", dialect="excel") |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
74 wr.writerow(header) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
75 wr.writerows(items) |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
76 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
77 |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
78 if __name__ == "__main__": |
|
402b58f45844
planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff
changeset
|
79 cli() |
