annotate table-union.py @ 0:402b58f45844 draft default tip

planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
author jpayne
date Mon, 08 Dec 2025 15:03:06 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
1 #! /usr/bin/env python
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
2 import csv
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
3 import sys
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
4 from collections import defaultdict
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
5
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
6 import click
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
7
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
8 @click.command()
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
9 @click.option("--unionize/--no-unionize", default=False, help="Attempt to unionize on an autodetected key column", is_flag=True)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
10 @click.option("--tuple/--no-tuple", "tuple_mode", default=False, help="For tables with inconsistent headers - unionize by column order instead of column label")
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
11 @click.argument("files", nargs=-1, type=click.Path(exists=True))
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
12 def cli(files, unionize=False, tuple_mode=False):
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
13 header = []
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
14 items = []
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
15 possible_identity_headers = None
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
16
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
17 for fi in files:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
18 with open(
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
19 fi, "r", newline="", encoding="utf-8"
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
20 ) as table: # Improved file opening
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
21 if not tuple_mode:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
22 reader = csv.DictReader(table, delimiter="\t", dialect="excel")
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
23
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
24 # Efficient header update using set operations
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
25 header_set = set(header)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
26 new_headers = [
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
27 field for field in reader.fieldnames if field not in header_set
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
28 ]
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
29 header.extend(new_headers)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
30
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
31 rows = list(reader) # Keep this for now, but see optimization below
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
32 if not rows: # skip empty files
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
33 continue
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
34
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
35 if unionize:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
36 # More efficient identity header detection
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
37 if possible_identity_headers is None:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
38 possible_identity_headers = set(reader.fieldnames)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
39
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
40 # Optimized identity header filtering
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
41 possible_identity_headers.intersection_update(
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
42 f
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
43 for f in reader.fieldnames
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
44 if len({row[f] for row in rows if f in row}) == len(rows)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
45 and all(row.get(f) is not None for row in rows)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
46 )
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
47 items.extend(rows)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
48 else:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
49 reader = csv.reader(table, delimiter="\t", dialect="excel")
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
50 if not header:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
51 header = next(reader)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
52 else:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
53 next(reader) # skip header in subsequent files
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
54 items.extend(reader)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
55
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
56
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
57 if possible_identity_headers and unionize and not tuple_mode:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
58 key_column = possible_identity_headers.pop()
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
59 # More efficient merging using defaultdict
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
60 merged_rows = defaultdict(dict)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
61 for row in items:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
62 key = row.get(key_column)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
63 if key is not None: # skip rows with null keys
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
64 merged_rows[key].update(row)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
65 items = list(merged_rows.values())
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
66
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
67 if not tuple_mode:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
68 wr = csv.DictWriter(
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
69 sys.stdout, delimiter="\t", dialect="excel", fieldnames=header
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
70 )
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
71 wr.writeheader()
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
72 else:
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
73 wr = csv.writer(sys.stdout, delimiter="\t", dialect="excel")
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
74 wr.writerow(header)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
75 wr.writerows(items)
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
76
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
77
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
78 if __name__ == "__main__":
402b58f45844 planemo upload commit 9cc4dc1db55299bf92ec6bd359161ece4592bd16-dirty
jpayne
parents:
diff changeset
79 cli()