Mercurial > repos > jpayne > table_ops
comparison table-union.py @ 2:8c5fb0c5e560
planemo upload
author | jpayne |
---|---|
date | Fri, 26 Jan 2018 16:38:41 -0500 |
parents | f1f2497301d3 |
children | 4eaafbdfb8bf |
comparison
equal
deleted
inserted
replaced
1:9c8237621723 | 2:8c5fb0c5e560 |
---|---|
6 | 6 |
7 | 7 |
8 def main(files): | 8 def main(files): |
9 header = [] | 9 header = [] |
10 items = [] | 10 items = [] |
11 possible_identity_headers = None | |
11 for fi in files: | 12 for fi in files: |
12 with open(fi, 'rU') as table: | 13 with open(fi, 'rU') as table: |
13 rows = csv.DictReader(table, delimiter='\t', dialect='excel-tab') | 14 reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab') |
14 for field in rows.fieldnames: | 15 rows = list(reader) |
16 for field in reader.fieldnames: | |
15 if field not in set(header): | 17 if field not in set(header): |
16 header.append(field) | 18 header.append(field) |
19 #try to find identity columns in the files, to use to join | |
20 if possible_identity_headers is None: | |
21 possible_identity_headers = set(reader.fieldnames) | |
22 #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null | |
23 #because these are the most likely to be shared keys | |
24 possible_identity_headers = possible_identity_headers.intersection(filter(lambda f: len(set([r[f] for r in rows])) == len(rows) and all([r[f] is not None for r in rows]), reader.fieldnames)) | |
17 items.extend(rows) | 25 items.extend(rows) |
26 | |
27 # if len(possible_identity_headers) > 1: | |
28 # #if there's more than one, we need to check that joining on any one of them produces the same results | |
29 | |
30 # #finally | |
31 # possible_identity_headers = set((possible_identity_headers.pop(), )) | |
32 | |
33 #if we found an identity column, then try to join rows | |
34 if possible_identity_headers: | |
35 key_column = possible_identity_headers.pop() | |
36 keys = set([r[key_column] for r in items]) | |
37 merged_rows = [] | |
38 for key in sorted(keys): | |
39 new_row = {} | |
40 for row in filter(lambda r: r[key_column] == key, items): | |
41 new_row.update(row) | |
42 merged_rows.append(new_row) | |
43 items = merged_rows | |
44 | |
18 wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header) | 45 wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header) |
19 wr.writeheader() | 46 wr.writeheader() |
20 wr.writerows(items) | 47 wr.writerows(items) |
21 | 48 |
22 | 49 |