comparison table-union.py @ 2:8c5fb0c5e560

planemo upload
author jpayne
date Fri, 26 Jan 2018 16:38:41 -0500
parents f1f2497301d3
children 4eaafbdfb8bf
comparison
equal deleted inserted replaced
1:9c8237621723 2:8c5fb0c5e560
6 6
7 7
8 def main(files): 8 def main(files):
9 header = [] 9 header = []
10 items = [] 10 items = []
11 possible_identity_headers = None
11 for fi in files: 12 for fi in files:
12 with open(fi, 'rU') as table: 13 with open(fi, 'rU') as table:
13 rows = csv.DictReader(table, delimiter='\t', dialect='excel-tab') 14 reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab')
14 for field in rows.fieldnames: 15 rows = list(reader)
16 for field in reader.fieldnames:
15 if field not in set(header): 17 if field not in set(header):
16 header.append(field) 18 header.append(field)
19 #try to find identity columns in the files, to use to join
20 if possible_identity_headers is None:
21 possible_identity_headers = set(reader.fieldnames)
22 #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null
23 #because these are the most likely to be shared keys
24 possible_identity_headers = possible_identity_headers.intersection(filter(lambda f: len(set([r[f] for r in rows])) == len(rows) and all([r[f] is not None for r in rows]), reader.fieldnames))
17 items.extend(rows) 25 items.extend(rows)
26
27 # if len(possible_identity_headers) > 1:
28 # #if there's more than one, we need to check that joining on any one of them produces the same results
29
30 # #finally
31 # possible_identity_headers = set((possible_identity_headers.pop(), ))
32
33 #if we found an identity column, then try to join rows
34 if possible_identity_headers:
35 key_column = possible_identity_headers.pop()
36 keys = set([r[key_column] for r in items])
37 merged_rows = []
38 for key in sorted(keys):
39 new_row = {}
40 for row in filter(lambda r: r[key_column] == key, items):
41 new_row.update(row)
42 merged_rows.append(new_row)
43 items = merged_rows
44
18 wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header) 45 wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header)
19 wr.writeheader() 46 wr.writeheader()
20 wr.writerows(items) 47 wr.writerows(items)
21 48
22 49