comparison table-union.py @ 14:1af2524f48b7 tip

planemo upload
author jpayne
date Tue, 30 Oct 2018 16:18:02 -0400
parents 2949b8929037
children
comparison
equal deleted inserted replaced
13:746091a78780 14:1af2524f48b7
2 import csv 2 import csv
3 import sys 3 import sys
4 4
5 5
6 6
7 def main(files): 7 def main(unionize=True, *files):
8 header = [] 8 header = []
9 items = [] 9 items = []
10 possible_identity_headers = None 10 possible_identity_headers = None
11 for fi in files: 11 for fi in files:
12 with open(fi, 'rU') as table: 12 with open(fi, 'rU') as table:
13 reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab') 13 reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab')
14 rows = list(reader) 14 rows = list(reader)
15 for field in reader.fieldnames: 15 for field in reader.fieldnames:
16 if field not in set(header): 16 if field not in set(header):
17 header.append(field) 17 header.append(field)
18
19
18 #try to find identity columns in the files, to use to join 20 #try to find identity columns in the files, to use to join
19 if possible_identity_headers is None: 21 if possible_identity_headers is None:
20 possible_identity_headers = set(reader.fieldnames) 22 possible_identity_headers = set(reader.fieldnames)
21 #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null 23 #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null
22 #because these are the most likely to be shared keys 24 #because these are the most likely to be shared keys
28 30
29 # #finally 31 # #finally
30 # possible_identity_headers = set((possible_identity_headers.pop(), )) 32 # possible_identity_headers = set((possible_identity_headers.pop(), ))
31 33
32 #if we found an identity column, then try to join rows 34 #if we found an identity column, then try to join rows
33 if possible_identity_headers: 35 if possible_identity_headers and unionize:
34 key_column = possible_identity_headers.pop() 36 key_column = possible_identity_headers.pop()
35 keys = set([r[key_column] for r in items]) 37 keys = set([r[key_column] for r in items])
36 merged_rows = [] 38 merged_rows = []
37 for key in sorted(keys): 39 for key in sorted(keys):
38 new_row = {} 40 new_row = {}
45 wr.writeheader() 47 wr.writeheader()
46 wr.writerows(items) 48 wr.writerows(items)
47 49
48 50
49 if __name__ == '__main__': 51 if __name__ == '__main__':
50 main(sys.argv[1:]) 52 main(*sys.argv[1:])