jpayne@11
|
1 #! /usr/bin/env python
|
jpayne@0
|
2 import csv
|
jpayne@0
|
3 import sys
|
jpayne@0
|
4
|
jpayne@0
|
5
|
jpayne@0
|
6
|
jpayne@14
|
7 def main(unionize=True, *files):
|
jpayne@0
|
8 header = []
|
jpayne@0
|
9 items = []
|
jpayne@2
|
10 possible_identity_headers = None
|
jpayne@0
|
11 for fi in files:
|
jpayne@0
|
12 with open(fi, 'rU') as table:
|
jpayne@2
|
13 reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab')
|
jpayne@2
|
14 rows = list(reader)
|
jpayne@2
|
15 for field in reader.fieldnames:
|
jpayne@0
|
16 if field not in set(header):
|
jpayne@0
|
17 header.append(field)
|
jpayne@14
|
18
|
jpayne@14
|
19
|
jpayne@2
|
20 #try to find identity columns in the files, to use to join
|
jpayne@2
|
21 if possible_identity_headers is None:
|
jpayne@2
|
22 possible_identity_headers = set(reader.fieldnames)
|
jpayne@2
|
23 #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null
|
jpayne@2
|
24 #because these are the most likely to be shared keys
|
jpayne@2
|
25 possible_identity_headers = possible_identity_headers.intersection(filter(lambda f: len(set([r[f] for r in rows])) == len(rows) and all([r[f] is not None for r in rows]), reader.fieldnames))
|
jpayne@0
|
26 items.extend(rows)
|
jpayne@2
|
27
|
jpayne@2
|
28 # if len(possible_identity_headers) > 1:
|
jpayne@2
|
29 # #if there's more than one, we need to check that joining on any one of them produces the same results
|
jpayne@2
|
30
|
jpayne@2
|
31 # #finally
|
jpayne@2
|
32 # possible_identity_headers = set((possible_identity_headers.pop(), ))
|
jpayne@2
|
33
|
jpayne@2
|
34 #if we found an identity column, then try to join rows
|
jpayne@14
|
35 if possible_identity_headers and unionize:
|
jpayne@2
|
36 key_column = possible_identity_headers.pop()
|
jpayne@2
|
37 keys = set([r[key_column] for r in items])
|
jpayne@2
|
38 merged_rows = []
|
jpayne@2
|
39 for key in sorted(keys):
|
jpayne@2
|
40 new_row = {}
|
jpayne@2
|
41 for row in filter(lambda r: r[key_column] == key, items):
|
jpayne@2
|
42 new_row.update(row)
|
jpayne@2
|
43 merged_rows.append(new_row)
|
jpayne@2
|
44 items = merged_rows
|
jpayne@2
|
45
|
jpayne@0
|
46 wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header)
|
jpayne@0
|
47 wr.writeheader()
|
jpayne@0
|
48 wr.writerows(items)
|
jpayne@0
|
49
|
jpayne@0
|
50
|
jpayne@0
|
51 if __name__ == '__main__':
|
jpayne@14
|
52 main(*sys.argv[1:]) |