Mercurial > repos > jpayne > table_ops
comparison table-union.py @ 14:1af2524f48b7 tip
planemo upload
author | jpayne |
---|---|
date | Tue, 30 Oct 2018 16:18:02 -0400 |
parents | 2949b8929037 |
children |
comparison
equal
deleted
inserted
replaced
13:746091a78780 | 14:1af2524f48b7 |
---|---|
2 import csv | 2 import csv |
3 import sys | 3 import sys |
4 | 4 |
5 | 5 |
6 | 6 |
7 def main(files): | 7 def main(unionize=True, *files): |
8 header = [] | 8 header = [] |
9 items = [] | 9 items = [] |
10 possible_identity_headers = None | 10 possible_identity_headers = None |
11 for fi in files: | 11 for fi in files: |
12 with open(fi, 'rU') as table: | 12 with open(fi, 'rU') as table: |
13 reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab') | 13 reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab') |
14 rows = list(reader) | 14 rows = list(reader) |
15 for field in reader.fieldnames: | 15 for field in reader.fieldnames: |
16 if field not in set(header): | 16 if field not in set(header): |
17 header.append(field) | 17 header.append(field) |
18 | |
19 | |
18 #try to find identity columns in the files, to use to join | 20 #try to find identity columns in the files, to use to join |
19 if possible_identity_headers is None: | 21 if possible_identity_headers is None: |
20 possible_identity_headers = set(reader.fieldnames) | 22 possible_identity_headers = set(reader.fieldnames) |
21 #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null | 23 #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null |
22 #because these are the most likely to be shared keys | 24 #because these are the most likely to be shared keys |
28 | 30 |
29 # #finally | 31 # #finally |
30 # possible_identity_headers = set((possible_identity_headers.pop(), )) | 32 # possible_identity_headers = set((possible_identity_headers.pop(), )) |
31 | 33 |
32 #if we found an identity column, then try to join rows | 34 #if we found an identity column, then try to join rows |
33 if possible_identity_headers: | 35 if possible_identity_headers and unionize: |
34 key_column = possible_identity_headers.pop() | 36 key_column = possible_identity_headers.pop() |
35 keys = set([r[key_column] for r in items]) | 37 keys = set([r[key_column] for r in items]) |
36 merged_rows = [] | 38 merged_rows = [] |
37 for key in sorted(keys): | 39 for key in sorted(keys): |
38 new_row = {} | 40 new_row = {} |
45 wr.writeheader() | 47 wr.writeheader() |
46 wr.writerows(items) | 48 wr.writerows(items) |
47 | 49 |
48 | 50 |
49 if __name__ == '__main__': | 51 if __name__ == '__main__': |
50 main(sys.argv[1:]) | 52 main(*sys.argv[1:]) |