annotate table-union.py @ 11:2949b8929037

planemo upload
author jpayne
date Wed, 31 Jan 2018 09:47:10 -0500
parents 4eaafbdfb8bf
children 1af2524f48b7
rev   line source
jpayne@11 1 #! /usr/bin/env python
jpayne@0 2 import csv
jpayne@0 3 import sys
jpayne@0 4
jpayne@0 5
jpayne@0 6
jpayne@0 7 def main(files):
jpayne@0 8 header = []
jpayne@0 9 items = []
jpayne@2 10 possible_identity_headers = None
jpayne@0 11 for fi in files:
jpayne@0 12 with open(fi, 'rU') as table:
jpayne@2 13 reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab')
jpayne@2 14 rows = list(reader)
jpayne@2 15 for field in reader.fieldnames:
jpayne@0 16 if field not in set(header):
jpayne@0 17 header.append(field)
jpayne@2 18 #try to find identity columns in the files, to use to join
jpayne@2 19 if possible_identity_headers is None:
jpayne@2 20 possible_identity_headers = set(reader.fieldnames)
jpayne@2 21 #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null
jpayne@2 22 #because these are the most likely to be shared keys
jpayne@2 23 possible_identity_headers = possible_identity_headers.intersection(filter(lambda f: len(set([r[f] for r in rows])) == len(rows) and all([r[f] is not None for r in rows]), reader.fieldnames))
jpayne@0 24 items.extend(rows)
jpayne@2 25
jpayne@2 26 # if len(possible_identity_headers) > 1:
jpayne@2 27 # #if there's more than one, we need to check that joining on any one of them produces the same results
jpayne@2 28
jpayne@2 29 # #finally
jpayne@2 30 # possible_identity_headers = set((possible_identity_headers.pop(), ))
jpayne@2 31
jpayne@2 32 #if we found an identity column, then try to join rows
jpayne@2 33 if possible_identity_headers:
jpayne@2 34 key_column = possible_identity_headers.pop()
jpayne@2 35 keys = set([r[key_column] for r in items])
jpayne@2 36 merged_rows = []
jpayne@2 37 for key in sorted(keys):
jpayne@2 38 new_row = {}
jpayne@2 39 for row in filter(lambda r: r[key_column] == key, items):
jpayne@2 40 new_row.update(row)
jpayne@2 41 merged_rows.append(new_row)
jpayne@2 42 items = merged_rows
jpayne@2 43
jpayne@0 44 wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header)
jpayne@0 45 wr.writeheader()
jpayne@0 46 wr.writerows(items)
jpayne@0 47
jpayne@0 48
jpayne@0 49 if __name__ == '__main__':
jpayne@0 50 main(sys.argv[1:])