table_ops: table-union.py annotate

annotate table-union.py @ 4:4a5c79572664

planemo upload

author	jpayne
date	Mon, 29 Jan 2018 16:17:35 -0500
parents	8c5fb0c5e560
children	4eaafbdfb8bf

rev	line source
jpayne@0	1 #!/usr/bin/env python3
jpayne@0	2
jpayne@0	3 import csv
jpayne@0	4 import sys
jpayne@0	5
jpayne@0	6
jpayne@0	7
jpayne@0	8 def main(files):
jpayne@0	9 header = []
jpayne@0	10 items = []
jpayne@2	11 possible_identity_headers = None
jpayne@0	12 for fi in files:
jpayne@0	13 with open(fi, 'rU') as table:
jpayne@2	14 reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab')
jpayne@2	15 rows = list(reader)
jpayne@2	16 for field in reader.fieldnames:
jpayne@0	17 if field not in set(header):
jpayne@0	18 header.append(field)
jpayne@2	19 #try to find identity columns in the files, to use to join
jpayne@2	20 if possible_identity_headers is None:
jpayne@2	21 possible_identity_headers = set(reader.fieldnames)
jpayne@2	22 #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null
jpayne@2	23 #because these are the most likely to be shared keys
jpayne@2	24 possible_identity_headers = possible_identity_headers.intersection(filter(lambda f: len(set([r[f] for r in rows])) == len(rows) and all([r[f] is not None for r in rows]), reader.fieldnames))
jpayne@0	25 items.extend(rows)
jpayne@2	26
jpayne@2	27 # if len(possible_identity_headers) > 1:
jpayne@2	28 # #if there's more than one, we need to check that joining on any one of them produces the same results
jpayne@2	29
jpayne@2	30 # #finally
jpayne@2	31 # possible_identity_headers = set((possible_identity_headers.pop(), ))
jpayne@2	32
jpayne@2	33 #if we found an identity column, then try to join rows
jpayne@2	34 if possible_identity_headers:
jpayne@2	35 key_column = possible_identity_headers.pop()
jpayne@2	36 keys = set([r[key_column] for r in items])
jpayne@2	37 merged_rows = []
jpayne@2	38 for key in sorted(keys):
jpayne@2	39 new_row = {}
jpayne@2	40 for row in filter(lambda r: r[key_column] == key, items):
jpayne@2	41 new_row.update(row)
jpayne@2	42 merged_rows.append(new_row)
jpayne@2	43 items = merged_rows
jpayne@2	44
jpayne@0	45 wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header)
jpayne@0	46 wr.writeheader()
jpayne@0	47 wr.writerows(items)
jpayne@0	48
jpayne@0	49
jpayne@0	50 if __name__ == '__main__':
jpayne@0	51 main(sys.argv[1:])

Mercurial > repos > jpayne > table_ops

annotate table-union.py @ 4:4a5c79572664