jpayne@0: #!/usr/bin/env python3 jpayne@0: jpayne@0: import csv jpayne@0: import sys jpayne@0: jpayne@0: jpayne@0: jpayne@0: def main(files): jpayne@0: header = [] jpayne@0: items = [] jpayne@2: possible_identity_headers = None jpayne@0: for fi in files: jpayne@0: with open(fi, 'rU') as table: jpayne@2: reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab') jpayne@2: rows = list(reader) jpayne@2: for field in reader.fieldnames: jpayne@0: if field not in set(header): jpayne@0: header.append(field) jpayne@2: #try to find identity columns in the files, to use to join jpayne@2: if possible_identity_headers is None: jpayne@2: possible_identity_headers = set(reader.fieldnames) jpayne@2: #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null jpayne@2: #because these are the most likely to be shared keys jpayne@2: possible_identity_headers = possible_identity_headers.intersection(filter(lambda f: len(set([r[f] for r in rows])) == len(rows) and all([r[f] is not None for r in rows]), reader.fieldnames)) jpayne@0: items.extend(rows) jpayne@2: jpayne@2: # if len(possible_identity_headers) > 1: jpayne@2: # #if there's more than one, we need to check that joining on any one of them produces the same results jpayne@2: jpayne@2: # #finally jpayne@2: # possible_identity_headers = set((possible_identity_headers.pop(), )) jpayne@2: jpayne@2: #if we found an identity column, then try to join rows jpayne@2: if possible_identity_headers: jpayne@2: key_column = possible_identity_headers.pop() jpayne@2: keys = set([r[key_column] for r in items]) jpayne@2: merged_rows = [] jpayne@2: for key in sorted(keys): jpayne@2: new_row = {} jpayne@2: for row in filter(lambda r: r[key_column] == key, items): jpayne@2: new_row.update(row) jpayne@2: merged_rows.append(new_row) jpayne@2: items = merged_rows jpayne@2: jpayne@0: wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header) jpayne@0: wr.writeheader() jpayne@0: wr.writerows(items) jpayne@0: jpayne@0: jpayne@0: if __name__ == '__main__': jpayne@0: main(sys.argv[1:])