Mercurial > repos > jpayne > table_ops
changeset 2:8c5fb0c5e560
planemo upload
author | jpayne |
---|---|
date | Fri, 26 Jan 2018 16:38:41 -0500 |
parents | 9c8237621723 |
children | 77fc9c4a7ef0 |
files | table-union.py test-data/combined.tsv test-data/dingbat.tsv test-data/loki.tsv test-data/sorted.tsv |
diffstat | 5 files changed, 39 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/table-union.py Mon Jan 08 11:33:27 2018 -0500 +++ b/table-union.py Fri Jan 26 16:38:41 2018 -0500 @@ -8,13 +8,40 @@ def main(files): header = [] items = [] + possible_identity_headers = None for fi in files: with open(fi, 'rU') as table: - rows = csv.DictReader(table, delimiter='\t', dialect='excel-tab') - for field in rows.fieldnames: + reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab') + rows = list(reader) + for field in reader.fieldnames: if field not in set(header): header.append(field) + #try to find identity columns in the files, to use to join + if possible_identity_headers is None: + possible_identity_headers = set(reader.fieldnames) + #winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null + #because these are the most likely to be shared keys + possible_identity_headers = possible_identity_headers.intersection(filter(lambda f: len(set([r[f] for r in rows])) == len(rows) and all([r[f] is not None for r in rows]), reader.fieldnames)) items.extend(rows) + + # if len(possible_identity_headers) > 1: + # #if there's more than one, we need to check that joining on any one of them produces the same results + + # #finally + # possible_identity_headers = set((possible_identity_headers.pop(), )) + + #if we found an identity column, then try to join rows + if possible_identity_headers: + key_column = possible_identity_headers.pop() + keys = set([r[key_column] for r in items]) + merged_rows = [] + for key in sorted(keys): + new_row = {} + for row in filter(lambda r: r[key_column] == key, items): + new_row.update(row) + merged_rows.append(new_row) + items = merged_rows + wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header) wr.writeheader() wr.writerows(items)
--- a/test-data/combined.tsv Mon Jan 08 11:33:27 2018 -0500 +++ b/test-data/combined.tsv Fri Jan 26 16:38:41 2018 -0500 @@ -1,3 +1,4 @@ -name flavor color size -Dingbat strawberry red -Loki chocolate massive +name flavor size color +Dingbat strawberry red +Dunston strawberry massive blue +Loki chocolate massive
--- a/test-data/dingbat.tsv Mon Jan 08 11:33:27 2018 -0500 +++ b/test-data/dingbat.tsv Fri Jan 26 16:38:41 2018 -0500 @@ -1,2 +1,3 @@ name flavor color Dingbat strawberry red +Dunston strawberry blue \ No newline at end of file
--- a/test-data/loki.tsv Mon Jan 08 11:33:27 2018 -0500 +++ b/test-data/loki.tsv Fri Jan 26 16:38:41 2018 -0500 @@ -1,2 +1,3 @@ name flavor size Loki chocolate massive +Dunston strawberry massive
--- a/test-data/sorted.tsv Mon Jan 08 11:33:27 2018 -0500 +++ b/test-data/sorted.tsv Fri Jan 26 16:38:41 2018 -0500 @@ -1,3 +1,4 @@ -name flavor color size -Loki chocolate massive -Dingbat strawberry red +name flavor size color +Loki chocolate massive +Dingbat strawberry red +Dunston strawberry massive blue