changeset 2:8c5fb0c5e560

planemo upload
author jpayne
date Fri, 26 Jan 2018 16:38:41 -0500
parents 9c8237621723
children 77fc9c4a7ef0
files table-union.py test-data/combined.tsv test-data/dingbat.tsv test-data/loki.tsv test-data/sorted.tsv
diffstat 5 files changed, 39 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/table-union.py	Mon Jan 08 11:33:27 2018 -0500
+++ b/table-union.py	Fri Jan 26 16:38:41 2018 -0500
@@ -8,13 +8,40 @@
 def main(files):
 	header = []
 	items = []
+	possible_identity_headers = None
 	for fi in files:
 		with open(fi, 'rU') as table:
-			rows = csv.DictReader(table, delimiter='\t', dialect='excel-tab')
-			for field in rows.fieldnames:
+			reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab')
+			rows = list(reader)
+			for field in reader.fieldnames:
 				if field not in set(header):
 					header.append(field)
+				#try to find identity columns in the files, to use to join
+				if possible_identity_headers is None:
+					possible_identity_headers = set(reader.fieldnames)
+				#winnow down the shared columns in each file by whether they're present in all, and all their values are unique in each file and not null
+				#because these are the most likely to be shared keys
+				possible_identity_headers = possible_identity_headers.intersection(filter(lambda f: len(set([r[f] for r in rows])) == len(rows) and all([r[f] is not None for r in rows]), reader.fieldnames))
 			items.extend(rows)
+	
+	# if len(possible_identity_headers) > 1:
+	# 	#if there's more than one, we need to check that joining on any one of them produces the same results
+
+	# 	#finally
+	# 	possible_identity_headers = set((possible_identity_headers.pop(), ))
+
+	#if we found an identity column, then try to join rows
+	if possible_identity_headers:
+		key_column = possible_identity_headers.pop()
+		keys = set([r[key_column] for r in items])
+		merged_rows = []
+		for key in sorted(keys):
+			new_row = {}
+			for row in filter(lambda r: r[key_column] == key, items):
+				new_row.update(row)
+			merged_rows.append(new_row)
+		items = merged_rows
+
 	wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header)
 	wr.writeheader()
 	wr.writerows(items)
--- a/test-data/combined.tsv	Mon Jan 08 11:33:27 2018 -0500
+++ b/test-data/combined.tsv	Fri Jan 26 16:38:41 2018 -0500
@@ -1,3 +1,4 @@
-name	flavor	color	size
-Dingbat	strawberry	red	
-Loki	chocolate		massive
+name	flavor	size	color
+Dingbat	strawberry		red
+Dunston	strawberry	massive	blue
+Loki	chocolate	massive	
--- a/test-data/dingbat.tsv	Mon Jan 08 11:33:27 2018 -0500
+++ b/test-data/dingbat.tsv	Fri Jan 26 16:38:41 2018 -0500
@@ -1,2 +1,3 @@
 name	flavor	color
 Dingbat	strawberry	red
+Dunston	strawberry	blue
\ No newline at end of file
--- a/test-data/loki.tsv	Mon Jan 08 11:33:27 2018 -0500
+++ b/test-data/loki.tsv	Fri Jan 26 16:38:41 2018 -0500
@@ -1,2 +1,3 @@
 name	flavor	size
 Loki	chocolate	massive
+Dunston	strawberry	massive
--- a/test-data/sorted.tsv	Mon Jan 08 11:33:27 2018 -0500
+++ b/test-data/sorted.tsv	Fri Jan 26 16:38:41 2018 -0500
@@ -1,3 +1,4 @@
-name	flavor	color	size
-Loki	chocolate		massive
-Dingbat	strawberry	red	
+name	flavor	size	color
+Loki	chocolate	massive	
+Dingbat	strawberry		red
+Dunston	strawberry	massive	blue