Mercurial > repos > jpayne > quast_select

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/quast-select.xml	Wed Feb 07 16:37:42 2018 -0500
@@ -0,0 +1,45 @@
+<tool id="quast-select" name="Select Best" version="0.1.0" profile="16.10">
+    <description>assembly based on a combined QUAST table</description>
+    <requirements>
+        <requirement type="package">python</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        #for $asm in $coll
+        ln -s $asm ./$asm.element_identifier &&
+        #end for
+        cp ./\$(python $__tool_directory__/quast_select.py $table $criterion) $output
+    ]]></command>
+    <inputs>
+        <param type="data" format="tsv" name="table" label="Combined QUAST output (from table_union)"/>
+        <param type="select" label="Select best assembly by..." name="criterion">
+            <option value="N50">Longest N50</option>
+            <option value="Largest contig">Longest single contig</option>
+            <option value="# contigs">Fewest contigs</option>
+            <option value="# contigs (>= 1000 bp)">Fewest contigs of length 1kbp or longer</option>
+            <option value="Total length">Total assembly length</option>
+            <option value="Total length (>= 1000 bp)">Total assembly length of contigs longer than 1kbp</option>
+        </param>
+        <param type="data_collection" collection_type="list" format="fasta" name="coll" label="Collection of FASTA assemblies" />
+    </inputs>
+    <outputs>
+        <data format="fasta" name="output" label="Best assembly by ${criterion}" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="table" value="combined_table.tsv" />
+            <param name="coll" >
+                <collection type="list">
+                    <element name="sample1" value="sample1.fasta" />
+                    <element name="sample2" value="sample2.fasta" />
+                    <element name="sample3" value="sample3.fasta" />
+                    <element name="sample4" value="sample4.fasta" />
+                </collection>
+            </param>
+            <param name="criterion" value="N50"/>
+            <output name="output" value="sample1.fasta" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        Pick the best assembly from a collection of assemblies and a combined QUAST report.
+    ]]></help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/quast_select.py	Wed Feb 07 16:37:42 2018 -0500
@@ -0,0 +1,27 @@
+from __future__ import print_function
+
+import csv
+from operator import lt, gt
+import sys
+
+def pick(rows, key, reverse=False):
+	sorted_rows = sorted(rows, key=lambda r:r[key], reverse=reverse)
+	return sorted_rows[0]['Assembly']
+
+def int_or_str(token):
+	try:
+		return int(token)
+	except ValueError:
+		return str(token)
+
+if __name__ == '__main__':
+	path, compared = sys.argv[1:]
+	#QUAST tables have sample info as columns, so we need to transpose the table
+	rows = list(zip(*csv.reader(open(path, "rU"), delimiter='\t', dialect='excel')))
+	hed = rows.pop(0)
+	dict_rows = [{h : int_or_str(r[i]) for i, h in enumerate(hed)} for r in rows]
+	if "#" in compared:
+		reverse = False #if it's a count, we want the fewest
+	else:
+		reverse = True #otherwise it's a length and we want the longest
+	print(pick(dict_rows, compared, reverse))
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/combined_table.tsv	Wed Feb 07 16:37:42 2018 -0500
@@ -0,0 +1,14 @@
+Assembly	sample1	sample2	sample3	sample4
+# N's per 100 kbp	0.00	0.00	0.00	0.00
+# contigs	15	26	25	18
+# contigs (>= 0 bp)	15	26	25	18
+# contigs (>= 1000 bp)	12	17	20	13
+GC (%)	49.67	50.22	49.81	49.62
+L50	4	8	8	5
+L75	7	14	15	9
+Largest contig	9036	4811	5055	5138
+N50	4026	1934	1668	3114
+N75	3428	1371	1217	1833
+Total length	42889	42188	41537	41859
+Total length (>= 0 bp)	42889	42188	41537	41859
+Total length (>= 1000 bp)	40450	35624	37562	37621
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample1.fasta	Wed Feb 07 16:37:42 2018 -0500
@@ -0,0 +1,2 @@
+>sample1
+AAAA
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample2.fasta	Wed Feb 07 16:37:42 2018 -0500
@@ -0,0 +1,2 @@
+>sample2
+TTTT
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample3.fasta	Wed Feb 07 16:37:42 2018 -0500
@@ -0,0 +1,2 @@
+>sample3
+GGGG
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample4.fasta	Wed Feb 07 16:37:42 2018 -0500
@@ -0,0 +1,2 @@
+>sample4
+CCCC
\ No newline at end of file