Mercurial > repos > jpayne > bioproject_to_srr_2

--- a/bio2srr.py	Thu Dec 20 11:04:53 2018 -0500
+++ b/bio2srr.py	Mon Aug 03 10:39:37 2020 -0400
@@ -4,8 +4,10 @@

 import requests
 import sys
+from xml.etree import ElementTree as xml
+import csv

-sra_exp_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
+sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"

 sample = """{
     "hitCount": 2,
@@ -22,7 +24,11 @@
     "facets": []
 }"""

-sra_run_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={experiment}"
+data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml"
+
+xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample"
+
+sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"

 sample = """{
     "hitCount": 1,
@@ -36,23 +42,55 @@
 }"""

 if __name__ == "__main__":
-	try:
-		bioproject = sys.argv[1]
-		b_result = requests.get(sra_exp_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
-		b_result.raise_for_status()
-		if b_result.json()['entries']:
-			for experiment in [d['id'] for d in b_result.json()['entries']]:
-				r_result = requests.get(sra_run_query.format(experiment=experiment), headers=dict(Accept="application/json"))
-				r_result.raise_for_status()
-				for run in [d['id'] for d in r_result.json()['entries']]:
-					print(run)
-		else:
-			print(f"No results found for '{bioproject}'.", file=sys.stderr)
-			quit(1)
-	except IndexError:
-		raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
-	except KeyError as e:
-		raise ValueError() from e
+    try:
+        bioproject = sys.argv[1]
+        b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
+        b_result.raise_for_status()
+        runs = [d['id'] for d in b_result.json()['entries']]
+        if not runs:
+            print(f"No results found for '{bioproject}'.", file=sys.stderr)
+            quit(1)
+    except IndexError:
+        raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
+    except KeyError as e:
+        raise ValueError() from e

+    try:
+        with open(sys.argv[2], 'r') as f:
+            rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
+            rcds = list(rdr)


+    except IndexError:
+        rcds = []
+
+    res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
+    res.raise_for_status()
+    bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
+
+    for run_id, sam_id in bsams:
+        record = {}
+        record['sample'] = run_id
+        record['biosample_accession'] = sam_id
+        res = requests.get(data_query.format(accession=sam_id))
+        res.raise_for_status()
+        root = xml.fromstring(res.text)
+        for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
+            key, value = iter(attr)
+            record[key.text] = value.text
+        rcds.append(record)
+        print(run_id)
+
+    headers = {}
+    for record in rcds:
+        for key in record.keys():
+            headers[key] = None # use a dict to preserve header order
+
+
+    with open('./metadata.tsv', 'w') as f:
+        wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
+        wtr.writeheader()
+        wtr.writerows(rcds)
+
+
+
--- a/bio2srr.xml	Thu Dec 20 11:04:53 2018 -0500
+++ b/bio2srr.xml	Mon Aug 03 10:39:37 2020 -0400
@@ -1,26 +1,35 @@
 <tool id="bio2srr" name="Bioproject to SRR" version="0.1.0">
 	<description>Retrieve SRR accessions from BioProject or BioSample.</description>
     <requirements>
-        <requirement type="package">package_python_3_4</requirement>
-    	<requirement type="package" version="2.18.4">requests</requirement>
+        <requirement type="package" version="3.8">python</requirement>
+    	<requirement type="package" version="2.24.0">requests</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
+        #if $src_metadata
+        $__tool_directory__/bio2srr.py "$input1" $src_metadata > "$output"
+        #else
         $__tool_directory__/bio2srr.py "$input1" > "$output"
+        #end if
     ]]></command>
     <inputs>
         <param type="text" name="input1" label="BioProject or BioSample" />
+        <param type="data" format="tabular" name="src_metadata" label="Table of your own metadata" optional="true" help="Optional metadata table to be folded in." />
     </inputs>
     <outputs>
-    	<data format="txt" name="output" />
+    	<data format="txt" name="output" label="SRR Accession List" />
+        <data format="tabular" name="metadata" from_work_dir="metadata.tsv" label="Sample Metadata Table" />
     </outputs>
     <tests>
     	<test>
     		<param name="input1" value="NOTHING" />
     		<output name="output" file="test.txt" />
+            <output name="metadata" file="metadata.tsv" />
     	</test>
     </tests>
     <help><![CDATA[
-        Retrieve SRR accessions by BioSample or BioProject.
+        Retrieve SRR accessions and metadata from a BioProject.
+
+        If you have metadata for your own samples (that aren't from the BioProject) you can provide a TSV table with it and they'll be folded together. (It's a union, not a join.) Put sample ID's under the header 'sample'.
     ]]></help>
     <citations>
     </citations>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/metadata.tsv	Mon Aug 03 10:39:37 2020 -0400
@@ -0,0 +1,3 @@
+sample	biosample_accession	isolation_source	collection_date	geo_loc_name	lat_lon	rel_to_oxygen	samp_collect_device	samp_mat_process	samp_size	source_material_id	BioSampleModel	ENA-SPOT-COUNT	ENA-BASE-COUNT	ENA-FIRST-PUBLIC	ENA-LAST-UPDATE
+SRR11671300	SAMN14820590	marine sediment	Not applicable	North Sea: German Bight (Helgoland Mud Area)	54.052300 N 7.580400 E	obligate anaerobe	anoxic sampling of sediment slurry incubations at defined time points	Previously collected sediments from Helgoland mud area, stored at 4ºC (near in-situ temperature) until use as starting material for incubation experiments. For the incubation experiments, anaerobic slurries were prepared in ratio 1:4 and incubated at 30ºC. Sample name identifies the unique information of each sample including the target 16S rRNA gene, incubation timepoint and state of the enrichment.	1 ml slurry in triplicates, Pooled DNA from triplicates were sequenced as 1 sample.	sediment incubation at 30ºC without any amendment sampled after 105 days sequenced with bacteria PCR primers	Metagenome or environmental	41855	6160440	2020-05-04	2020-05-04
+SRR11671283	SAMN14820597	marine sediment	Not applicable	North Sea: German Bight (Helgoland Mud Area)	54.052300 N 7.580400 E	obligate anaerobe	anoxic sampling of sediment slurry incubations at defined time points	Previously collected sediments from Helgoland mud area, stored at 4ºC (near in-situ temperature) until use as starting material for incubation experiments. For the incubation experiments, anaerobic slurries were prepared in ratio 1:4 and incubated at 30ºC. Sample name identifies the unique information of each sample including the target 16S rRNA gene, incubation timepoint and state of the enrichment.	1 ml slurry in triplicates, Pooled DNA from triplicates were sequenced as 1 sample.	sediment incubation at 30ºC without any amendment sampled after 105 days sequenced with archaea PCR primers	Metagenome or environmental	19833	2973256	2020-05-04	2020-05-04
--- a/test-data/test.txt	Thu Dec 20 11:04:53 2018 -0500
+++ b/test-data/test.txt	Mon Aug 03 10:39:37 2020 -0400
@@ -1,15 +1,2 @@
-ERR2130609
-ERR2130611
-ERR2130613
-ERR2130497
-ERR2130499
-ERR2130502
-ERR2130503
-ERR2130506
-ERR2130508
-ERR2130510
-ERR2130512
-ERR2130517
-ERR2130520
-ERR2130521
-ERR2130523
\ No newline at end of file
+SRR11671300
+SRR11671283