changeset 4:2d4a2159c74b

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Fri, 03 May 2024 01:17:43 -0400
parents 80f1001797c7
children 1a3038b6d1dd
files __pycache__/bio2srr.cpython-312.pyc __pycache__/tests.cpython-312-pytest-8.2.0.pyc bio2srr.py bio2srr.xml test-data/accessions.txt test-data/metadata.tsv test-data/metadata.tsv.bak test-data/test.txt tests.py
diffstat 9 files changed, 563 insertions(+), 147 deletions(-) [+]
line wrap: on
line diff
Binary file __pycache__/bio2srr.cpython-312.pyc has changed
Binary file __pycache__/tests.cpython-312-pytest-8.2.0.pyc has changed
--- a/bio2srr.py	Wed Oct 27 05:00:45 2021 -0400
+++ b/bio2srr.py	Fri May 03 01:17:43 2024 -0400
@@ -1,125 +1,233 @@
-#! /usr/bin/env python3
-
-"Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."
-
-import requests
-import sys
-from xml.etree import ElementTree as xml
-import csv
-
-from time import sleep
-
-sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
-
-sample = """{
-    "hitCount": 2,
-    "entries": [
-        {
-            "id": "SRX377510",
-            "source": "sra-experiment"
-        },
-        {
-            "id": "SRX583279",
-            "source": "sra-experiment"
-        }
-    ],
-    "facets": []
-}"""
-
-data_query = "?display=xml"
-
-sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run"
-
-sample = """{
-    "hitCount": 1,
-    "entries": [
-        {
-            "id": "SRR1029665",
-            "source": "sra-run"
-        }
-    ],
-    "facets": []
-}"""
-
-def get_tag(root, tag):
-    val = root.find(tag)
-    if val:
-        return val.text
-
-if __name__ == "__main__":
-    try:
-        bioproject = sys.argv[1]
-
-        b_result = None
-
-        runs = []
-
-        while not b_result or len(runs) < b_result['hitCount']:
-            b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json"))
-            b_result.raise_for_status()
-            b_result = b_result.json()
-            runs += [d['id'] for d in b_result['entries']]
-
-        if not runs:
-            print(f"No results found for '{bioproject}'.", file=sys.stderr)
-            quit(1)
-    except IndexError:
-        raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
-
-    try:
-        with open(sys.argv[2], 'r') as f:
-            rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
-            rcds = list(rdr)
-
-
-    except IndexError:
-        rcds = []
-
-    bsams = []
-
-    for id in runs:
-        res = requests.get(
-            f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample",
-            headers=dict(Accept="application/json")
-        )
-        res.raise_for_status()
-        bsams.append(res.json()['entries'][0]['references'][0]['acc'])
-        sleep(.1)
-
-    # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
-    # res.raise_for_status()
-    # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
-
-    for run_id, sam_id in zip(runs, bsams):
-        print(run_id)
-        record = {}
-        record['sample'] = run_id
-        record['biosample_accession'] = sam_id
-        res = requests.get(
-            f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}"
-        )
-        res.raise_for_status()
-        root = xml.fromstring(res.text)
-        
-        record['submitter_id'] = get_tag(root, './/SUBMITTER_ID')
-        record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME')
-
-        for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
-            key, value = iter(attr)
-            record[key.text] = value.text
-        rcds.append(record)
-        sleep(.1)
-
-    headers = {}
-    for record in rcds:
-        for key in record.keys():
-            headers[key] = None # use a dict to preserve header order
-    
-        
-    with open('./metadata.tsv', 'w') as f:
-        wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
-        wtr.writeheader()
-        wtr.writerows(rcds)
-
-
-
+"Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils"
+
+import requests
+import sys
+import csv
+
+
+from itertools import batched
+from functools import cmp_to_key
+from time import sleep
+from xml.etree import ElementTree as xml
+
+esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
+elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
+
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+logger = logging.getLogger("bio2srr")
+
+def log(msg):
+    logger.info(msg) # fix logging later
+
+def get_tag(root, tag):
+    val = root.find(tag)
+    if val is not None:
+        return val.text
+    log(f"No result for {tag}")
+
+
+
+def header_sort_override(a, b):
+    if a == b:
+        return 0
+    try:
+        for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]:
+            if a == name:
+                return -1
+            if b == name:
+                return 1
+    except:
+        pass
+    if a < b:
+        return -1
+    else:
+        return 1
+
+hso = cmp_to_key(header_sort_override)
+
+def resolve_bioproject_ids_and_links(bioproject_id_list):
+    "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML"
+    for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list):
+        log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}")
+        #get bioproject to bioproject links
+        response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json"))
+        response.raise_for_status()
+        reply = response.json()
+        linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}])
+        if len(linksets) >= 3:
+            for id in linksets[2].get("links", []): #third index is the up to down links
+                response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json"))
+                response.raise_for_status()
+                replyy = response.json()
+                biop = replyy["result"][id]["project_acc"]
+                if id not in bioproject_id_list:
+                    bioproject_id_list.append((biop, id)) # recurse over bioproject links
+        # get bioproject to biosample links
+        response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json"))
+        response.raise_for_status()
+        reply = response.json()
+        links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", [])
+        log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})")
+        for ids in batched(links, 200):
+            response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json"))
+            response.raise_for_status()
+            replyy = response.json()
+            for field, value in replyy.get("result", {}).items():
+                if "uids" not in field:
+                    yield bioproject, field, value["sampledata"] # this is XML, deleriously
+                    sleep(0.1)
+
+
+biosample_example = """
+<BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268">
+    <Ids>     
+        <Id db="BioSample" is_primary="1">SAMN17131268</Id>
+        <Id db_label="Sample name">CJP19-D996</Id>
+    </Ids>
+    <Description>     
+        <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title>     
+        <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni">       
+            <OrganismName>Campylobacter jejuni</OrganismName>
+        </Organism>   
+    </Description>   
+    <Owner>     
+        <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name> 
+    </Owner> 
+    <Models>   
+        <Model>Pathogen.env</Model>  
+    </Models>  
+    <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package> 
+    <Attributes>  
+        <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute>  
+        <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute>     
+        <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute>     
+        <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute>     
+        <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute>     
+        <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute>     
+        <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute>     
+        <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute>     
+        <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute>   
+    </Attributes>   
+    <Links>     
+        <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link>   
+    </Links>
+    <Status status="live" when="2020-12-21T15:08:05.693"/> 
+</BioSample>
+
+"""
+
+def flatten_biosample_xml(biosampxml):
+    root = xml.fromstring(biosampxml)
+    accession = get_tag(root, r'.//Id[@db="BioSample"]')
+    # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]')
+    organism = get_tag(root, r".//OrganismName")
+    tax_id = root.find(r".//Organism").attrib.get("taxonomy_id")
+    package = get_tag(root, r".//Package")
+    sampledict = dict(
+        biosample_accession=accession,
+        # sample_name=sample_name,
+        organism = organism,
+        taxid = tax_id,
+        package = package
+    )
+    for attribute in root.findall("Attributes/Attribute"):
+        sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
+
+    return sampledict
+
+
+def yield_sra_runs_from_sample(biosampleids):
+    sleep(0.1)
+    response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json"))
+    response.raise_for_status()
+    reply = response.json()
+    for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
+        sleep(0.1)
+        response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json"))
+        response.raise_for_status()
+        replyy = response.json()
+        for field, value in replyy.get("result", {}).items():
+            if "uids" not in field:
+                yield field, value.get("runs")
+
+
+runs_example = """
+<Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>  
+<Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>   
+"""
+
+def flatten_runs(runxml):
+    root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
+    for run in root.findall(".//Run"):
+        yield dict(
+            sra_run_accession = run.attrib["acc"],
+            total_spots = run.attrib["total_spots"],
+            total_bases = run.attrib["total_bases"],
+        )
+
+
+
+def main(starting_bioproject):
+    rows = []
+    response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json"))
+    response.raise_for_status()
+    reply = response.json()
+    try:
+        bioproject_id = reply["esearchresult"]["idlist"][0]
+        log(f"Found UID {bioproject_id} for '{starting_bioproject}'")
+    except IndexError:
+        logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"")
+        sys.exit(1)
+    for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]):
+        try:
+            sampledict = flatten_biosample_xml(biosample_xml)
+        except KeyError:
+            log(biosample_xml)
+            raise
+        sampledict["bioproject"] = bioproject
+        for sra, runs in yield_sra_runs_from_sample(biosample):
+            for run in flatten_runs(runs.strip()):
+                run.update(sampledict)
+                rows.append(run)
+
+    log(f"Writing {len(rows)} rows to metadata.tsv")
+
+    header = set()
+    for row in rows:
+        for key in row.keys():
+            header.add(key)
+
+    header = sorted(list(header), key=hso)
+    logger.info(f"Header: {header}")
+
+    rows.sort(key=lambda x: x["biosample_accession"])
+
+    with open("metadata.tsv", "w") as f:
+        writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
+        writer.writeheader()
+        writer.writerows(rows)
+
+    log(f"Writing {len(rows)} accessions to accessions.txt")
+
+    with open("accessions.txt", "w") as f:
+        for row in rows:
+            f.write(row["sra_run_accession"] + "\n")
+
+
+if __name__ == "__main__":
+    b = sys.argv[1].strip()
+    log(f"Starting with {b}")
+    try:
+        main(b)
+    except requests.HTTPError as e:
+        logger.error(e)
+        sys.exit(1)
+
+                
+
+
+
--- a/bio2srr.xml	Wed Oct 27 05:00:45 2021 -0400
+++ b/bio2srr.xml	Fri May 03 01:17:43 2024 -0400
@@ -1,35 +1,31 @@
-<tool id="bio2srr" name="Bioproject to SRR" version="1.0.0">
-	<description>Retrieve SRR accessions and sample metadata from BioProject or BioSample.</description>
+<tool id="bio2srr" name="Bioproject to SRR" version="3.0.0">
+	<description>Retrieve SRR accessions and sample metadata from BioProject. Recursively follows links to subprojects.</description>
     <requirements>
-        <requirement type="package" version="3.8">python</requirement>
-    	<requirement type="package" version="2.24.0">requests</requirement>
+        <requirement type="package" version="3.12">python</requirement>
+    	<requirement type="package" version="2.31.0">requests</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
-        #if $src_metadata
-        $__tool_directory__/bio2srr.py "$input1" $src_metadata > "$output"
-        #else
-        $__tool_directory__/bio2srr.py "$input1" > "$output"
-        #end if
+        python $__tool_directory__/bio2srr.py "$input1"
     ]]></command>
     <inputs>
-        <param type="text" name="input1" label="BioProject or BioSample" />
-        <param type="data" format="tabular" name="src_metadata" label="Table of your own metadata" optional="true" help="Optional metadata table to be folded in." />
+        <param type="text" name="input1" label="BioProject" />
     </inputs>
     <outputs>
-    	<data format="txt" name="output" label="SRR Accession List" />
+    	<data format="txt" name="output" from_work_dir="accessions.txt" label="SRR Accession List" />
         <data format="tabular" name="metadata" from_work_dir="metadata.tsv" label="Sample Metadata Table" />
     </outputs>
     <tests>
     	<test>
-    		<param name="input1" value="NOTHING" />
-    		<output name="output" file="test.txt" />
+    		<param name="input1" value="PRJNA681235" />
+    		<output name="output" file="accessions.txt" />
             <output name="metadata" file="metadata.tsv" />
     	</test>
+        <test expect_failure="1">
+            <param name="input1" value="NOTHING" />
+        </test>
     </tests>
     <help><![CDATA[
-        Retrieve SRR accessions and metadata from a BioProject.
-
-        If you have metadata for your own samples (that aren't from the BioProject) you can provide a TSV table with it and they'll be folded together. (It's a union, not a join.) Put sample ID's under the header 'sample'.
+        Retrieve SRR accessions and sample metadata from a BioProject, including subprojects. It's a JOIN, so if a sample is associated with multiple SRA runs, then each run has a copy of the sample's metadata.
     ]]></help>
     <citations>
     </citations>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/accessions.txt	Fri May 03 01:17:43 2024 -0400
@@ -0,0 +1,91 @@
+SRR288080
+SRR005375
+SRR005372
+SRR000090
+SRR000091
+SRR000092
+SRR000093
+SRR000094
+SRR000095
+SRR000078
+SRR000079
+SRR000072
+SRR000073
+SRR000074
+SRR000075
+SRR000076
+SRR000077
+SRR000070
+SRR000071
+SRR288080
+SRR000090
+SRR000091
+SRR000092
+SRR000093
+SRR000094
+SRR000095
+SRR000078
+SRR000079
+SRR000070
+SRR000071
+SRR288080
+SRR000090
+SRR000091
+SRR000092
+SRR000093
+SRR000094
+SRR000095
+SRR000080
+SRR000081
+SRR000078
+SRR000079
+SRR000070
+SRR000071
+SRR288080
+SRR005375
+SRR005372
+SRR000090
+SRR000091
+SRR000092
+SRR000093
+SRR000094
+SRR000095
+SRR000078
+SRR000079
+SRR000072
+SRR000073
+SRR000074
+SRR000075
+SRR000076
+SRR000077
+SRR000070
+SRR000071
+SRR000068
+SRR000069
+SRR288080
+SRR000080
+SRR000081
+SRR000078
+SRR000079
+SRR000068
+SRR000069
+SRR000066
+SRR000067
+SRR288080
+SRR287817
+SRR000089
+SRR000088
+SRR000087
+SRR000086
+SRR000085
+SRR000084
+SRR000083
+SRR000082
+SRR000080
+SRR000081
+SRR000078
+SRR000079
+SRR000068
+SRR000069
+SRR000066
+SRR000067
--- a/test-data/metadata.tsv	Wed Oct 27 05:00:45 2021 -0400
+++ b/test-data/metadata.tsv	Fri May 03 01:17:43 2024 -0400
@@ -1,3 +1,92 @@
-sample	biosample_accession	isolation_source	collection_date	geo_loc_name	lat_lon	rel_to_oxygen	samp_collect_device	samp_mat_process	samp_size	source_material_id	BioSampleModel	ENA-SPOT-COUNT	ENA-BASE-COUNT	ENA-FIRST-PUBLIC	ENA-LAST-UPDATE
-SRR11671300	SAMN14820590	marine sediment	Not applicable	North Sea: German Bight (Helgoland Mud Area)	54.052300 N 7.580400 E	obligate anaerobe	anoxic sampling of sediment slurry incubations at defined time points	Previously collected sediments from Helgoland mud area, stored at 4ºC (near in-situ temperature) until use as starting material for incubation experiments. For the incubation experiments, anaerobic slurries were prepared in ratio 1:4 and incubated at 30ºC. Sample name identifies the unique information of each sample including the target 16S rRNA gene, incubation timepoint and state of the enrichment.	1 ml slurry in triplicates, Pooled DNA from triplicates were sequenced as 1 sample.	sediment incubation at 30ºC without any amendment sampled after 105 days sequenced with bacteria PCR primers	Metagenome or environmental	41855	6160440	2020-05-04	2020-05-04
-SRR11671283	SAMN14820597	marine sediment	Not applicable	North Sea: German Bight (Helgoland Mud Area)	54.052300 N 7.580400 E	obligate anaerobe	anoxic sampling of sediment slurry incubations at defined time points	Previously collected sediments from Helgoland mud area, stored at 4ºC (near in-situ temperature) until use as starting material for incubation experiments. For the incubation experiments, anaerobic slurries were prepared in ratio 1:4 and incubated at 30ºC. Sample name identifies the unique information of each sample including the target 16S rRNA gene, incubation timepoint and state of the enrichment.	1 ml slurry in triplicates, Pooled DNA from triplicates were sequenced as 1 sample.	sediment incubation at 30ºC without any amendment sampled after 105 days sequenced with archaea PCR primers	Metagenome or environmental	19833	2973256	2020-05-04	2020-05-04
+bioproject	biosample_accession	organism	taxid	package	Genus	ProjectAccession	PublicAccession	Species	attribute_package	collected_by	collection_date	geo_loc_name	isolate	isolate_name_alias	isolation_source	lat_lon	project_name	sequenced_by	sra_run_accession	strain	total_bases	total_spots
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR288080	ECP19-2498	1008246	3835
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR005375	ECP19-2498	63477063	237172
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR005372	ECP19-2498	21805775	88278
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000090	ECP19-2498	59522375	222843
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000091	ECP19-2498	392964	1467
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000092	ECP19-2498	872292	3261
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000093	ECP19-2498	60878431	227850
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000094	ECP19-2498	1311175	4908
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000095	ECP19-2498	592711	2214
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000078	ECP19-2498	35726106	136244
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000079	ECP19-2498	33865731	128606
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000072	ECP19-2498	43110538	164772
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000073	ECP19-2498	834018	3206
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000074	ECP19-2498	1191933	4540
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000075	ECP19-2498	817514	3107
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000076	ECP19-2498	53028372	201721
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000077	ECP19-2498	322254	1226
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000070	ECP19-2498	69214301	262057
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000071	ECP19-2498	56794062	215192
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR288080	ECP19-598	1008246	3835
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000090	ECP19-598	59522375	222843
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000091	ECP19-598	392964	1467
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000092	ECP19-598	872292	3261
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000093	ECP19-598	60878431	227850
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000094	ECP19-598	1311175	4908
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000095	ECP19-598	592711	2214
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000078	ECP19-598	35726106	136244
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000079	ECP19-598	33865731	128606
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000070	ECP19-598	69214301	262057
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000071	ECP19-598	56794062	215192
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR288080	ECP19-798	1008246	3835
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000090	ECP19-798	59522375	222843
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000091	ECP19-798	392964	1467
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000092	ECP19-798	872292	3261
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000093	ECP19-798	60878431	227850
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000094	ECP19-798	1311175	4908
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000095	ECP19-798	592711	2214
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000080	ECP19-798	42230342	158320
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000081	ECP19-798	48201615	180220
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000078	ECP19-798	35726106	136244
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000079	ECP19-798	33865731	128606
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000070	ECP19-798	69214301	262057
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000071	ECP19-798	56794062	215192
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR288080	ECP19-198	1008246	3835
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR005375	ECP19-198	63477063	237172
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR005372	ECP19-198	21805775	88278
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000090	ECP19-198	59522375	222843
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000091	ECP19-198	392964	1467
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000092	ECP19-198	872292	3261
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000093	ECP19-198	60878431	227850
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000094	ECP19-198	1311175	4908
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000095	ECP19-198	592711	2214
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000078	ECP19-198	35726106	136244
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000079	ECP19-198	33865731	128606
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000072	ECP19-198	43110538	164772
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000073	ECP19-198	834018	3206
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000074	ECP19-198	1191933	4540
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000075	ECP19-198	817514	3107
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000076	ECP19-198	53028372	201721
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000077	ECP19-198	322254	1226
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000070	ECP19-198	69214301	262057
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000071	ECP19-198	56794062	215192
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000068	ECP19-198	63395546	247135
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000069	ECP19-198	57476129	224837
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR288080	CJP19-D445	1008246	3835
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000080	CJP19-D445	42230342	158320
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000081	CJP19-D445	48201615	180220
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000078	CJP19-D445	35726106	136244
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000079	CJP19-D445	33865731	128606
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000068	CJP19-D445	63395546	247135
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000069	CJP19-D445	57476129	224837
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000066	CJP19-D445	63790620	242673
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000067	CJP19-D445	66936400	255351
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR288080	CJP19-D996	1008246	3835
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR287817	CJP19-D996	155025677	1414888
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000089	CJP19-D996	64994909	250945
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000088	CJP19-D996	62912540	242861
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000087	CJP19-D996	1051130	4049
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000086	CJP19-D996	525756	2047
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000085	CJP19-D996	436118	1684
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000084	CJP19-D996	466139	1803
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000083	CJP19-D996	1251016	4841
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000082	CJP19-D996	1227889	4753
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000080	CJP19-D996	42230342	158320
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000081	CJP19-D996	48201615	180220
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000078	CJP19-D996	35726106	136244
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000079	CJP19-D996	33865731	128606
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000068	CJP19-D996	63395546	247135
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000069	CJP19-D996	57476129	224837
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000066	CJP19-D996	63790620	242673
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR000067	CJP19-D996	66936400	255351
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/metadata.tsv.bak	Fri May 03 01:17:43 2024 -0400
@@ -0,0 +1,92 @@
+bioproject	biosample_accession	organism	taxid	package	total_bases	lat_lon	geo_loc_name	collection_date	isolate_name_alias	isolate	sra_run_accession	Species	sequenced_by	strain	isolation_source	attribute_package	project_name	total_spots	ProjectAccession	collected_by	PublicAccession	Genus
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1008246	missing	USA	2019	CFSAN091029		SRR288080	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	3835	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	63477063	missing	USA	2019	CFSAN091029		SRR005375	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	237172	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	21805775	missing	USA	2019	CFSAN091029		SRR005372	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	88278	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	59522375	missing	USA	2019	CFSAN091029		SRR000090	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	222843	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	392964	missing	USA	2019	CFSAN091029		SRR000091	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	1467	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	872292	missing	USA	2019	CFSAN091029		SRR000092	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	3261	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	60878431	missing	USA	2019	CFSAN091029		SRR000093	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	227850	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1311175	missing	USA	2019	CFSAN091029		SRR000094	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	4908	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	592711	missing	USA	2019	CFSAN091029		SRR000095	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	2214	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	35726106	missing	USA	2019	CFSAN091029		SRR000078	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	136244	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	33865731	missing	USA	2019	CFSAN091029		SRR000079	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	128606	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	43110538	missing	USA	2019	CFSAN091029		SRR000072	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	164772	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	834018	missing	USA	2019	CFSAN091029		SRR000073	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	3206	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1191933	missing	USA	2019	CFSAN091029		SRR000074	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	4540	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	817514	missing	USA	2019	CFSAN091029		SRR000075	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	3107	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	53028372	missing	USA	2019	CFSAN091029		SRR000076	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	201721	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	322254	missing	USA	2019	CFSAN091029		SRR000077	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	1226	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	69214301	missing	USA	2019	CFSAN091029		SRR000070	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	262057	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	56794062	missing	USA	2019	CFSAN091029		SRR000071	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-2498	missing	environmental/food/other	GenomeTrakr	215192	PRJNA681235	CDC	CFSAN091029	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1008246	missing	USA	2019	CFSAN091027		SRR288080	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	3835	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	59522375	missing	USA	2019	CFSAN091027		SRR000090	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	222843	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	392964	missing	USA	2019	CFSAN091027		SRR000091	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	1467	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	872292	missing	USA	2019	CFSAN091027		SRR000092	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	3261	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	60878431	missing	USA	2019	CFSAN091027		SRR000093	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	227850	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1311175	missing	USA	2019	CFSAN091027		SRR000094	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	4908	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	592711	missing	USA	2019	CFSAN091027		SRR000095	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	2214	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	35726106	missing	USA	2019	CFSAN091027		SRR000078	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	136244	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	33865731	missing	USA	2019	CFSAN091027		SRR000079	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	128606	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	69214301	missing	USA	2019	CFSAN091027		SRR000070	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	262057	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	56794062	missing	USA	2019	CFSAN091027		SRR000071	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-598	missing	environmental/food/other	GenomeTrakr	215192	PRJNA681235	CDC	CFSAN091027	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1008246	missing	USA	2019	CFSAN091028		SRR288080	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	3835	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	59522375	missing	USA	2019	CFSAN091028		SRR000090	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	222843	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	392964	missing	USA	2019	CFSAN091028		SRR000091	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	1467	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	872292	missing	USA	2019	CFSAN091028		SRR000092	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	3261	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	60878431	missing	USA	2019	CFSAN091028		SRR000093	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	227850	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1311175	missing	USA	2019	CFSAN091028		SRR000094	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	4908	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	592711	missing	USA	2019	CFSAN091028		SRR000095	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	2214	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	42230342	missing	USA	2019	CFSAN091028		SRR000080	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	158320	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	48201615	missing	USA	2019	CFSAN091028		SRR000081	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	180220	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	35726106	missing	USA	2019	CFSAN091028		SRR000078	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	136244	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	33865731	missing	USA	2019	CFSAN091028		SRR000079	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	128606	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	69214301	missing	USA	2019	CFSAN091028		SRR000070	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	262057	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	56794062	missing	USA	2019	CFSAN091028		SRR000071	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-798	missing	environmental/food/other	GenomeTrakr	215192	PRJNA681235	CDC	CFSAN091028	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1008246	missing	USA	2019	CFSAN091030		SRR288080	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	3835	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	63477063	missing	USA	2019	CFSAN091030		SRR005375	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	237172	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	21805775	missing	USA	2019	CFSAN091030		SRR005372	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	88278	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	59522375	missing	USA	2019	CFSAN091030		SRR000090	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	222843	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	392964	missing	USA	2019	CFSAN091030		SRR000091	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	1467	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	872292	missing	USA	2019	CFSAN091030		SRR000092	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	3261	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	60878431	missing	USA	2019	CFSAN091030		SRR000093	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	227850	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1311175	missing	USA	2019	CFSAN091030		SRR000094	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	4908	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	592711	missing	USA	2019	CFSAN091030		SRR000095	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	2214	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	35726106	missing	USA	2019	CFSAN091030		SRR000078	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	136244	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	33865731	missing	USA	2019	CFSAN091030		SRR000079	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	128606	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	43110538	missing	USA	2019	CFSAN091030		SRR000072	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	164772	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	834018	missing	USA	2019	CFSAN091030		SRR000073	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	3206	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	1191933	missing	USA	2019	CFSAN091030		SRR000074	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	4540	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	817514	missing	USA	2019	CFSAN091030		SRR000075	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	3107	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	53028372	missing	USA	2019	CFSAN091030		SRR000076	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	201721	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	322254	missing	USA	2019	CFSAN091030		SRR000077	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	1226	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	69214301	missing	USA	2019	CFSAN091030		SRR000070	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	262057	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	56794062	missing	USA	2019	CFSAN091030		SRR000071	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	215192	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	63395546	missing	USA	2019	CFSAN091030		SRR000068	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	247135	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	57476129	missing	USA	2019	CFSAN091030		SRR000069	coli	FDA Center for Food Safety and Applied Nutrition	ECP19-198	missing	environmental/food/other	GenomeTrakr	224837	PRJNA681235	CDC	CFSAN091030	Escherichia
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0	1008246	missing	missing	missing		CFSAN091031	SRR288080		FDA Center for Food Safety and Applied Nutrition	CJP19-D445	missing		GenomeTrakr	3835		CDC		
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0	42230342	missing	missing	missing		CFSAN091031	SRR000080		FDA Center for Food Safety and Applied Nutrition	CJP19-D445	missing		GenomeTrakr	158320		CDC		
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0	48201615	missing	missing	missing		CFSAN091031	SRR000081		FDA Center for Food Safety and Applied Nutrition	CJP19-D445	missing		GenomeTrakr	180220		CDC		
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0	35726106	missing	missing	missing		CFSAN091031	SRR000078		FDA Center for Food Safety and Applied Nutrition	CJP19-D445	missing		GenomeTrakr	136244		CDC		
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0	33865731	missing	missing	missing		CFSAN091031	SRR000079		FDA Center for Food Safety and Applied Nutrition	CJP19-D445	missing		GenomeTrakr	128606		CDC		
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0	63395546	missing	missing	missing		CFSAN091031	SRR000068		FDA Center for Food Safety and Applied Nutrition	CJP19-D445	missing		GenomeTrakr	247135		CDC		
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0	57476129	missing	missing	missing		CFSAN091031	SRR000069		FDA Center for Food Safety and Applied Nutrition	CJP19-D445	missing		GenomeTrakr	224837		CDC		
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0	63790620	missing	missing	missing		CFSAN091031	SRR000066		FDA Center for Food Safety and Applied Nutrition	CJP19-D445	missing		GenomeTrakr	242673		CDC		
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0	66936400	missing	missing	missing		CFSAN091031	SRR000067		FDA Center for Food Safety and Applied Nutrition	CJP19-D445	missing		GenomeTrakr	255351		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	1008246	missing	missing	missing		CFSAN091032	SRR288080		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	3835		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	155025677	missing	missing	missing		CFSAN091032	SRR287817		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	1414888		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	64994909	missing	missing	missing		CFSAN091032	SRR000089		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	250945		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	62912540	missing	missing	missing		CFSAN091032	SRR000088		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	242861		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	1051130	missing	missing	missing		CFSAN091032	SRR000087		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	4049		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	525756	missing	missing	missing		CFSAN091032	SRR000086		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	2047		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	436118	missing	missing	missing		CFSAN091032	SRR000085		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	1684		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	466139	missing	missing	missing		CFSAN091032	SRR000084		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	1803		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	1251016	missing	missing	missing		CFSAN091032	SRR000083		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	4841		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	1227889	missing	missing	missing		CFSAN091032	SRR000082		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	4753		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	42230342	missing	missing	missing		CFSAN091032	SRR000080		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	158320		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	48201615	missing	missing	missing		CFSAN091032	SRR000081		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	180220		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	35726106	missing	missing	missing		CFSAN091032	SRR000078		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	136244		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	33865731	missing	missing	missing		CFSAN091032	SRR000079		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	128606		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	63395546	missing	missing	missing		CFSAN091032	SRR000068		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	247135		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	57476129	missing	missing	missing		CFSAN091032	SRR000069		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	224837		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	63790620	missing	missing	missing		CFSAN091032	SRR000066		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	242673		CDC		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0	66936400	missing	missing	missing		CFSAN091032	SRR000067		FDA Center for Food Safety and Applied Nutrition	CJP19-D996	missing		GenomeTrakr	255351		CDC		
--- a/test-data/test.txt	Wed Oct 27 05:00:45 2021 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-SRR11671300
-SRR11671283
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests.py	Fri May 03 01:17:43 2024 -0400
@@ -0,0 +1,42 @@
+import pytest
+
+from bio2srr import *
+
+
+def test_element_tree_xpath():
+    from xml.etree import ElementTree as xml
+    root = xml.fromstring(biosample_example)
+    assert root.find(".//Id[@db='BioSample']") is not None
+
+def test_flatten_biosample_xml():
+    d = flatten_biosample_xml(biosample_example)
+    assert d['biosample_accession'] == 'SAMN17131268'
+    assert d['organism'] == 'Campylobacter jejuni'
+    assert d['isolate'] == 'CFSAN091032'
+
+def test_flatten_runs():
+    d = list(flatten_runs(runs_example))
+    assert len(d) == 2
+
+def test_header_sort_override_consistency():
+    import random
+    L = ["C", "B", "A", "taxid", "bioproject"]
+    L.sort(key=hso)
+    # assert L[0] == "bioproject"
+    A = L.copy()
+    assert A == L
+    R = []
+    for _ in range(100):
+        random.shuffle(A)
+        A.sort(key=hso)
+        R.append(A == L)
+    assert all(R)
+
+def test_hso_override():
+    assert header_sort_override("bioproject", "taxid") < 0
+    assert header_sort_override("taxid", "bioproject") > 0
+    assert header_sort_override("taxid", "taxid") == 0
+
+def test_hso_regular():
+    assert header_sort_override("A", "B") < 0
+    assert header_sort_override("B", "A") > 0
\ No newline at end of file