Mercurial > repos > rliterman > csp2

--- a/CSP2/bin/chooseRefs.py	Wed Dec 11 12:04:20 2024 -0500
+++ b/CSP2/bin/chooseRefs.py	Thu Dec 12 13:53:15 2024 -0500
@@ -186,6 +186,8 @@
 non_ref_df = cluster_df.loc[~cluster_df['Isolate_ID'].isin(refs_chosen)].sort_values('Base_Score', ascending=False)
 non_ref_df['Is_Ref'] = False
 final_ref_df['Is_Ref'] = True
-pd.concat([final_ref_df, non_ref_df]).reset_index(drop=True).to_csv(ref_file, index=False, sep="\t")
+
+with open(ref_file, 'w') as f:
+    pd.concat([final_ref_df, non_ref_df]).reset_index(drop=True).to_csv(f, index=False, sep="\t")

 print(",".join(final_ref_df['Path'].tolist()))
\ No newline at end of file
--- a/CSP2/bin/compileSNPResults.py	Wed Dec 11 12:04:20 2024 -0500
+++ b/CSP2/bin/compileSNPResults.py	Thu Dec 12 13:53:15 2024 -0500
@@ -467,17 +467,20 @@
 # Output data

 # Mean assembly stats
-isolate_mean_df.reset_index().to_csv(mean_isolate_file,sep='\t',index=False)
+with open(mean_isolate_file, 'w') as f:
+    isolate_mean_df.reset_index().to_csv(f,sep='\t',index=False)

 # Isolate assembly stats
 isolate_assembly_stats = isolate_stats.loc[isolate_stats['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])].drop(['Min','Max','StdDev','Count'],axis=1).rename(columns = {'Mean':'Value'})
-isolate_assembly_stats.to_csv(isolate_assembly_stats_file,sep='\t',index=False)
+with open(isolate_assembly_stats_file,'w') as f:
+    isolate_assembly_stats.to_csv(f,sep='\t',index=False)

 # Isolate alignment stats
 isolate_align_stats = pd.concat([align_stats,isolate_cocalled_stats,isolate_snp_stats,isolate_stdev_stats]).reset_index(drop=True)
 for col in ['Min', 'Mean', 'Max', 'StdDev', 'Zscore']:
     isolate_align_stats[col] = isolate_align_stats[col].astype("float").round(3)
-isolate_align_stats.to_csv(align_stats_file,sep='\t',index=False)
+with open(align_stats_file,'w') as f:
+    isolate_align_stats.to_csv(f,sep='\t',index=False)

 # Reference Assembly Stats
 ref_align_summary_df = ref_summary_df.loc[(~ref_summary_df['Measure'].isin(['Contig_Count','Assembly_Bases','L50','L90','N50','N90'])) & (~pd.isna(ref_summary_df['Zscore']))]
@@ -492,7 +495,9 @@
 ref_isolate_align_stats = align_stats.loc[(align_stats['Isolate_Type'] == "Reference") & (align_stats['Measure'].isin(['Self_Aligned','Compare_Aligned']))].drop(['Isolate_Type'],axis=1).rename(columns = {'Isolate_ID':'Reference_ID'})[['Reference_ID','Measure','Mean','StdDev','Min','Max','Count','Zscore','QC']]

 ref_mean_summary_stats = pd.concat([ref_mean_summary_df,ref_isolate_align_stats])
-ref_mean_summary_stats.to_csv(ref_mean_summary_file,sep='\t',index=False)
+
+with open(ref_mean_summary_file,'w') as f:
+    ref_mean_summary_stats.to_csv(f,sep='\t',index=False)

 end_time = time.time()

@@ -505,10 +510,12 @@

     # Comparisons if multiple refs
     if len(reference_ids) > 1:
-        comparison_df.to_csv(snp_comparison_file,sep="\t",index = False)
+        with open(snp_comparison_file,"w") as f:
+            comparison_df.to_csv(f,sep="\t",index = False)
         log.write(f"\t- Saved SNP distance comparisons across references to {snp_comparison_file}\n")

     # Failures/warnings
     if warn_fail_df.shape[0] > 0:
-        warn_fail_df.to_csv(qc_file,sep="\t",index=False)
+        with open(qc_file,"w") as f:
+            warn_fail_df.to_csv(f,sep="\t",index=False)
         log.write(f"\t- Saved QC warnings/failures to {qc_file}\n")
\ No newline at end of file
--- a/CSP2/bin/runSNPPipeline.py	Wed Dec 11 12:04:20 2024 -0500
+++ b/CSP2/bin/runSNPPipeline.py	Thu Dec 12 13:53:15 2024 -0500
@@ -507,7 +507,8 @@

                     # Write filtered SNP data to file
                     snp_file = log_file.replace(".log","_SNPs.tsv")
-                    filtered_snp_df.to_csv(snp_file, sep="\t", index=False)
+                    with open(snp_file,"w") as f:
+                        filtered_snp_df.to_csv(f, sep="\t", index=False)

                     filtered_snp_df.loc[:, 'Query_ID'] = query_id

@@ -750,7 +751,8 @@

     # Save reference screening
     results_df = pd.DataFrame([item.result()[0] for item in results], columns = output_columns)
-    results_df.to_csv(reference_screening_file, sep="\t", index=False)
+    with open(reference_screening_file,"w") as f:
+        results_df.to_csv(f, sep="\t", index=False)

     # Get reference bed dfs
     covered_df = pd.concat([item.result()[1] for item in results])
@@ -940,7 +942,8 @@
             locus_coverage_df = snp_coverage_df.merge(ref_base_coverage_df, how='outer', on='Ref_Loc').merge(uncovered_count_df, how='outer', on='Ref_Loc').merge(purged_count_df, how='outer', on='Ref_Loc').fillna(0)
             locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']] = locus_coverage_df.loc[:, ['SNP_Count','Ref_Base_Count','Uncovered_Count','Purged_Count']].astype(int)
             locus_coverage_df['Missing_Ratio'] = ((locus_coverage_df['Uncovered_Count'] + locus_coverage_df['Purged_Count']) / (1+len(pass_qc_isolates))) * 100
-            locus_coverage_df.to_csv(locus_category_file, sep="\t", index=False)
+            with open(locus_category_file,"w") as f:
+                locus_coverage_df.to_csv(f, sep="\t", index=False)

             # Get isolate coverage stats
             min_isolate_cols = ['Query_ID','SNP','Ref_Base','Percent_Missing','Purged','Uncovered','Rescued_SNP','Purged_Ref_Edge']
@@ -964,7 +967,8 @@
                 isolate_coverage_df.loc[isolate_coverage_df['Query_ID'] == reference_id, 'Purged_Ref_Edge'] = ref_edge_df['Ref_Loc'].nunique()

             isolate_coverage_df = isolate_coverage_df[min_isolate_cols + possible_purged_cols].sort_values(by = 'Percent_Missing',ascending = False).reset_index(drop=True)
-            isolate_coverage_df.to_csv(query_coverage_file, sep="\t", index=False)
+            with open(query_coverage_file,'w') as f:
+                isolate_coverage_df.to_csv(f, sep="\t", index=False)

             with open(log_file,"a+") as log:
                 log.write(f"\t- SNP coverage information: {locus_category_file}\n")
@@ -1039,42 +1043,52 @@
         pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled'])
         preserved_pairwise_df = pairwise_df.copy()

-        pairwise_df.to_csv(raw_pairwise, sep="\t", index=False)
-        preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False)
+        with open(raw_pairwise,"w") as f:
+            pairwise_df.to_csv(f, sep="\t", index=False)
+        with open(preserved_pairwise,"w") as f:
+            preserved_pairwise_df.to_csv(f, sep="\t", index=False)

         # Create matrix
         idx = sorted(set(pairwise_df['Query_1']).union(pairwise_df['Query_2']))
         mirrored_distance_df = pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f'))
         mirrored_distance_df.index.name = ''
-        mirrored_distance_df.to_csv(raw_matrix,sep="\t")
-        mirrored_distance_df.to_csv(preserved_matrix,sep="\t")
+        with open(raw_matrix,"w") as f:
+            mirrored_distance_df.to_csv(f,sep="\t")
+        with open(preserved_matrix,"w") as f:
+            mirrored_distance_df.to_csv(f,sep="\t")

     else:
         raw_distance_results = parallelAlignment(alignment)
         raw_pairwise_df = pd.DataFrame(raw_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled'])
-        raw_pairwise_df.to_csv(raw_pairwise, sep="\t", index=False)
+        with open(raw_pairwise,"w") as f:
+            raw_pairwise_df.to_csv(f, sep="\t", index=False)

         if len(locs_pass_missing) == snp_count:
             preserved_pairwise_df = raw_pairwise_df.copy()
-            preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False)
+            with open(preserved_pairwise,"w") as f:
+                preserved_pairwise_df.to_csv(f, sep="\t", index=False)
         elif len(locs_pass_missing) == 0:
             preserved_pairwise_df = pd.DataFrame([(pairwise[0], pairwise[1], 0,np.nan) for pairwise in pairwise_combinations],columns = ['Query_1','Query_2','SNP_Distance','SNPs_Cocalled'])
-            preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False)
+            with open(preserved_pairwise,"w") as f:
+                preserved_pairwise_df.to_csv(f, sep="\t", index=False)
         else:
             preserved_distance_results = parallelAlignment(preserved_alignment)
             preserved_pairwise_df = pd.DataFrame(preserved_distance_results, columns=['Query_1', 'Query_2', 'SNP_Distance', 'SNPs_Cocalled'])
-            preserved_pairwise_df.to_csv(preserved_pairwise, sep="\t", index=False)
-
+            with open(preserved_pairwise,"w") as f:
+                preserved_pairwise_df.to_csv(f, sep="\t", index=False)
+
         # Create matrix
         idx = sorted(set(raw_pairwise_df['Query_1']).union(raw_pairwise_df['Query_2']))
         mirrored_distance_df = raw_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f'))
         mirrored_distance_df.index.name = ''
-        mirrored_distance_df.to_csv(raw_matrix,sep="\t")
+        with open(raw_matrix,"w") as f:
+            mirrored_distance_df.to_csv(f,sep="\t")

         idx = sorted(set(preserved_pairwise_df['Query_1']).union(preserved_pairwise_df['Query_2']))
         mirrored_distance_df = preserved_pairwise_df.pivot(index='Query_1', columns='Query_2', values='SNP_Distance').reindex(index=idx, columns=idx).fillna(0, downcast='infer').pipe(lambda x: x+x.values.T).applymap(lambda x: format(x, '.0f'))
         mirrored_distance_df.index.name = ''
-        mirrored_distance_df.to_csv(preserved_matrix,sep="\t")
+        with open(preserved_matrix,"w") as f:
+            mirrored_distance_df.to_csv(f,sep="\t")

     # Clean up pybedtools temp
     helpers.cleanup(verbose=False,remove_all = False)
--- a/CSP2/bin/saveSNPDiffs.py	Wed Dec 11 12:04:20 2024 -0500
+++ b/CSP2/bin/saveSNPDiffs.py	Thu Dec 12 13:53:15 2024 -0500
@@ -58,7 +58,8 @@
         header_rows.append(processHeader(top_line,snpdiffs_file,trim_name))

 output_data = pd.concat(header_rows, ignore_index=True)
-output_data.to_csv(summary_file, sep='\t', index=False)
+with open(summary_file,"w") as f:
+    output_data.to_csv(f, sep='\t', index=False)

 # If ref_ids is empty, save isolate data
 ref_header = ['Reference_ID','Reference_Assembly','Reference_Contig_Count','Reference_Assembly_Bases','Reference_N50','Reference_N90','Reference_L50','Reference_L90','Reference_SHA256']
@@ -79,5 +80,6 @@
 cols = combined_df.columns.tolist()
 cols = cols[:1] + cols[-1:] + cols[1:-1]
 combined_df = combined_df[cols]
-combined_df.to_csv(isolate_data_file, sep='\t', index=False)
+with open(isolate_data_file,"w") as f:
+    combined_df.to_csv(f, sep='\t', index=False)
--- a/CSP2/bin/screenSNPDiffs.py	Wed Dec 11 12:04:20 2024 -0500
+++ b/CSP2/bin/screenSNPDiffs.py	Thu Dec 12 13:53:15 2024 -0500
@@ -502,7 +502,8 @@

                     # Write filtered SNP data to file
                     snp_file = log_file.replace(".log","_SNPs.tsv")
-                    filtered_snp_df.to_csv(snp_file, sep="\t", index=False)
+                    with open(snp_file,"w") as f:
+                        filtered_snp_df.to_csv(f, sep="\t", index=False)

                     csp2_screen_snps = filtered_snp_df[filtered_snp_df.Cat == "SNP"].shape[0]

@@ -630,7 +631,8 @@
                 'MUMmer_gSNPs','MUMmer_gIndels']

     results_df = pd.DataFrame([item.result() for item in results], columns = output_columns)
-    results_df.to_csv(output_file, sep="\t", index=False)
+    with open(output_file,"w") as f:
+        results_df.to_csv(f, sep="\t", index=False)
 except:
     run_failed = True
     print("Exception occurred:\n", traceback.format_exc())
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/docker/Dockerfile	Thu Dec 12 13:53:15 2024 -0500
@@ -0,0 +1,246 @@
+# CSP2 Dockerfile
+# Based on StaPH-B's Dockerfile for BEDTools, MUmmer, and Skesa
+# Thanks to Erin Young, Curtis Kapsak, John Arnn, and the StaPH-B team
+# https://github.com/StaPH-B/docker-builds/blob/master/bedtools/2.31.1/Dockerfile
+# https://github.com/StaPH-B/docker-builds/blob/master/mummer/4.0.0/Dockerfile
+# https://github.com/StaPH-B/docker-builds/blob/master/skesa/2.4.0/Dockerfile
+
+ARG CSP2_VER="0.9.0"
+ARG BEDTOOLS_VER="2.31.1"
+ARG MUMMER_VER="4.0.0"
+ARG SKESA_VER="2.4.0"
+ARG MASH_VER="2.3"
+ARG BBMAP_VER="38.90"
+ARG PYTHON_VER="3.8"
+
+FROM ubuntu:focal AS build
+
+ARG BEDTOOLS_VER
+ARG MUMMER_VER
+ARG SKESA_VER
+ARG MASH_VER
+ARG BBMAP_VER
+ARG PYTHON_VER
+
+WORKDIR /build
+
+# to prevent tzdata from asking for a region during apt updates; ARG so that variable only
+# persists at buildtime
+# from https://github.com/StaPH-B/docker-builds/blob/master/mummer/4.0.0/Dockerfile
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tzdata \
+    gpg-agent \
+    software-properties-common \
+    build-essential \
+    zlib1g-dev \
+    libghc-bzlib-dev \
+    liblzma-dev \
+    wget \
+    ca-certificates
+
+RUN add-apt-repository 'ppa:deadsnakes/ppa' && apt-get update && apt-get install -y --no-install-recommends \
+    python${PYTHON_VER} \
+#    python${PYTHON_VER}-pip \
+#    python${PYTHON_VER}-full \
+    python${PYTHON_VER}-dev \
+    python${PYTHON_VER}-venv && \
+    python${PYTHON_VER} -m venv --copies /opt/venv
+
+
+ENV PATH="/opt/venv/bin:$PATH"
+
+RUN pip install --no-cache-dir -U pandas~=1.2.0 pybedtools refchooser scikit-learn
+
+ADD https://github.com/arq5x/bedtools2/archive/refs/tags/v${BEDTOOLS_VER}.tar.gz .
+ADD https://github.com/mummer4/mummer/releases/download/v${MUMMER_VER}rc1/mummer-${MUMMER_VER}rc1.tar.gz .
+ADD https://github.com/ncbi/SKESA/releases/download/${SKESA_VER}/skesa.centos.7.7 .
+ADD https://github.com/ncbi/SKESA/releases/download/${SKESA_VER}/gfa_connector.centos7.7 .
+ADD https://github.com/ncbi/SKESA/releases/download/${SKESA_VER}/kmercounter.centos7.7 .
+ADD https://github.com/marbl/Mash/releases/download/v${MASH_VER}/mash-Linux64-v${MASH_VER}.tar .
+
+# Install BEDTools
+# per https://github.com/StaPH-B/docker-builds/blob/master/bedtools/2.31.1/Dockerfile
+# python3 required when compiling via `make` command for creating old CLI executables
+# dependencies listed here (albeit for v2.30.0, still should be identical): https://packages.ubuntu.com/jammy/bedtools
+# requires libghc-bzlib-dev, build-essential, zlib1g-dev, and a few others
+# 'make install' should place binary executable files in /usr/local/bin
+RUN tar -xzf v${BEDTOOLS_VER}.tar.gz && \
+    rm v${BEDTOOLS_VER}.tar.gz && \
+   cd bedtools2-${BEDTOOLS_VER} && \
+   make && \
+   make install
+
+ # Install mummer
+ # per https://github.com/StaPH-B/docker-builds/blob/master/mummer/4.0.0/Dockerfile
+RUN tar -xvf mummer-${MUMMER_VER}rc1.tar.gz && \
+    rm mummer-${MUMMER_VER}rc1.tar.gz && \
+    cd mummer-${MUMMER_VER}rc1 && \
+    ./configure --prefix=/usr/local && \
+    make && \
+    make install && \
+    ldconfig
+
+# # Install Skesa
+# # per https://github.com/StaPH-B/docker-builds/blob/master/skesa/2.4.0/Dockerfile
+# # get skesa, gfa_connector, and kmercounter binaries, rename them
+RUN mkdir skesa && \
+    cd skesa && \
+    mv /build/skesa.centos.7.7 skesa && \
+    mv /build/gfa_connector.centos7.7 gfa_connector && \
+    mv /build/kmercounter.centos7.7 kmercounter && \
+    chmod +x skesa gfa_connector kmercounter && \
+    mv skesa gfa_connector kmercounter /usr/local/bin
+
+# Install Mash
+RUN tar -xvf mash-Linux64-v${MASH_VER}.tar && \
+    mv mash-Linux64-v${MASH_VER}/mash /usr/local/bin
+
+# Install BBMap
+RUN wget -O BBMap_${BBMAP_VER}.tar.gz https://sourceforge.net/projects/bbmap/files/BBMap_${BBMAP_VER}.tar.gz/download && \
+    tar -xvf BBMap_${BBMAP_VER}.tar.gz && \
+    mv bbmap/* /usr/local/bin
+
+
+FROM ubuntu:focal AS app
+
+ARG CSP2_VER
+ARG CSP2_BRANCH="main"
+ARG PYTHON_VER
+
+LABEL base.image="ubuntu:focal"
+LABEL version=${CSP2_VER}
+LABEL software="CSP2"
+LABEL software.version=${CSP2_VER}
+LABEL description="a Nextflow pipeline for rapid, accurate SNP distance estimation from assembly data"
+LABEL website="https://github.com/CFSAN-Biostatistics/CSP2"
+LABEL licence="https://github.com/CFSAN-Biostatistics/CSP2/blob/main/LICENSE"
+LABEL maintainer="Robert Literman"
+LABEL maintainer.email="Robert.Literman@fda.hhs.gov"
+LABEL maintainer.organization="FDA/CFSAN/Biostatistics"
+LABEL maintainer2="Justin Payne"
+LABEL maintainer2.email="Justin.Payne@fda.hhs.gov"
+LABEL maintainer2.organization="FDA/CFSAN/Biostatistics"
+
+WORKDIR /root/.nextflow
+WORKDIR /app
+
+# copy in all executable files from builder stage to final app stage
+COPY --from=build /usr/local/bin /usr/local/bin
+
+# Lots of perl nonsense
+COPY --from=build /usr/local/lib /usr/local/lib
+COPY --from=build /usr/local/libexec/mummer /usr/local/libexec/mummer
+COPY --from=build /usr/lib/x86_64-linux-gnu/perl /usr/lib/x86_64-linux-gnu/perl
+COPY --from=build /usr/local/share /usr/local/share
+COPY --from=build /usr/share /usr/share
+COPY --from=build /opt/venv /opt/venv
+COPY --from=build /usr/bin/make /usr/local/bin/make
+
+
+# Python stuff
+COPY --from=build /usr/lib/python${PYTHON_VER} /usr/lib/python${PYTHON_VER}
+
+
+#Install JRE
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    openjdk-17-jre-headless \
+    curl
+
+# Install Nextflow
+# per https://www.nextflow.io/docs/latest/getstarted.html
+RUN export CAPSULE_LOG=debug && curl -s https://get.nextflow.io | bash && \
+    chmod +x nextflow && \
+    mv nextflow /usr/local/bin && \
+    nextflow run hello
+
+ADD docker/Makefile .
+
+# set PATH, set perl locale settings for singularity compatibility
+ENV PATH="/opt/venv/bin:/usr/local/bin:/skesa:$PATH" \
+    LC_ALL=C \
+    NXF_OFFLINE='true'
+
+ADD bin ./bin
+ADD conf ./conf
+ADD subworkflows ./subworkflows
+ADD CSP2.nf ./CSP2.nf
+ADD nextflow.config ./nextflow.config
+
+
+FROM app AS pretest
+
+# set PATH, set perl locale settings for singularity compatibility
+ENV PATH="/opt/venv/bin:/usr/local/bin:/skesa:$PATH" \
+    LC_ALL=C \
+    NXF_OFFLINE='true'
+
+#Alternate test data directory
+ADD https://github.com/CFSAN-Biostatistics/CSP2_TestData#main:assets assets/
+
+
+# Test MUmmer installation
+# per https://github.com/StaPH-B/docker-builds/blob/master/mummer/4.0.0/Dockerfile
+
+ADD https://mummer4.github.io/tutorial/exampleFiles/2.1/in/H_pylori26695_Eslice.fasta .
+ADD https://mummer4.github.io/tutorial/exampleFiles/2.1/in/H_pyloriJ99_Eslice.fasta .
+ADD https://mummer4.github.io/tutorial/exampleFiles/2.2/in/B_anthracis_Mslice.fasta .
+ADD https://mummer4.github.io/tutorial/exampleFiles/2.2/in/B_anthracis_contigs.fasta .
+ADD http://mummer.sourceforge.net/examples/data/H_pylori26695_Eslice.fasta .
+ADD http://mummer.sourceforge.net/examples/data/H_pyloriJ99_Eslice.fasta .
+ADD https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V5.3.2/SARS-CoV-2.primer.bed ./V5.3.2.artic.bed
+ADD https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V4.1/SARS-CoV-2.primer.bed ./V4.1.artic.bed
+
+FROM pretest AS test
+
+# Test MASH
+
+RUN nucmer -h && \
+  promer -h && \
+  mummer -mum -b -c H_pylori26695_Eslice.fasta H_pyloriJ99_Eslice.fasta > mummer.mums && \
+  nucmer  -c 100 -p nucmer B_anthracis_Mslice.fasta B_anthracis_contigs.fasta && \
+  show-snps -C nucmer.delta > nucmer.snps && \
+  promer -p promer_100 -c 100  H_pylori26695_Eslice.fasta H_pyloriJ99_Eslice.fasta
+
+# Test bedtools installation
+# check help options and version
+RUN bedtools --help && \
+    bedtools --version
+
+# downloads two bedfiles for ARTIC SARS-CoV-2 artic schemes, fixes their formatting, uses bedtools sort, intersect, and merge
+# per https://github.com/StaPH-B/docker-builds/blob/master/bedtools/2.31.1/Dockerfile
+RUN awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' V5.3.2.artic.bed > V5.3.2.unsorted.bed && \
+    bedtools sort -i V5.3.2.unsorted.bed > V5.3.2.bed && \
+    awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' V4.1.artic.bed   > V4.1.bed   && \
+    bedtools intersect -a V5.3.2.bed -b V4.1.bed > intersect_test.bed && \
+    mergeBed -i V5.3.2.bed > merged_test.bed && \
+    head intersect_test.bed merged_test.bed
+
+RUN /bin/bash -c 'make test'
+
+FROM app AS release
+
+ARG CSP2_VER
+ARG BEDTOOLS_VER
+ARG MUMMER_VER
+ARG SKESA_VER
+ARG MASH_VER
+ARG BBMAP_VER
+ARG PYTHON_VER
+ENV CSP2_VER=${CSP2_VER}
+ENV BEDTOOLS_VER=${BEDTOOLS_VER}
+ENV MUMMER_VER=${MUMMER_VER}
+ENV SKESA_VER=${SKESA_VER}
+ENV MASH_VER=${MASH_VER}
+ENV BBMAP_VER=${BBMAP_VER}
+ENV PYTHON_VER=${PYTHON_VER}
+
+# set PATH, set perl locale settings for singularity compatibility
+ENV PATH="/opt/venv/bin:/usr/local/bin:/skesa:$PATH" \
+    LC_ALL=C \
+    NXF_OFFLINE='true'
+
+ENTRYPOINT ["make", "--makefile=/app/Makefile"]
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/docker/Makefile	Thu Dec 12 13:53:15 2024 -0500
@@ -0,0 +1,143 @@
+.PHONY:
+
+.ONESHELL:
+
+
+
+usage: ## Show this menu
+	@grep -E '^[a-zA-Z_-]+:.*?##.*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?##"}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+
+version: ## Show version and branch
+	@echo "CSP2 v$${CSP2_VER}/$${CSP2_BRANCH}"
+
+# ENV CSP2_VER=${CSP2_VER}
+# ENV BEDTOOLS_VER=${BEDTOOLS_VER}
+# ENV MIUMMER_VER=${MUMMER_VER}
+# ENV SKESA_VER=${SKESA_VER}
+# ENV MASH_VER=${MASH_VER}
+# ENV BBMAP_VER=${BBMAP_VER}
+# ENV PYTHON_VER=${PYTHON_VER}
+
+versions: version ## Show versions of key installed depedencies
+	@echo `nextflow -v`
+	@echo `python3 --version` " (container says ${PYTHON_VER})"
+	@echo `bedtools --version` " (container says ${BEDTOOLS_VER})"
+	@echo "mummer " `mummer --version` " (container says ${MUMMER_VER})"
+	@echo `skesa --version 2>&1` " (container says ${SKESA_VER})"
+	@echo "mash " `mash --version` " (container says ${MASH_VER})"
+	@echo `bbmap.sh --version 2>&1` " (container says ${BBMAP_VER})"
+
+help: ## Show help
+	@echo "Citation: CFSAN SNP Pipeline 2, v$${CSP2_VER}, Literman et al. 2024"
+	@echo
+	@echo "CSP2 is a Nextflow pipeline for rapid, accurate SNP distance estimation"
+	@echo "from assembly data."
+	@echo
+	@echo "Please see: https://github.com/CFSAN-Biostatistics/CSP2"
+	@echo
+	@echo "CSP2 runs are managed via Nextflow, providing the user with an array of"
+	@echo "customizations while also facilitating module development and additions in"
+	@echo "future releases."
+	@echo
+	@echo "Important Note: The software continues to be focused on the analysis of"
+	@echo "groups of bacterial genomes with limited evolutionary differences (<1000"
+	@echo "SNPs). Testing is underway to determine how the underlying cluster"
+	@echo "diversity impacts distances estimates."
+	@echo
+	@echo "CSP2 has two main run modes:"
+	@echo "1) "Screening Mode" (screen): Used to determine whether query isolates are"
+	@echo "close to a set of reference isolates (e.g., lab control strains, strains"
+	@echo "related to an outbreak, etc.) Given one or more user-provided reference"
+	@echo "isolates (--ref_reads; --ref_fasta), get alignment statistics and SNP"
+	@echo "distances between all reference and query isolates (--reads; --fasta)"
+	@echo
+	@echo "2) "SNP Pipeline Mode" (snp): Used to generate pairwise distances and"
+	@echo "alignments for a set of query isolates Generate pairwise SNP distances and"
+	@echo "alignments for 2+ isolates (--reads; --fasta) based on comparisons to:"
+	@echo
+	@echo "One or more user-provided references (--ref_reads; --ref_fasta), or One or"
+	@echo "more reference isolates selected by RefChooser (--n_ref)"
+	@echo
+	@echo "Usage: screen [options] {--fasta PATH {--reads=PATH | --forward=STR --reverse=STR} --out=PATH}"
+	@echo "  or   snp [options] {--fasta {--reads=PATH | --forward=STR --reverse=STR} --out=PATH}"
+	@echo
+	@echo "Options:"
+	@echo "  --outroot=PATH\tBase directory to create output folder  [default=$CWD] "
+	@echo "  --out=PATH\t\tName of the output folder to create (must not exist)"
+	@echo "\t\t\t  [default=CSP2_<current_datetime>]"
+	@echo "  --forward=STR\t\tFull file extension for forward/left reads of query"
+	@echo "\t\t\t  [default='_1.fastq.gz']"
+	@echo "  --reverse=STR\t\tFull file extension for reverse/right reads of reference"
+	@echo "\t\t\t  [default='_2.fastq.gz']"
+	@echo "  --ref_forward=STR\tFull file extension for forward/left reads of reference"
+	@echo "\t\t\t  [default='_1.fastq.gz']"
+	@echo "  --ref_reverse=STR\tFull file extension for reverse/right reads of reference"
+	@echo "\t\t\t  [default='_2.fastq.gz']"
+	@echo "  --readext=STR\t\tExtension for single-end reads for query [default='fastq.gz']"
+	@echo "  --ref_readext=STR\tExtension for single-end reads for reference"
+	@echo "\t\t\t  [default='fastq.gz']"
+	@echo "  --min_cov=NUM\t\tDo not analyze queries that cover less than <min_cov>% of the"
+	@echo "\t\t\treference assembly [default=85]"
+	@echo "  --min_iden=NUM\tOnly consider alignments where the percent identity is at least"
+	@echo "\t\t\t  <min_iden> [default=99]"
+	@echo "  --min_len=NUM\t\tOnly consider alignments that span at least <min_len> in bp"
+	@echo "\t\t\t  [default=500]"
+	@echo "  --dwin=LIST\t\tA comma-separated list of windows to check SNP densities"
+	@echo "\t\t\t  [default=1000,125,15]"
+	@echo "  --wsnps=LIST\t\tThe maximum number of SNPs allowed in the corresponding window from"
+	@echo "\t\t\t  --dwin  [default=3,2,1]"
+	@echo "  --query_edge=NUM\tOnly consider SNPs that occur within <query_edge>bp of the end"
+	@echo "\t\t\t  of a query contig [default=250]"
+	@echo "  --ref_edge=NUM\tOnly consider SNPs that occur within <query_edge>bp of the end"
+	@echo "\t\t\t  of a reference contig [default=250]"
+	@echo "  --n_ref=NUM\t\tThe number of RefChooser reference isolates to consider (only"
+	@echo "\t\t\t  applied if using RefChooser) [default=3]"
+	@echo "  --reads=PATH\t\tLocation of query read data (Path to directory, or path to file with"
+	@echo "\t\t\t  multiple directories)"
+	@echo "  --fasta=PATH\t\tLocation of query assembly data (Path to directory containing"
+	@echo "\t\t\t  FASTAs, path to FASTA, path to multiple FASTAs)"
+	@echo "  --ref_reads=PATH\tLocation of reference read data (Path to directory, or path to"
+	@echo "\t\t\t  file with multiple directories)"
+	@echo "  --ref_fasta=PATH\tLocation of reference assembly data (Path to directory"
+	@echo "\t\t\t  containing FASTAs, path to FASTA, path to multiple FASTAs)"
+	@echo "  --trim_name=STR\tA string in assembly file names that you want to remove from"
+	@echo "\t\t\t  sample IDs (e.g., _contigs_skesa)"
+
+config:
+	@cat <<- EOF
+	profiles {
+		standard {
+			process.executor = 'local'
+			params.cores = `nproc --all`
+		}
+	}
+	EOF > ~/.nextflow/config
+
+
+ifeq (screen, $(firstword $(MAKECMDGOALS)))
+	runargs := $(wordlist 2, $(words $(MAKECMDGOALS)), $(MAKECMDGOALS))
+	$(eval $(runargs):;@true)
+endif
+
+ifeq (snp, $(firstword $(MAKECMDGOALS)))
+	runargs := $(wordlist 2, $(words $(MAKECMDGOALS)), $(MAKECMDGOALS))
+	$(eval $(runargs):;@true)
+endif
+
+screen: config ## determine whether query isolates are close to a reference
+	nextflow run CSP2.nf -profile standard --runmode screen $(runargs)
+
+snp: config ## generate pairwise distances for a set of query isolates
+	nextflow run CSP2.nf -profile standard --runmode snp $(runargs)
+
+snpdiffs: config
+
+test_screen:
+	nextflow run CSP2.nf -profile standard --runmode screen --fasta assets/Screen/Assembly/Week_42_Assembly.fasta --reads assets/Screen/Reads/ --ref_fasta assets/Screen/Assembly/Lab_Control.fasta --out ./CSP2_Test_Screen --readext fq.gz --forward _1.fq.gz --reverse _2.fq.gz
+
+test_snp:
+	nextflow run CSP2.nf -profile standard --runmode snp --fasta assets/SNP/ --n_ref 3 --out ./CSP2_Test_SNP --max_missing 50
+
+test: config test_screen test_snp
+	ls -lah assets/Screen/Output/Contamination_Screen/
+	diff -bur ./CSP2_Test_SNP/snpdiffs assets/SNP/Output/Soil_Analysis/snpdiffs
\ No newline at end of file
Binary file CSP2/img/SNP.jpg has changed
Binary file CSP2/img/Screen_Run.jpg has changed
Binary file CSP2/img/Temp_Logo.jpg has changed
--- a/CSP2/nextflow.config	Wed Dec 11 12:04:20 2024 -0500
+++ b/CSP2/nextflow.config	Thu Dec 12 13:53:15 2024 -0500
@@ -17,11 +17,11 @@
     withLabel: 'mummerMem' {
         task_name = 'CSP2-MUMmer'
         cpus = 1
-        //memory = '4 GB'
+        # memory = '4 GB'
     }
     withLabel: 'skesaMem' {
         task_name = 'CSP2-SKESA'
-        //memory = '12 GB'
+        # memory = '12 GB'
     }
 }
--- a/csp2_screen.xml	Wed Dec 11 12:04:20 2024 -0500
+++ b/csp2_screen.xml	Thu Dec 12 13:53:15 2024 -0500
@@ -58,7 +58,7 @@
 fi;

 nextflow run ${__tool_directory__}/CSP2/CSP2.nf -profile csp2_galaxy --runmode screen \$QUERY_FASTA_ARG \$REF_FASTA_ARG \$QUERY_READS_ARG \$REF_READS_ARG \$REF_ID_ARG \$TRIM_ARG --readext $readext --forward $forward --reverse $reverse --ref_readext $readext --ref_forward $forward --ref_reverse $reverse --min_cov $min_cov --min_iden $min_iden --min_len $min_len --ref_edge $ref_edge --query_edge $query_edge --dwin $dwin --wsnps $wsnps --out \$CSP2_DIR/CSP2_Screen_Output > Nextflow_Log.txt 2>&1;
-sleep 15;
+cat Nextflow_Log.txt;
 ]]>
         </command>
         <inputs>
@@ -83,7 +83,6 @@
                 <data name="raw_mummer" format="tabular" label="Raw MUMmer Output" from_work_dir="CSP2_Screen_Output/Raw_MUMmer_Summary.tsv" />
                 <data name="isolate_data" format="tabular" label="Isolate Data" from_work_dir="CSP2_Screen_Output/Isolate_Data.tsv" />
                 <data name="screening_results" format="tabular" label="Screening Results" from_work_dir="CSP2_Screen_Output/Screening_Results.tsv" />
-                <data name="nextflow_log" format="txt" label="Nextflow Log" from_work_dir="Nextflow_Log.txt" />
         </outputs>
         <tests>
                 <test>