view CSP2/subworkflows/alignData/main.nf @ 0:01431fa12065

"planemo upload"
author rliterman
date Mon, 02 Dec 2024 10:40:55 -0500
parents
children 0d775868ee62
line wrap: on
line source
// Subworkflow to run MUMmer for query/referece comparisons

// Set path variables
output_directory = file(params.output_directory)
mummer_directory = file(params.mummer_directory)
mummer_log_directory = file(params.mummer_log_directory)
snpdiffs_directory = file(params.snpdiffs_directory)
log_directory = file(params.log_directory)

if(params.tmp_dir == ""){
    temp_dir = ""
} else{
    temp_dir = file(params.temp_dir)
}

ref_mode = params.ref_mode
ref_id_file = file(params.ref_id_file)

// Set path to accessory scripts/files
all_snpdiffs_list = file("${log_directory}/All_SNPDiffs.txt")
isolate_data_file = file("${output_directory}/Isolate_Data.tsv")
snpdiffs_summary_file = file("${output_directory}/Raw_MUMmer_Summary.tsv")
mummerScript = file("$projectDir/bin/compileMUMmer.py")

workflow {
    main:
    // Align genomes
    snpdiffs = alignGenomes(to_align: read_data, snpdiffs_data: snpdiffs_data)
    publish:
    // Publish snpdiffs
    snpdiffs >> 'snpdiffs.tsv'
}

workflow alignGenomes{
    take:
    to_align
    snpdiffs_data

    emit:
    return_snpdiffs

    main:
    
    // Align anything that needs aligning
    sample_pairwise = to_align
    .filter{"${it[0]}" != "${it[2]}"} // Don't map things to themselves
    | runMUMmer 
    | splitCsv
    
    log_hold = sample_pairwise
    .concat(snpdiffs_data)
    .unique{it -> it[2]}
    .collect{it -> it[2]}

    snpdiff_files = saveMUMmerLog(log_hold)
    .collect().flatten().collate(1)

    return_snpdiffs = sample_pairwise
    .concat(snpdiffs_data)
    .map { it -> tuple([it[0], it[1]].sort().join(',').toString(),it[0], it[1], it[2]) }
    .unique{it -> it[0]}
    .map{it->tuple(it[3],it[1],it[2])}
    .join(snpdiff_files,by:0)
    .map{it->tuple(it[1],it[2],it[0])}
}

process runMUMmer{

    cpus = 1
    memory '4 GB'

    input:
    tuple val(query_name),val(query_fasta),val(ref_name),val(ref_fasta)

    output:
    stdout
    
    script:

    report_id = "${query_name}__vs__${ref_name}"
    mummer_log = file("${mummer_log_directory}/${report_id}.log")

    // Ensure MUmmer directories exist
    if(!mummer_directory.isDirectory()){
        error "$mummer_directory does not exist..."
    } else{
        """
        $params.load_mummer_module
        $params.load_python_module
        $params.load_bedtools_module
        $params.load_bbtools_module

        cd ${mummer_directory}
        dnadiff -p ${report_id} ${ref_fasta} ${query_fasta}
        
        rm -rf ${mummer_directory}/${report_id}.mdelta
        rm -rf ${mummer_directory}/${report_id}.mcoords
        rm -rf ${mummer_directory}/${report_id}.1delta
        rm -rf ${mummer_directory}/${report_id}.delta
        rm -rf ${mummer_directory}/${report_id}.qdiff
        rm -rf ${mummer_directory}/${report_id}.rdiff
        rm -rf ${mummer_directory}/${report_id}.unref
        rm -rf ${mummer_directory}/${report_id}.unqry

        python ${mummerScript} --query "${query_name}" --query_fasta "${query_fasta}" --reference "${ref_name}" --reference_fasta "${ref_fasta}" --mummer_dir "${mummer_directory}" --snpdiffs_dir "${snpdiffs_directory}" --temp_dir "${temp_dir}" --log_file "${mummer_log}"
        """
    }
}

process saveMUMmerLog{

    executor = 'local'
    cpus = 1
    maxForks = 1

    input:
    val(snpdiffs_paths)

    output:
    val(snpdiffs_paths)

    script:
    saveSNPDiffs = file("$projectDir/bin/saveSNPDiffs.py")
    all_snpdiffs_list.write(snpdiffs_paths.join('\n') + '\n')
    """
    $params.load_python_module
    python $saveSNPDiffs --snpdiffs_file "${all_snpdiffs_list}" --summary_file "${snpdiffs_summary_file}" --isolate_file "${isolate_data_file}" --trim_name "${params.trim_name}" --ref_id_file "${ref_id_file}"
    """
}