Mercurial > repos > estrain > microrunqc
comparison sum_mlst.py @ 0:4e629e82c5b1 draft default tip
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
| author | estrain |
|---|---|
| date | Fri, 13 Mar 2026 12:51:10 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4e629e82c5b1 |
|---|---|
| 1 #!/usr/bin/env | |
| 2 | |
| 3 ## Generate basic summary stats from SKESA, fastq-scan, and MLST output. | |
| 4 ## author: errol strain, estrain@gmail.com | |
| 5 | |
| 6 from argparse import (ArgumentParser, FileType) | |
| 7 import sys | |
| 8 import glob | |
| 9 import subprocess | |
| 10 from decimal import Decimal | |
| 11 | |
| 12 def parse_args(): | |
| 13 "Parse the input arguments, use '-h' for help." | |
| 14 | |
| 15 parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports') | |
| 16 | |
| 17 # Read inputs | |
| 18 parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly') | |
| 19 parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output') | |
| 20 parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output') | |
| 21 parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size') | |
| 22 parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') | |
| 23 | |
| 24 return parser.parse_args() | |
| 25 | |
| 26 args =parse_args() | |
| 27 | |
| 28 # FASTA file | |
| 29 fasta = args.fasta[0] | |
| 30 | |
| 31 # Get individual and total length of contigs | |
| 32 cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta] | |
| 33 seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0] | |
| 34 intlen = list(map(int,seqlen.splitlines())) | |
| 35 totlen = sum(intlen) | |
| 36 # Count number of contigs | |
| 37 numtigs = len(intlen) | |
| 38 | |
| 39 # Get coverage information from skesa fasta header | |
| 40 cmd1 = ["grep",">",fasta] | |
| 41 cmd2 = ["cut","-f","3","-d","_"] | |
| 42 p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) | |
| 43 p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0] | |
| 44 covdep = map(float,p2.splitlines()) | |
| 45 covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)] | |
| 46 covdep = round(sum(covlist)/totlen,1) | |
| 47 | |
| 48 # Calculate N50 | |
| 49 vals = [int(i) for i in intlen] | |
| 50 vals.sort(reverse=True) | |
| 51 n50=0 | |
| 52 for counter in range(0,len(vals)-1): | |
| 53 if sum(vals[0:counter]) > (totlen/2): | |
| 54 n50=vals[counter-1] | |
| 55 break | |
| 56 | |
| 57 # Read in MLST output | |
| 58 mlst = open(args.mlst[0],"r") | |
| 59 profile = mlst.readline() | |
| 60 els = profile.split("\t") | |
| 61 | |
| 62 # Read in median insert size | |
| 63 medfile = open(args.med[0],"r") | |
| 64 insert = medfile.readline() | |
| 65 insert = insert.rstrip() | |
| 66 | |
| 67 # Read in fastq-scan | |
| 68 fqfile = open(args.fqscan[0],"r") | |
| 69 fq = fqfile.readline() | |
| 70 fq = fq.rstrip() | |
| 71 | |
| 72 output = open(args.output[0],"w") | |
| 73 | |
| 74 filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n") | |
| 75 output.write(filehead) | |
| 76 | |
| 77 output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq)) | |
| 78 for counter in range(1,len(els)): | |
| 79 output.write("\t" + str(els[counter])) |
