Mercurial > repos > estrain > microrunqc
annotate sum_mlst.py @ 0:4e629e82c5b1 draft default tip
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
| author | estrain |
|---|---|
| date | Fri, 13 Mar 2026 12:51:10 +0000 |
| parents | |
| children |
| rev | line source |
|---|---|
|
0
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
1 #!/usr/bin/env |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
2 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
3 ## Generate basic summary stats from SKESA, fastq-scan, and MLST output. |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
4 ## author: errol strain, estrain@gmail.com |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
5 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
6 from argparse import (ArgumentParser, FileType) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
7 import sys |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
8 import glob |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
9 import subprocess |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
10 from decimal import Decimal |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
11 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
12 def parse_args(): |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
13 "Parse the input arguments, use '-h' for help." |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
14 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
15 parser = ArgumentParser(description='Generate Basic Summary Statistics from SKESA assemblies, fastq-scan output, and MLST reports') |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
16 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
17 # Read inputs |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
18 parser.add_argument('--fasta', type=str, required=True, nargs=1, help='SKESA FASTA assembly') |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
19 parser.add_argument('--mlst', type=str, required=True, nargs=1, help='MLST output') |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
20 parser.add_argument('--fqscan', type=str, required=True, nargs=1, help='fastq-scan output') |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
21 parser.add_argument('--med', type=str, required=True, nargs=1, help='Median Insert Size') |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
22 parser.add_argument('--output', type=str, required=True, nargs=1, help='Output File') |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
23 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
24 return parser.parse_args() |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
25 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
26 args =parse_args() |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
27 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
28 # FASTA file |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
29 fasta = args.fasta[0] |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
30 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
31 # Get individual and total length of contigs |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
32 cmd = ["awk", "/^>/ {if (seqlen){print seqlen}; ;seqlen=0;next; } { seqlen = seqlen +length($0)}END{print seqlen}",fasta] |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
33 seqlen = subprocess.Popen(cmd,stdout= subprocess.PIPE).communicate()[0] |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
34 intlen = list(map(int,seqlen.splitlines())) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
35 totlen = sum(intlen) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
36 # Count number of contigs |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
37 numtigs = len(intlen) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
38 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
39 # Get coverage information from skesa fasta header |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
40 cmd1 = ["grep",">",fasta] |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
41 cmd2 = ["cut","-f","3","-d","_"] |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
42 p1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
43 p2 = subprocess.Popen(cmd2, stdin=p1.stdout, stdout=subprocess.PIPE).communicate()[0] |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
44 covdep = map(float,p2.splitlines()) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
45 covlist = [a*b for a,b in zip([float(i) for i in intlen],covdep)] |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
46 covdep = round(sum(covlist)/totlen,1) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
47 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
48 # Calculate N50 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
49 vals = [int(i) for i in intlen] |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
50 vals.sort(reverse=True) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
51 n50=0 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
52 for counter in range(0,len(vals)-1): |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
53 if sum(vals[0:counter]) > (totlen/2): |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
54 n50=vals[counter-1] |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
55 break |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
56 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
57 # Read in MLST output |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
58 mlst = open(args.mlst[0],"r") |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
59 profile = mlst.readline() |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
60 els = profile.split("\t") |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
61 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
62 # Read in median insert size |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
63 medfile = open(args.med[0],"r") |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
64 insert = medfile.readline() |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
65 insert = insert.rstrip() |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
66 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
67 # Read in fastq-scan |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
68 fqfile = open(args.fqscan[0],"r") |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
69 fq = fqfile.readline() |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
70 fq = fq.rstrip() |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
71 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
72 output = open(args.output[0],"w") |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
73 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
74 filehead = str("File\tContigs\tLength\tEstCov\tN50\tMedianInsert\tMeanLength_R1\tMeanLength_R2\tMeanQ_R1\tMeanQ_R2\tScheme\tST\n") |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
75 output.write(filehead) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
76 |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
77 output.write(str(fasta) + "\t" + str(numtigs) + "\t" + str(totlen) + "\t" + str(covdep) + "\t" + str(n50) +"\t" + str(insert) + "\t" + str(fq)) |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
78 for counter in range(1,len(els)): |
|
4e629e82c5b1
planemo upload commit a820b38dea9a409c11e220ba904da232fdbc4c05
estrain
parents:
diff
changeset
|
79 output.write("\t" + str(els[counter])) |
