kkonganti@17
|
1 #!/usr/bin/env perl
|
kkonganti@17
|
2
|
kkonganti@17
|
3 # Kranti Konganti
|
kkonganti@17
|
4 # 01/02/2024
|
kkonganti@17
|
5
|
kkonganti@17
|
6 use strict;
|
kkonganti@17
|
7 use warnings;
|
kkonganti@17
|
8 use Getopt::Long;
|
kkonganti@17
|
9 use Data::Dumper;
|
kkonganti@17
|
10 use Pod::Usage;
|
kkonganti@17
|
11 use File::Basename;
|
kkonganti@17
|
12 use File::Spec::Functions;
|
kkonganti@17
|
13
|
kkonganti@17
|
14 my $tbl = {};
|
kkonganti@17
|
15 my $snp_2_serovar = {};
|
kkonganti@17
|
16 my $acc_2_serovar = {};
|
kkonganti@17
|
17 my $acc_2_target = {};
|
kkonganti@17
|
18 my $snp_count = {};
|
kkonganti@17
|
19 my $snp_2_acc = {};
|
kkonganti@17
|
20 my $acc_2_snp = {};
|
kkonganti@17
|
21 my $multi_cluster_acc = {};
|
kkonganti@17
|
22 my (
|
kkonganti@17
|
23 $serovar_limit, $serovar_or_type_col, $min_asm_size,
|
kkonganti@17
|
24 $complete_serotype_name, $PDG_file, $table_file,
|
kkonganti@17
|
25 $not_null_pdg_serovar, $snp_cluster, $help,
|
kkonganti@17
|
26 $out_prefix, $acc_col, $seronamecol,
|
kkonganti@17
|
27 $target_acc_col
|
kkonganti@17
|
28 );
|
kkonganti@17
|
29 my @custom_serovars;
|
kkonganti@17
|
30
|
kkonganti@17
|
31 GetOptions(
|
kkonganti@17
|
32 'help' => \$help,
|
kkonganti@17
|
33 'pdg=s' => \$PDG_file,
|
kkonganti@17
|
34 'tbl=s' => \$table_file,
|
kkonganti@17
|
35 'snp=s' => \$snp_cluster,
|
kkonganti@17
|
36 'min_contig_size=i' => \$min_asm_size,
|
kkonganti@17
|
37 'complete_serotype_name' => \$complete_serotype_name,
|
kkonganti@17
|
38 'serocol:i' => \$serovar_or_type_col,
|
kkonganti@17
|
39 'seronamecol:i' => \$seronamecol,
|
kkonganti@17
|
40 'target_acc_col:i' => \$target_acc_col,
|
kkonganti@17
|
41 'acc_col:i' => \$acc_col,
|
kkonganti@17
|
42 'not_null_pdg_serovar' => \$not_null_pdg_serovar,
|
kkonganti@17
|
43 'num_serotypes_per_serotype:i' => \$serovar_limit,
|
kkonganti@17
|
44 'include_serovar=s' => \@custom_serovars,
|
kkonganti@17
|
45 'op=s' => \$out_prefix
|
kkonganti@17
|
46 ) or pod2usage( -verbose => 2 );
|
kkonganti@17
|
47
|
kkonganti@17
|
48 if ( defined $help ) {
|
kkonganti@17
|
49 pod2usage( -verbose => 2 );
|
kkonganti@17
|
50 }
|
kkonganti@17
|
51
|
kkonganti@17
|
52 if ( !defined $serovar_limit ) {
|
kkonganti@17
|
53 $serovar_limit = 1;
|
kkonganti@17
|
54 }
|
kkonganti@17
|
55
|
kkonganti@17
|
56 if ( !defined $serovar_or_type_col ) {
|
kkonganti@17
|
57 $serovar_or_type_col = 50;
|
kkonganti@17
|
58 }
|
kkonganti@17
|
59
|
kkonganti@17
|
60 if ( !defined $seronamecol ) {
|
kkonganti@17
|
61 $seronamecol = 34;
|
kkonganti@17
|
62 }
|
kkonganti@17
|
63
|
kkonganti@17
|
64 if ( !defined $target_acc_col ) {
|
kkonganti@17
|
65 $target_acc_col = 43;
|
kkonganti@17
|
66 }
|
kkonganti@17
|
67
|
kkonganti@17
|
68 if ( !defined $acc_col ) {
|
kkonganti@17
|
69 $acc_col = 10;
|
kkonganti@17
|
70 }
|
kkonganti@17
|
71
|
kkonganti@17
|
72 if ( !defined $min_asm_size ) {
|
kkonganti@17
|
73 $min_asm_size = 0;
|
kkonganti@17
|
74 }
|
kkonganti@17
|
75
|
kkonganti@17
|
76 if ( defined $out_prefix ) {
|
kkonganti@17
|
77 $out_prefix .= '_';
|
kkonganti@17
|
78 }
|
kkonganti@17
|
79 else {
|
kkonganti@17
|
80 $out_prefix = '';
|
kkonganti@17
|
81 }
|
kkonganti@17
|
82
|
kkonganti@17
|
83 pod2usage( -verbose => 2 ) if ( !$PDG_file || !$table_file || !$snp_cluster );
|
kkonganti@17
|
84
|
kkonganti@17
|
85 open( my $pdg_file, '<', $PDG_file )
|
kkonganti@17
|
86 || die "\nCannot open PDG file $PDG_file: $!\n\n";
|
kkonganti@17
|
87 open( my $tbl_file, '<', $table_file )
|
kkonganti@17
|
88 || die "\nCannot open tbl file $table_file: $!\n\n";
|
kkonganti@17
|
89 open( my $snp_cluster_file, '<', $snp_cluster )
|
kkonganti@17
|
90 || die "\nCannot open $snp_cluster: $!\n\n";
|
kkonganti@17
|
91 open( my $acc_fh, '>', 'acc2serovar.txt' )
|
kkonganti@17
|
92 || die "\nCannot open acc2serovar.txt: $!\n\n";
|
kkonganti@17
|
93 open( my $Stdout, '>&', STDOUT ) || die "\nCannot pipe to STDOUT: $!\n\n";
|
kkonganti@17
|
94 open( my $Stderr, '>&', STDERR ) || die "\nCannot pipe to STDERR: $!\n\n";
|
kkonganti@17
|
95 open( my $accs_snp_fh, '>', $out_prefix . 'accs_snp.txt' )
|
kkonganti@17
|
96 || die "\nCannnot open " . $out_prefix . "accs_snp.txt for writing: $!\n\n";
|
kkonganti@17
|
97 open( my $genome_headers_fh, '>', $out_prefix . 'mash_snp_genome_list.txt' )
|
kkonganti@17
|
98 || die "\nCannnot open "
|
kkonganti@17
|
99 . $out_prefix
|
kkonganti@17
|
100 . "mash_snp_genome_list.txt for writing: $!\n\n";
|
kkonganti@17
|
101
|
kkonganti@17
|
102 my $pdg_release = basename( $PDG_file, ".metadata.tsv" );
|
kkonganti@17
|
103
|
kkonganti@17
|
104 while ( my $line = <$pdg_file> ) {
|
kkonganti@17
|
105 chomp $line;
|
kkonganti@17
|
106 next if ( $line =~ m/^\#/ );
|
kkonganti@17
|
107
|
kkonganti@17
|
108 # Relevent columns (Perl index):
|
kkonganti@17
|
109 # 10-1 = 9: asm_acc
|
kkonganti@17
|
110 # 34 -1 = 33: serovar
|
kkonganti@17
|
111 # 50 -1 = 49: computed serotype
|
kkonganti@17
|
112
|
kkonganti@17
|
113 my @cols = split( /\t/, $line );
|
kkonganti@17
|
114 my $serovar_or_type = $cols[ $serovar_or_type_col - 1 ];
|
kkonganti@17
|
115 my $acc = $cols[ $acc_col - 1 ];
|
kkonganti@17
|
116 my $serovar = $cols[ $seronamecol - 1 ];
|
kkonganti@17
|
117 my $target_acc = $cols[ $target_acc_col - 1 ];
|
kkonganti@17
|
118
|
kkonganti@17
|
119 $serovar_or_type =~ s/\"//g;
|
kkonganti@17
|
120
|
kkonganti@17
|
121 my $skip = 1;
|
kkonganti@17
|
122 foreach my $ser (@custom_serovars) {
|
kkonganti@17
|
123 $skip = 0, next if ( $serovar_or_type =~ qr/\Q$ser\E/ );
|
kkonganti@17
|
124 }
|
kkonganti@17
|
125
|
kkonganti@17
|
126 if ( defined $complete_serotype_name ) {
|
kkonganti@17
|
127 next
|
kkonganti@17
|
128 if ( $skip
|
kkonganti@17
|
129 && ( $serovar_or_type =~ m/serotype=.*?\-.*?\,antigen_formula.+/ )
|
kkonganti@17
|
130 );
|
kkonganti@17
|
131 }
|
kkonganti@17
|
132
|
kkonganti@17
|
133 next
|
kkonganti@17
|
134 if (
|
kkonganti@17
|
135 $skip
|
kkonganti@17
|
136 && ( $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
|
kkonganti@17
|
137 || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/ )
|
kkonganti@17
|
138 );
|
kkonganti@17
|
139
|
kkonganti@17
|
140 # next
|
kkonganti@17
|
141 # if (
|
kkonganti@17
|
142 # (
|
kkonganti@17
|
143 # $serovar_or_type =~ m/serotype=\-\s+\-\:\-\:\-/
|
kkonganti@17
|
144 # || $serovar_or_type =~ m/antigen_formula=\-\:\-\:\-/
|
kkonganti@17
|
145 # )
|
kkonganti@17
|
146 # );
|
kkonganti@17
|
147
|
kkonganti@17
|
148 if ( defined $not_null_pdg_serovar ) {
|
kkonganti@17
|
149 $acc_2_serovar->{$acc} = $serovar_or_type,
|
kkonganti@17
|
150 $acc_2_target->{$acc} = $target_acc,
|
kkonganti@17
|
151 print $acc_fh "$acc\t$serovar_or_type\n"
|
kkonganti@17
|
152 if ( $acc !~ m/NULL/
|
kkonganti@17
|
153 && $serovar !~ m/NULL/
|
kkonganti@17
|
154 && $serovar_or_type !~ m/NULL/ );
|
kkonganti@17
|
155 }
|
kkonganti@17
|
156 else {
|
kkonganti@17
|
157 $acc_2_serovar->{$acc} = $serovar_or_type,
|
kkonganti@17
|
158 $acc_2_target->{$acc} = $target_acc,
|
kkonganti@17
|
159 print $acc_fh "$acc\t$serovar_or_type\n"
|
kkonganti@17
|
160 if ( $acc !~ m/NULL/ && $serovar_or_type !~ m/NULL/ );
|
kkonganti@17
|
161 }
|
kkonganti@17
|
162
|
kkonganti@17
|
163 # $snp_count->{$serovar_or_type} = 0;
|
kkonganti@17
|
164 }
|
kkonganti@17
|
165
|
kkonganti@17
|
166 #
|
kkonganti@17
|
167 # SNP to ACC
|
kkonganti@17
|
168 #
|
kkonganti@17
|
169
|
kkonganti@17
|
170 while ( my $line = <$snp_cluster_file> ) {
|
kkonganti@17
|
171 chomp $line;
|
kkonganti@17
|
172 my @cols = split( /\t/, $line );
|
kkonganti@17
|
173
|
kkonganti@17
|
174 # Relevant columns
|
kkonganti@17
|
175 # 0: SNP Cluster ID
|
kkonganti@17
|
176 # 3: Genome Accession belonging to the cluster (RefSeq or GenBank)
|
kkonganti@17
|
177 my $snp_clus_id = $cols[0];
|
kkonganti@17
|
178 my $acc = $cols[3];
|
kkonganti@17
|
179
|
kkonganti@17
|
180 next if ( $acc =~ m/^NULL/ || $snp_clus_id =~ m/^PDS_acc/ );
|
kkonganti@17
|
181 next if ( !exists $acc_2_serovar->{$acc} );
|
kkonganti@17
|
182 push @{ $snp_2_acc->{$snp_clus_id} }, $acc;
|
kkonganti@17
|
183 if ( exists $acc_2_snp->{$acc} ) {
|
kkonganti@17
|
184 print $Stderr
|
kkonganti@17
|
185 "\nGot a duplicate assembly accession. Cannot proceed!\n\n$line\n\n";
|
kkonganti@17
|
186 exit 1;
|
kkonganti@17
|
187 }
|
kkonganti@17
|
188 $acc_2_snp->{$acc} = $snp_clus_id;
|
kkonganti@17
|
189 $snp_count->{$snp_clus_id} = 0;
|
kkonganti@17
|
190 }
|
kkonganti@17
|
191
|
kkonganti@17
|
192 while ( my $line = <$tbl_file> ) {
|
kkonganti@17
|
193 chomp $line;
|
kkonganti@17
|
194
|
kkonganti@17
|
195 my @cols = split( /\t/, $line );
|
kkonganti@17
|
196
|
kkonganti@17
|
197 # .tbl file columns (Perl index):
|
kkonganti@17
|
198 #
|
kkonganti@17
|
199 # 0: Accession
|
kkonganti@17
|
200 # 1: AssemblyLevel
|
kkonganti@17
|
201 # 2: ScaffoldN50
|
kkonganti@17
|
202 # 3: ContigN50
|
kkonganti@17
|
203
|
kkonganti@17
|
204 my $acc = $cols[0];
|
kkonganti@17
|
205 my $asm_lvl = $cols[1];
|
kkonganti@17
|
206 my $scaffold_n50 = $cols[2];
|
kkonganti@17
|
207 my $contig_n50 = $cols[3];
|
kkonganti@17
|
208
|
kkonganti@17
|
209 # my $idx0 = $acc_2_serovar->{$cols[0]};
|
kkonganti@17
|
210 my $idx0 = $acc_2_snp->{$acc} if ( exists $acc_2_snp->{ $cols[0] } );
|
kkonganti@17
|
211
|
kkonganti@17
|
212 if ( not_empty($acc) && defined $idx0 ) {
|
kkonganti@17
|
213 my $fna_rel_loc =
|
kkonganti@17
|
214 "$pdg_release/ncbi_dataset/data/$acc/"
|
kkonganti@17
|
215 . $acc
|
kkonganti@17
|
216 . '_scaffolded_genomic.fna.gz';
|
kkonganti@17
|
217
|
kkonganti@17
|
218 if ( not_empty($scaffold_n50) ) {
|
kkonganti@17
|
219 next if ( $scaffold_n50 <= $min_asm_size );
|
kkonganti@17
|
220 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
|
kkonganti@17
|
221 ->{$scaffold_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
|
kkonganti@17
|
222 }
|
kkonganti@17
|
223 elsif ( not_empty($contig_n50) ) {
|
kkonganti@17
|
224 next if ( $contig_n50 <= $min_asm_size );
|
kkonganti@17
|
225 push @{ $snp_2_serovar->{$idx0}->{ sort_asm_level($asm_lvl) }
|
kkonganti@17
|
226 ->{$contig_n50} }, "$acc_2_serovar->{$acc}|$fna_rel_loc";
|
kkonganti@17
|
227 }
|
kkonganti@17
|
228 }
|
kkonganti@17
|
229 }
|
kkonganti@17
|
230
|
kkonganti@17
|
231 foreach my $snp_cluster_id ( keys %$snp_2_acc ) {
|
kkonganti@17
|
232 my $count = $snp_count->{$snp_cluster_id};
|
kkonganti@17
|
233 foreach my $asm_lvl (
|
kkonganti@17
|
234 sort { $a cmp $b }
|
kkonganti@17
|
235 keys %{ $snp_2_serovar->{$snp_cluster_id} }
|
kkonganti@17
|
236 )
|
kkonganti@17
|
237 {
|
kkonganti@17
|
238 if ( $asm_lvl =~ m/Complete\s+Genome/i ) {
|
kkonganti@17
|
239 $count =
|
kkonganti@17
|
240 print_dl_metadata( $asm_lvl,
|
kkonganti@17
|
241 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
|
kkonganti@17
|
242 $count, $snp_cluster_id );
|
kkonganti@17
|
243 }
|
kkonganti@17
|
244 if ( $asm_lvl =~ m/Chromosome/i ) {
|
kkonganti@17
|
245 $count =
|
kkonganti@17
|
246 print_dl_metadata( $asm_lvl,
|
kkonganti@17
|
247 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
|
kkonganti@17
|
248 $count, $snp_cluster_id );
|
kkonganti@17
|
249 }
|
kkonganti@17
|
250 if ( $asm_lvl =~ m/Scaffold/i ) {
|
kkonganti@17
|
251 $count =
|
kkonganti@17
|
252 print_dl_metadata( $asm_lvl,
|
kkonganti@17
|
253 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
|
kkonganti@17
|
254 $count, $snp_cluster_id );
|
kkonganti@17
|
255 }
|
kkonganti@17
|
256 if ( $asm_lvl =~ m/Contig/i ) {
|
kkonganti@17
|
257 $count =
|
kkonganti@17
|
258 print_dl_metadata( $asm_lvl,
|
kkonganti@17
|
259 \$snp_2_serovar->{$snp_cluster_id}->{$asm_lvl},
|
kkonganti@17
|
260 $count, $snp_cluster_id );
|
kkonganti@17
|
261 }
|
kkonganti@17
|
262 printf $Stderr "%-17s | %s\n", $snp_cluster_id, $count
|
kkonganti@17
|
263 if ( $count > 0 );
|
kkonganti@17
|
264 last if ( $count >= $serovar_limit );
|
kkonganti@17
|
265 }
|
kkonganti@17
|
266 }
|
kkonganti@17
|
267
|
kkonganti@17
|
268 close $pdg_file;
|
kkonganti@17
|
269 close $tbl_file;
|
kkonganti@17
|
270 close $snp_cluster_file;
|
kkonganti@17
|
271 close $acc_fh;
|
kkonganti@17
|
272 close $accs_snp_fh;
|
kkonganti@17
|
273
|
kkonganti@17
|
274 #-------------------------------------------
|
kkonganti@17
|
275 # Main ends
|
kkonganti@17
|
276 #-------------------------------------------
|
kkonganti@17
|
277 # Routines begin
|
kkonganti@17
|
278 #-------------------------------------------
|
kkonganti@17
|
279
|
kkonganti@17
|
280 sub print_dl_metadata {
|
kkonganti@17
|
281 my $asm_lvl = shift;
|
kkonganti@17
|
282 my $acc_sizes = shift;
|
kkonganti@17
|
283 my $curr_count = shift;
|
kkonganti@17
|
284 my $snp_cluster_id = shift;
|
kkonganti@17
|
285
|
kkonganti@17
|
286 $asm_lvl =~ s/.+?\_(.+)/$1/;
|
kkonganti@17
|
287
|
kkonganti@17
|
288 foreach my $acc_size ( sort { $b <=> $a } keys %{$$acc_sizes} ) {
|
kkonganti@17
|
289 foreach my $serovar_url ( @{ $$acc_sizes->{$acc_size} } ) {
|
kkonganti@17
|
290 my ( $serovar, $url ) = split( /\|/, $serovar_url );
|
kkonganti@17
|
291 return $curr_count if ( exists $multi_cluster_acc->{$url} );
|
kkonganti@17
|
292 $multi_cluster_acc->{$url} = 1;
|
kkonganti@17
|
293 $curr_count++;
|
kkonganti@17
|
294 my ( $final_acc, $genome_header ) =
|
kkonganti@17
|
295 ( split( /\//, $url ) )[ 3 .. 4 ];
|
kkonganti@17
|
296 print $accs_snp_fh "$final_acc\n";
|
kkonganti@17
|
297 print $genome_headers_fh catfile( 'scaffold_genomes',
|
kkonganti@17
|
298 $genome_header )
|
kkonganti@17
|
299 . "\n";
|
kkonganti@17
|
300 print $Stdout "$serovar|$asm_lvl|$acc_size|$url|$snp_cluster_id\n"
|
kkonganti@17
|
301 if ( $curr_count > 0 );
|
kkonganti@17
|
302 }
|
kkonganti@17
|
303 last if ( $curr_count >= $serovar_limit );
|
kkonganti@17
|
304 }
|
kkonganti@17
|
305 return $curr_count;
|
kkonganti@17
|
306 }
|
kkonganti@17
|
307
|
kkonganti@17
|
308 sub sort_asm_level {
|
kkonganti@17
|
309 my $level = shift;
|
kkonganti@17
|
310
|
kkonganti@17
|
311 $level =~ s/(Complete\s+Genome)/a\_$1/
|
kkonganti@17
|
312 if ( $level =~ m/Complete\s+Genome/i );
|
kkonganti@17
|
313 $level =~ s/(Chromosome)/b\_$1/ if ( $level =~ m/Chromosome/i );
|
kkonganti@17
|
314 $level =~ s/(Scaffold)/c\_$1/ if ( $level =~ m/Scaffold/i );
|
kkonganti@17
|
315 $level =~ s/(Contig)/d\_$1/ if ( $level =~ m/Contig/i );
|
kkonganti@17
|
316
|
kkonganti@17
|
317 return $level;
|
kkonganti@17
|
318 }
|
kkonganti@17
|
319
|
kkonganti@17
|
320 sub not_empty {
|
kkonganti@17
|
321 my $col = shift;
|
kkonganti@17
|
322
|
kkonganti@17
|
323 if ( $col !~ m/^$/ ) {
|
kkonganti@17
|
324 return 1;
|
kkonganti@17
|
325 }
|
kkonganti@17
|
326 else {
|
kkonganti@17
|
327 return 0;
|
kkonganti@17
|
328 }
|
kkonganti@17
|
329 }
|
kkonganti@17
|
330
|
kkonganti@17
|
331 __END__
|
kkonganti@17
|
332
|
kkonganti@17
|
333 =head1 SYNOPSIS
|
kkonganti@17
|
334
|
kkonganti@17
|
335 This script will take in a PDG metadata file, a C<.tbl> file and generate
|
kkonganti@17
|
336 the final list by B<I<waterfall>> priority.
|
kkonganti@17
|
337
|
kkonganti@17
|
338 See complete description:
|
kkonganti@17
|
339
|
kkonganti@17
|
340 perldoc waterfall_per_snp_cluster.pl
|
kkonganti@17
|
341
|
kkonganti@17
|
342 or
|
kkonganti@17
|
343
|
kkonganti@17
|
344 waterfall_per_snp_cluster.pl --help
|
kkonganti@17
|
345
|
kkonganti@17
|
346 Examples:
|
kkonganti@17
|
347
|
kkonganti@17
|
348 waterfall_per_snp_cluster.pl
|
kkonganti@17
|
349
|
kkonganti@17
|
350 =head1 DESCRIPTION
|
kkonganti@17
|
351
|
kkonganti@17
|
352 We will retain up to N number of genome accessions per SNP cluster.
|
kkonganti@17
|
353 It prioritizes SNP Cluster participation over serotype coverage.
|
kkonganti@17
|
354 Which N genomes are selected depends on (in order):
|
kkonganti@17
|
355
|
kkonganti@17
|
356 1. Genome assembly level, whose priority is
|
kkonganti@17
|
357
|
kkonganti@17
|
358 a: Complete Genome
|
kkonganti@17
|
359 b: Chromosome
|
kkonganti@17
|
360 c: Scaffold
|
kkonganti@17
|
361 d: Contig
|
kkonganti@17
|
362
|
kkonganti@17
|
363 2. If the genomes are of same assembly level, then
|
kkonganti@17
|
364 scaffold N50 followed by contig N50 is chosen.
|
kkonganti@17
|
365
|
kkonganti@17
|
366 3. If the scaffold or contig N50 is same, then all
|
kkonganti@17
|
367 of them are included
|
kkonganti@17
|
368
|
kkonganti@17
|
369 =head1 OPTIONS
|
kkonganti@17
|
370
|
kkonganti@17
|
371 =over 3
|
kkonganti@17
|
372
|
kkonganti@17
|
373 =item -p PDGXXXXX.XXXX.metadata.tsv
|
kkonganti@17
|
374
|
kkonganti@17
|
375 Absolute UNIX path pointing to the PDG metadata file.
|
kkonganti@17
|
376 Example: PDG000000002.2505.metadata.tsv
|
kkonganti@17
|
377
|
kkonganti@17
|
378 =item -t asm.tbl
|
kkonganti@17
|
379
|
kkonganti@17
|
380 Absolute UNIX path pointing to the file from the result
|
kkonganti@17
|
381 of the C<dl_pdg_data.py> script, which is the C<asm.tbl>
|
kkonganti@17
|
382 file.
|
kkonganti@17
|
383
|
kkonganti@17
|
384 =item -snp PDGXXXXXXX.XXXX.reference_target.cluster_list.tsv
|
kkonganti@17
|
385
|
kkonganti@17
|
386 Absolute UNIX path pointing to the SNP Cluster metadata file.
|
kkonganti@17
|
387 Examples: PDG000000002.2505.reference_target.cluster_list.tsv
|
kkonganti@17
|
388
|
kkonganti@17
|
389 =item --serocol <int> (Optional)
|
kkonganti@17
|
390
|
kkonganti@17
|
391 Column number (non 0-based index) of the PDG metadata file
|
kkonganti@17
|
392 by which the serotypes are collected
|
kkonganti@17
|
393 (column name: "computed_types"). Default: 50
|
kkonganti@17
|
394
|
kkonganti@17
|
395 =item --seronamecol <int> (Optional)
|
kkonganti@17
|
396
|
kkonganti@17
|
397 Column number (non 0-based index) of the PDG metadata file
|
kkonganti@17
|
398 whose column name is "serovar". Default: 34
|
kkonganti@17
|
399
|
kkonganti@17
|
400 =item --acc_col <int> (Optional)
|
kkonganti@17
|
401
|
kkonganti@17
|
402 Column number (non 0-based index) of the PDG metadata file
|
kkonganti@17
|
403 whose column name is "acc". Default: 10
|
kkonganti@17
|
404
|
kkonganti@17
|
405 =item --target_acc_col <int> (Optional)
|
kkonganti@17
|
406
|
kkonganti@17
|
407 Column number (non 0-based index) of the PDG metadata file
|
kkonganti@17
|
408 whose column name is "target_acc". Default: 43
|
kkonganti@17
|
409
|
kkonganti@17
|
410 =item --complete_serotype_name (Optional)
|
kkonganti@17
|
411
|
kkonganti@17
|
412 Skip indexing serotypes when the serotype name in the column
|
kkonganti@17
|
413 number 49 (non 0-based) of PDG metadata file consists a "-". For example, if
|
kkonganti@17
|
414 an accession has a I<B<serotype=>> string as such in column
|
kkonganti@17
|
415 number 49 (non 0-based): C<"serotype=- 13:z4,z23:-","antigen_formula=13:z4,z23:-">
|
kkonganti@17
|
416 then, the indexing of that accession is skipped.
|
kkonganti@17
|
417 Default: False
|
kkonganti@17
|
418
|
kkonganti@17
|
419 =item --not_null_pdg_serovar (Optional)
|
kkonganti@17
|
420
|
kkonganti@17
|
421 Only index the B<I<computed_serotype>> column i.e. column number 49 (non 0-based)
|
kkonganti@17
|
422 if the B<I<serovar>> column is not C<NULL>.
|
kkonganti@17
|
423
|
kkonganti@17
|
424 =item -i <serotype name> (Optional)
|
kkonganti@17
|
425
|
kkonganti@17
|
426 Make sure the following serotype is included. Mention C<-i> multiple
|
kkonganti@17
|
427 times to include multiple serotypes.
|
kkonganti@17
|
428
|
kkonganti@17
|
429 =item -num <int> (Optional)
|
kkonganti@17
|
430
|
kkonganti@17
|
431 Number of genome accessions per SNP Cluster. Default: 1
|
kkonganti@17
|
432
|
kkonganti@17
|
433 =back
|
kkonganti@17
|
434
|
kkonganti@17
|
435 =head1 AUTHOR
|
kkonganti@17
|
436
|
kkonganti@17
|
437 Kranti Konganti
|
kkonganti@17
|
438
|
kkonganti@17
|
439 =cut
|