Commit 2020496d authored by Keith Jolley's avatar Keith Jolley

Type alleles for use with new allele discovery.

Closes #285.
parent 22ca540b
......@@ -40,6 +40,7 @@ my $TBLASTX = 'off';
my $HUNT = 'off';
my $RESCAN_ALLELES = 'off';
my $RESCAN_SEQS = 'off';
my $TYPE_ALLELES = 'off';
my $MARK_MISSING = 'off';
sub get_javascript {
......@@ -67,6 +68,7 @@ function use_defaults() {
\$("#partial_when_exact").prop(\"checked\",$check_values{$PARTIAL_WHEN_EXACT});
\$("#rescan_alleles").prop(\"checked\",$check_values{$RESCAN_ALLELES});
\$("#rescan_seqs").prop(\"checked\",$check_values{$RESCAN_SEQS});
\$("#type_alleles").prop(\"checked\",$check_values{$TYPE_ALLELES});
\$("#mark_missing").prop(\"checked\",$check_values{$MARK_MISSING});
}
......@@ -396,6 +398,20 @@ sub _print_parameter_fieldset {
: q()
);
say q(</li><li>);
say $q->checkbox(
-name => 'type_alleles',
-id => 'type_alleles',
-label => 'Use only type alleles to identify locus',
-checked => ( $general_prefs->{'scan_type_alleles'} && $general_prefs->{'scan_type_alleles'} eq 'on' )
? 'checked'
: q()
);
say q( <a class="tooltip" title="Type alleles - Constrain the search space to contain only type alleles. These )
. q(are defined in the sequence record. This can prevent more variable alleles being defined over time. )
. q(If the locus region is identified then a full database lookup will be used to identify a known allele. )
. q(<br /><br />)
. q(Note that if no type alleles have been defined then you will not find any matches!">)
. q(<span class="fa fa-info-circle"></span></a></li><li>);
say $q->checkbox(
-name => 'mark_missing',
-id => 'mark_missing',
......@@ -448,7 +464,7 @@ sub _scan {
my $dbname = $self->{'system'}->{'db'};
foreach (
qw (identity alignment word_size partial_matches limit_matches limit_time
tblastx hunt rescan_alleles rescan_seqs mark_missing)
tblastx hunt rescan_alleles rescan_seqs type_alleles mark_missing)
)
{
my $value = ( defined $q->param($_) && $q->param($_) ne '' ) ? $q->param($_) : 'off';
......
......@@ -136,7 +136,10 @@ sub get_all_sequences {
$qry = "SELECT $self->{'dbase_id_field'},$self->{'dbase_seq_field'} FROM $self->{'dbase_table'} WHERE "
. "$self->{'dbase_id2_field'}=?";
$qry .= ' AND exemplar' if $options->{'exemplar'};
$qry .= ' AND type_allele' if $options->{'type_alleles'};
} else {
#TODO Remove support for non-BIGSdb seqdef databases
$logger->logwarn('Use of non-BIGSdb sequence definition databases is deprecated.');
$qry = "SELECT $self->{'dbase_id_field'},$self->{'dbase_seq_field'} FROM $self->{'dbase_table'}";
$qry .= ' WHERE exemplar' if $options->{'exemplar'};
}
......
......@@ -168,7 +168,8 @@ sub blast {
$temp_fastafile =~ s/\\/\\\\/gx;
$temp_fastafile =~ s/'/__prime__/gx;
my $outfile_url = "$file_prefix\_$$\_outfile.txt";
$self->_create_fasta_index( [$locus], $temp_fastafile, { exemplar => $params->{'exemplar'} } );
$self->_create_fasta_index( [$locus], $temp_fastafile,
{ exemplar => $params->{'exemplar'}, type_alleles => $params->{'type_alleles'} } );
$self->_create_query_fasta_file( $isolate_id, $temp_infile, $params );
my ( $probe_matches, $pcr_products );
my $continue = 1;
......@@ -182,7 +183,7 @@ sub blast {
return if !$continue;
$self->{'db'}->commit; #prevent idle in transaction table locks
return if !-e $temp_fastafile || -z $temp_fastafile;
$params->{'exact_matches_only'} = 1 if ( $self->{'no_exemplars'} && !$params->{'scannew'} );
$params->{'exact_matches_only'} = $self->exact_matches_only($params);
my $word_size = $self->_get_word_size( $program, $locus, $params );
my $blast_threads = $self->{'config'}->{'blast_threads'} || 1;
my $filter = $program eq 'blastn' ? 'dust' : 'seg';
......@@ -221,7 +222,11 @@ sub blast {
}
);
if (
( !@{ $exact_matches->{$locus} } || $params->{'partial_when_exact'} || $params->{'exemplar'} )
(
!@{ $exact_matches->{$locus} }
|| $params->{'partial_when_exact'}
|| $self->_always_lookup_partials($params)
)
|| ( $locus_info->{'pcr_filter'}
&& !$params->{'pcr_filter'}
&& $locus_info->{'probe_filter'}
......@@ -241,7 +246,7 @@ sub blast {
}
);
}
if ( $params->{'exemplar'} ) {
if ( $self->_always_lookup_partials($params) ) {
$self->_lookup_partial_matches( $locus, $exact_matches, $partial_matches );
}
return ( $exact_matches->{$locus}, $partial_matches->{$locus} );
......@@ -252,19 +257,38 @@ sub blast {
return;
}
sub _always_lookup_partials {
my ( $self, $params ) = @_;
if ( $params->{'exemplar'} || $params->{'type_alleles'} ) {
return 1;
}
return;
}
sub exact_matches_only {
my ( $self, $params ) = @_;
if ( $self->{'no_exemplars'} && !$params->{'scannew'} && !$params->{'type_alleles'} ) {
return 1;
}
return;
}
#If we are BLASTing against a subset of the database, lookup partial matches against complete
#set of alleles.
sub _lookup_partial_matches {
my ( $self, $locus, $exact_matches, $partial_matches ) = @_;
$partial_matches->{$locus} //= [];
return if !@{ $partial_matches->{$locus} };
my %already_matched_alleles = map { $_->{'allele'} => 1 } @{ $exact_matches->{$locus} };
foreach my $match ( @{ $partial_matches->{$locus} } ) {
my $seq = $self->extract_seq_from_match($match);
my $allele_id = $self->{'datastore'}->get_locus($locus)->get_allele_id_from_sequence( \$seq );
if ( defined $allele_id && !$already_matched_alleles{$allele_id} ) {
$match->{'identity'} = 100;
$match->{'allele'} = $allele_id;
$match->{'from_partial'} = 1;
$match->{'partial_match_allele'} = $match->{'allele'};
$match->{'identity'} = 100;
$match->{'allele'} = $allele_id;
push @{ $exact_matches->{$locus} }, $match;
}
}
......@@ -288,9 +312,13 @@ sub _create_fasta_index {
if ( $locus_info->{'dbase_name'} ) {
my $ok = 1;
try {
my $seqs_ref =
$self->{'datastore'}->get_locus($locus)
->get_all_sequences( { exemplar => $options->{'exemplar'}, no_temp_table => 1 } );
my $seqs_ref = $self->{'datastore'}->get_locus($locus)->get_all_sequences(
{
exemplar => $options->{'exemplar'},
type_alleles => $options->{'type_alleles'},
no_temp_table => 1
}
);
if ( $options->{'exemplar'} && !keys %$seqs_ref ) {
$logger->info("Locus $locus has no exemplars set - using all alleles.");
$seqs_ref = $self->{'datastore'}->get_locus($locus)->get_all_sequences( { no_temp_table => 1 } );
......@@ -751,10 +779,15 @@ sub _get_row {
$buffer .= q(</td>);
$tooltip //= q();
$buffer .= qq(<td$class>$match->{'allele'}$tooltip</td>);
$buffer .= qq(<td>$match->{'identity'}</td>);
$buffer .= qq(<td>$match->{'alignment'}</td>);
$buffer .= qq(<td>$match->{'length'}</td>);
$buffer .= qq(<td>$match->{'e-value'}</td>);
if ( $match->{'from_partial'} ) {
$buffer .= q(<td>100.00</td>);
$buffer .= qq(<td colspan="3">Initial partial BLAST match to allele $match->{'partial_match_allele'}</td>);
} else {
$buffer .= qq(<td>$match->{'identity'}</td>);
$buffer .= qq(<td>$match->{'alignment'}</td>);
$buffer .= qq(<td>$match->{'length'}</td>);
$buffer .= qq(<td>$match->{'e-value'}</td>);
}
$buffer .= qq(<td>$match->{'seqbin_id'}</td>);
$buffer .= qq(<td>$match->{'start'}</td>);
$buffer .= qq(<td>$match->{'end'} </td>);
......@@ -1079,7 +1112,7 @@ sub _parse_blast_partial {
$match->{'start'} = $record->[7];
$match->{'end'} = $record->[6];
}
if ( $length > $match->{'alignment'} ) {
if ( $length != $match->{'alignment'} ) {
if ( $match->{'reverse'} ) {
if ( $record->[8] < $record->[9] ) {
$match->{'predicted_start'} = $match->{'start'} - $length + $record->[9];
......
#Written by Keith Jolley
#Copyright (c) 2014-2015, University of Oxford
#Copyright (c) 2014-2016, University of Oxford
#E-mail: keith.jolley@zoo.ox.ac.uk
#
#This file is part of Bacterial Isolate Genome Sequence Database (BIGSdb).
......@@ -27,8 +27,8 @@ use Error qw(:try);
use constant DEFAULT_ALIGNMENT => 100;
use constant DEFAULT_IDENTITY => 99;
use constant DEFAULT_WORD_SIZE => 30;
use constant DEFINER_USER => -1; #User id for tagger (there needs to be a record in the users table)
use constant DEFINER_USERNAME => 'autodefiner';
use constant DEFINER_USER => -1; #User id for tagger (there needs to be a record in the users table)
use constant DEFINER_USERNAME => 'autodefiner';
sub run_script {
my ($self) = @_;
......@@ -38,15 +38,8 @@ sub run_script {
die "No connection to database (check logs).\n" if !defined $self->{'db'};
die "This script can only be run against an isolate database.\n"
if ( $self->{'system'}->{'dbtype'} // '' ) ne 'isolates';
my $params;
$params->{$_} = 1 foreach qw(pcr_filter probe_filter scannew);
$params->{'alignment'} =
BIGSdb::Utils::is_int( $self->{'options'}->{'A'} ) ? $self->{'options'}->{'A'} : DEFAULT_ALIGNMENT;
$params->{'identity'} =
BIGSdb::Utils::is_int( $self->{'options'}->{'B'} ) ? $self->{'options'}->{'B'} : DEFAULT_IDENTITY;
$params->{'word_size'} =
BIGSdb::Utils::is_int( $self->{'options'}->{'w'} ) ? $self->{'options'}->{'w'} : DEFAULT_WORD_SIZE;
my $loci = $self->get_loci_with_ref_db;
my $params = $self->_get_params;
my $loci = $self->get_loci_with_ref_db;
if ( $self->{'options'}->{'a'} && !$self->_can_define_alleles($loci) ) {
exit(1);
......@@ -69,7 +62,6 @@ sub run_script {
my $locus_info = $self->{'datastore'}->get_locus_info($locus);
my %seqs;
foreach my $isolate_id (@$isolate_list) {
my $allele_ids = $self->{'datastore'}->get_allele_ids( $isolate_id, $locus );
next if @$allele_ids;
if ( !$self->{'options'}->{'T'} ) {
......@@ -115,7 +107,6 @@ sub run_script {
}
}
last if $EXIT || $self->_is_time_up;
last if $isolate_id==306;
}
$self->{'datastore'}->finish_with_locus($locus);
......@@ -132,6 +123,20 @@ sub run_script {
return;
}
sub _get_params {
my ($self) = @_;
my $params;
$params->{$_} = 1 foreach qw(pcr_filter probe_filter scannew);
$params->{'alignment'} =
BIGSdb::Utils::is_int( $self->{'options'}->{'A'} ) ? $self->{'options'}->{'A'} : DEFAULT_ALIGNMENT;
$params->{'identity'} =
BIGSdb::Utils::is_int( $self->{'options'}->{'B'} ) ? $self->{'options'}->{'B'} : DEFAULT_IDENTITY;
$params->{'word_size'} =
BIGSdb::Utils::is_int( $self->{'options'}->{'w'} ) ? $self->{'options'}->{'w'} : DEFAULT_WORD_SIZE;
$params->{'type_alleles'} = $self->{'options'}->{'type_alleles'};
return $params;
}
sub _define_allele {
my ( $self, $locus, $seq, $flag ) = @_;
my $locus_info = $self->{'datastore'}->get_locus_info($locus);
......
......@@ -1462,13 +1462,18 @@ sub get_sequences_table_attributes {
foreign_key => 'loci',
dropdown_query => 'yes'
},
{ name => 'allele_id', type => 'text', required => 'yes', primary_key => 'yes' },
{ name => 'sequence', type => 'text', required => 'yes', length => 32768, user_update => 'no' },
{ name => 'status', type => 'text', required => 'yes', optlist => "@optlist", hide_public => 'yes' },
{ name => 'sender', type => 'int', required => 'yes', dropdown_query => 'yes', hide_public => 'yes' },
{ name => 'curator', type => 'int', required => 'yes', dropdown_query => 'yes', hide_public => 'yes' },
{ name => 'date_entered', type => 'date', required => 'yes', hide_public => 'yes' },
{ name => 'datestamp', type => 'date', required => 'yes', hide_public => 'yes' }
{ name => 'allele_id', type => 'text', required => 'yes', primary_key => 'yes' },
{ name => 'sequence', type => 'text', required => 'yes', length => 32768, user_update => 'no' },
{ name => 'status', type => 'text', required => 'yes', optlist => "@optlist", hide_public => 'yes' },
{
name => 'type_allele',
type => 'bool',
comments => 'New allele searches can be constrained to use just type alleles in comparisons',
},
{ name => 'sender', type => 'int', required => 'yes', dropdown_query => 'yes', hide_public => 'yes' },
{ name => 'curator', type => 'int', required => 'yes', dropdown_query => 'yes', hide_public => 'yes' },
{ name => 'date_entered', type => 'date', required => 'yes', hide_public => 'yes' },
{ name => 'datestamp', type => 'date', required => 'yes', hide_public => 'yes' }
];
if ( ( $self->{'system'}->{'allele_comments'} // '' ) eq 'yes' ) {
push @$attributes, ( { name => 'comments', type => 'text', required => 'no', length => 120 } );
......@@ -1973,7 +1978,8 @@ sub get_classification_schemes_table_attributes {
{ name => 'display_order', type => 'int' }
);
}
push @$attributes, (
push @$attributes,
(
{
name => 'status',
type => 'text',
......@@ -1983,7 +1989,7 @@ sub get_classification_schemes_table_attributes {
},
{ name => 'curator', type => 'int', required => 'yes', dropdown_query => 'yes' },
{ name => 'datestamp', type => 'date', required => 'yes' }
);
);
return $attributes;
}
......
......@@ -65,6 +65,7 @@ GetOptions(
'n|new_only' => \$opts{'n'},
'o|order' => \$opts{'o'},
'r|random' => \$opts{'r'},
'type_alleles' => \$opts{'type_alleles'},
'T|already_tagged' => \$opts{'T'},
'v|view=s' => \$opts{'v'}
) or die("Error in command line arguments\n");
......@@ -238,6 +239,14 @@ ${bold}-t, --time$norm ${under}MINS$norm
${bold}--threads$norm ${under}THREADS$norm
Maximum number of threads to use.
${bold}--type_alleles$norm
Only use alleles with the 'type_allele' flag set to identify locus.
If a partial match is found then a full database lookup will be performed
to identify any known alleles. Using this option will constrain the search
space so that allele definitions don't become more variable over time. Note
that you must have at least one allele defined as a type allele for a locus
if you use this option otherwise you will not find any matches!
${bold}-T, --already_tagged$norm
Scan even when sequence tagged (no designation).
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment