Bug 142119 - Sphinx: indexing code, searching code to be done

git-svn-id: svn://svn.office.custis.ru/3rdparty/bugzilla.org/trunk@1886 6955db30-a419-402b-8a0d-67ecbb4d7f56
master
vfilippov 2013-12-23 15:06:20 +00:00
parent d55b895c2f
commit b9653028c0
6 changed files with 149 additions and 40 deletions

View File

@ -610,6 +610,16 @@ sub dbh {
return $class->request_cache->{dbh};
}
sub dbh_sphinx
{
my $class = shift;
if (!exists $class->request_cache->{dbh_sphinx})
{
$class->request_cache->{dbh_sphinx} = Bugzilla::DB::connect_sphinx();
}
return $class->request_cache->{dbh_sphinx};
}
sub dbh_main {
my $class = shift;
$class->request_cache->{dbh_main} ||= Bugzilla::DB::connect_main();

View File

@ -1197,7 +1197,6 @@ sub _extract_multi_selects {
# Should be called any time you update short_desc or change a comment.
sub _sync_fulltext
{
use utf8;
my ($self, $new_bug) = @_;
my $dbh = Bugzilla->dbh;
my ($short_desc) = $dbh->selectrow_array(
@ -1214,21 +1213,34 @@ sub _sync_fulltext
$nopriv = join "\n", @$nopriv;
$priv = join "\n", @$priv;
my $row = [ $short_desc, $nopriv, $priv ];
$_ = $dbh->quote_fulltext($_) for @$row;
## O_o Don't know how can it be tainted here, sometimes it was. Checking if it goes away.
#trick_taint($row);
# Determine if we are using Sphinx or MySQL fulltext search
my ($sph, $id_field);
my $index = Bugzilla->localconfig->{sphinx_index};
if ($index)
{
$sph = Bugzilla->dbh_sphinx;
$id_field = 'id';
$_ = $sph->quote($_) for @$row;
}
else
{
$index = 'bugs_fulltext';
$sph = $dbh;
$id_field = 'bug_id';
$_ = $dbh->quote_fulltext($_) for @$row;
}
my $sql;
if ($new_bug)
{
$sql = "INSERT INTO bugs_fulltext (bug_id, short_desc, comments, comments_private)".
$sql = "INSERT INTO $index ($id_field, short_desc, comments, comments_private)".
" VALUES (".join(',', $self->id, @$row).")";
}
else
{
$sql = "UPDATE bugs_fulltext SET short_desc=$row->[0],".
" comments=$row->[1], comments_private=$row->[2] WHERE bug_id=".$self->id;
$sql = "UPDATE $index SET short_desc=$row->[0],".
" comments=$row->[1], comments_private=$row->[2] WHERE $id_field=".$self->id;
}
return $dbh->do($sql);
return $sph->do($sql);
}
# This is the correct way to delete bugs from the DB.

View File

@ -96,6 +96,33 @@ sub connect_main {
$lc->{db_sock}, $lc->{db_user}, $lc->{db_pass});
}
sub connect_sphinx
{
my $lc = Bugzilla->localconfig;
if (!$lc->{sphinx_index})
{
return undef;
}
my $host = $lc->{sphinx_host};
my $port = $lc->{sphinx_port};
my $sock = $lc->{sphinx_sock};
my $dsn = "dbi:mysql:host=$host;database=none";
$dsn .= ";port=$port" if $port;
$dsn .= ";mysql_socket=$sock" if $sock;
my $sphinx = DBI->connect($dsn, 'nobody', 'nobody', {
mysql_enable_utf8 => 1,
# Needs to be explicitly specified for command-line processes.
mysql_auto_reconnect => 1,
});
$sphinx->do("SET NAMES utf8");
return $sphinx;
}
sub _connect {
my ($driver, $host, $dbname, $port, $sock, $user, $pass) = @_;

View File

@ -3286,8 +3286,17 @@ sub _populate_bugs_fulltext
my $bug_ids = shift;
$bug_ids = undef if $bug_ids && !@$bug_ids;
my $dbh = Bugzilla->dbh;
my ($table, $limit1, $id) = ('bugs_fulltext', $dbh->sql_limit(1), 'bug_id');
my $fulltext = $dbh->selectrow_array("SELECT $id FROM $table $limit1");
# These vary between different fulltext search engines (MySQL, Sphinx)
my ($table, $limit1, $id_field, $quote, $sph) = ('bugs_fulltext', $dbh->sql_limit(1), 'bug_id', 'quote_fulltext', $dbh);
if (Bugzilla->localconfig->{sphinx_index})
{
$sph = Bugzilla->dbh_sphinx;
$limit1 = 'LIMIT 1';
$table = Bugzilla->localconfig->{sphinx_index};
$id_field = 'id';
$quote = 'quote';
}
my $fulltext = $sph->selectrow_array("SELECT $id_field FROM $table $limit1");
my ($datasize, $time) = (0, time);
my ($lastdata, $lasttime) = ($datasize, $time);
# We only populate the table if it's empty or if we've been given a
@ -3298,52 +3307,66 @@ sub _populate_bugs_fulltext
$bug_ids ||= $dbh->selectcol_arrayref("SELECT bug_id FROM bugs");
return if !$bug_ids;
# There could be tons of bugs, so we'll use 256-bug portions
# There could be tons of bugs, so we use 256-bug portions,
# and limit single query to 4MB
print "Populating full-text index... (this can take a long time.)\n";
my ($portion, $done, $total) = (256, 0, scalar @$bug_ids);
my ($short, $all, $nopriv, $wh, $rows);
while (my @ids = splice @$bug_ids, 0, $portion)
my ($portion, $done, $max_packet, $total) = (256, 0, 4*1024*1024, scalar @$bug_ids);
my ($short, $all, $nopriv, $wh);
# Count lengths in bytes
use bytes;
my $rows = {};
while (@$bug_ids || %$rows)
{
$rows = {};
$wh = "bug_id IN (" . join(",", ("?") x @ids) . ")";
($short) = $dbh->selectall_arrayref(
"SELECT bug_id, short_desc FROM bugs WHERE $wh", undef, @ids
);
$all = $dbh->selectall_arrayref(
"SELECT bug_id, thetext, isprivate FROM longdescs WHERE $wh",
undef, @ids
);
# Local block with 'use bytes' for counting data size in MB
if (scalar keys %$rows < $portion)
{
use bytes;
# Read more data
my @ids = splice @$bug_ids, 0, $portion;
$wh = "bug_id IN (" . join(",", ("?") x @ids) . ")";
# Get bug titles
($short) = $dbh->selectall_arrayref(
"SELECT bug_id, short_desc FROM bugs WHERE $wh", undef, @ids
);
for (@$short)
{
$rows->{$_->[0]} = [ $_->[1], '', '' ];
$datasize += length $_->[1];
}
# Comments divide into non-private and private
# Get comments; they can be private and non-private
$all = $dbh->selectall_arrayref(
"SELECT bug_id, thetext, isprivate FROM longdescs WHERE $wh",
undef, @ids
);
for (@$all)
{
$rows->{$_->[0]}->[$_->[2] ? 2 : 1] .= $_->[1] . "\n";
$datasize += length($_->[1])+1;
}
}
my $query = "INSERT INTO $table ($id_field, short_desc, comments, comments_private) VALUES ";
my $len = 0;
my @ids;
for (keys %$rows)
{
Encode::_utf8_off($_) for @{$rows->{$_}};
for (@{$rows->{$_}})
{
Encode::_utf8_off($_);
$datasize += length $_;
}
my $s = "($_, ".join(', ', map { $sph->$quote($_) } @{$rows->{$_}})."), ";
if ($len + length $s >= $max_packet)
{
last;
}
delete $rows->{$_};
push @ids, $_;
$query .= $s;
$len += length $s;
}
for (keys %$rows)
{
# CustIS Bug 46221 - Snowball stemmers in Bugzilla fulltext search
$rows->{$_} = join ', ', map { $dbh->quote_fulltext($_) } @{$rows->{$_}};
}
$dbh->do("DELETE FROM $table WHERE $id IN (".join(',', @ids).')');
$dbh->do(
"INSERT INTO $table ($id, short_desc, comments, comments_private) VALUES ".
join(", ", map { "($_, $rows->{$_})" } @ids)
);
substr $query, -2, 2, '';
$sph->do("DELETE FROM $table WHERE $id_field IN (".join(',', @ids).')');
$sph->do($query);
$done += @ids;
print "\r$done / $total, ".sprintf("%.2f MB, %d KB/s", $datasize/1048576, ($datasize-$lastdata)/1024/(time-$lasttime));
print "\r$done / $total, ";
printf("%.2f MB, %d KB/s", $datasize/1048576, ($datasize-$lastdata)/1024/(time-$lasttime));
print " ...";
($lastdata, $lasttime) = ($datasize, time);
}

View File

@ -150,6 +150,26 @@ EOT
# want that.
EOT
},
{
name => 'sphinx_host',
default => 'localhost',
desc => "# The DNS name of the host that the Sphinx server runs on.\n"
},
{
name => 'sphinx_index',
default => '',
desc => "# The name of Sphinx Search index to use for fulltext search ('' to disable Sphinx support)\n"
},
{
name => 'sphinx_port',
default => 0,
desc => "# Sphinx port (listening MySQL protocol)\n",
},
{
name => 'sphinx_sock',
default => '',
desc => "# Sphinx UNIX socket (listening MySQL protocol)\n",
},
{
name => 'db_check',
default => 1,

17
data/sphinx.conf Normal file
View File

@ -0,0 +1,17 @@
# Add this to your sphinx.conf to use Sphinx search
index bugs
{
type = rt
path = /var/lib/sphinxsearch/data/bugs
rt_field = short_desc
rt_field = comments
rt_field = comments_private
docinfo = extern
enable_star = 1
charset_type = utf-8
charset_table = 0..9, A..Z->a..z, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
blend_chars = _, -, &, +, @, $
morphology = stem_enru
min_word_len = 2
}