Bug 46221 - Russian Stemming in Bugzilla 3.2 Full-Text Search

[Bug 40933 - Bugzilla 3.2]


git-svn-id: svn://svn.office.custis.ru/3rdparty/bugzilla.org/trunk@114 6955db30-a419-402b-8a0d-67ecbb4d7f56
custis
vfilippov 2009-01-30 17:10:49 +00:00
parent 182f38c7e3
commit 4ca9210f7b
7 changed files with 398 additions and 114 deletions

View File

@ -750,29 +750,34 @@ sub _extract_multi_selects {
}
# Should be called any time you update short_desc or change a comment.
sub _sync_fulltext {
sub _sync_fulltext
{
my ($self, $new_bug) = @_;
my $dbh = Bugzilla->dbh;
if ($new_bug) {
$dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc)
SELECT bug_id, short_desc FROM bugs WHERE bug_id = ?',
undef, $self->id);
my ($short_desc) = $dbh->selectrow_array(
"SELECT short_desc FROM bugs WHERE bug_id=?", undef, $self->id
);
my @comments = @{ $dbh->selectall_arrayref(
"SELECT thetext, isprivate FROM longdescs WHERE bug_id=?",
undef, $self->id
) || [] };
my @no_private = grep { !$_->[1] } @comments;
my $all = join "\n", map { $_->[0] } @comments;
my $nopriv = join "\n", map { $_->[0] } @no_private;
# Bug 46221 - Russian Stemming in Bugzilla fulltext search
{
use utf8;
$short_desc = stem_text($short_desc);
$all = stem_text($all);
$nopriv = stem_text($nopriv);
}
else {
$dbh->do('UPDATE bugs_fulltext SET short_desc = ? WHERE bug_id = ?',
undef, $self->short_desc, $self->id);
}
my $comments = $dbh->selectall_arrayref(
'SELECT thetext, isprivate FROM longdescs WHERE bug_id = ?',
undef, $self->id);
my $all = join("\n", map { $_->[0] } @$comments);
my @no_private = grep { !$_->[1] } @$comments;
my $nopriv_string = join("\n", map { $_->[0] } @no_private);
$dbh->do('UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ?
WHERE bug_id = ?', undef, $all, $nopriv_string, $self->id);
my $sql = "bugs_fulltext SET short_desc=?, comments=?, comments_noprivate=?";
my @bind = ($short_desc, $all, $nopriv, $self->id);
$sql = "INSERT INTO $sql, bug_id=?" if $new_bug;
$sql = "UPDATE $sql WHERE bug_id=?" unless $new_bug;
return $dbh->do($sql, undef, @bind);
}
# This is the correct way to delete bugs from the DB.
# No bug should be deleted from anywhere else except from here.
#

View File

@ -158,7 +158,7 @@ sub sql_limit {
sub sql_string_concat {
my ($self, @params) = @_;
return 'CONCAT(' . join(', ', @params) . ')';
}
@ -167,7 +167,7 @@ sub sql_fulltext_search {
# Add the boolean mode modifier if the search string contains
# boolean operators.
my $mode = ($text =~ /[+\-<>()~*"]/ ? "IN BOOLEAN MODE" : "");
my $mode = ($text =~ /[+\-<>()~*\"]/ ? "IN BOOLEAN MODE" : "");
# quote the text for use in the MATCH AGAINST expression
$text = $self->quote($text);
@ -180,7 +180,7 @@ sub sql_fulltext_search {
sub sql_istring {
my ($self, $string) = @_;
return $string;
}

View File

@ -2965,7 +2965,7 @@ sub _change_text_types {
$dbh->bz_alter_column('namedqueries', 'query',
{ TYPE => 'LONGTEXT', NOTNULL => 1 });
}
}
sub _check_content_length {
my ($table_name, $field_name, $max_length, $id_field) = @_;
@ -2993,67 +2993,58 @@ sub _check_content_length {
}
}
sub _populate_bugs_fulltext {
sub _populate_bugs_fulltext
{
my $dbh = Bugzilla->dbh;
my $fulltext = $dbh->selectrow_array('SELECT 1 FROM bugs_fulltext '
. $dbh->sql_limit(1));
my $fulltext = $dbh->selectrow_array
("SELECT 1 FROM bugs_fulltext ".$dbh->sql_limit(1));
# We only populate the table if it's empty...
if (!$fulltext) {
if (!$fulltext)
{
# ... and if there are bugs in the bugs table.
my $bug_ids = $dbh->selectcol_arrayref('SELECT bug_id FROM bugs');
return if !@$bug_ids;
my @bug_ids = @{ $dbh->selectcol_arrayref("SELECT bug_id FROM bugs") };
return if !@bug_ids;
# Populating bugs_fulltext can be very slow for large installs,
# so we special-case any DB that supports GROUP_CONCAT, which is
# a much faster way to do things.
if (UNIVERSAL::can($dbh, 'sql_group_concat')) {
print "Populating bugs_fulltext...";
print " (this can take a long time.)\n";
# XXX This hack should probably be moved elsewhere.
if ($dbh->isa('Bugzilla::DB::Mysql')) {
$dbh->do('SET SESSION group_concat_max_len = 128000000');
$dbh->do('SET SESSION max_allowed_packet = 128000000');
# Bug 46221 - Russian Stemming in Bugzilla fulltext search
# We can't use GROUP_CONCAT because we need to stem each word
# And there could be tons of bugs, so we'll use N-bug portions
print "Populating bugs_fulltext... (this can take a long time.)\n";
my ($portion, $done, $total) = (256, 0, scalar @bug_ids);
my ($short, $all, $nopriv, $wh, $rows);
my ($sth, $sthn) = (undef, 0);
while (my @ids = splice @bug_ids, 0, $portion)
{
$rows = {};
$wh = "bug_id IN (" . join(",", ("?") x @ids) . ")";
($short) = $dbh->selectall_arrayref(
"SELECT bug_id, short_desc FROM bugs WHERE $wh", undef, @ids
);
$all = $dbh->selectall_arrayref(
"SELECT bug_id, thetext, isprivate FROM longdescs WHERE $wh",
undef, @ids
);
$rows->{$_->[0]} = [ stem_text($_->[1]), '', '' ] for @$short;
for (@$all)
{
$_->[1] = stem_text($_->[1]);
$rows->{$_->[0]}->[1] .= $_->[1] . "\n";
$rows->{$_->[0]}->[2] .= $_->[1] . "\n"
unless $_->[2];
}
$dbh->do(
q{INSERT INTO bugs_fulltext (bug_id, short_desc, comments,
comments_noprivate)
SELECT bugs.bug_id, bugs.short_desc, }
. $dbh->sql_group_concat('longdescs.thetext', '\'\n\'')
. ', ' . $dbh->sql_group_concat('nopriv.thetext', '\'\n\'') .
q{ FROM bugs
LEFT JOIN longdescs
ON bugs.bug_id = longdescs.bug_id
LEFT JOIN longdescs AS nopriv
ON longdescs.comment_id = nopriv.comment_id
AND nopriv.isprivate = 0 }
. $dbh->sql_group_by('bugs.bug_id', 'bugs.short_desc'));
}
# The slow way, without group_concat.
else {
print "Populating bugs_fulltext.short_desc...\n";
$dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc)
SELECT bug_id, short_desc FROM bugs');
my $count = 1;
my $sth_all = $dbh->prepare('SELECT thetext FROM longdescs
WHERE bug_id = ?');
my $sth_nopriv = $dbh->prepare(
'SELECT thetext FROM longdescs
WHERE bug_id = ? AND isprivate = 0');
my $sth_update = $dbh->prepare(
'UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ?
WHERE bug_id = ?');
print "Populating bugs_fulltext comment fields...\n";
foreach my $id (@$bug_ids) {
my $all = $dbh->selectcol_arrayref($sth_all, undef, $id);
my $nopriv = $dbh->selectcol_arrayref($sth_nopriv, undef, $id);
$sth_update->execute(join("\n", @$all), join("\n", @$nopriv), $id);
indicate_progress({ total => scalar @$bug_ids, every => 100,
current => $count++ });
if ($sthn != @ids)
{
# Optimization: cache prepared statements
$sthn = @ids;
$sth = $dbh->prepare(
"INSERT INTO bugs_fulltext (bug_id, short_desc, comments, comments_noprivate)" .
" VALUES " . join(",", ("(?,?,?,?)") x @ids)
);
}
print "\n";
$sth->execute(map { ($_, @{$rows->{$_}}) } @ids);
$done += @ids;
print "\r$done / $total ...";
}
print "\n";
}
}

View File

@ -751,8 +751,8 @@ sub init {
my @supplist = (" ");
foreach my $str (@supptables) {
if ($str =~ /^(LEFT|INNER|RIGHT)\s+JOIN/i) {
$str =~ /^(.*?)\s+ON\s+(.*)$/i;
if ($str =~ /^(LEFT|INNER|RIGHT)\s+JOIN/iso) {
$str =~ /^(.*?)\s+ON\s+(.*)$/iso;
my ($leftside, $rightside) = ($1, $2);
if (defined $suppseen{$leftside}) {
$supplist[$suppseen{$leftside}] .= " AND ($rightside)";
@ -1208,7 +1208,7 @@ sub _long_desc_changedbefore_after {
my ($chartid, $t, $v, $supptables, $term) =
@func_args{qw(chartid t v supptables term)};
my $dbh = Bugzilla->dbh;
my $operator = ($$t =~ /before/) ? '<' : '>';
my $table = "longdescs_$$chartid";
push(@$supptables, "LEFT JOIN longdescs AS $table " .
@ -1218,53 +1218,53 @@ sub _long_desc_changedbefore_after {
$$term = "($table.bug_when IS NOT NULL)";
}
sub _content_matches {
sub _content_matches
{
my $self = shift;
my %func_args = @_;
my ($chartid, $supptables, $term, $groupby, $fields, $v) =
@func_args{qw(chartid supptables term groupby fields v)};
my $dbh = Bugzilla->dbh;
# "content" is an alias for columns containing text for which we
# can search a full-text index and retrieve results by relevance,
# can search a full-text index and retrieve results by relevance,
# currently just bug comments (and summaries to some degree).
# There's only one way to search a full-text index, so we only
# accept the "matches" operator, which is specific to full-text
# index searches.
# Add the fulltext table to the query so we can search on it.
my $l = FULLTEXT_BUGLIST_LIMIT;
my $table = "bugs_fulltext_$$chartid";
my $comments_col = "comments";
$comments_col = "comments_noprivate" unless $self->{'user'}->is_insider;
push(@$supptables, "LEFT JOIN bugs_fulltext AS $table " .
"ON bugs.bug_id = $table.bug_id");
$comments_col = "comments_noprivate" unless $self->{user}->is_insider;
# Create search terms to add to the SELECT and WHERE clauses.
my ($term1, $rterm1) = $dbh->sql_fulltext_search("$table.$comments_col",
$$v, 1);
my ($term2, $rterm2) = $dbh->sql_fulltext_search("$table.short_desc",
$$v, 2);
$rterm1 = $term1 if !$rterm1;
$rterm2 = $term2 if !$rterm2;
my $text = stem_text($$v, 1);
my ($term1, $rterm1) =
$dbh->sql_fulltext_search("bugs_fulltext.$comments_col", $text, 1);
$rterm1 ||= $term1;
my ($term2, $rterm2) =
$dbh->sql_fulltext_search("bugs_fulltext.short_desc", $text, 2);
$rterm2 ||= $term2;
# The term to use in the WHERE clause.
$$term = "$term1 > 0 OR $term2 > 0";
# In order to sort by relevance (in case the user requests it),
# we SELECT the relevance value and give it an alias so we can
# add it to the SORT BY clause when we build it in buglist.cgi.
my $select_term = "($rterm1 + $rterm2) AS relevance";
# Bug 46221 - Russian Stemming in Bugzilla fulltext search
# Bugzilla's fulltext search mechanism is bad because
# MATCH(...) OR MATCH(...) is very slow in MySQL - it doesn't do
# fulltext index merge optimization. So we'll use a derived UNION table.
push @$supptables,
"INNER JOIN (
SELECT bug_id, SUM(relevance) AS relevance FROM (
(SELECT bug_id, $rterm1 AS relevance
FROM bugs_fulltext WHERE $term1 > 0 LIMIT $l)
UNION ALL
(SELECT bug_id, $rterm2 AS relevance
FROM bugs_fulltext WHERE $term2 > 0 LIMIT $l)
) AS ${table}_0
GROUP BY bug_id
) AS $table ON bugs.bug_id = $table.bug_id";
# Users can specify to display the relevance field, in which case
# it'll show up in the list of fields being selected, and we need
# to replace that occurrence with our select term. Otherwise
# we can just add the term to the list of fields being selected.
if (grep($_ eq "relevance", @$fields)) {
@$fields = map($_ eq "relevance" ? $select_term : $_ , @$fields);
}
else {
push(@$fields, $select_term);
}
# All work done by INNER JOIN
$$term = "1";
}
sub _timestamp_compare {
@ -1272,7 +1272,7 @@ sub _timestamp_compare {
my %func_args = @_;
my ($v, $q) = @func_args{qw(v q)};
my $dbh = Bugzilla->dbh;
$$v = SqlifyDate($$v);
$$q = $dbh->quote($$v);
}
@ -1283,7 +1283,7 @@ sub _commenter_exact {
my ($chartid, $sequence, $supptables, $term, $v) =
@func_args{qw(chartid sequence supptables term v)};
my $user = $self->{'user'};
$$v =~ m/(%\\w+%)/;
my $match = pronoun($1, $user);
my $chartseq = $$chartid;

View File

@ -44,13 +44,14 @@ use base qw(Exporter);
file_mod_time is_7bit_clean
bz_crypt generate_random_password
validate_email_syntax clean_text
get_text disable_utf8);
get_text disable_utf8 stem_text);
use Bugzilla::Constants;
use Date::Parse;
use Date::Format;
use Text::Wrap;
use Lingua::Stem::RuUTF8;
# This is from the perlsec page, slightly modified to remove a warning
# From that page:
@ -495,7 +496,7 @@ sub generate_random_password {
sub validate_email_syntax {
my ($addr) = @_;
my $match = Bugzilla->params->{'emailregexp'};
my $ret = ($addr =~ /$match/ && $addr !~ /[\\\(\)<>&,;:"\[\] \t\r\n]/);
my $ret = ($addr =~ /$match/ && $addr !~ /[\\\(\)<>&,;:\"\[\] \t\r\n]/);
if ($ret) {
# We assume these checks to suffice to consider the address untainted.
trick_taint($_[0]);
@ -586,6 +587,27 @@ sub disable_utf8 {
}
}
# Bug 46221 - Russian Stemming in Bugzilla fulltext search
sub stem_text
{
my ($text, $allow_verbatim) = @_;
$text = [ split /(?<=\w)(?=\W)|(?<=\W)(?=\w)/, $text ];
my $q = 0;
for (@$text)
{
unless (/\W/)
{
$_ = Lingua::Stem::RuUTF8::stem_word($_) unless $q;
}
elsif ($allow_verbatim)
{
# If $allow_verbatim is TRUE then text in "double quotes" doesn't stem
$q = ($q + tr/\"/\"/) % 2;
}
}
return join '', @$text;
}
1;
__END__

252
Lingua/Stem/RuUTF8.pm Normal file
View File

@ -0,0 +1,252 @@
#!/usr/bin/perl
# Lingua::Stem::Ru - UTF-8 ×ÅÒÓÉÑ
package Lingua::Stem::RuUTF8;
use utf8;
use strict;
use Exporter;
use Carp;
use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
BEGIN {
@ISA = qw (Exporter);
@EXPORT = ();
@EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching);
%EXPORT_TAGS = ();
}
$VERSION = "0.01";
my $Stem_Caching = 0;
my $Stem_Cache = {};
my $VOWEL = qr/аеиоуÑÑ<E280B9>ÑŽÑ<C5BD>/;
my $PERFECTIVEGROUND = qr/((ив|ивши|ившиÑ<C2B8>ÑŒ|ыв|ывши|ÑвшиÑ<C2B8>ÑŒ)|((?<=[аÑ<C2B0>])(в|вши|вшиÑ<C2B8>ÑŒ)))$/;
my $REFLEXIVE = qr/(Ñ<><>ÑŒ])$/;
my $ADJECTIVE = qr/(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|Ñ‹Ñ…|ую|ÑŽÑŽ|аÑ<C2B0><>Ñ<EFBFBD>|ою|ею)$/;
my $PARTICIPLE = qr/((ивш|ывш|ующ)|((?<=[аÑ<C2B0>])(ем|нн|вш|ющ|щ)))$/;
my $VERB = qr/((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|Ñ<>Ñ|ует|уют|ит|Ñ‹Ñ‚|ены|ить|Ñ‹Ñ‚ÑŒ|ишь|ую|ÑŽ)|((?<=[аÑ<C2B0>])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ÑŽÑ‚|ны|Ñ‚ÑŒ|ешь|нно)))$/;
my $NOUN = qr/(а|ев|ов|ие|ье|е|иÑ<C2B8>ми<>ми|ами|еи|ии|и|ией|ей|ой|ий|й|иÑ<C2B8>м|Ñ<>м|ием|ем|ам|ом|о|у|ах|иÑ<C2B8>Ñ…|Ñ<>Ñ…|Ñ‹|ÑŒ|ию|ью|ÑŽ|иÑ<C2B8>|ÑŒÑ<C592><>)$/;
my $RVRE = qr/^(.*?[$VOWEL])(.*)$/;
my $DERIVATIONAL = qr/[^$VOWEL][$VOWEL]+[^$VOWEL]+[$VOWEL].*(?<=о)Ñ<>ÑÑŒ?$/;
sub stem {
return [] if ($#_ == -1);
my $parm_ref;
if (ref $_[0]) {
$parm_ref = shift;
} else {
$parm_ref = { @_ };
}
my $words = [];
my $locale = 'ru';
my $exceptions = {};
foreach (keys %$parm_ref) {
my $key = lc ($_);
if ($key eq '-words') {
@$words = @{$parm_ref->{$key}};
} elsif ($key eq '-exceptions') {
$exceptions = $parm_ref->{$key};
} elsif ($key eq '-locale') {
$locale = $parm_ref->{$key};
} else {
croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
}
}
local( $_ );
foreach (@$words) {
# Flatten case
$_ = lc $_;
# Check against exceptions list
if (exists $exceptions->{$_}) {
$_ = $exceptions->{$_};
next;
}
# Check against cache of stemmed words
my $original_word = $_;
if ($Stem_Caching && exists $Stem_Cache->{$original_word}) {
$_ = $Stem_Cache->{$original_word};
next;
}
$_ = stem_word($_);
$Stem_Cache->{$original_word} = $_ if $Stem_Caching;
}
$Stem_Cache = {} if ($Stem_Caching < 2);
return $words;
}
sub stem_word {
my $word = lc shift;
# Check against cache of stemmed words
if ($Stem_Caching && exists $Stem_Cache->{$word}) {
return $Stem_Cache->{$word};
}
my ($start, $RV) = $word =~ /$RVRE/;
return $word unless $RV;
# Step 1
unless ($RV =~ s/$PERFECTIVEGROUND//) {
$RV =~ s/$REFLEXIVE//;
if ($RV =~ s/$ADJECTIVE//) {
$RV =~ s/$PARTICIPLE//;
} else {
$RV =~ s/$NOUN// unless $RV =~ s/$VERB//;
}
}
# Step 2
$RV =~ s/и$//;
# Step 3
$RV =~ s/оÑ<C2BE>ÑÑŒ?$// if $RV =~ /$DERIVATIONAL/;
# Step 4
unless ($RV =~ s/ь$//) {
$RV =~ s/ейше?//;
$RV =~ s/нн$/н/;
}
return $start.$RV;
}
sub stem_caching {
my $parm_ref;
if (ref $_[0]) {
$parm_ref = shift;
} else {
$parm_ref = { @_ };
}
my $caching_level = $parm_ref->{-level};
if (defined $caching_level) {
if ($caching_level !~ m/^[012]$/) {
croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
}
$Stem_Caching = $caching_level;
}
return $Stem_Caching;
}
sub clear_stem_cache {
$Stem_Cache = {};
}
1;
__END__
=head1 NAME
Lingua::Stem::RuUTF8 - Porter's stemming algorithm for Russian (UTF-8 only)
=head1 SYNOPSIS
use Lingua::Stem::RuUTF8;
my $stems = Lingua::Stem::RuUTF8::stem({
-words => $word_list_reference,
-locale => 'ru',
-exceptions => $exceptions_hash,
});
my $stem = Lingua::Stem::RuUTF8::stem_word( $word );
=head1 DESCRIPTION
This module applies the Porter Stemming Algorithm to its parameters,
returning the stemmed words.
The algorithm is implemented exactly as described in:
http://snowball.tartarus.org/russian/stemmer.html
The code is carefully crafted to work in conjunction with the L<Lingua::Stem>
module by Benjamin Franz. This stemmer is also based
on the work of Aldo Capini, see L<Lingua::Stem::It>.
=head1 METHODS
=over 4
=item stem({ -words => \@words, -locale => 'ru', -exceptions => \%exceptions });
Stems a list of passed words. Returns an anonymous list reference to the stemmed
words.
Example:
my $stemmed_words = Lingua::Stem::RuUTF8::stem({
-words => \@words,
-locale => 'ru',
-exceptions => \%exceptions,
});
=item stem_word( $word );
Stems a single word and returns the stem directly.
Example:
my $stem = Lingua::Stem::RuUTF8::stem_word( $word );
=item stem_caching({ -level => 0|1|2 });
Sets the level of stem caching.
'0' means 'no caching'. This is the default level.
'1' means 'cache per run'. This caches stemming results during a single
call to 'stem'.
'2' means 'cache indefinitely'. This caches stemming results until
either the process exits or the 'clear_stem_cache' method is called.
=item clear_stem_cache;
Clears the cache of stemmed words
=back
=cut
=head2 EXPORT
None by default.
=head1 HISTORY
=over 8
=item *
0.01 (2004-05-21)
=back
=head1 AUTHOR
Aleksandr Guidrevitch <pillgrim@mail.ru>
=head1 SEE ALSO
Lingua::Stem
=head1 COPYRIGHT
Copyright (C) 2003 by Aldo Calpini <dada@perl.it>
Copyright (C) 2004 by Aleksandr Guidrevitch <pillgrim@mail.ru>
This software may be freely copied and distributed under the same
terms and conditions as Perl itself, either Perl version 5.8.3
or, at your option, any later version of Perl 5 you may
have available..
=cut

View File

@ -125,6 +125,20 @@ function doOnSelectProduct(selectmode) {
[%# *** Summary *** %]
<table>
<tr>
<th align="right" colspan="2">
<label for="content">Words (full-text search):</label>
</th>
<td>
<input name="content" size="40" id="content"
value="[% default.content.0 FILTER html %]">
<script type="text/javascript"> <!--
document.forms['queryform'].content.focus();
// -->
</script>
</td>
</tr>
<tr>
<th align="right">
<label for="short_desc" accesskey="s"><u>S</u>ummary</label>: