Bug 46221 - Russian Stemming in Bugzilla 3.2 Full-Text Search

[Bug 40933 - Bugzilla 3.2] git-svn-id: svn://svn.office.custis.ru/3rdparty/bugzilla.org/trunk@114 6955db30-a419-402b-8a0d-67ecbb4d7f56
2009-01-30 17:10:49 +00:00 · 2009-01-30 17:10:49 +00:00 · 4ca9210f7b
parent 182f38c7e3
commit 4ca9210f7b
7 changed files with 398 additions and 114 deletions
--- a/Bugzilla/Bug.pm
+++ b/Bugzilla/Bug.pm
@ -750,29 +750,34 @@ sub _extract_multi_selects {
 }

 # Should be called any time you update short_desc or change a comment.
-sub _sync_fulltext {
+sub _sync_fulltext
+{
    my ($self, $new_bug) = @_;
    my $dbh = Bugzilla->dbh;
-    if ($new_bug) {
-        $dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc)
-                  SELECT bug_id, short_desc FROM bugs WHERE bug_id = ?',
-                 undef, $self->id);
+    my ($short_desc) = $dbh->selectrow_array(
+        "SELECT short_desc FROM bugs WHERE bug_id=?", undef, $self->id
+    );
+    my @comments = @{ $dbh->selectall_arrayref(
+        "SELECT thetext, isprivate FROM longdescs WHERE bug_id=?",
+        undef, $self->id
+    ) || [] };
+    my @no_private = grep { !$_->[1] } @comments;
+    my $all = join "\n", map { $_->[0] } @comments;
+    my $nopriv = join "\n", map { $_->[0] } @no_private;
+    # Bug 46221 - Russian Stemming in Bugzilla fulltext search
+    {
+        use utf8;
+        $short_desc = stem_text($short_desc);
+        $all = stem_text($all);
+        $nopriv = stem_text($nopriv);
    }
-    else {
-        $dbh->do('UPDATE bugs_fulltext SET short_desc = ? WHERE bug_id = ?',
-                 undef, $self->short_desc, $self->id);
-    }
-    my $comments = $dbh->selectall_arrayref(
-        'SELECT thetext, isprivate FROM longdescs WHERE bug_id = ?',
-        undef, $self->id);
-    my $all = join("\n", map { $_->[0] } @$comments);
-    my @no_private = grep { !$_->[1] } @$comments;
-    my $nopriv_string = join("\n", map { $_->[0] } @no_private);
-    $dbh->do('UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ?
-               WHERE bug_id = ?', undef, $all, $nopriv_string, $self->id);
+    my $sql = "bugs_fulltext SET short_desc=?, comments=?, comments_noprivate=?";
+    my @bind = ($short_desc, $all, $nopriv, $self->id);
+    $sql = "INSERT INTO $sql, bug_id=?" if $new_bug;
+    $sql = "UPDATE $sql WHERE bug_id=?" unless $new_bug;
+    return $dbh->do($sql, undef, @bind);
 }

-
 # This is the correct way to delete bugs from the DB.
 # No bug should be deleted from anywhere else except from here.
 #
--- a/Bugzilla/DB/Mysql.pm
+++ b/Bugzilla/DB/Mysql.pm
@ -158,7 +158,7 @@ sub sql_limit {

 sub sql_string_concat {
    my ($self, @params) = @_;
-    
+
    return 'CONCAT(' . join(', ', @params) . ')';
 }

@ -167,7 +167,7 @@ sub sql_fulltext_search {

    # Add the boolean mode modifier if the search string contains
    # boolean operators.
-    my $mode = ($text =~ /[+\-<>()~*"]/ ? "IN BOOLEAN MODE" : "");
+    my $mode = ($text =~ /[+\-<>()~*\"]/ ? "IN BOOLEAN MODE" : "");

    # quote the text for use in the MATCH AGAINST expression
    $text = $self->quote($text);
@ -180,7 +180,7 @@ sub sql_fulltext_search {

 sub sql_istring {
    my ($self, $string) = @_;
-    
+
    return $string;
 }

--- a/Bugzilla/Install/DB.pm
+++ b/Bugzilla/Install/DB.pm
@ -2965,7 +2965,7 @@ sub _change_text_types {
    $dbh->bz_alter_column('namedqueries', 'query',
        { TYPE => 'LONGTEXT', NOTNULL => 1 });

-} 
+}

 sub _check_content_length {
    my ($table_name, $field_name, $max_length, $id_field) = @_;
@ -2993,67 +2993,58 @@ sub _check_content_length {
    }
 }

-sub _populate_bugs_fulltext {
+sub _populate_bugs_fulltext
+{
    my $dbh = Bugzilla->dbh;
-    my $fulltext = $dbh->selectrow_array('SELECT 1 FROM bugs_fulltext '
-                                         . $dbh->sql_limit(1));
+    my $fulltext = $dbh->selectrow_array
+        ("SELECT 1 FROM bugs_fulltext ".$dbh->sql_limit(1));
    # We only populate the table if it's empty...
-    if (!$fulltext) {
+    if (!$fulltext)
+    {
        # ... and if there are bugs in the bugs table.
-        my $bug_ids = $dbh->selectcol_arrayref('SELECT bug_id FROM bugs');
-        return if !@$bug_ids;
+        my @bug_ids = @{ $dbh->selectcol_arrayref("SELECT bug_id FROM bugs") };
+        return if !@bug_ids;

-        # Populating bugs_fulltext can be very slow for large installs,
-        # so we special-case any DB that supports GROUP_CONCAT, which is
-        # a much faster way to do things.
-        if (UNIVERSAL::can($dbh, 'sql_group_concat')) {
-            print "Populating bugs_fulltext...";
-            print " (this can take a long time.)\n";
-            # XXX This hack should probably be moved elsewhere.
-            if ($dbh->isa('Bugzilla::DB::Mysql')) {
-                $dbh->do('SET SESSION group_concat_max_len = 128000000');
-                $dbh->do('SET SESSION max_allowed_packet =   128000000');
+        # Bug 46221 - Russian Stemming in Bugzilla fulltext search
+        # We can't use GROUP_CONCAT because we need to stem each word
+        # And there could be tons of bugs, so we'll use N-bug portions
+        print "Populating bugs_fulltext... (this can take a long time.)\n";
+        my ($portion, $done, $total) = (256, 0, scalar @bug_ids);
+        my ($short, $all, $nopriv, $wh, $rows);
+        my ($sth, $sthn) = (undef, 0);
+        while (my @ids = splice @bug_ids, 0, $portion)
+        {
+            $rows = {};
+            $wh = "bug_id IN (" . join(",", ("?") x @ids) . ")";
+            ($short) = $dbh->selectall_arrayref(
+                "SELECT bug_id, short_desc FROM bugs WHERE $wh", undef, @ids
+            );
+            $all = $dbh->selectall_arrayref(
+                "SELECT bug_id, thetext, isprivate FROM longdescs WHERE $wh",
+                undef, @ids
+            );
+            $rows->{$_->[0]} = [ stem_text($_->[1]), '', '' ] for @$short;
+            for (@$all)
+            {
+                $_->[1] = stem_text($_->[1]);
+                $rows->{$_->[0]}->[1] .= $_->[1] . "\n";
+                $rows->{$_->[0]}->[2] .= $_->[1] . "\n"
+                    unless $_->[2];
            }
-            $dbh->do(
-                q{INSERT INTO bugs_fulltext (bug_id, short_desc, comments, 
-                                             comments_noprivate)
-                       SELECT bugs.bug_id, bugs.short_desc, }
-                     . $dbh->sql_group_concat('longdescs.thetext', '\'\n\'')
-              . ', ' . $dbh->sql_group_concat('nopriv.thetext',    '\'\n\'') .
-                      q{ FROM bugs 
-                              LEFT JOIN longdescs
-                                     ON bugs.bug_id = longdescs.bug_id
-                              LEFT JOIN longdescs AS nopriv
-                                     ON longdescs.comment_id = nopriv.comment_id
-                                        AND nopriv.isprivate = 0 }
-                     . $dbh->sql_group_by('bugs.bug_id', 'bugs.short_desc'));
-        }
-        # The slow way, without group_concat.
-        else {
-            print "Populating bugs_fulltext.short_desc...\n";
-            $dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc)
-                           SELECT bug_id, short_desc FROM bugs');
-
-            my $count = 1;
-            my $sth_all = $dbh->prepare('SELECT thetext FROM longdescs 
-                                          WHERE bug_id = ?');
-            my $sth_nopriv = $dbh->prepare(
-                'SELECT thetext FROM longdescs
-                  WHERE bug_id = ? AND isprivate = 0');
-            my $sth_update = $dbh->prepare(
-                'UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ?
-                  WHERE bug_id = ?');
-
-            print "Populating bugs_fulltext comment fields...\n";
-            foreach my $id (@$bug_ids) { 
-                my $all = $dbh->selectcol_arrayref($sth_all, undef, $id);
-                my $nopriv = $dbh->selectcol_arrayref($sth_nopriv, undef, $id);
-                $sth_update->execute(join("\n", @$all), join("\n", @$nopriv), $id);
-                indicate_progress({ total   => scalar @$bug_ids, every => 100,
-                                    current => $count++ });
+            if ($sthn != @ids)
+            {
+                # Optimization: cache prepared statements
+                $sthn = @ids;
+                $sth = $dbh->prepare(
+                    "INSERT INTO bugs_fulltext (bug_id, short_desc, comments, comments_noprivate)" .
+                    " VALUES " . join(",", ("(?,?,?,?)") x @ids)
+                );
            }
-            print "\n";
+            $sth->execute(map { ($_, @{$rows->{$_}}) } @ids);
+            $done += @ids;
+            print "\r$done / $total ...";
        }
+        print "\n";
    }
 }

--- a/Bugzilla/Search.pm
+++ b/Bugzilla/Search.pm
@ -751,8 +751,8 @@ sub init {
    my @supplist = (" ");
    foreach my $str (@supptables) {

-        if ($str =~ /^(LEFT|INNER|RIGHT)\s+JOIN/i) {
-            $str =~ /^(.*?)\s+ON\s+(.*)$/i;
+        if ($str =~ /^(LEFT|INNER|RIGHT)\s+JOIN/iso) {
+            $str =~ /^(.*?)\s+ON\s+(.*)$/iso;
            my ($leftside, $rightside) = ($1, $2);
            if (defined $suppseen{$leftside}) {
                $supplist[$suppseen{$leftside}] .= " AND ($rightside)";
@ -1208,7 +1208,7 @@ sub _long_desc_changedbefore_after {
    my ($chartid, $t, $v, $supptables, $term) =
        @func_args{qw(chartid t v supptables term)};
    my $dbh = Bugzilla->dbh;
-    
+
    my $operator = ($$t =~ /before/) ? '<' : '>';
    my $table = "longdescs_$$chartid";
    push(@$supptables, "LEFT JOIN longdescs AS $table " .
@ -1218,53 +1218,53 @@ sub _long_desc_changedbefore_after {
    $$term = "($table.bug_when IS NOT NULL)";
 }

-sub _content_matches {
+sub _content_matches
+{
    my $self = shift;
    my %func_args = @_;
    my ($chartid, $supptables, $term, $groupby, $fields, $v) =
        @func_args{qw(chartid supptables term groupby fields v)};
    my $dbh = Bugzilla->dbh;
-    
+
    # "content" is an alias for columns containing text for which we
-    # can search a full-text index and retrieve results by relevance, 
+    # can search a full-text index and retrieve results by relevance,
    # currently just bug comments (and summaries to some degree).
    # There's only one way to search a full-text index, so we only
    # accept the "matches" operator, which is specific to full-text
    # index searches.

-    # Add the fulltext table to the query so we can search on it.
+    my $l = FULLTEXT_BUGLIST_LIMIT;
    my $table = "bugs_fulltext_$$chartid";
    my $comments_col = "comments";
-    $comments_col = "comments_noprivate" unless $self->{'user'}->is_insider;
-    push(@$supptables, "LEFT JOIN bugs_fulltext AS $table " .
-                       "ON bugs.bug_id = $table.bug_id");
-    
+    $comments_col = "comments_noprivate" unless $self->{user}->is_insider;
+
    # Create search terms to add to the SELECT and WHERE clauses.
-    my ($term1, $rterm1) = $dbh->sql_fulltext_search("$table.$comments_col", 
-                                                        $$v, 1);
-    my ($term2, $rterm2) = $dbh->sql_fulltext_search("$table.short_desc", 
-                                                        $$v, 2);
-    $rterm1 = $term1 if !$rterm1;
-    $rterm2 = $term2 if !$rterm2;
+    my $text = stem_text($$v, 1);
+    my ($term1, $rterm1) =
+        $dbh->sql_fulltext_search("bugs_fulltext.$comments_col", $text, 1);
+    $rterm1 ||= $term1;
+    my ($term2, $rterm2) =
+        $dbh->sql_fulltext_search("bugs_fulltext.short_desc", $text, 2);
+    $rterm2 ||= $term2;

-    # The term to use in the WHERE clause.
-    $$term = "$term1 > 0 OR $term2 > 0";
-    
-    # In order to sort by relevance (in case the user requests it),
-    # we SELECT the relevance value and give it an alias so we can
-    # add it to the SORT BY clause when we build it in buglist.cgi.
-    my $select_term = "($rterm1 + $rterm2) AS relevance";
+    # Bug 46221 - Russian Stemming in Bugzilla fulltext search
+    # Bugzilla's fulltext search mechanism is bad because
+    # MATCH(...) OR MATCH(...) is very slow in MySQL - it doesn't do
+    # fulltext index merge optimization. So we'll use a derived UNION table.
+    push @$supptables,
+        "INNER JOIN (
+            SELECT bug_id, SUM(relevance) AS relevance FROM (
+                (SELECT bug_id, $rterm1 AS relevance
+                FROM bugs_fulltext WHERE $term1 > 0 LIMIT $l)
+                UNION ALL
+                (SELECT bug_id, $rterm2 AS relevance
+                FROM bugs_fulltext WHERE $term2 > 0 LIMIT $l)
+            ) AS ${table}_0
+            GROUP BY bug_id
+        ) AS $table ON bugs.bug_id = $table.bug_id";

-    # Users can specify to display the relevance field, in which case
-    # it'll show up in the list of fields being selected, and we need
-    # to replace that occurrence with our select term.  Otherwise
-    # we can just add the term to the list of fields being selected.
-    if (grep($_ eq "relevance", @$fields)) {
-        @$fields = map($_ eq "relevance" ? $select_term : $_ , @$fields);
-    }
-    else {
-        push(@$fields, $select_term);
-    }
+    # All work done by INNER JOIN
+    $$term = "1";
 }

 sub _timestamp_compare {
@ -1272,7 +1272,7 @@ sub _timestamp_compare {
    my %func_args = @_;
    my ($v, $q) = @func_args{qw(v q)};
    my $dbh = Bugzilla->dbh;
-    
+
    $$v = SqlifyDate($$v);
    $$q = $dbh->quote($$v);
 }
@ -1283,7 +1283,7 @@ sub _commenter_exact {
    my ($chartid, $sequence, $supptables, $term, $v) =
        @func_args{qw(chartid sequence supptables term v)};
    my $user = $self->{'user'};
-    
+
    $$v =~ m/(%\\w+%)/;
    my $match = pronoun($1, $user);
    my $chartseq = $$chartid;
--- a/Bugzilla/Util.pm
+++ b/Bugzilla/Util.pm
@ -44,13 +44,14 @@ use base qw(Exporter);
                             file_mod_time is_7bit_clean
                             bz_crypt generate_random_password
                             validate_email_syntax clean_text
-                             get_text disable_utf8);
+                             get_text disable_utf8 stem_text);

 use Bugzilla::Constants;

 use Date::Parse;
 use Date::Format;
 use Text::Wrap;
+use Lingua::Stem::RuUTF8;

 # This is from the perlsec page, slightly modified to remove a warning
 # From that page:
@ -495,7 +496,7 @@ sub generate_random_password {
 sub validate_email_syntax {
    my ($addr) = @_;
    my $match = Bugzilla->params->{'emailregexp'};
-    my $ret = ($addr =~ /$match/ && $addr !~ /[\\\(\)<>&,;:"\[\] \t\r\n]/);
+    my $ret = ($addr =~ /$match/ && $addr !~ /[\\\(\)<>&,;:\"\[\] \t\r\n]/);
    if ($ret) {
        # We assume these checks to suffice to consider the address untainted.
        trick_taint($_[0]);
@ -586,6 +587,27 @@ sub disable_utf8 {
    }
 }

+# Bug 46221 - Russian Stemming in Bugzilla fulltext search
+sub stem_text
+{
+    my ($text, $allow_verbatim) = @_;
+    $text = [ split /(?<=\w)(?=\W)|(?<=\W)(?=\w)/, $text ];
+    my $q = 0;
+    for (@$text)
+    {
+        unless (/\W/)
+        {
+            $_ = Lingua::Stem::RuUTF8::stem_word($_) unless $q;
+        }
+        elsif ($allow_verbatim)
+        {
+            # If $allow_verbatim is TRUE then text in "double quotes" doesn't stem
+            $q = ($q + tr/\"/\"/) % 2;
+        }
+    }
+    return join '', @$text;
+}
+
 1;

 __END__
--- a/Lingua/Stem/RuUTF8.pm
+++ b/Lingua/Stem/RuUTF8.pm
@ -0,0 +1,252 @@
+#!/usr/bin/perl
+# Lingua::Stem::Ru - UTF-8 ×ÅÒÓÉÑ
+
+package Lingua::Stem::RuUTF8;
+
+use utf8;
+use strict;
+use Exporter;
+use Carp;
+use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
+
+BEGIN {
+    @ISA         = qw (Exporter);
+    @EXPORT      = ();
+    @EXPORT_OK   = qw (stem stem_word clear_stem_cache stem_caching);
+    %EXPORT_TAGS = ();
+}
+$VERSION = "0.01";
+
+my $Stem_Caching  = 0;
+my $Stem_Cache    = {};
+
+my $VOWEL        = qr/Ð°ÐµÐ¸Ð¾ÑƒÑ‹Ñ<E280B9>ÑŽÑ<C5BD>/;
+my $PERFECTIVEGROUND = qr/((Ð¸Ð²|Ð¸Ð²ÑˆÐ¸|Ð¸Ð²ÑˆÐ¸Ñ<C2B8>ÑŒ|Ñ‹Ð²|Ñ‹Ð²ÑˆÐ¸|Ñ‹Ð²ÑˆÐ¸Ñ<C2B8>ÑŒ)|((?<=[Ð°Ñ<C2B0>])(Ð²|Ð²ÑˆÐ¸|Ð²ÑˆÐ¸Ñ<C2B8>ÑŒ)))$/;
+my $REFLEXIVE    = qr/(Ñ<>[Ñ<>ÑŒ])$/;
+my $ADJECTIVE    = qr/(ÐµÐµ|Ð¸Ðµ|Ñ‹Ðµ|Ð¾Ðµ|Ð¸Ð¼Ð¸|Ñ‹Ð¼Ð¸|ÐµÐ¹|Ð¸Ð¹|Ñ‹Ð¹|Ð¾Ð¹|ÐµÐ¼|Ð¸Ð¼|Ñ‹Ð¼|Ð¾Ð¼|ÐµÐ³Ð¾|Ð¾Ð³Ð¾|ÐµÐ¼Ñƒ|Ð¾Ð¼Ñƒ|Ð¸Ñ…|Ñ‹Ñ…|ÑƒÑŽ|ÑŽÑŽ|Ð°Ñ<C2B0>|Ñ<>Ñ<EFBFBD>|Ð¾ÑŽ|ÐµÑŽ)$/;
+my $PARTICIPLE   = qr/((Ð¸Ð²Ñˆ|Ñ‹Ð²Ñˆ|ÑƒÑŽÑ‰)|((?<=[Ð°Ñ<C2B0>])(ÐµÐ¼|Ð½Ð½|Ð²Ñˆ|ÑŽÑ‰|Ñ‰)))$/;
+my $VERB         = qr/((Ð¸Ð»Ð°|Ñ‹Ð»Ð°|ÐµÐ½Ð°|ÐµÐ¹Ñ‚Ðµ|ÑƒÐ¹Ñ‚Ðµ|Ð¸Ñ‚Ðµ|Ð¸Ð»Ð¸|Ñ‹Ð»Ð¸|ÐµÐ¹|ÑƒÐ¹|Ð¸Ð»|Ñ‹Ð»|Ð¸Ð¼|Ñ‹Ð¼|ÐµÐ½|Ð¸Ð»Ð¾|Ñ‹Ð»Ð¾|ÐµÐ½Ð¾|Ñ<>Ñ‚|ÑƒÐµÑ‚|ÑƒÑŽÑ‚|Ð¸Ñ‚|Ñ‹Ñ‚|ÐµÐ½Ñ‹|Ð¸Ñ‚ÑŒ|Ñ‹Ñ‚ÑŒ|Ð¸ÑˆÑŒ|ÑƒÑŽ|ÑŽ)|((?<=[Ð°Ñ<C2B0>])(Ð»Ð°|Ð½Ð°|ÐµÑ‚Ðµ|Ð¹Ñ‚Ðµ|Ð»Ð¸|Ð¹|Ð»|ÐµÐ¼|Ð½|Ð»Ð¾|Ð½Ð¾|ÐµÑ‚|ÑŽÑ‚|Ð½Ñ‹|Ñ‚ÑŒ|ÐµÑˆÑŒ|Ð½Ð½Ð¾)))$/;
+my $NOUN         = qr/(Ð°|ÐµÐ²|Ð¾Ð²|Ð¸Ðµ|ÑŒÐµ|Ðµ|Ð¸Ñ<C2B8>Ð¼Ð¸|Ñ<>Ð¼Ð¸|Ð°Ð¼Ð¸|ÐµÐ¸|Ð¸Ð¸|Ð¸|Ð¸ÐµÐ¹|ÐµÐ¹|Ð¾Ð¹|Ð¸Ð¹|Ð¹|Ð¸Ñ<C2B8>Ð¼|Ñ<>Ð¼|Ð¸ÐµÐ¼|ÐµÐ¼|Ð°Ð¼|Ð¾Ð¼|Ð¾|Ñƒ|Ð°Ñ…|Ð¸Ñ<C2B8>Ñ…|Ñ<>Ñ…|Ñ‹|ÑŒ|Ð¸ÑŽ|ÑŒÑŽ|ÑŽ|Ð¸Ñ<C2B8>|ÑŒÑ<C592>|Ñ<>)$/;
+my $RVRE         = qr/^(.*?[$VOWEL])(.*)$/;
+my $DERIVATIONAL = qr/[^$VOWEL][$VOWEL]+[^$VOWEL]+[$VOWEL].*(?<=Ð¾)Ñ<>Ñ‚ÑŒ?$/;
+
+sub stem {
+    return [] if ($#_ == -1);
+    my $parm_ref;
+    if (ref $_[0]) {
+        $parm_ref = shift;
+    } else {
+        $parm_ref = { @_ };
+    }
+
+    my $words      = [];
+    my $locale     = 'ru';
+    my $exceptions = {};
+    foreach (keys %$parm_ref) {
+        my $key = lc ($_);
+        if ($key eq '-words') {
+            @$words = @{$parm_ref->{$key}};
+        } elsif ($key eq '-exceptions') {
+            $exceptions = $parm_ref->{$key};
+        } elsif ($key eq '-locale') {
+            $locale = $parm_ref->{$key};
+        } else {
+            croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
+        }
+    }
+
+    local( $_ );
+    foreach (@$words) {
+        # Flatten case
+        $_ = lc $_;
+
+        # Check against exceptions list
+        if (exists $exceptions->{$_}) {
+            $_ = $exceptions->{$_};
+            next;
+        }
+
+        # Check against cache of stemmed words
+        my $original_word = $_;
+        if ($Stem_Caching && exists $Stem_Cache->{$original_word}) {
+            $_ = $Stem_Cache->{$original_word};
+            next;
+        }
+
+        $_ = stem_word($_);
+
+        $Stem_Cache->{$original_word} = $_ if $Stem_Caching;
+    }
+    $Stem_Cache = {} if ($Stem_Caching < 2);
+
+    return $words;
+}
+
+sub stem_word {
+    my $word = lc shift;
+
+    # Check against cache of stemmed words
+    if ($Stem_Caching && exists $Stem_Cache->{$word}) {
+        return $Stem_Cache->{$word};
+    }
+
+     my ($start, $RV) = $word =~ /$RVRE/;
+     return $word unless $RV;
+
+     # Step 1
+     unless ($RV =~ s/$PERFECTIVEGROUND//) {
+         $RV =~ s/$REFLEXIVE//;
+
+         if ($RV =~ s/$ADJECTIVE//) {
+             $RV =~ s/$PARTICIPLE//;
+         } else {
+             $RV =~ s/$NOUN// unless $RV =~ s/$VERB//;
+         }
+     }
+
+     # Step 2
+     $RV =~ s/Ð¸$//;
+
+     # Step 3
+     $RV =~ s/Ð¾Ñ<C2BE>Ñ‚ÑŒ?$// if $RV =~ /$DERIVATIONAL/;
+
+     # Step 4
+     unless ($RV =~ s/ÑŒ$//) {
+         $RV =~ s/ÐµÐ¹ÑˆÐµ?//;
+         $RV =~ s/Ð½Ð½$/Ð½/;	
+     }
+
+     return $start.$RV;
+}
+
+sub stem_caching {
+    my $parm_ref;
+    if (ref $_[0]) {
+        $parm_ref = shift;
+    } else {
+        $parm_ref = { @_ };
+    }
+    my $caching_level = $parm_ref->{-level};
+    if (defined $caching_level) {
+        if ($caching_level !~ m/^[012]$/) {
+            croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
+        }
+        $Stem_Caching = $caching_level;
+    }
+    return $Stem_Caching;
+}
+
+sub clear_stem_cache {
+    $Stem_Cache = {};
+}
+
+1;
+__END__
+
+=head1 NAME
+
+Lingua::Stem::RuUTF8 - Porter's stemming algorithm for Russian (UTF-8 only)
+
+=head1 SYNOPSIS
+
+    use Lingua::Stem::RuUTF8;
+    my $stems = Lingua::Stem::RuUTF8::stem({
+        -words => $word_list_reference,
+        -locale => 'ru',
+        -exceptions => $exceptions_hash,
+    });
+
+    my $stem = Lingua::Stem::RuUTF8::stem_word( $word );
+
+=head1 DESCRIPTION
+
+This module applies the Porter Stemming Algorithm to its parameters,
+returning the stemmed words.
+
+The algorithm is implemented exactly as described in:
+
+    http://snowball.tartarus.org/russian/stemmer.html
+
+The code is carefully crafted to work in conjunction with the L<Lingua::Stem>
+module by Benjamin Franz. This stemmer is also based 
+on the work of Aldo Capini, see L<Lingua::Stem::It>.
+
+=head1 METHODS
+
+=over 4
+
+=item stem({ -words => \@words, -locale => 'ru', -exceptions => \%exceptions });
+
+Stems a list of passed words. Returns an anonymous list reference to the stemmed
+words.
+
+Example:
+
+  my $stemmed_words = Lingua::Stem::RuUTF8::stem({
+      -words => \@words,
+      -locale => 'ru',
+      -exceptions => \%exceptions,
+  });
+
+=item stem_word( $word );
+
+Stems a single word and returns the stem directly.
+
+Example:
+
+  my $stem = Lingua::Stem::RuUTF8::stem_word( $word );
+
+=item stem_caching({ -level => 0|1|2 });
+
+Sets the level of stem caching.
+
+'0' means 'no caching'. This is the default level.
+
+'1' means 'cache per run'. This caches stemming results during a single
+    call to 'stem'.
+
+'2' means 'cache indefinitely'. This caches stemming results until
+    either the process exits or the 'clear_stem_cache' method is called.
+
+=item clear_stem_cache;
+
+Clears the cache of stemmed words
+
+=back
+
+=cut
+
+=head2 EXPORT
+
+None by default.
+
+=head1 HISTORY
+
+=over 8
+
+=item *
+
+0.01 (2004-05-21)
+
+=back
+
+=head1 AUTHOR
+
+Aleksandr Guidrevitch <pillgrim@mail.ru>
+
+=head1 SEE ALSO
+
+ Lingua::Stem
+
+=head1 COPYRIGHT
+
+Copyright (C) 2003 by Aldo Calpini <dada@perl.it>
+
+Copyright (C) 2004 by Aleksandr Guidrevitch <pillgrim@mail.ru>
+
+This software may be freely copied and distributed under the same
+terms and conditions as Perl itself, either Perl version 5.8.3
+or, at your option, any later version of Perl 5 you may
+have available..
+
+=cut
--- a/template/en/default/search/form.html.tmpl
+++ b/template/en/default/search/form.html.tmpl
@ -125,6 +125,20 @@ function doOnSelectProduct(selectmode) {
 [%# *** Summary *** %]

 <table>
+  <tr>
+    <th align="right" colspan="2">
+      <label for="content">Words (full-text search):</label>
+    </th>
+    <td>
+      <input name="content" size="40" id="content"
+             value="[% default.content.0 FILTER html %]">
+      <script type="text/javascript"> <!--
+          document.forms['queryform'].content.focus(); 
+      // -->
+      </script>
+    </td>
+  </tr>
+
  <tr>
    <th align="right">
      <label for="short_desc" accesskey="s"><u>S</u>ummary</label>: