Bug 46221 - Russian Stemming in Bugzilla 3.2 Full-Text Search
[Bug 40933 - Bugzilla 3.2] git-svn-id: svn://svn.office.custis.ru/3rdparty/bugzilla.org/trunk@114 6955db30-a419-402b-8a0d-67ecbb4d7f56custis
parent
182f38c7e3
commit
4ca9210f7b
|
@ -750,29 +750,34 @@ sub _extract_multi_selects {
|
|||
}
|
||||
|
||||
# Should be called any time you update short_desc or change a comment.
|
||||
sub _sync_fulltext {
|
||||
sub _sync_fulltext
|
||||
{
|
||||
my ($self, $new_bug) = @_;
|
||||
my $dbh = Bugzilla->dbh;
|
||||
if ($new_bug) {
|
||||
$dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc)
|
||||
SELECT bug_id, short_desc FROM bugs WHERE bug_id = ?',
|
||||
undef, $self->id);
|
||||
my ($short_desc) = $dbh->selectrow_array(
|
||||
"SELECT short_desc FROM bugs WHERE bug_id=?", undef, $self->id
|
||||
);
|
||||
my @comments = @{ $dbh->selectall_arrayref(
|
||||
"SELECT thetext, isprivate FROM longdescs WHERE bug_id=?",
|
||||
undef, $self->id
|
||||
) || [] };
|
||||
my @no_private = grep { !$_->[1] } @comments;
|
||||
my $all = join "\n", map { $_->[0] } @comments;
|
||||
my $nopriv = join "\n", map { $_->[0] } @no_private;
|
||||
# Bug 46221 - Russian Stemming in Bugzilla fulltext search
|
||||
{
|
||||
use utf8;
|
||||
$short_desc = stem_text($short_desc);
|
||||
$all = stem_text($all);
|
||||
$nopriv = stem_text($nopriv);
|
||||
}
|
||||
else {
|
||||
$dbh->do('UPDATE bugs_fulltext SET short_desc = ? WHERE bug_id = ?',
|
||||
undef, $self->short_desc, $self->id);
|
||||
}
|
||||
my $comments = $dbh->selectall_arrayref(
|
||||
'SELECT thetext, isprivate FROM longdescs WHERE bug_id = ?',
|
||||
undef, $self->id);
|
||||
my $all = join("\n", map { $_->[0] } @$comments);
|
||||
my @no_private = grep { !$_->[1] } @$comments;
|
||||
my $nopriv_string = join("\n", map { $_->[0] } @no_private);
|
||||
$dbh->do('UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ?
|
||||
WHERE bug_id = ?', undef, $all, $nopriv_string, $self->id);
|
||||
my $sql = "bugs_fulltext SET short_desc=?, comments=?, comments_noprivate=?";
|
||||
my @bind = ($short_desc, $all, $nopriv, $self->id);
|
||||
$sql = "INSERT INTO $sql, bug_id=?" if $new_bug;
|
||||
$sql = "UPDATE $sql WHERE bug_id=?" unless $new_bug;
|
||||
return $dbh->do($sql, undef, @bind);
|
||||
}
|
||||
|
||||
|
||||
# This is the correct way to delete bugs from the DB.
|
||||
# No bug should be deleted from anywhere else except from here.
|
||||
#
|
||||
|
|
|
@ -158,7 +158,7 @@ sub sql_limit {
|
|||
|
||||
sub sql_string_concat {
|
||||
my ($self, @params) = @_;
|
||||
|
||||
|
||||
return 'CONCAT(' . join(', ', @params) . ')';
|
||||
}
|
||||
|
||||
|
@ -167,7 +167,7 @@ sub sql_fulltext_search {
|
|||
|
||||
# Add the boolean mode modifier if the search string contains
|
||||
# boolean operators.
|
||||
my $mode = ($text =~ /[+\-<>()~*"]/ ? "IN BOOLEAN MODE" : "");
|
||||
my $mode = ($text =~ /[+\-<>()~*\"]/ ? "IN BOOLEAN MODE" : "");
|
||||
|
||||
# quote the text for use in the MATCH AGAINST expression
|
||||
$text = $self->quote($text);
|
||||
|
@ -180,7 +180,7 @@ sub sql_fulltext_search {
|
|||
|
||||
sub sql_istring {
|
||||
my ($self, $string) = @_;
|
||||
|
||||
|
||||
return $string;
|
||||
}
|
||||
|
||||
|
|
|
@ -2965,7 +2965,7 @@ sub _change_text_types {
|
|||
$dbh->bz_alter_column('namedqueries', 'query',
|
||||
{ TYPE => 'LONGTEXT', NOTNULL => 1 });
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
sub _check_content_length {
|
||||
my ($table_name, $field_name, $max_length, $id_field) = @_;
|
||||
|
@ -2993,67 +2993,58 @@ sub _check_content_length {
|
|||
}
|
||||
}
|
||||
|
||||
sub _populate_bugs_fulltext {
|
||||
sub _populate_bugs_fulltext
|
||||
{
|
||||
my $dbh = Bugzilla->dbh;
|
||||
my $fulltext = $dbh->selectrow_array('SELECT 1 FROM bugs_fulltext '
|
||||
. $dbh->sql_limit(1));
|
||||
my $fulltext = $dbh->selectrow_array
|
||||
("SELECT 1 FROM bugs_fulltext ".$dbh->sql_limit(1));
|
||||
# We only populate the table if it's empty...
|
||||
if (!$fulltext) {
|
||||
if (!$fulltext)
|
||||
{
|
||||
# ... and if there are bugs in the bugs table.
|
||||
my $bug_ids = $dbh->selectcol_arrayref('SELECT bug_id FROM bugs');
|
||||
return if !@$bug_ids;
|
||||
my @bug_ids = @{ $dbh->selectcol_arrayref("SELECT bug_id FROM bugs") };
|
||||
return if !@bug_ids;
|
||||
|
||||
# Populating bugs_fulltext can be very slow for large installs,
|
||||
# so we special-case any DB that supports GROUP_CONCAT, which is
|
||||
# a much faster way to do things.
|
||||
if (UNIVERSAL::can($dbh, 'sql_group_concat')) {
|
||||
print "Populating bugs_fulltext...";
|
||||
print " (this can take a long time.)\n";
|
||||
# XXX This hack should probably be moved elsewhere.
|
||||
if ($dbh->isa('Bugzilla::DB::Mysql')) {
|
||||
$dbh->do('SET SESSION group_concat_max_len = 128000000');
|
||||
$dbh->do('SET SESSION max_allowed_packet = 128000000');
|
||||
# Bug 46221 - Russian Stemming in Bugzilla fulltext search
|
||||
# We can't use GROUP_CONCAT because we need to stem each word
|
||||
# And there could be tons of bugs, so we'll use N-bug portions
|
||||
print "Populating bugs_fulltext... (this can take a long time.)\n";
|
||||
my ($portion, $done, $total) = (256, 0, scalar @bug_ids);
|
||||
my ($short, $all, $nopriv, $wh, $rows);
|
||||
my ($sth, $sthn) = (undef, 0);
|
||||
while (my @ids = splice @bug_ids, 0, $portion)
|
||||
{
|
||||
$rows = {};
|
||||
$wh = "bug_id IN (" . join(",", ("?") x @ids) . ")";
|
||||
($short) = $dbh->selectall_arrayref(
|
||||
"SELECT bug_id, short_desc FROM bugs WHERE $wh", undef, @ids
|
||||
);
|
||||
$all = $dbh->selectall_arrayref(
|
||||
"SELECT bug_id, thetext, isprivate FROM longdescs WHERE $wh",
|
||||
undef, @ids
|
||||
);
|
||||
$rows->{$_->[0]} = [ stem_text($_->[1]), '', '' ] for @$short;
|
||||
for (@$all)
|
||||
{
|
||||
$_->[1] = stem_text($_->[1]);
|
||||
$rows->{$_->[0]}->[1] .= $_->[1] . "\n";
|
||||
$rows->{$_->[0]}->[2] .= $_->[1] . "\n"
|
||||
unless $_->[2];
|
||||
}
|
||||
$dbh->do(
|
||||
q{INSERT INTO bugs_fulltext (bug_id, short_desc, comments,
|
||||
comments_noprivate)
|
||||
SELECT bugs.bug_id, bugs.short_desc, }
|
||||
. $dbh->sql_group_concat('longdescs.thetext', '\'\n\'')
|
||||
. ', ' . $dbh->sql_group_concat('nopriv.thetext', '\'\n\'') .
|
||||
q{ FROM bugs
|
||||
LEFT JOIN longdescs
|
||||
ON bugs.bug_id = longdescs.bug_id
|
||||
LEFT JOIN longdescs AS nopriv
|
||||
ON longdescs.comment_id = nopriv.comment_id
|
||||
AND nopriv.isprivate = 0 }
|
||||
. $dbh->sql_group_by('bugs.bug_id', 'bugs.short_desc'));
|
||||
}
|
||||
# The slow way, without group_concat.
|
||||
else {
|
||||
print "Populating bugs_fulltext.short_desc...\n";
|
||||
$dbh->do('INSERT INTO bugs_fulltext (bug_id, short_desc)
|
||||
SELECT bug_id, short_desc FROM bugs');
|
||||
|
||||
my $count = 1;
|
||||
my $sth_all = $dbh->prepare('SELECT thetext FROM longdescs
|
||||
WHERE bug_id = ?');
|
||||
my $sth_nopriv = $dbh->prepare(
|
||||
'SELECT thetext FROM longdescs
|
||||
WHERE bug_id = ? AND isprivate = 0');
|
||||
my $sth_update = $dbh->prepare(
|
||||
'UPDATE bugs_fulltext SET comments = ?, comments_noprivate = ?
|
||||
WHERE bug_id = ?');
|
||||
|
||||
print "Populating bugs_fulltext comment fields...\n";
|
||||
foreach my $id (@$bug_ids) {
|
||||
my $all = $dbh->selectcol_arrayref($sth_all, undef, $id);
|
||||
my $nopriv = $dbh->selectcol_arrayref($sth_nopriv, undef, $id);
|
||||
$sth_update->execute(join("\n", @$all), join("\n", @$nopriv), $id);
|
||||
indicate_progress({ total => scalar @$bug_ids, every => 100,
|
||||
current => $count++ });
|
||||
if ($sthn != @ids)
|
||||
{
|
||||
# Optimization: cache prepared statements
|
||||
$sthn = @ids;
|
||||
$sth = $dbh->prepare(
|
||||
"INSERT INTO bugs_fulltext (bug_id, short_desc, comments, comments_noprivate)" .
|
||||
" VALUES " . join(",", ("(?,?,?,?)") x @ids)
|
||||
);
|
||||
}
|
||||
print "\n";
|
||||
$sth->execute(map { ($_, @{$rows->{$_}}) } @ids);
|
||||
$done += @ids;
|
||||
print "\r$done / $total ...";
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -751,8 +751,8 @@ sub init {
|
|||
my @supplist = (" ");
|
||||
foreach my $str (@supptables) {
|
||||
|
||||
if ($str =~ /^(LEFT|INNER|RIGHT)\s+JOIN/i) {
|
||||
$str =~ /^(.*?)\s+ON\s+(.*)$/i;
|
||||
if ($str =~ /^(LEFT|INNER|RIGHT)\s+JOIN/iso) {
|
||||
$str =~ /^(.*?)\s+ON\s+(.*)$/iso;
|
||||
my ($leftside, $rightside) = ($1, $2);
|
||||
if (defined $suppseen{$leftside}) {
|
||||
$supplist[$suppseen{$leftside}] .= " AND ($rightside)";
|
||||
|
@ -1208,7 +1208,7 @@ sub _long_desc_changedbefore_after {
|
|||
my ($chartid, $t, $v, $supptables, $term) =
|
||||
@func_args{qw(chartid t v supptables term)};
|
||||
my $dbh = Bugzilla->dbh;
|
||||
|
||||
|
||||
my $operator = ($$t =~ /before/) ? '<' : '>';
|
||||
my $table = "longdescs_$$chartid";
|
||||
push(@$supptables, "LEFT JOIN longdescs AS $table " .
|
||||
|
@ -1218,53 +1218,53 @@ sub _long_desc_changedbefore_after {
|
|||
$$term = "($table.bug_when IS NOT NULL)";
|
||||
}
|
||||
|
||||
sub _content_matches {
|
||||
sub _content_matches
|
||||
{
|
||||
my $self = shift;
|
||||
my %func_args = @_;
|
||||
my ($chartid, $supptables, $term, $groupby, $fields, $v) =
|
||||
@func_args{qw(chartid supptables term groupby fields v)};
|
||||
my $dbh = Bugzilla->dbh;
|
||||
|
||||
|
||||
# "content" is an alias for columns containing text for which we
|
||||
# can search a full-text index and retrieve results by relevance,
|
||||
# can search a full-text index and retrieve results by relevance,
|
||||
# currently just bug comments (and summaries to some degree).
|
||||
# There's only one way to search a full-text index, so we only
|
||||
# accept the "matches" operator, which is specific to full-text
|
||||
# index searches.
|
||||
|
||||
# Add the fulltext table to the query so we can search on it.
|
||||
my $l = FULLTEXT_BUGLIST_LIMIT;
|
||||
my $table = "bugs_fulltext_$$chartid";
|
||||
my $comments_col = "comments";
|
||||
$comments_col = "comments_noprivate" unless $self->{'user'}->is_insider;
|
||||
push(@$supptables, "LEFT JOIN bugs_fulltext AS $table " .
|
||||
"ON bugs.bug_id = $table.bug_id");
|
||||
|
||||
$comments_col = "comments_noprivate" unless $self->{user}->is_insider;
|
||||
|
||||
# Create search terms to add to the SELECT and WHERE clauses.
|
||||
my ($term1, $rterm1) = $dbh->sql_fulltext_search("$table.$comments_col",
|
||||
$$v, 1);
|
||||
my ($term2, $rterm2) = $dbh->sql_fulltext_search("$table.short_desc",
|
||||
$$v, 2);
|
||||
$rterm1 = $term1 if !$rterm1;
|
||||
$rterm2 = $term2 if !$rterm2;
|
||||
my $text = stem_text($$v, 1);
|
||||
my ($term1, $rterm1) =
|
||||
$dbh->sql_fulltext_search("bugs_fulltext.$comments_col", $text, 1);
|
||||
$rterm1 ||= $term1;
|
||||
my ($term2, $rterm2) =
|
||||
$dbh->sql_fulltext_search("bugs_fulltext.short_desc", $text, 2);
|
||||
$rterm2 ||= $term2;
|
||||
|
||||
# The term to use in the WHERE clause.
|
||||
$$term = "$term1 > 0 OR $term2 > 0";
|
||||
|
||||
# In order to sort by relevance (in case the user requests it),
|
||||
# we SELECT the relevance value and give it an alias so we can
|
||||
# add it to the SORT BY clause when we build it in buglist.cgi.
|
||||
my $select_term = "($rterm1 + $rterm2) AS relevance";
|
||||
# Bug 46221 - Russian Stemming in Bugzilla fulltext search
|
||||
# Bugzilla's fulltext search mechanism is bad because
|
||||
# MATCH(...) OR MATCH(...) is very slow in MySQL - it doesn't do
|
||||
# fulltext index merge optimization. So we'll use a derived UNION table.
|
||||
push @$supptables,
|
||||
"INNER JOIN (
|
||||
SELECT bug_id, SUM(relevance) AS relevance FROM (
|
||||
(SELECT bug_id, $rterm1 AS relevance
|
||||
FROM bugs_fulltext WHERE $term1 > 0 LIMIT $l)
|
||||
UNION ALL
|
||||
(SELECT bug_id, $rterm2 AS relevance
|
||||
FROM bugs_fulltext WHERE $term2 > 0 LIMIT $l)
|
||||
) AS ${table}_0
|
||||
GROUP BY bug_id
|
||||
) AS $table ON bugs.bug_id = $table.bug_id";
|
||||
|
||||
# Users can specify to display the relevance field, in which case
|
||||
# it'll show up in the list of fields being selected, and we need
|
||||
# to replace that occurrence with our select term. Otherwise
|
||||
# we can just add the term to the list of fields being selected.
|
||||
if (grep($_ eq "relevance", @$fields)) {
|
||||
@$fields = map($_ eq "relevance" ? $select_term : $_ , @$fields);
|
||||
}
|
||||
else {
|
||||
push(@$fields, $select_term);
|
||||
}
|
||||
# All work done by INNER JOIN
|
||||
$$term = "1";
|
||||
}
|
||||
|
||||
sub _timestamp_compare {
|
||||
|
@ -1272,7 +1272,7 @@ sub _timestamp_compare {
|
|||
my %func_args = @_;
|
||||
my ($v, $q) = @func_args{qw(v q)};
|
||||
my $dbh = Bugzilla->dbh;
|
||||
|
||||
|
||||
$$v = SqlifyDate($$v);
|
||||
$$q = $dbh->quote($$v);
|
||||
}
|
||||
|
@ -1283,7 +1283,7 @@ sub _commenter_exact {
|
|||
my ($chartid, $sequence, $supptables, $term, $v) =
|
||||
@func_args{qw(chartid sequence supptables term v)};
|
||||
my $user = $self->{'user'};
|
||||
|
||||
|
||||
$$v =~ m/(%\\w+%)/;
|
||||
my $match = pronoun($1, $user);
|
||||
my $chartseq = $$chartid;
|
||||
|
|
|
@ -44,13 +44,14 @@ use base qw(Exporter);
|
|||
file_mod_time is_7bit_clean
|
||||
bz_crypt generate_random_password
|
||||
validate_email_syntax clean_text
|
||||
get_text disable_utf8);
|
||||
get_text disable_utf8 stem_text);
|
||||
|
||||
use Bugzilla::Constants;
|
||||
|
||||
use Date::Parse;
|
||||
use Date::Format;
|
||||
use Text::Wrap;
|
||||
use Lingua::Stem::RuUTF8;
|
||||
|
||||
# This is from the perlsec page, slightly modified to remove a warning
|
||||
# From that page:
|
||||
|
@ -495,7 +496,7 @@ sub generate_random_password {
|
|||
sub validate_email_syntax {
|
||||
my ($addr) = @_;
|
||||
my $match = Bugzilla->params->{'emailregexp'};
|
||||
my $ret = ($addr =~ /$match/ && $addr !~ /[\\\(\)<>&,;:"\[\] \t\r\n]/);
|
||||
my $ret = ($addr =~ /$match/ && $addr !~ /[\\\(\)<>&,;:\"\[\] \t\r\n]/);
|
||||
if ($ret) {
|
||||
# We assume these checks to suffice to consider the address untainted.
|
||||
trick_taint($_[0]);
|
||||
|
@ -586,6 +587,27 @@ sub disable_utf8 {
|
|||
}
|
||||
}
|
||||
|
||||
# Bug 46221 - Russian Stemming in Bugzilla fulltext search
|
||||
sub stem_text
|
||||
{
|
||||
my ($text, $allow_verbatim) = @_;
|
||||
$text = [ split /(?<=\w)(?=\W)|(?<=\W)(?=\w)/, $text ];
|
||||
my $q = 0;
|
||||
for (@$text)
|
||||
{
|
||||
unless (/\W/)
|
||||
{
|
||||
$_ = Lingua::Stem::RuUTF8::stem_word($_) unless $q;
|
||||
}
|
||||
elsif ($allow_verbatim)
|
||||
{
|
||||
# If $allow_verbatim is TRUE then text in "double quotes" doesn't stem
|
||||
$q = ($q + tr/\"/\"/) % 2;
|
||||
}
|
||||
}
|
||||
return join '', @$text;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
|
|
@ -0,0 +1,252 @@
|
|||
#!/usr/bin/perl
|
||||
# Lingua::Stem::Ru - UTF-8 ×ÅÒÓÉÑ
|
||||
|
||||
package Lingua::Stem::RuUTF8;
|
||||
|
||||
use utf8;
|
||||
use strict;
|
||||
use Exporter;
|
||||
use Carp;
|
||||
use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
|
||||
|
||||
BEGIN {
|
||||
@ISA = qw (Exporter);
|
||||
@EXPORT = ();
|
||||
@EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching);
|
||||
%EXPORT_TAGS = ();
|
||||
}
|
||||
$VERSION = "0.01";
|
||||
|
||||
my $Stem_Caching = 0;
|
||||
my $Stem_Cache = {};
|
||||
|
||||
my $VOWEL = qr/аеиоуыÑ<E280B9>ÑŽÑ<C5BD>/;
|
||||
my $PERFECTIVEGROUND = qr/((ив|ивши|ившиÑ<C2B8>ÑŒ|ыв|ывши|ывшиÑ<C2B8>ÑŒ)|((?<=[аÑ<C2B0>])(в|вши|вшиÑ<C2B8>ÑŒ)))$/;
|
||||
my $REFLEXIVE = qr/(Ñ<>[Ñ<>ÑŒ])$/;
|
||||
my $ADJECTIVE = qr/(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|Ñ‹Ñ…|ую|ÑŽÑŽ|аÑ<C2B0>|Ñ<>Ñ<EFBFBD>|ою|ею)$/;
|
||||
my $PARTICIPLE = qr/((ивш|ывш|ующ)|((?<=[аÑ<C2B0>])(ем|нн|вш|ющ|щ)))$/;
|
||||
my $VERB = qr/((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|Ñ<>Ñ‚|ует|уют|ит|Ñ‹Ñ‚|ены|ить|Ñ‹Ñ‚ÑŒ|ишь|ую|ÑŽ)|((?<=[аÑ<C2B0>])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ÑŽÑ‚|ны|Ñ‚ÑŒ|ешь|нно)))$/;
|
||||
my $NOUN = qr/(а|ев|ов|ие|ье|е|иÑ<C2B8>ми|Ñ<>ми|ами|еи|ии|и|ией|ей|ой|ий|й|иÑ<C2B8>м|Ñ<>м|ием|ем|ам|ом|о|у|ах|иÑ<C2B8>Ñ…|Ñ<>Ñ…|Ñ‹|ÑŒ|ию|ью|ÑŽ|иÑ<C2B8>|ÑŒÑ<C592>|Ñ<>)$/;
|
||||
my $RVRE = qr/^(.*?[$VOWEL])(.*)$/;
|
||||
my $DERIVATIONAL = qr/[^$VOWEL][$VOWEL]+[^$VOWEL]+[$VOWEL].*(?<=о)Ñ<>Ñ‚ÑŒ?$/;
|
||||
|
||||
sub stem {
|
||||
return [] if ($#_ == -1);
|
||||
my $parm_ref;
|
||||
if (ref $_[0]) {
|
||||
$parm_ref = shift;
|
||||
} else {
|
||||
$parm_ref = { @_ };
|
||||
}
|
||||
|
||||
my $words = [];
|
||||
my $locale = 'ru';
|
||||
my $exceptions = {};
|
||||
foreach (keys %$parm_ref) {
|
||||
my $key = lc ($_);
|
||||
if ($key eq '-words') {
|
||||
@$words = @{$parm_ref->{$key}};
|
||||
} elsif ($key eq '-exceptions') {
|
||||
$exceptions = $parm_ref->{$key};
|
||||
} elsif ($key eq '-locale') {
|
||||
$locale = $parm_ref->{$key};
|
||||
} else {
|
||||
croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
|
||||
}
|
||||
}
|
||||
|
||||
local( $_ );
|
||||
foreach (@$words) {
|
||||
# Flatten case
|
||||
$_ = lc $_;
|
||||
|
||||
# Check against exceptions list
|
||||
if (exists $exceptions->{$_}) {
|
||||
$_ = $exceptions->{$_};
|
||||
next;
|
||||
}
|
||||
|
||||
# Check against cache of stemmed words
|
||||
my $original_word = $_;
|
||||
if ($Stem_Caching && exists $Stem_Cache->{$original_word}) {
|
||||
$_ = $Stem_Cache->{$original_word};
|
||||
next;
|
||||
}
|
||||
|
||||
$_ = stem_word($_);
|
||||
|
||||
$Stem_Cache->{$original_word} = $_ if $Stem_Caching;
|
||||
}
|
||||
$Stem_Cache = {} if ($Stem_Caching < 2);
|
||||
|
||||
return $words;
|
||||
}
|
||||
|
||||
sub stem_word {
|
||||
my $word = lc shift;
|
||||
|
||||
# Check against cache of stemmed words
|
||||
if ($Stem_Caching && exists $Stem_Cache->{$word}) {
|
||||
return $Stem_Cache->{$word};
|
||||
}
|
||||
|
||||
my ($start, $RV) = $word =~ /$RVRE/;
|
||||
return $word unless $RV;
|
||||
|
||||
# Step 1
|
||||
unless ($RV =~ s/$PERFECTIVEGROUND//) {
|
||||
$RV =~ s/$REFLEXIVE//;
|
||||
|
||||
if ($RV =~ s/$ADJECTIVE//) {
|
||||
$RV =~ s/$PARTICIPLE//;
|
||||
} else {
|
||||
$RV =~ s/$NOUN// unless $RV =~ s/$VERB//;
|
||||
}
|
||||
}
|
||||
|
||||
# Step 2
|
||||
$RV =~ s/и$//;
|
||||
|
||||
# Step 3
|
||||
$RV =~ s/оÑ<C2BE>Ñ‚ÑŒ?$// if $RV =~ /$DERIVATIONAL/;
|
||||
|
||||
# Step 4
|
||||
unless ($RV =~ s/ь$//) {
|
||||
$RV =~ s/ейше?//;
|
||||
$RV =~ s/нн$/н/;
|
||||
}
|
||||
|
||||
return $start.$RV;
|
||||
}
|
||||
|
||||
sub stem_caching {
|
||||
my $parm_ref;
|
||||
if (ref $_[0]) {
|
||||
$parm_ref = shift;
|
||||
} else {
|
||||
$parm_ref = { @_ };
|
||||
}
|
||||
my $caching_level = $parm_ref->{-level};
|
||||
if (defined $caching_level) {
|
||||
if ($caching_level !~ m/^[012]$/) {
|
||||
croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
|
||||
}
|
||||
$Stem_Caching = $caching_level;
|
||||
}
|
||||
return $Stem_Caching;
|
||||
}
|
||||
|
||||
sub clear_stem_cache {
|
||||
$Stem_Cache = {};
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Lingua::Stem::RuUTF8 - Porter's stemming algorithm for Russian (UTF-8 only)
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Lingua::Stem::RuUTF8;
|
||||
my $stems = Lingua::Stem::RuUTF8::stem({
|
||||
-words => $word_list_reference,
|
||||
-locale => 'ru',
|
||||
-exceptions => $exceptions_hash,
|
||||
});
|
||||
|
||||
my $stem = Lingua::Stem::RuUTF8::stem_word( $word );
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module applies the Porter Stemming Algorithm to its parameters,
|
||||
returning the stemmed words.
|
||||
|
||||
The algorithm is implemented exactly as described in:
|
||||
|
||||
http://snowball.tartarus.org/russian/stemmer.html
|
||||
|
||||
The code is carefully crafted to work in conjunction with the L<Lingua::Stem>
|
||||
module by Benjamin Franz. This stemmer is also based
|
||||
on the work of Aldo Capini, see L<Lingua::Stem::It>.
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
=over 4
|
||||
|
||||
=item stem({ -words => \@words, -locale => 'ru', -exceptions => \%exceptions });
|
||||
|
||||
Stems a list of passed words. Returns an anonymous list reference to the stemmed
|
||||
words.
|
||||
|
||||
Example:
|
||||
|
||||
my $stemmed_words = Lingua::Stem::RuUTF8::stem({
|
||||
-words => \@words,
|
||||
-locale => 'ru',
|
||||
-exceptions => \%exceptions,
|
||||
});
|
||||
|
||||
=item stem_word( $word );
|
||||
|
||||
Stems a single word and returns the stem directly.
|
||||
|
||||
Example:
|
||||
|
||||
my $stem = Lingua::Stem::RuUTF8::stem_word( $word );
|
||||
|
||||
=item stem_caching({ -level => 0|1|2 });
|
||||
|
||||
Sets the level of stem caching.
|
||||
|
||||
'0' means 'no caching'. This is the default level.
|
||||
|
||||
'1' means 'cache per run'. This caches stemming results during a single
|
||||
call to 'stem'.
|
||||
|
||||
'2' means 'cache indefinitely'. This caches stemming results until
|
||||
either the process exits or the 'clear_stem_cache' method is called.
|
||||
|
||||
=item clear_stem_cache;
|
||||
|
||||
Clears the cache of stemmed words
|
||||
|
||||
=back
|
||||
|
||||
=cut
|
||||
|
||||
=head2 EXPORT
|
||||
|
||||
None by default.
|
||||
|
||||
=head1 HISTORY
|
||||
|
||||
=over 8
|
||||
|
||||
=item *
|
||||
|
||||
0.01 (2004-05-21)
|
||||
|
||||
=back
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Aleksandr Guidrevitch <pillgrim@mail.ru>
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
Lingua::Stem
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (C) 2003 by Aldo Calpini <dada@perl.it>
|
||||
|
||||
Copyright (C) 2004 by Aleksandr Guidrevitch <pillgrim@mail.ru>
|
||||
|
||||
This software may be freely copied and distributed under the same
|
||||
terms and conditions as Perl itself, either Perl version 5.8.3
|
||||
or, at your option, any later version of Perl 5 you may
|
||||
have available..
|
||||
|
||||
=cut
|
|
@ -125,6 +125,20 @@ function doOnSelectProduct(selectmode) {
|
|||
[%# *** Summary *** %]
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align="right" colspan="2">
|
||||
<label for="content">Words (full-text search):</label>
|
||||
</th>
|
||||
<td>
|
||||
<input name="content" size="40" id="content"
|
||||
value="[% default.content.0 FILTER html %]">
|
||||
<script type="text/javascript"> <!--
|
||||
document.forms['queryform'].content.focus();
|
||||
// -->
|
||||
</script>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<th align="right">
|
||||
<label for="short_desc" accesskey="s"><u>S</u>ummary</label>:
|
||||
|
|
Loading…
Reference in New Issue