Bug 82651 - Content size limit (4MB Sphinx), enable snippets setting

git-svn-id: svn://svn.office.custis.ru/3rdparty/viewvc.org/trunk@1400 6955db30-a419-402b-8a0d-67ecbb4d7f56
remotes/github/custis
vfilippov 2011-09-29 18:23:59 +00:00 committed by Vitaliy Filippov
parent a564b02d18
commit 72934bf6cd
3 changed files with 57 additions and 39 deletions

View File

@ -634,9 +634,13 @@ enabled = 0
# Set to 1 to enable indexing of file contents using Sphinx and Tika
index_content = 0
# Set to limit stored text file content size (4 MB default, 0 = unlimited, -1 = don't store content, index only)
# Set to limit stored text file content size (max. 4MB - Sphinx limit)
#content_max_size = 4194304
# Do store indexed content for snippet display?
# Do display snippets when searching on content?
enable_snippets = 1
# Database hostname, port, and socket
#host = localhost
#port = 3306

View File

@ -325,6 +325,7 @@ class Config:
self.cvsdb.enabled = 0
self.cvsdb.index_content = 0
self.cvsdb.enable_snippets = 1
self.cvsdb.content_max_size = 0
self.cvsdb.host = ''
self.cvsdb.port = 3306

View File

@ -40,7 +40,7 @@ error = "cvsdb error"
class CheckinDatabase:
def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, cfg,
authorizer = None, index_content = 0, sphinx_host = None, sphinx_port = None,
sphinx_socket = None, sphinx_index = None, content_max_size = 0):
sphinx_socket = None, sphinx_index = None, content_max_size = 0, enable_snippets = 1):
self.cfg = cfg
self._host = host
@ -56,6 +56,9 @@ class CheckinDatabase:
# Sphinx settings
self.index_content = index_content
if content_max_size > 4*1024*1024 or content_max_size <= 0:
content_max_size = 4*1024*1024
self.enable_snippets = enable_snippets
self.content_max_size = content_max_size
self.sphinx_host = sphinx_host
self.sphinx_port = sphinx_port
@ -376,7 +379,9 @@ class CheckinDatabase:
content = commit.GetContent()
props['ci_when'] = str(int(commit.GetTime() or 0))
if len(content):
props['content'] = content
# Maximum field size limit for Sphinx is 4MB
if len(content) > self.content_max_size:
props['content'] = content[0:self.content_max_size]
# Now, stored MIME type is only needed while searching
# It is guessed again when the file is displayed
props['mimetype'] = commit.GetMimeType()
@ -392,10 +397,7 @@ class CheckinDatabase:
)
# Sphinx (at least 2.0.1) still caches all string attributes inside RAM,
# so we'll store them in MySQL (used only for snippet display)
if self.content_max_size >= 0:
# Limit content size:
if self.content_max_size != 0 and len(content) >= self.content_max_size:
content = content[0:self.content_max_size]
if self.enable_snippets:
cursor.execute('INSERT INTO contents SET id=%s, content=%s', (commit_id, content))
except Exception, e:
print ("Error adding commit: '"+str(e)+"'\nValues were:\n"+
@ -612,6 +614,39 @@ class CheckinDatabase:
return self.authorizer.check_path_access(rootname, path_parts, vclib.FILE, rev)
return True
def fetch_snippets(self, query, sphinx_rows, cursor, sphinx_cursor):
snippets = {}
if self.enable_snippets:
# FIXME remove hardcode
snippet_options = {
'around': 15,
'limit': 200,
'before_match': '<span style="color:red">',
'after_match': '</span>',
'chunk_separator': ' ...\n',
}
preformatted_mime = 'text/(?!html|xml).*'
bm_html = cgi.escape(snippet_options['before_match'])
am_html = cgi.escape(snippet_options['after_match'])
# Build snippets using Sphinx (content is stored in MySQL)
for docid, rel, mimetype in sphinx_rows:
cursor.execute('SELECT content FROM contents WHERE id=%s', (docid, ))
s = cursor.fetchone()
if s:
s = s[0]
sphinx_cursor.execute(
'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
(s, self.sphinx_index, query.content_query) + tuple(snippet_options.values())
)
s, = sphinx_cursor.fetchone()
s = cgi.escape(s)
if re.match(preformatted_mime, mimetype):
s = s.replace('\n', '<br />')
s = s.replace(bm_html, snippet_options['before_match'])
s = s.replace(am_html, snippet_options['after_match'])
snippets[docid] = s
return snippets
def RunQuery(self, query):
if len(query.content_query) and self.sphinx:
# Use Sphinx to search on document content
@ -621,44 +656,21 @@ class CheckinDatabase:
sphcur.execute(sql)
sphinx_rows = list((str(docid), rel, mimetype) for docid, rel, mimetype in sphcur)
if len(sphinx_rows):
# FIXME remove hardcode
snippet_options = {
'around': 15,
'limit': 200,
'before_match': '<span style="color:red">',
'after_match': '</span>',
'chunk_separator': ' ...\n',
}
preformatted_mime = 'text/(?!html|xml).*'
snippets = {}
bm_html = cgi.escape(snippet_options['before_match'])
am_html = cgi.escape(snippet_options['after_match'])
# Build snippets using Sphinx (content is stored in MySQL)
for docid, rel, mimetype in sphinx_rows:
cursor.execute('SELECT content FROM contents WHERE id=%s', (docid, ))
s = cursor.fetchone()
if s:
s = s[0]
sphcur.execute(
'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
(s, self.sphinx_index, query.content_query) + tuple(snippet_options.values())
)
s, = sphcur.fetchone()
s = cgi.escape(s)
if re.match(preformatted_mime, mimetype):
s = s.replace('\n', '<br />')
s = s.replace(bm_html, snippet_options['before_match'])
s = s.replace(am_html, snippet_options['after_match'])
snippets[docid] = s
else:
snippets[docid] = ''
snippets = self.fetch_snippets(query, sphinx_rows, cursor, sphcur)
# Fetch commit attributes from MySQL
sql = self.CreateIdQueryString((docid for (docid, _, _) in sphinx_rows))
cursor.execute(sql)
byid = {}
for row in cursor:
byid[str(row[0])] = row
rows = list(byid[docid] + (rel, snippets[docid]) for (docid, rel, _) in sphinx_rows if docid in byid)
nrows = []
for docid, rel, _ in sphinx_rows:
if docid in byid:
if docid in snippets:
nrows.append(byid[docid] + (rel, snippets[docid]))
else:
nrows.append(byid[docid] + (rel, ''))
rows = nrows
else:
rows = []
else:
@ -1105,6 +1117,7 @@ def ConnectDatabase(cfg, authorizer=None, readonly=0):
min_relevance = cfg.cvsdb.fulltext_min_relevance,
authorizer = authorizer,
index_content = cfg.cvsdb.index_content,
enable_snippets = cfg.cvsdb.enable_snippets,
sphinx_host = cfg.cvsdb.sphinx_host,
sphinx_port = int(cfg.cvsdb.sphinx_port),
sphinx_socket = cfg.cvsdb.sphinx_socket,