Bug 82651 - Store contents for snippet retrieval in MySQL

git-svn-id: svn://svn.office.custis.ru/3rdparty/viewvc.org/trunk@1398 6955db30-a419-402b-8a0d-67ecbb4d7f56
remotes/github/custis
vfilippov 2011-09-29 18:08:16 +00:00 committed by Vitaliy Filippov
parent 1b75ada880
commit 80ccb26b20
4 changed files with 70 additions and 41 deletions

View File

@ -40,7 +40,7 @@ CREATE TABLE branches (
branch varchar(64) binary DEFAULT '' NOT NULL, branch varchar(64) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE branch (branch) UNIQUE branch (branch)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS checkins; DROP TABLE IF EXISTS checkins;
CREATE TABLE checkins ( CREATE TABLE checkins (
@ -65,7 +65,7 @@ CREATE TABLE checkins (
KEY fileid (fileid), KEY fileid (fileid),
KEY branchid (branchid), KEY branchid (branchid),
KEY descid (descid) KEY descid (descid)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS descs; DROP TABLE IF EXISTS descs;
CREATE TABLE descs ( CREATE TABLE descs (
@ -83,7 +83,7 @@ CREATE TABLE dirs (
dir varchar(255) binary DEFAULT '' NOT NULL, dir varchar(255) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE dir (dir) UNIQUE dir (dir)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS files; DROP TABLE IF EXISTS files;
CREATE TABLE files ( CREATE TABLE files (
@ -91,7 +91,7 @@ CREATE TABLE files (
file varchar(255) binary DEFAULT '' NOT NULL, file varchar(255) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE file (file) UNIQUE file (file)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS people; DROP TABLE IF EXISTS people;
CREATE TABLE people ( CREATE TABLE people (
@ -99,7 +99,7 @@ CREATE TABLE people (
who varchar(128) binary DEFAULT '' NOT NULL, who varchar(128) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE who (who) UNIQUE who (who)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS repositories; DROP TABLE IF EXISTS repositories;
CREATE TABLE repositories ( CREATE TABLE repositories (
@ -107,7 +107,7 @@ CREATE TABLE repositories (
repository varchar(64) binary DEFAULT '' NOT NULL, repository varchar(64) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE repository (repository) UNIQUE repository (repository)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS tags; DROP TABLE IF EXISTS tags;
CREATE TABLE tags ( CREATE TABLE tags (
@ -121,6 +121,12 @@ CREATE TABLE tags (
KEY dirid (dirid), KEY dirid (dirid),
KEY fileid (fileid), KEY fileid (fileid),
KEY branchid (branchid) KEY branchid (branchid)
) TYPE=InnoDB;
DROP TABLE IF EXISTS contents;
CREATE TABLE contents (
id int NOT NULL PRIMARY KEY,
content MEDIUMTEXT NOT NULL DEFAULT ''
) TYPE=MyISAM; ) TYPE=MyISAM;
""" """
@ -135,7 +141,7 @@ CREATE TABLE branches (
branch varchar(64) binary DEFAULT '' NOT NULL, branch varchar(64) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE branch (branch) UNIQUE branch (branch)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS commits; DROP TABLE IF EXISTS commits;
CREATE TABLE commits ( CREATE TABLE commits (
@ -160,7 +166,7 @@ CREATE TABLE commits (
KEY fileid (fileid), KEY fileid (fileid),
KEY branchid (branchid), KEY branchid (branchid),
KEY descid (descid) KEY descid (descid)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS descs; DROP TABLE IF EXISTS descs;
CREATE TABLE descs ( CREATE TABLE descs (
@ -168,7 +174,8 @@ CREATE TABLE descs (
description text, description text,
hash bigint(20) DEFAULT '0' NOT NULL, hash bigint(20) DEFAULT '0' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
KEY hash (hash) KEY hash (hash),
FULLTEXT KEY description (description)
) TYPE=MyISAM; ) TYPE=MyISAM;
DROP TABLE IF EXISTS dirs; DROP TABLE IF EXISTS dirs;
@ -177,7 +184,7 @@ CREATE TABLE dirs (
dir varchar(255) binary DEFAULT '' NOT NULL, dir varchar(255) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE dir (dir) UNIQUE dir (dir)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS files; DROP TABLE IF EXISTS files;
CREATE TABLE files ( CREATE TABLE files (
@ -185,7 +192,7 @@ CREATE TABLE files (
file varchar(255) binary DEFAULT '' NOT NULL, file varchar(255) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE file (file) UNIQUE file (file)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS people; DROP TABLE IF EXISTS people;
CREATE TABLE people ( CREATE TABLE people (
@ -193,7 +200,7 @@ CREATE TABLE people (
who varchar(128) binary DEFAULT '' NOT NULL, who varchar(128) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE who (who) UNIQUE who (who)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS repositories; DROP TABLE IF EXISTS repositories;
CREATE TABLE repositories ( CREATE TABLE repositories (
@ -201,7 +208,7 @@ CREATE TABLE repositories (
repository varchar(64) binary DEFAULT '' NOT NULL, repository varchar(64) binary DEFAULT '' NOT NULL,
PRIMARY KEY (id), PRIMARY KEY (id),
UNIQUE repository (repository) UNIQUE repository (repository)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS tags; DROP TABLE IF EXISTS tags;
CREATE TABLE tags ( CREATE TABLE tags (
@ -215,7 +222,7 @@ CREATE TABLE tags (
KEY dirid (dirid), KEY dirid (dirid),
KEY fileid (fileid), KEY fileid (fileid),
KEY branchid (branchid) KEY branchid (branchid)
) TYPE=MyISAM; ) TYPE=InnoDB;
DROP TABLE IF EXISTS metadata; DROP TABLE IF EXISTS metadata;
CREATE TABLE metadata ( CREATE TABLE metadata (
@ -223,8 +230,14 @@ CREATE TABLE metadata (
value text, value text,
PRIMARY KEY (name), PRIMARY KEY (name),
UNIQUE name (name) UNIQUE name (name)
) TYPE=MyISAM; ) TYPE=InnoDB;
INSERT INTO metadata (name, value) VALUES ('version', '1'); INSERT INTO metadata (name, value) VALUES ('version', '1');
DROP TABLE IF EXISTS contents;
CREATE TABLE contents (
id int NOT NULL PRIMARY KEY,
content MEDIUMTEXT NOT NULL DEFAULT ''
) TYPE=MyISAM;
""" """
BONSAI_COMPAT=""" BONSAI_COMPAT="""

View File

@ -634,6 +634,9 @@ enabled = 0
# Set to 1 to enable indexing of file contents using Sphinx and Tika # Set to 1 to enable indexing of file contents using Sphinx and Tika
index_content = 0 index_content = 0
# Set to limit stored text file content size (4 MB default, 0 = unlimited)
#content_max_size = 4194304
# Database hostname, port, and socket # Database hostname, port, and socket
#host = localhost #host = localhost
#port = 3306 #port = 3306
@ -661,7 +664,6 @@ index_content = 0
# Requires Sphinx >= 0.9.9 with a real-time updatable SphinxQL index. # Requires Sphinx >= 0.9.9 with a real-time updatable SphinxQL index.
# Index must be created in sphinx.conf by hand and have the following fields: # Index must be created in sphinx.conf by hand and have the following fields:
# rt_field = content # rt_field = content
# rt_attr_string = content
# rt_attr_string = mimetype # rt_attr_string = mimetype
# rt_attr_timestamp = ci_when # rt_attr_timestamp = ci_when
# rt_attr_uint = whoid # rt_attr_uint = whoid

View File

@ -325,6 +325,7 @@ class Config:
self.cvsdb.enabled = 0 self.cvsdb.enabled = 0
self.cvsdb.index_content = 0 self.cvsdb.index_content = 0
self.cvsdb.content_max_size = 0
self.cvsdb.host = '' self.cvsdb.host = ''
self.cvsdb.port = 3306 self.cvsdb.port = 3306
self.cvsdb.socket = '' self.cvsdb.socket = ''

View File

@ -40,7 +40,7 @@ error = "cvsdb error"
class CheckinDatabase: class CheckinDatabase:
def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, cfg, def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, cfg,
authorizer = None, index_content = 0, sphinx_host = None, sphinx_port = None, authorizer = None, index_content = 0, sphinx_host = None, sphinx_port = None,
sphinx_socket = None, sphinx_index = None): sphinx_socket = None, sphinx_index = None, content_max_size = 0):
self.cfg = cfg self.cfg = cfg
self._host = host self._host = host
@ -56,6 +56,7 @@ class CheckinDatabase:
# Sphinx settings # Sphinx settings
self.index_content = index_content self.index_content = index_content
self.content_max_size = content_max_size
self.sphinx_host = sphinx_host self.sphinx_host = sphinx_host
self.sphinx_port = sphinx_port self.sphinx_port = sphinx_port
self.sphinx_socket = sphinx_socket self.sphinx_socket = sphinx_socket
@ -373,9 +374,6 @@ class CheckinDatabase:
if self.index_content: if self.index_content:
sphcur = self.sphinx.cursor() sphcur = self.sphinx.cursor()
content = commit.GetContent() content = commit.GetContent()
# Sphinx has 4 MB text field limit
if len(content) >= 4*1024*1024:
content = content[0:4*1024*1024]
props['ci_when'] = str(int(commit.GetTime() or 0)) props['ci_when'] = str(int(commit.GetTime() or 0))
if len(content): if len(content):
props['content'] = content props['content'] = content
@ -392,6 +390,12 @@ class CheckinDatabase:
','.join('%s' for i in props)+')', ','.join('%s' for i in props)+')',
tuple(props[i] for i in props) tuple(props[i] for i in props)
) )
# Sphinx (at least 2.0.1) still caches all string attributes inside RAM,
# so we'll store them in MySQL (used only for snippet display)
# Limit content size:
if self.content_max_size and len(content) >= self.content_max_size:
content = content[0:self.content_max_size]
cursor.execute('INSERT INTO contents SET id=%s, content=%s', (commit_id, content))
except Exception, e: except Exception, e:
print ("Error adding commit: '"+str(e)+"'\nValues were:\n"+ print ("Error adding commit: '"+str(e)+"'\nValues were:\n"+
"\n".join(i+'='+str(props[i]) for i in props)) "\n".join(i+'='+str(props[i]) for i in props))
@ -477,7 +481,7 @@ class CheckinDatabase:
elif self._row_limit: elif self._row_limit:
limit = "LIMIT %s" % (str(self._row_limit)) limit = "LIMIT %s" % (str(self._row_limit))
fields = "id `id`, WEIGHT() `relevance`, `content`, `mimetype`" fields = "id `id`, WEIGHT() `relevance`, `mimetype`"
return "SELECT %s FROM %s %s %s %s" % (fields, self.sphinx_index, conditions, order_by, limit) return "SELECT %s FROM %s %s %s %s" % (fields, self.sphinx_index, conditions, order_by, limit)
@ -611,11 +615,12 @@ class CheckinDatabase:
if len(query.content_query) and self.sphinx: if len(query.content_query) and self.sphinx:
# Use Sphinx to search on document content # Use Sphinx to search on document content
sql = self.CreateSphinxQueryString(query) sql = self.CreateSphinxQueryString(query)
cursor = self.sphinx.cursor() cursor = self.db.cursor()
cursor.execute(sql) sphcur = self.sphinx.cursor()
sphinx_rows = list((str(docid), rel, content, mimetype) for docid, rel, content, mimetype in cursor) sphcur.execute(sql)
sphinx_rows = list((str(docid), rel, mimetype) for docid, rel, mimetype in sphcur)
if len(sphinx_rows): if len(sphinx_rows):
# Fetch snippets # FIXME remove hardcode
snippet_options = { snippet_options = {
'around': 15, 'around': 15,
'limit': 200, 'limit': 200,
@ -627,26 +632,32 @@ class CheckinDatabase:
snippets = {} snippets = {}
bm_html = cgi.escape(snippet_options['before_match']) bm_html = cgi.escape(snippet_options['before_match'])
am_html = cgi.escape(snippet_options['after_match']) am_html = cgi.escape(snippet_options['after_match'])
for docid, rel, content, mimetype in sphinx_rows: # Build snippets using Sphinx (content is stored in MySQL)
cursor.execute( for docid, rel, mimetype in sphinx_rows:
'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')', cursor.execute('SELECT content FROM contents WHERE id=%s', (docid, ))
(content, self.sphinx_index, query.content_query) + tuple(snippet_options.values()) s = cursor.fetchone()
) if s:
s, = cursor.fetchone() s = s[0]
s = cgi.escape(s) sphcur.execute(
if re.match(preformatted_mime, mimetype): 'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
s = s.replace('\n', '<br />') (s, self.sphinx_index, query.content_query) + tuple(snippet_options.values())
s = s.replace(bm_html, snippet_options['before_match']) )
s = s.replace(am_html, snippet_options['after_match']) s, = sphcur.fetchone()
snippets[docid] = s s = cgi.escape(s)
# Fetch all fields from MySQL if re.match(preformatted_mime, mimetype):
sql = self.CreateIdQueryString((docid for (docid, _, _, _) in sphinx_rows)) s = s.replace('\n', '<br />')
cursor = self.db.cursor() s = s.replace(bm_html, snippet_options['before_match'])
s = s.replace(am_html, snippet_options['after_match'])
snippets[docid] = s
else:
snippets[docid] = ''
# Fetch commit attributes from MySQL
sql = self.CreateIdQueryString((docid for (docid, _, _) in sphinx_rows))
cursor.execute(sql) cursor.execute(sql)
byid = {} byid = {}
for row in cursor: for row in cursor:
byid[str(row[0])] = row byid[str(row[0])] = row
rows = list(byid[docid] + (rel, snippets[docid]) for (docid, rel, _, _) in sphinx_rows if docid in byid) rows = list(byid[docid] + (rel, snippets[docid]) for (docid, rel, _) in sphinx_rows if docid in byid)
else: else:
rows = [] rows = []
else: else:
@ -751,6 +762,7 @@ class CheckinDatabase:
self.sql_purge('branches', 'id', 'branchid', checkins_table) self.sql_purge('branches', 'id', 'branchid', checkins_table)
self.sql_purge('descs', 'id', 'descid', checkins_table) self.sql_purge('descs', 'id', 'descid', checkins_table)
self.sql_purge('people', 'id', 'whoid', checkins_table) self.sql_purge('people', 'id', 'whoid', checkins_table)
self.sql_purge('contents', 'id', 'id', checkins_table)
# Reset all internal id caches. We could be choosier here, # Reset all internal id caches. We could be choosier here,
# but let's just be as safe as possible. # but let's just be as safe as possible.
@ -1096,6 +1108,7 @@ def ConnectDatabase(cfg, authorizer=None, readonly=0):
sphinx_port = int(cfg.cvsdb.sphinx_port), sphinx_port = int(cfg.cvsdb.sphinx_port),
sphinx_socket = cfg.cvsdb.sphinx_socket, sphinx_socket = cfg.cvsdb.sphinx_socket,
sphinx_index = cfg.cvsdb.sphinx_index, sphinx_index = cfg.cvsdb.sphinx_index,
content_max_size = cfg.cvsdb.content_max_size,
cfg = cfg, cfg = cfg,
) )
db.Connect() db.Connect()