Bug 82651 - Retrieve text file contents from SVN when searching

git-svn-id: svn://svn.office.custis.ru/3rdparty/viewvc.org/trunk@1443 6955db30-a419-402b-8a0d-67ecbb4d7f56
remotes/github/custis
vfilippov 2011-10-25 13:43:12 +00:00 committed by Vitaliy Filippov
parent 4cd52560ac
commit 928bf6f1a6
5 changed files with 228 additions and 190 deletions

View File

@ -322,24 +322,10 @@ class SvnRev:
os.path.basename(change.path),
diffobj.tempfile2
)
# Read and guess charset by ourselves for text files
if mime and mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')):
try:
fd = open(diffobj.tempfile2, 'rb')
content = fd.read()
fd.close()
except: pass
# Guess charset
if content:
content, charset = repo.guesser.guess_charset(content)
if charset:
content = content.encode('utf-8')
if repo.verbose:
print 'Guessed %s for %s' % (charset, change.path)
elif repo.verbose:
print 'Failed to guess charset for %s, not indexing' % (change.path, )
# Try to extract content using Tika from binary documents
elif repo.tika_client:
# Do not index contents of text files - it can be easily retrieved later
if mime and not mime.startswith('text/') and not
(mime.startswith('application/') and mime.endswith('xml')):
content = repo.tika_client.get_text(diffobj.tempfile2, mime, change.path)
self.changes.append((path, action, plus, minus, content, mime))

View File

@ -637,10 +637,6 @@ index_content = 0
# Set to limit stored text file content size (max. 4MB - Sphinx limit)
#content_max_size = 4194304
# Do store indexed content for snippet display?
# Do display snippets when searching on content?
enable_snippets = 1
# Database hostname, port, and socket
#host = localhost
#port = 3306
@ -683,6 +679,22 @@ enable_snippets = 1
#sphinx_socket = /var/run/sphinxql.sock
#sphinx_index = viewvc
## Parameters for snippet (excerpt) display
# Enable snippets? ViewVC indexes binary file contents in MySQL when enabled.
enable_snippets = 1
# Regexp for MIME types in which \n means "newline"
sphinx_preformatted_mime = text/(?!html|xml).*
# Sphinx snippet options - see http://sphinxsearch.com/docs/current.html#api-func-buildexcerpts
sphinx_snippet_options =
around: 15
limit: 200
before_match: <span style="color:red">
after_match: </span>
chunk_separator: ... \n
# Limit the number of rows returned by a given query to this number.
#row_limit = 1000

View File

@ -344,6 +344,13 @@ class Config:
self.cvsdb.sphinx_port = 3307
self.cvsdb.sphinx_socket = ''
self.cvsdb.sphinx_index = ''
self.cvsdb.sphinx_preformatted_mime = 'text/(?!html|xml).*'
self.cvsdb.sphinx_snippet_options =
'around: 15\n'\
'limit: 200\n'\
'before_match: <span style="color:red">\n'\
'after_match: </span>\n'\
'chunk_separator: ... \n\n'
def _startswith(somestr, substr):
return somestr[:len(substr)] == substr

View File

@ -1,4 +1,3 @@
# -*-python-*-
#
# Copyright (C) 1999-2009 The ViewCVS Group. All Rights Reserved.
#
@ -38,32 +37,45 @@ error = "cvsdb error"
## complient database interface
class CheckinDatabase:
def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, cfg,
authorizer = None, index_content = 0, sphinx_host = None, sphinx_port = None,
sphinx_socket = None, sphinx_index = None, content_max_size = 0, enable_snippets = 1):
def __init__(self, cfg, guesser, readonly, request = None):
self.cfg = cfg
self.guesser = guesser
self.readonly = readonly
self.request = request
self._host = host
self._port = port
self._socket = socket
self._user = user
self._passwd = passwd
self._database = database
self._row_limit = row_limit
self._version = None
self._min_relevance = min_relevance
self.authorizer = authorizer
self._host = cfg.host
self._port = cfg.port
self._socket = cfg.socket
self._user = readonly and cfg.user or cfg.readonly_user
self._passwd = readonly and cfg.passwd or cfg.readonly_passwd
self._database = cfg.database
self._row_limit = cfg.row_limit
self._version = None
self._min_relevance = cfg.fulltext_min_relevance
# Sphinx settings
self.index_content = index_content
if content_max_size > 4*1024*1024 or content_max_size <= 0:
content_max_size = 4*1024*1024
self.enable_snippets = enable_snippets
self.content_max_size = content_max_size
self.sphinx_host = sphinx_host
self.sphinx_port = sphinx_port
self.sphinx_socket = sphinx_socket
self.sphinx_index = sphinx_index
self.index_content = cfg.index_content
self.content_max_size = cfg.context_max_size
if self.content_max_size > 4*1024*1024 or self.content_max_size <= 0:
self.content_max_size = 4*1024*1024
self.enable_snippets = cfg.enable_snippets
self.sphinx_host = cfg.sphinx_host
self.sphinx_port = cfg.sphinx_port
self.sphinx_socket = cfg.sphinx_socket
self.sphinx_index = cfg.sphinx_index
# Snippet settings
for i in cfg.sphinx_snippet_options.split('\n'):
(a, b) = i.split(':')
if b[0] == ' ':
b = b[1:]
b = b.replace('\\n', '\n')
self.snippet_options[a] = b
self.preformatted_mime = cfg.sphinx_preformatted_mime
if 'before_match' in self.snippet_options:
self.snippet_beforematch_html = cgi.escape(self.snippet_options['before_match'])
if 'after_match' in self.snippet_options:
self.snippet_aftermatch_html = cgi.escape(self.snippet_options['after_match'])
## database lookup caches
self._get_cache = {}
@ -383,8 +395,8 @@ class CheckinDatabase:
if len(content) > self.content_max_size:
content = content[0:self.content_max_size]
props['content'] = content
# Now, stored MIME type is only needed while searching
# It is guessed again when the file is displayed
# Stored MIME type is only needed for snippet display
# It is re-guessed when the file is displayed
props['mimetype'] = commit.GetMimeType()
props['id'] = str(commit_id)
del props['addedlines']
@ -396,8 +408,8 @@ class CheckinDatabase:
','.join('%s' for i in props)+')',
tuple(props[i] for i in props)
)
# Sphinx (at least 2.0.1) still caches all string attributes inside RAM,
# so we'll store them in MySQL (used only for snippet display)
# Sphinx (at least 2.0.1) still caches all string attributes
# inside RAM, so we'll store contents in MySQL
if self.enable_snippets:
cursor.execute('INSERT INTO contents SET id=%s, content=%s', (commit_id, content))
except Exception, e:
@ -485,7 +497,7 @@ class CheckinDatabase:
elif self._row_limit:
limit = "LIMIT %s" % (str(self._row_limit))
fields = "id `id`, WEIGHT() `relevance`, `mimetype`"
fields = "id, `mimetype`, WEIGHT() `relevance`"
return "SELECT %s FROM %s %s %s %s" % (fields, self.sphinx_index, conditions, order_by, limit)
@ -493,7 +505,7 @@ class CheckinDatabase:
def CreateIdQueryString(self, ids):
commits_table = self._version >= 1 and 'commits' or 'checkins'
return (
'SELECT %s.*, repositories.repository AS repository_name, dirs.dir AS dir_name, files.file AS file_name'
'SELECT %s.*, repositories.repository AS repository_name, dirs.dir AS dir_name, files.file AS file_name, "" AS snippet'
' FROM %s, repositories, dirs, files'
' WHERE %s.id IN (%s) AND repositoryid=repositories.id'
' AND dirid=dirs.id AND fileid=files.id' % (commits_table, commits_table, commits_table, ','.join(ids))
@ -508,7 +520,7 @@ class CheckinDatabase:
"files.file AS file_name"]
tableList = [
(commits_table, None),
("repositories","(%s.repositoryid=repositories.id)" % (commits_table)),
("repositories", "(%s.repositoryid=repositories.id)" % (commits_table)),
("dirs", "(%s.dirid=dirs.id)" % (commits_table)),
("files", "(%s.fileid=files.id)" % (commits_table))]
condList = []
@ -607,110 +619,146 @@ class CheckinDatabase:
return sql
def check_commit_access(self, repos, dir, file, rev):
if self.authorizer:
r = self.request.get_repo(repos)
if r.auth:
rootname = repos.split('/')
rootname = rootname.pop()
path_parts = dir.split('/')
path_parts.append(file)
return self.authorizer.check_path_access(rootname, path_parts, vclib.FILE, rev)
return r.auth.check_path_access(rootname, path_parts, vclib.FILE, rev)
return True
def fetch_snippets(self, query, sphinx_rows, cursor, sphinx_cursor):
snippets = {}
if self.enable_snippets:
# FIXME remove hardcode
snippet_options = {
'around': 15,
'limit': 200,
'before_match': '<span style="color:red">',
'after_match': '</span>',
'chunk_separator': ' ...\n',
}
preformatted_mime = 'text/(?!html|xml).*'
bm_html = cgi.escape(snippet_options['before_match'])
am_html = cgi.escape(snippet_options['after_match'])
# Build snippets using Sphinx (content is stored in MySQL)
for docid, rel, mimetype in sphinx_rows:
cursor.execute('SELECT content FROM contents WHERE id=%s', (docid, ))
s = cursor.fetchone()
if s:
s = s[0]
sphinx_cursor.execute(
'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
(s, self.sphinx_index, query.content_query) + tuple(snippet_options.values())
)
s, = sphinx_cursor.fetchone()
s = cgi.escape(s)
if re.match(preformatted_mime, mimetype):
s = s.replace('\n', '<br />')
s = s.replace(bm_html, snippet_options['before_match'])
s = s.replace(am_html, snippet_options['after_match'])
snippets[docid] = s
return snippets
# Build a snippet using Sphinx
def get_snippet(self, sph, content, query):
sph.execute(
'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
(content, self.sphinx_index, query) + tuple(snippet_options.values())
)
s, = sph.fetchone()
s = cgi.escape(s)
if re.match(self.preformatted_mime, mimetype):
s = s.replace('\n', '<br />')
if 'before_match' in self.snippet_options:
s = s.replace(self.sphinx_beforematch_html, self.snippet_options['before_match'])
if 'after_match' in self.snippet_options:
s = s.replace(self.sphinx_aftermatch_html, self.snippet_options['after_match'])
return s
# Fetch snippets for a query result
def fetch_snippets(self, query, rows):
if not len(rows):
return
cursor = self.db.cursor()
sph = self.sphinx.cursor()
# Fetch binary file contents, stored in MySQL
cursor.execute(
'SELECT id, content FROM contents WHERE id IN (' +
','.join(sphinx_rows.keys()) + ')'
)
# Build snippets
for (docid, content) in cursor:
rows[docid]['snippet'] = self.get_snippet(sph, content, query.content_query)
for docid in rows:
mime = rows[docid]['mimetype']
if not rows[docid]['snippet'] and mime and
(mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')):
# Fetch text file contents directly from SVN
repo = rows[docid]['repository_name']
path = rows[docid]['dir_name']+'/'+rows[docid]['file_name']
revision = rows[docid]['revision']
fp = None
try:
fp, = self.request.get_repo(repo).repo.openfile(path, revision)
content = fp.read()
fp.close()
content = repo.guesser.utf8(content)
except:
if fp: fp.close()
content = None
# Build snippet
if content:
rows[docid]['snippet'] = self.get_snippet(sph, content, query.content_query)
# Run query and return all rows as dictionaries
def selectall(db, sql, args = (), key = None):
cursor = db.cursor()
cursor.execute(sql, args)
if key:
rows = {}
for i in cursor:
r = dict(zip(cursor.description, i))
rows[r[key]] = r
else:
rows = []
for i in cursor:
rows.append(dict(zip(cursor.description, i)))
return rows
# Run content query
def RunSphinxQuery(self, query):
cursor = self.db.cursor()
rows = selectall(self.sphinx, self.CreateSphinxQueryString(query))
if len(rows):
m_rows = selectall(self.db, self.CreateIdQueryString(sphinx_rows.keys()), (), 'id')
new_rows = []
# Check rights BEFORE fetching snippets
for i in rows:
if i['id'] in m_rows:
if not self.check_commit_access(
m_rows[i['id']]['repository_name'],
m_rows[i['id']]['dir_name'],
m_rows[i['id']]['file_name'],
m_rows[i['id']]['revision']):
del m_rows[i['id']]
else:
m_rows[i['id']].update(i)
# Fetch snippets
if self.enable_snippets:
self.fetch_snippets(query, m_rows)
for i in rows:
if i['id'] in m_rows:
new_rows.push(m_rows[i['id']])
rows = new_rows
else:
rows = []
return rows
def RunQuery(self, query):
if len(query.content_query) and self.sphinx:
# Use Sphinx to search on document content
sql = self.CreateSphinxQueryString(query)
cursor = self.db.cursor()
sphcur = self.sphinx.cursor()
sphcur.execute(sql)
sphinx_rows = list((str(docid), rel, mimetype) for docid, rel, mimetype in sphcur)
if len(sphinx_rows):
snippets = self.fetch_snippets(query, sphinx_rows, cursor, sphcur)
# Fetch commit attributes from MySQL
sql = self.CreateIdQueryString((docid for (docid, _, _) in sphinx_rows))
cursor.execute(sql)
byid = {}
for row in cursor:
byid[str(row[0])] = row
nrows = []
for docid, rel, _ in sphinx_rows:
if docid in byid:
if docid in snippets:
nrows.append(byid[docid] + (rel, snippets[docid]))
else:
nrows.append(byid[docid] + (rel, ''))
rows = nrows
else:
rows = []
rows = self.RunSphinxQuery(query)
else:
# Use regular queries when document content is not searched
sql = self.CreateSQLQueryString(query)
cursor = self.db.cursor()
cursor.execute(sql)
rows = list(cursor)
rows = selectall(self.db, self.CreateSQLQueryString(query))
# Check rights
rows = r for r in rows if self.check_commit_access(
r['repository_name'],
r['dir_name'],
r['file_name'],
r['revision'])
# Convert rows to commit objects
for row in rows:
(dbId, dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID,
dbFileID, dbRevision, dbStickyTag, dbBranchID, dbAddedLines,
dbRemovedLines, dbDescID, dbRepositoryName, dbDirName,
dbFileName, dbRelevance, dbSnippet) = row
if not self.check_commit_access(dbRepositoryName, dbDirName, dbFileName, dbRevision):
continue
commit = LazyCommit(self)
if dbType == 'Add':
if row['type'] == 'Add':
commit.SetTypeAdd()
elif dbType == 'Remove':
elif row['type'] == 'Remove':
commit.SetTypeRemove()
else:
commit.SetTypeChange()
commit.SetTime(dbi.TicksFromDateTime(dbCI_When))
commit.SetFileID(dbFileID)
commit.SetDirectoryID(dbDirID)
commit.SetRevision(dbRevision)
commit.SetRepositoryID(dbRepositoryID)
commit.SetAuthorID(dbAuthorID)
commit.SetBranchID(dbBranchID)
commit.SetPlusCount(dbAddedLines)
commit.SetMinusCount(dbRemovedLines)
commit.SetDescriptionID(dbDescID)
commit.SetRelevance(dbRelevance)
commit.SetSnippet(dbSnippet)
commit.SetTime(dbi.TicksFromDateTime(row['ci_when']))
commit.SetFileID(row['fileid'])
commit.SetDirectoryID(row['dirid'])
commit.SetRevision(row['revision'])
commit.SetRepositoryID(row['repositoryid'])
commit.SetAuthorID(row['authorid'])
commit.SetBranchID(row['branchid'])
commit.SetPlusCount(row['addedlines'])
commit.SetMinusCount(row['removedlines'])
commit.SetDescriptionID(row['descid'])
commit.SetRelevance(row['relevance'])
commit.SetSnippet(row['snippet'])
query.AddCommit(commit)
@ -745,6 +793,7 @@ class CheckinDatabase:
return commit
# Now unused
def sql_delete(self, table, key, value, keep_fkey = None):
sql = "DELETE FROM %s WHERE %s=%%s" % (table, key)
sql_args = (value, )
@ -1114,30 +1163,11 @@ def CreateCheckinQuery():
return CheckinDatabaseQuery()
def ConnectDatabase(cfg, authorizer=None, readonly=0):
if readonly:
user = cfg.cvsdb.readonly_user
passwd = cfg.cvsdb.readonly_passwd
else:
user = cfg.cvsdb.user
passwd = cfg.cvsdb.passwd
db = CheckinDatabase(
host = cfg.cvsdb.host,
port = cfg.cvsdb.port,
socket = cfg.cvsdb.socket,
user = user,
passwd = passwd,
database = cfg.cvsdb.database_name,
row_limit = cfg.cvsdb.row_limit,
min_relevance = cfg.cvsdb.fulltext_min_relevance,
readonly = readonly,
authorizer = authorizer,
index_content = cfg.cvsdb.index_content,
enable_snippets = cfg.cvsdb.enable_snippets,
sphinx_host = cfg.cvsdb.sphinx_host,
sphinx_port = int(cfg.cvsdb.sphinx_port),
sphinx_socket = cfg.cvsdb.sphinx_socket,
sphinx_index = cfg.cvsdb.sphinx_index,
content_max_size = cfg.cvsdb.content_max_size,
cfg = cfg,
cfg = cfg.cvsdb,
guesser = cfg.guesser(),
)
db.Connect()
return db

View File

@ -96,6 +96,13 @@ _RCSDIFF_ERROR = 'error'
# special characters that don't need to be URL encoded
_URL_SAFE_CHARS = "/*~"
class Repo:
def __init__(self, repos, rootname, auth, rootpath, roottype):
self.repos = repos
self.rootname = rootname
self.auth = auth
self.rootpath = rootpath
self.roottype = roottype
class Request:
def __init__(self, server, cfg):
@ -134,6 +141,15 @@ class Request:
def utf8(self, value):
return self.cfg.guesser().utf8(value)
def get_repo(self, rootname):
if rootname in self.all_repos:
return self.all_repos[rootname]
try:
r = self.all_repos[rootname] = self.create_repos(rootname)
except:
r = None
return r
def create_repos(self, rootname):
if not rootname:
return None
@ -163,13 +179,7 @@ class Request:
return None
repos.open()
return {
'repos' : repos,
'rootname' : rootname,
'auth' : authorizer,
'rootpath' : rootpath,
'roottype' : roottype,
}
return Repo(repos, rootname, authorizer, rootpath, roottype)
return None
@ -278,9 +288,9 @@ class Request:
if self.rootname:
rcr = self.create_repos(self.rootname)
if rcr:
self.repos = rcr['repos']
self.rootpath = rcr['rootpath']
self.auth = rcr['auth']
self.repos = rcr.repos
self.rootpath = rcr.rootpath
self.auth = rcr.auth
# Overlay root-specific options.
cfg.overlay_root_options(self.rootname)
if self.repos.roottype() == vclib.CVS:
@ -3778,8 +3788,6 @@ def build_commit(request, files, max_files, dir_strip, format):
plus_count = 0
minus_count = 0
found_unreadable = 0
if not request.all_repos:
request.all_repos = {}
for f in files:
dirname = f.GetDirectory()
@ -3798,19 +3806,14 @@ def build_commit(request, files, max_files, dir_strip, format):
# Check path access (since the commits database logic bypasses the
# vclib layer and, thus, the vcauth stuff that layer uses).
my_repos = request.all_repos.get(f.GetRepository(), '')
if not my_repos:
try:
my_repos = request.all_repos[f.GetRepository()] = request.create_repos(f.GetRepository())
except:
my_repos = None
my_repos = request.get_repo(f.GetRepository())
if not my_repos:
return None
if my_repos['roottype'] == 'cvs':
# we store UTF-8 in the DB
if my_repos.roottype == 'cvs':
# we store UTF8 in the DB
try: where = where.decode('utf-8')
except: pass
# FIXME maybe store "real" filesystem path in the DB instead of having such setting?
# FIXME maybe also store "real" non-UTF8 filesystem path in the DB instead of having such setting?
try: where = where.encode(cfg.options.cvs_ondisk_charset)
except: pass
path_parts = _path_parts(where)
@ -3818,13 +3821,13 @@ def build_commit(request, files, max_files, dir_strip, format):
# In CVS, we can actually look at deleted revisions; in Subversion
# we can't -- we'll look at the previous revision instead.
exam_rev = rev
if my_repos['roottype'] == 'svn' and change_type == 'Remove':
if my_repos.roottype == 'svn' and change_type == 'Remove':
exam_rev = rev_prev
if path_parts:
# Skip files in CVSROOT if asked to hide such.
if cfg.options.hide_cvsroot \
and is_cvsroot_path(my_repos['roottype'], path_parts):
and is_cvsroot_path(my_repos.roottype, path_parts):
found_unreadable = 1
continue
@ -3838,7 +3841,7 @@ def build_commit(request, files, max_files, dir_strip, format):
# but to omit as unauthorized paths the authorization logic
# can't find.
try:
readable = vclib.check_path_access(my_repos['repos'], path_parts,
readable = vclib.check_path_access(my_repos.repos, path_parts,
None, exam_rev)
except vclib.ItemNotFound:
readable = 0
@ -3846,30 +3849,30 @@ def build_commit(request, files, max_files, dir_strip, format):
found_unreadable = 1
continue
if my_repos['roottype'] == 'svn':
if my_repos.roottype == 'svn':
params = { 'pathrev': exam_rev }
else:
params = { 'revision': exam_rev, 'pathrev': f.GetBranch() or None }
dir_href = request.get_url(root=my_repos['rootname'], view_func=view_directory,
dir_href = request.get_url(root=my_repos.rootname, view_func=view_directory,
where=dirname, pathtype=vclib.DIR,
params=params, escape=1)
log_href = request.get_url(root=my_repos['rootname'], view_func=view_log,
log_href = request.get_url(root=my_repos.rootname, view_func=view_log,
where=where, pathtype=vclib.FILE,
params=params, escape=1)
diff_href = view_href = download_href = None
if 'markup' in cfg.options.allowed_views:
view_href = request.get_url(root=my_repos['rootname'], view_func=view_markup,
view_href = request.get_url(root=my_repos.rootname, view_func=view_markup,
where=where, pathtype=vclib.FILE,
params=params, escape=1)
if 'co' in cfg.options.allowed_views:
download_href = request.get_url(root=my_repos['rootname'], view_func=view_checkout,
download_href = request.get_url(root=my_repos.rootname, view_func=view_checkout,
where=where, pathtype=vclib.FILE,
params=params, escape=1)
path_prev = None
if change_type == 'Add' and my_repos['roottype'] == 'svn':
if change_type == 'Add' and my_repos.roottype == 'svn':
try:
rev_prev, path_prev = my_repos['repos'].last_rev(where, int(rev), int(rev_prev))
rev_prev, path_prev = my_repos.repos.last_rev(where, int(rev), int(rev_prev))
if int(rev_prev) == int(rev):
path_prev = None
except:
@ -3881,7 +3884,7 @@ def build_commit(request, files, max_files, dir_strip, format):
'r2': rev,
'diff_format': None
})
diff_href = request.get_url(root=my_repos['rootname'], view_func=view_diff,
diff_href = request.get_url(root=my_repos.rootname, view_func=view_diff,
where=where, pathtype=vclib.FILE,
params=diff_href_params, escape=1)
mime_type = calculate_mime_type(request, path_parts, exam_rev)
@ -3946,12 +3949,12 @@ def build_commit(request, files, max_files, dir_strip, format):
commit.short_log = format_log(desc, cfg, format != 'rss')
commit.author = request.server.escape(author)
commit.rss_date = make_rss_time_string(date, request.cfg)
if my_repos['roottype'] == 'svn':
if my_repos.roottype == 'svn':
commit.rev = commit_rev
commit.rss_url = '%s://%s%s' % \
(request.server.getenv("HTTPS") == "on" and "https" or "http",
request.server.getenv("HTTP_HOST"),
request.get_url(root=my_repos['rootname'], view_func=view_revision,
request.get_url(root=my_repos.rootname, view_func=view_revision,
params={'revision': commit.rev},
escape=1))
else: