Bug 82651 - Retrieve text file contents from SVN when searching (debug)

git-svn-id: svn://svn.office.custis.ru/3rdparty/viewvc.org/trunk@1444 6955db30-a419-402b-8a0d-67ecbb4d7f56
remotes/github/custis
vfilippov 2011-10-25 14:15:38 +00:00 committed by Vitaliy Filippov
parent 928bf6f1a6
commit 76987f0851
4 changed files with 56 additions and 46 deletions

View File

@ -324,8 +324,8 @@ class SvnRev:
)
# Try to extract content using Tika from binary documents
# Do not index contents of text files - it can be easily retrieved later
if mime and not mime.startswith('text/') and not
(mime.startswith('application/') and mime.endswith('xml')):
if (mime and not mime.startswith('text/') and not
(mime.startswith('application/') and mime.endswith('xml'))):
content = repo.tika_client.get_text(diffobj.tempfile2, mime, change.path)
self.changes.append((path, action, plus, minus, content, mime))

View File

@ -345,7 +345,7 @@ class Config:
self.cvsdb.sphinx_socket = ''
self.cvsdb.sphinx_index = ''
self.cvsdb.sphinx_preformatted_mime = 'text/(?!html|xml).*'
self.cvsdb.sphinx_snippet_options =
self.cvsdb.sphinx_snippet_options = \
'around: 15\n'\
'limit: 200\n'\
'before_match: <span style="color:red">\n'\

View File

@ -48,14 +48,14 @@ class CheckinDatabase:
self._socket = cfg.socket
self._user = readonly and cfg.user or cfg.readonly_user
self._passwd = readonly and cfg.passwd or cfg.readonly_passwd
self._database = cfg.database
self._database = cfg.database_name
self._row_limit = cfg.row_limit
self._version = None
self._min_relevance = cfg.fulltext_min_relevance
# Sphinx settings
self.index_content = cfg.index_content
self.content_max_size = cfg.context_max_size
self.content_max_size = cfg.content_max_size
if self.content_max_size > 4*1024*1024 or self.content_max_size <= 0:
self.content_max_size = 4*1024*1024
self.enable_snippets = cfg.enable_snippets
@ -65,12 +65,18 @@ class CheckinDatabase:
self.sphinx_index = cfg.sphinx_index
# Snippet settings
self.snippet_options = {}
for i in cfg.sphinx_snippet_options.split('\n'):
(a, b) = i.split(':')
if b[0] == ' ':
b = b[1:]
b = b.replace('\\n', '\n')
self.snippet_options[a] = b
i = i.split(':', 1)
if len(i) == 2:
(a, b) = i
if b[0] == ' ':
b = b[1:]
b = b.replace('\\n', '\n')
if re.match('\d+', b):
b = int(b)
self.snippet_options[a] = b
self.snippet_options_str = ''.join(', %s AS '+i for i in self.snippet_options)
self.preformatted_mime = cfg.sphinx_preformatted_mime
if 'before_match' in self.snippet_options:
self.snippet_beforematch_html = cgi.escape(self.snippet_options['before_match'])
@ -107,7 +113,7 @@ class CheckinDatabase:
self.sphinx = dbi.connect(self.sphinx_host, self.sphinx_port, self.sphinx_socket, '', '', '')
def utf8(self, value):
return self.cfg.guesser().utf8(value)
return self.guesser.utf8(value)
def sql_get_id(self, table, column, value, auto_set):
value = self.utf8(value)
@ -629,19 +635,19 @@ class CheckinDatabase:
return True
# Build a snippet using Sphinx
def get_snippet(self, sph, content, query):
def get_snippet(self, sph, content, query, mimetype):
sph.execute(
'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
(content, self.sphinx_index, query) + tuple(snippet_options.values())
'CALL SNIPPETS(%s, %s, %s'+self.snippet_options_str+')',
(content, self.sphinx_index, query) + tuple(self.snippet_options.values())
)
s, = sph.fetchone()
s = cgi.escape(s)
if re.match(self.preformatted_mime, mimetype):
s = s.replace('\n', '<br />')
if 'before_match' in self.snippet_options:
s = s.replace(self.sphinx_beforematch_html, self.snippet_options['before_match'])
s = s.replace(self.snippet_beforematch_html, self.snippet_options['before_match'])
if 'after_match' in self.snippet_options:
s = s.replace(self.sphinx_aftermatch_html, self.snippet_options['after_match'])
s = s.replace(self.snippet_aftermatch_html, self.snippet_options['after_match'])
return s
# Fetch snippets for a query result
@ -653,53 +659,57 @@ class CheckinDatabase:
# Fetch binary file contents, stored in MySQL
cursor.execute(
'SELECT id, content FROM contents WHERE id IN (' +
','.join(sphinx_rows.keys()) + ')'
','.join(rows.keys()) + ')'
)
# Build snippets
for (docid, content) in cursor:
rows[docid]['snippet'] = self.get_snippet(sph, content, query.content_query)
rows[str(docid)]['snippet'] = self.get_snippet(sph, content, query.content_query, rows[str(docid)]['mimetype'])
for docid in rows:
mime = rows[docid]['mimetype']
if not rows[docid]['snippet'] and mime and
(mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')):
if (not rows[docid]['snippet'] and mime and
(mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')))):
# Fetch text file contents directly from SVN
repo = rows[docid]['repository_name']
path = rows[docid]['dir_name']+'/'+rows[docid]['file_name']
path = rows[docid]['dir_name'].split('/') + [rows[docid]['file_name']]
revision = rows[docid]['revision']
fp = None
try:
fp, = self.request.get_repo(repo).repo.openfile(path, revision)
fp, _ = self.request.get_repo(repo).repos.openfile(path, revision)
content = fp.read()
fp.close()
content = repo.guesser.utf8(content)
content = self.guesser.utf8(content)
except:
if fp: fp.close()
content = None
raise
# Build snippet
if content:
rows[docid]['snippet'] = self.get_snippet(sph, content, query.content_query)
rows[docid]['snippet'] = self.get_snippet(sph, content, query.content_query, rows[docid]['mimetype'])
# Run query and return all rows as dictionaries
def selectall(db, sql, args = (), key = None):
def selectall(self, db, sql, args = (), key = None):
cursor = db.cursor()
cursor.execute(sql, args)
desc = list(r[0] for r in cursor.description)
if key:
rows = {}
for i in cursor:
r = dict(zip(cursor.description, i))
rows[r[key]] = r
r = dict(zip(desc, i))
rows[str(r[key])] = r
else:
rows = []
for i in cursor:
rows.append(dict(zip(cursor.description, i)))
rows.append(dict(zip(desc, i)))
return rows
# Run content query
def RunSphinxQuery(self, query):
cursor = self.db.cursor()
rows = selectall(self.sphinx, self.CreateSphinxQueryString(query))
rows = self.selectall(self.sphinx, self.CreateSphinxQueryString(query))
if len(rows):
m_rows = selectall(self.db, self.CreateIdQueryString(sphinx_rows.keys()), (), 'id')
for r in rows:
r['id'] = str(r['id'])
m_rows = self.selectall(self.db, self.CreateIdQueryString((r['id'] for r in rows)), (), 'id')
new_rows = []
# Check rights BEFORE fetching snippets
for i in rows:
@ -717,7 +727,7 @@ class CheckinDatabase:
self.fetch_snippets(query, m_rows)
for i in rows:
if i['id'] in m_rows:
new_rows.push(m_rows[i['id']])
new_rows.append(m_rows[i['id']])
rows = new_rows
else:
rows = []
@ -729,13 +739,13 @@ class CheckinDatabase:
rows = self.RunSphinxQuery(query)
else:
# Use regular queries when document content is not searched
rows = selectall(self.db, self.CreateSQLQueryString(query))
rows = self.selectall(self.db, self.CreateSQLQueryString(query))
# Check rights
rows = r for r in rows if self.check_commit_access(
rows = (r for r in rows if self.check_commit_access(
r['repository_name'],
r['dir_name'],
r['file_name'],
r['revision'])
r['revision']))
# Convert rows to commit objects
for row in rows:
@ -752,7 +762,7 @@ class CheckinDatabase:
commit.SetDirectoryID(row['dirid'])
commit.SetRevision(row['revision'])
commit.SetRepositoryID(row['repositoryid'])
commit.SetAuthorID(row['authorid'])
commit.SetAuthorID(row['whoid'])
commit.SetBranchID(row['branchid'])
commit.SetPlusCount(row['addedlines'])
commit.SetMinusCount(row['removedlines'])
@ -1162,18 +1172,18 @@ def CreateCommit():
def CreateCheckinQuery():
return CheckinDatabaseQuery()
def ConnectDatabase(cfg, authorizer=None, readonly=0):
def ConnectDatabase(cfg, request=None, readonly=0):
db = CheckinDatabase(
readonly = readonly,
authorizer = authorizer,
request = request,
cfg = cfg.cvsdb,
guesser = cfg.guesser(),
)
db.Connect()
return db
def ConnectDatabaseReadOnly(cfg, authorizer):
return ConnectDatabase(cfg, authorizer, 1)
def ConnectDatabaseReadOnly(cfg, request):
return ConnectDatabase(cfg, request, 1)
def GetCommitListFromRCSFile(repository, path_parts, revision=None):
commit_list = []

View File

@ -3593,7 +3593,7 @@ def is_querydb_nonempty_for_root(request):
if request.cfg.cvsdb.check_database_for_root:
global cvsdb
import cvsdb
db = cvsdb.ConnectDatabaseReadOnly(request.cfg, request.auth)
db = cvsdb.ConnectDatabaseReadOnly(request.cfg, request)
repos_root, repos_dir = cvsdb.FindRepository(db, request.rootpath)
if repos_root:
return 1
@ -3992,16 +3992,16 @@ def query_is_unsecure_patch(request, commits):
lr = {}
for commit in commits:
for fileinfo in commit.files:
fn = _path_join([fileinfo.root['rootname'], fileinfo.dir, fileinfo.file])
fn = _path_join([fileinfo.root.rootname, fileinfo.dir, fileinfo.file])
if mr.get(fn, ''):
pr = mr[fn]
if fileinfo.root['roottype'] == 'svn':
pr = lr[fileinfo.root['rootname']]
if fileinfo.root.roottype == 'svn':
pr = lr[fileinfo.root.rootname]
pr = prev_rev(pr)
if rev_cmp(pr, fileinfo.rev) > 0:
return True
if fileinfo.root['roottype'] == 'svn':
lr[fileinfo.root['rootname']] = fileinfo.rev
if fileinfo.root.roottype == 'svn':
lr[fileinfo.root.rootname] = fileinfo.rev
mr[fn] = fileinfo.rev
return None
@ -4145,7 +4145,7 @@ def view_query(request):
global cvsdb
import cvsdb
db = cvsdb.ConnectDatabaseReadOnly(cfg, request.auth)
db = cvsdb.ConnectDatabaseReadOnly(cfg, request)
repos_dir = []
if not repos_root and request.rootpath:
repos_root, repos_dir = cvsdb.FindRepository(db, request.rootpath)