Bug 82651 - Tika&Sphinx&chardet content indexing (done!)

git-svn-id: svn://svn.office.custis.ru/3rdparty/viewvc.org/trunk@1388 6955db30-a419-402b-8a0d-67ecbb4d7f56
remotes/github/custis
vfilippov 2011-09-27 16:13:53 +00:00 committed by Vitaliy Filippov
parent 83c7e6fe49
commit e363cf19b1
10 changed files with 761 additions and 332 deletions

View File

@ -44,6 +44,7 @@ CREATE TABLE branches (
DROP TABLE IF EXISTS checkins;
CREATE TABLE checkins (
id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
type enum('Change','Add','Remove'),
ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
whoid mediumint(9) DEFAULT '0' NOT NULL,
@ -57,7 +58,7 @@ CREATE TABLE checkins (
removedlines int(11) DEFAULT '0' NOT NULL,
descid mediumint(9),
UNIQUE repositoryid (repositoryid,dirid,fileid,revision),
KEY repository_when (repositoryid,ci_when),
KEY repositoryid_when (repositoryid,ci_when),
KEY ci_when (ci_when),
KEY whoid (whoid,ci_when),
KEY dirid (dirid),
@ -138,6 +139,7 @@ CREATE TABLE branches (
DROP TABLE IF EXISTS commits;
CREATE TABLE commits (
id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
type enum('Change','Add','Remove'),
ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
whoid mediumint(9) DEFAULT '0' NOT NULL,
@ -151,9 +153,9 @@ CREATE TABLE commits (
removedlines int(11) DEFAULT '0' NOT NULL,
descid mediumint(9),
UNIQUE repositoryid (repositoryid,dirid,fileid,revision),
KEY repositoryid_when (repositoryid,ci_when),
KEY ci_when (ci_when),
KEY whoid (whoid),
KEY repositoryid_2 (repositoryid),
KEY whoid (whoid,ci_when),
KEY dirid (dirid),
KEY fileid (fileid),
KEY branchid (branchid),

View File

@ -58,7 +58,11 @@ else:
import os
import string
import socket
import select
import re
import mimetypes
import time
import svn.core
import svn.repos
@ -68,14 +72,20 @@ import svn.delta
import cvsdb
import viewvc
import vclib
from viewvcmagic import ContentMagic
class SvnRepo:
"""Class used to manage a connection to a SVN repository."""
def __init__(self, path):
def __init__(self, path, index_content = None, tika_client = None, guesser = None,
svn_ignore_mimetype = False):
self.path = path
self.repo = svn.repos.svn_repos_open(path)
self.fs = svn.repos.svn_repos_fs(self.repo)
self.rev_max = svn.fs.youngest_rev(self.fs)
self.index_content = index_content
self.tika_client = tika_client
self.guesser = guesser
self.svn_ignore_mimetype = svn_ignore_mimetype
def __getitem__(self, rev):
if rev is None:
rev = self.rev_max
@ -128,6 +138,74 @@ def _get_diff_counts(diff_fp):
line = diff_fp.readline()
return plus, minus
class TikaClient:
# Create tika client
def __init__(self, tika_server, mime_types):
self.tika_server = tika_server
self.mime_types = mime_types
self.addr = tika_server.split(':')
# Split address
if len(self.addr) != 2:
raise Exception('tika_server value is incorrect: \''+tika_server+'\', please use \'host:port\' format')
self.addr = (self.addr[0], int(self.addr[1]))
# Build regexp for MIME types
m = re.split('\s+', mime_types.strip())
self.mime_regexp = re.compile('|'.join('^'+re.escape(i).replace('\\*', '.*')+'$' for i in m))
# Extract text content from file using Tika which runs in server mode
def get_text(self, filename, mime_type, log_filename):
if not self.mime_regexp.match(mime_type):
# Tika can't handle this mime type, return nothing
return ''
fd = None
s = None
text = ''
fsize = 0
try:
# Read original file
fd = open(filename, 'rb')
data = fd.read()
fsize = len(data)
if not fsize:
return ''
# Connect to Tika
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(self.addr)
s.setblocking(0)
sockfd = s.fileno()
# Tika is somewhat delicate about network IO, so:
# Read and write using poll(2) system call
p = select.poll()
p.register(sockfd)
while 1:
fds = p.poll()
if not fds:
break
(pollfd, event) = fds[0]
if event & select.POLLIN:
# Exception or empty data means EOF...
try: part = os.read(sockfd, 65536)
except: break
if not part: break
text += part
if event & select.POLLOUT:
if not len(data):
# Shutdown output and forget about POLLOUT
s.shutdown(socket.SHUT_WR)
p.modify(sockfd, select.POLLIN)
else:
# Write and consume some data
l = os.write(sockfd, data)
data = data[l:]
if len(text) == 0:
raise Exception('Empty response from Tika server')
print "Extracted %d bytes from %s (%s) of size %d" % (len(text), log_filename, mime_type, fsize)
except Exception, e:
print "Error extracting text from %s (%s) of size %d: %s" % (log_filename, mime_type, fsize, str(e))
finally:
if fd: fd.close()
if s: s.close()
return text
class SvnRev:
"""Class used to hold information about a particular revision of
@ -184,19 +262,53 @@ class SvnRev:
['-b', '-B'])
diff_fp = diffobj.get_pipe()
plus, minus = _get_diff_counts(diff_fp)
# TODO Indexing file contents
# For binary files: svn.fs.contents_changed(root1, path1, root2, path2)
# Temp file with contents is at: diffobj.tempfile2
# Apache Tika server may even be at another host!
# CustIS Bug 50473: a workaround for svnlib behaviour in file movements (FILE1 -> FILE2 + FILE1 -> null)
if change.base_path:
if not change.path and changes_hash.get(change.base_path, '') != '':
if not change.path and change.base_path in changes_hash:
minus = 0
elif change.path:
changes_hash[change.base_path] = change.path
self.changes.append((path, action, plus, minus))
content = ''
mime = ''
# need to check if binary file's content changed when copying,
# if not, don't extract it, just get it from previous revision later
if repo.index_content and change.path and (not change.base_path
or svn.fs.contents_changed(
base_root and base_root or None,
base_root and change.base_path or None,
fsroot, change.path
)):
props = svn.fs.node_proplist(fsroot, change.path)
if not repo.svn_ignore_mimetype:
mime = props.get('svn:mime-type', None)
else:
mime = None
mime = repo.guesser.guess_mime(
mime,
os.path.basename(change.path),
diffobj.tempfile2
)
# Read and guess charset by ourselves for text files
if mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')):
try:
fd = open(diffobj.tempfile2, 'rb')
content = fd.read()
fd.close()
except: pass
# Guess charset
if content:
content, charset = repo.guesser.guess_charset(content)
if charset:
content = content.encode('utf-8')
print 'Guessed %s for %s' % (charset, change.path)
else:
print 'Failed to guess charset for %s, not indexing' % (change.path, )
# Try to extract content using Tika from binary documents
elif repo.tika_client:
content = repo.tika_client.get_text(diffobj.tempfile2, mime, change.path)
self.changes.append((path, action, plus, minus, content, mime))
def _get_root_for_rev(self, rev):
"""Fetch a revision root from a cache of such, or a fresh root
@ -217,7 +329,7 @@ def handle_revision(db, command, repo, rev, verbose, force=0):
if verbose: print "skipped (no changes)."
return
for (path, action, plus, minus) in revision.changes:
for (path, action, plus, minus, content, mime) in revision.changes:
directory, file = os.path.split(path)
commit = cvsdb.CreateCommit()
commit.SetRepository(repo.path)
@ -230,6 +342,8 @@ def handle_revision(db, command, repo, rev, verbose, force=0):
commit.SetPlusCount(plus)
commit.SetMinusCount(minus)
commit.SetBranch(None)
commit.SetContent(content)
commit.SetMimeType(mime)
if action == 'add':
commit.SetTypeAdd()
@ -268,7 +382,16 @@ def main(command, repository, revs=[], verbose=0, force=0):
sys.stderr.write("ERROR: " + str(e) + "\n")
sys.exit(1)
repo = SvnRepo(repository)
tika_client = None
if cfg.utilities.tika_server:
tika_client = TikaClient(cfg.utilities.tika_server, cfg.utilities.tika_mime_types)
repo = SvnRepo(
path = repository,
index_content = cfg.cvsdb.index_content,
tika_client = tika_client,
guesser = cfg.guesser(),
svn_ignore_mimetype = cfg.options.svn_ignore_mimetype,
)
if command == 'rebuild' or (command == 'update' and not revs):
for rev in range(repo.rev_max+1):
handle_revision(db, command, repo, rev, verbose)

View File

@ -1,6 +1,6 @@
#---------------------------------------------------------------------------
#
# Configuration file for ViewVC
# Configuration file for ViewVC (4IntraNet patched version)
#
# Information on ViewVC is located at the following web site:
# http://viewvc.org/
@ -244,8 +244,8 @@ cvsnt =
# See also bin/cvsnt-rcsfile-inetd.pl
#rcsfile_socket = 'host:port'
# Example: rcsfile_socket = '127.0.0.1:8071'
#rcsfile_socket = host:port
# Example: rcsfile_socket = 127.0.0.1:8071
# Subversion command-line client, used for viewing Subversion repositories
svn =
@ -259,6 +259,32 @@ diff =
cvsgraph =
# cvsgraph = /usr/local/bin/cvsgraph
# Apache Tika TCP server host and port, used to extract text from binary documents
# Note that as of 2011-09-12, Tika 0.9 has a bug which leads to hangups on processing
# M$Word documents in server mode. So you must use the fixed version, downloaded from:
# http://wiki.4intra.net/public/tika-app-0.9-fix-TIKA709.jar
# (mirror) http://code.google.com/p/mediawiki4intranet/downloads/detail?name=tika-app-0.9-fix-TIKA709.jar
# Or apply the patch by yourself and rebuild Tika from source, see patch here:
# https://issues.apache.org/jira/browse/TIKA-709
# Tika server should be started with command 'java -jar tika-app-0.9.jar -p PORT -t -eutf-8'
#tika_server = host:port
# Example: tika_server = 127.0.0.1:8072
# This lists MIME types that can be processed by Tika
# You may change it if your Tika is newer than 0.9 and supports more formats
# (note) *+xml examples: xhtml+xml, rss+xml, atom+xml, docbook+xml, rdf+xml
tika_mime_types =
text/*
application/*+xml
application/xml
application/vnd.oasis.opendocument.*
application/vnd.openxmlformats
application/vnd.ms-*
application/msaccess
application/msword
application/pdf
application/rtf
#---------------------------------------------------------------------------
[options]
@ -494,6 +520,12 @@ short_log_len = 80
# should we colorize known file content syntaxes? (requires Pygments module)
enable_syntax_coloration = 1
# detect_encoding: Should we attempt to detect versioned file
# character encodings? [Requires 'chardet' module]
# Used in file list, file content display and indexing
# See also options.encodings for naive guessing.
detect_encoding = 1
# Use CvsGraph. See http://www.akhphd.au.dk/~bertho/cvsgraph/ for
# documentation and download.
use_cvsgraph = 0
@ -544,6 +576,17 @@ use_pagesize = 0
# Set to 0 to disable the limit.
limit_changes = 100
# You can also use primitive charset guessing instead of chardet (options.detect_encoding)
# Just set this to the list of possible charsets in your repository.
# ViewVC will simply try to decode content using each of them, and pick
# the first which succeeds. UTF-8 is always tried automatically.
#encodings = cp1251:iso-8859-1
# Sadly this is also required - for back-links from query results to files
# in CVS, because it doesn't recode file names to UTF-8 as Subversion does.
# Just set to cp1251 if you work with your CVS from Windowz.
#cvs_ondisk_charset = cp1251
#---------------------------------------------------------------------------
[templates]
@ -588,9 +631,14 @@ limit_changes = 100
# Set to 1 to enable the database integration feature, 0 otherwise.
enabled = 0
# Database hostname and port.
# Set to 1 to enable indexing of file contents using Sphinx and Tika
index_content = 0
# Database hostname, port, and socket
#host = localhost
#port = 3306
# On Debian Linux, enable this:
#socket = /var/run/mysqld/mysqld.sock
# ViewVC database name.
#database_name = ViewVC
@ -605,6 +653,30 @@ enabled = 0
#readonly_user =
#readonly_passwd =
# ViewVC can use Sphinx (http://sphinxsearch.com) full-text search engine
# to index file contents with full history and then search over them.
# Also, Apache Tika console application can be used in TCP server mode to
# add support for indexing binary documents (M$Word, PDF and etc).
# See tika_server in [utilities].
# Requires Sphinx >= 0.9.9 with a real-time updatable SphinxQL index.
# Index must be created in sphinx.conf by hand and have the following fields:
# rt_field = content
# rt_attr_string = content
# rt_attr_string = mimetype
# rt_attr_timestamp = ci_when
# rt_attr_uint = whoid
# rt_attr_uint = repositoryid
# rt_attr_uint = dirid
# rt_attr_uint = fileid
# rt_attr_uint = revision
# rt_attr_uint = branchid
# Sphinx connection parameters:
#sphinx_host =
#sphinx_port =
#sphinx_socket = /var/run/sphinxql.sock
#sphinx_index = viewvc
# Limit the number of rows returned by a given query to this number.
#row_limit = 1000

View File

@ -24,6 +24,7 @@ import vclib.ccvs
import vclib.svn
import cvsdb
import viewvc
from viewvcmagic import ContentMagic
#########################################################################
#
@ -47,6 +48,7 @@ class Config:
'root_parents', 'allowed_views', 'mime_types_files')
def __init__(self):
self.__guesser = None
for section in self._sections:
setattr(self, section, _sub_config())
@ -66,7 +68,6 @@ class Config:
if rootname:
self._process_root_options(self.parser, rootname)
self.expand_root_parents()
cvsdb.setencs(self.options.encodings.split(':'))
r = {}
for i in self.rewritehtml.__dict__.keys():
if i[-8:] == '.replace':
@ -237,6 +238,11 @@ class Config:
params['__config'] = self
return params
def guesser(self):
if not self.__guesser:
self.__guesser = ContentMagic(self.options.encodings)
return self.__guesser
def set_defaults(self):
"Set some default values in the configuration."
@ -258,6 +264,8 @@ class Config:
self.utilities.svn = ''
self.utilities.diff = ''
self.utilities.cvsgraph = ''
self.utilities.tika_server = ''
self.utilities.tika_mime_types = ''
self.options.root_as_url_component = 1
self.options.checkout_magic = 0
@ -302,7 +310,7 @@ class Config:
self.options.limit_changes = 100
self.options.cvs_ondisk_charset = 'cp1251'
self.options.binary_mime_re = '^(?!text/|.*\Wxml)'
self.options.encodings = 'utf-8:cp1251:iso-8859-1'
self.options.encodings = 'cp1251:iso-8859-1'
self.templates.diff = None
self.templates.directory = None
@ -316,6 +324,7 @@ class Config:
self.templates.roots = None
self.cvsdb.enabled = 0
self.cvsdb.index_content = 0
self.cvsdb.host = ''
self.cvsdb.port = 3306
self.cvsdb.socket = ''
@ -329,6 +338,11 @@ class Config:
self.cvsdb.check_database_for_root = 0
self.cvsdb.fulltext_min_relevance = 0.2
self.cvsdb.sphinx_host = ''
self.cvsdb.sphinx_port = 3307
self.cvsdb.sphinx_socket = ''
self.cvsdb.sphinx_index = ''
def _startswith(somestr, substr):
return somestr[:len(substr)] == substr

View File

@ -15,6 +15,7 @@ import sys
import string
import time
import re
import cgi
import vclib
import dbi
@ -36,22 +37,12 @@ error = "cvsdb error"
## defined to actually be complete; it should run well off of any DBI 2.0
## complient database interface
encs = [ "utf-8", "cp1251", "iso-8859-1" ]
def utf8string(value):
for e in encs:
try:
value = value.decode(e)
break
except: pass
return value.encode("utf-8")
def setencs(e):
global encs
encs = e
class CheckinDatabase:
def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, authorizer = None):
def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, cfg,
authorizer = None, index_content = 0, sphinx_host = None, sphinx_port = None,
sphinx_socket = None, sphinx_index = None):
self.cfg = cfg
self._host = host
self._port = port
self._socket = socket
@ -63,11 +54,21 @@ class CheckinDatabase:
self._min_relevance = min_relevance
self.authorizer = authorizer
# Sphinx settings
self.index_content = index_content
self.sphinx_host = sphinx_host
self.sphinx_port = sphinx_port
self.sphinx_socket = sphinx_socket
self.sphinx_index = sphinx_index
## database lookup caches
self._get_cache = {}
self._get_id_cache = {}
self._desc_id_cache = {}
# Sphinx connection None by default
self.sphinx = None
def Connect(self):
self.db = dbi.connect(
self._host, self._port, self._socket, self._user, self._passwd, self._database)
@ -83,12 +84,17 @@ class CheckinDatabase:
else:
self._version = 0
if self._version > CURRENT_SCHEMA_VERSION:
raise DatabaseVersionError("Database version %d is newer than the "
"last version supported by this "
"software." % (self._version))
raise DatabaseVersionError("Database version %d is newer than the "
"last version supported by this "
"software." % (self._version))
if self.index_content:
self.sphinx = dbi.connect(self.sphinx_host, self.sphinx_port, self.sphinx_socket, '', '', '')
def utf8(self, value):
return self.cfg.guesser().utf8(value)
def sql_get_id(self, table, column, value, auto_set):
value = utf8string(value)
value = self.utf8(value)
sql = "SELECT id FROM %s WHERE %s=%%s" % (table, column)
sql_args = (value, )
@ -257,7 +263,7 @@ class CheckinDatabase:
return self.get_list("repositories", repository)
def SQLGetDescriptionID(self, description, auto_set = 1):
description = utf8string(description)
description = self.utf8(description)
## lame string hash, blame Netscape -JMP
hash = len(description)
@ -338,48 +344,55 @@ class CheckinDatabase:
self.AddCommit(commit)
def AddCommit(self, commit):
ci_when = dbi.DateTimeFromTicks(commit.GetTime() or 0.0)
ci_type = commit.GetTypeString()
who_id = self.GetAuthorID(commit.GetAuthor())
repository_id = self.GetRepositoryID(commit.GetRepository())
directory_id = self.GetDirectoryID(commit.GetDirectory())
file_id = self.GetFileID(commit.GetFile())
revision = commit.GetRevision()
sticky_tag = "NULL"
branch_id = self.GetBranchID(commit.GetBranch())
plus_count = commit.GetPlusCount() or '0'
minus_count = commit.GetMinusCount() or '0'
description_id = self.GetDescriptionID(commit.GetDescription())
props = {
'type' : commit.GetTypeString(),
'ci_when' : dbi.DateTimeFromTicks(commit.GetTime() or 0.0),
'whoid' : self.GetAuthorID(commit.GetAuthor()),
'repositoryid' : self.GetRepositoryID(commit.GetRepository()),
'dirid' : self.GetDirectoryID(commit.GetDirectory()),
'fileid' : self.GetFileID(commit.GetFile()),
'revision' : commit.GetRevision(),
'branchid' : self.GetBranchID(commit.GetBranch()),
'addedlines' : commit.GetPlusCount() or '0',
'removedlines' : commit.GetMinusCount() or '0',
'descid' : self.GetDescriptionID(commit.GetDescription()),
}
commits_table = self._version >= 1 and 'commits' or 'checkins'
sql = "REPLACE INTO %s" % (commits_table)
sql = sql + \
" (type,ci_when,whoid,repositoryid,dirid,fileid,revision,"\
" stickytag,branchid,addedlines,removedlines,descid)"\
"VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
sql_args = (ci_type, ci_when, who_id, repository_id,
directory_id, file_id, revision, sticky_tag, branch_id,
plus_count, minus_count, description_id)
cursor = self.db.cursor()
try:
cursor.execute(sql, sql_args)
# MySQL-specific INSERT-or-UPDATE with ID retrieval
cursor.execute(
'INSERT INTO '+commits_table+'('+','.join(i for i in props)+') VALUES ('+
', '.join('%s' for i in props)+') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), '+
', '.join(i+'=VALUES('+i+')' for i in props),
tuple(props[i] for i in props)
)
commit_id = cursor.lastrowid
if self.index_content:
sphcur = self.sphinx.cursor()
content = commit.GetContent()
props['ci_when'] = str(int(commit.GetTime() or 0))
if len(content):
props['content'] = content
# Now, stored MIME type is only needed while searching
# It is guessed again when the file is displayed
props['mimetype'] = commit.GetMimeType()
props['id'] = str(commit_id)
del props['addedlines']
del props['removedlines']
del props['descid']
del props['type']
sphcur.execute(
'INSERT INTO '+self.sphinx_index+'('+','.join(i for i in props)+') VALUES ('+
','.join('%s' for i in props)+')',
tuple(props[i] for i in props)
)
except Exception, e:
raise Exception("Error adding commit: '%s'\n"
"Values were:\n"
"\ttype = %s\n"
"\tci_when = %s\n"
"\twhoid = %s\n"
"\trepositoryid = %s\n"
"\tdirid = %s\n"
"\tfileid = %s\n"
"\trevision = %s\n"
"\tstickytag = %s\n"
"\tbranchid = %s\n"
"\taddedlines = %s\n"
"\tremovedlines = %s\n"
"\tdescid = %s\n"
% ((str(e), ) + sql_args))
print ("Error adding commit: '"+str(e)+"'\nValues were:\n"+
"\n".join(i+'='+str(props[i]) for i in props))
raise
def SQLQueryListString(self, field, query_entry_list):
sqlList = []
@ -414,6 +427,67 @@ class CheckinDatabase:
return "(%s)" % (string.join(sqlList, " OR "))
def query_ids(self, in_field, table, id_field, name_field, lst):
if not len(lst):
return None
cond = self.SQLQueryListString(name_field, lst)
cursor = self.db.cursor()
cursor.execute('SELECT %s FROM %s WHERE %s' % (id_field, table, cond))
ids = list(str(row[0]) for row in cursor)
if not len(ids):
return None
return "%s IN (%s)" % (in_field, ','.join(ids))
def CreateSphinxQueryString(self, query):
condList = [
'MATCH(%s)' % (self.db.literal(query.content_query), ),
self.query_ids('repositoryid', 'repositories', 'id', 'repository', query.repository_list),
self.query_ids('branchid', 'branches', 'id', 'branch', query.branch_list),
self.query_ids('dirid', 'dirs', 'id', 'dir', query.directory_list),
self.query_ids('fileid', 'files', 'id', 'file', query.file_list),
self.query_ids('authorid', 'people', 'id', 'who', query.author_list),
self.query_ids('descid', 'descs', 'id', 'description', query.comment_list),
]
if len(query.revision_list):
condList.append("revision IN ("+','.join(self.db.literal(s) for s in query.revision_list)+")")
if query.from_date:
condList.append('ci_when>='+str(dbi.TicksFromDateTime(query.from_date)))
if query.to_date:
condList.append('ci_when<='+str(dbi.TicksFromDateTime(query.to_date)))
if query.sort == 'date':
order_by = 'ORDER BY `ci_when` DESC, `relevance` DESC'
elif query.sort == 'date_rev':
order_by = 'ORDER BY `ci_when` ASC, `relevance` DESC'
else: # /* if query.sort == 'relevance' */
order_by = 'ORDER BY `relevance` DESC'
conditions = string.join((i for i in condList if i), " AND ")
conditions = conditions and "WHERE %s" % conditions
## limit the number of rows requested or we could really slam
## a server with a large database
limit = ""
if query.limit:
limit = "LIMIT %s" % (str(query.limit))
elif self._row_limit:
limit = "LIMIT %s" % (str(self._row_limit))
fields = "id `id`, WEIGHT() `relevance`, `content`, `mimetype`"
return "SELECT %s FROM %s %s %s %s" % (fields, self.sphinx_index, conditions, order_by, limit)
# Get commits by their IDs
def CreateIdQueryString(self, ids):
commits_table = self._version >= 1 and 'commits' or 'checkins'
return (
'SELECT %s.*, repositories.repository AS repository_name, dirs.dir AS dir_name, files.file AS file_name'
' FROM %s, repositories, dirs, files'
' WHERE %s.id IN (%s) AND repositoryid=repositories.id'
' AND dirid=dirs.id AND fileid=files.id' % (commits_table, commits_table, commits_table, ','.join(ids))
)
def CreateSQLQueryString(self, query):
commits_table = self._version >= 1 and 'commits' or 'checkins'
fields = [
@ -435,6 +509,7 @@ class CheckinDatabase:
fields.append("%s) AS relevance" % temp)
else:
fields.append("'' AS relevance")
fields.append("'' AS snippet")
if len(query.repository_list):
temp = self.SQLQueryListString("repositories.repository",
@ -478,16 +553,18 @@ class CheckinDatabase:
temp = "(%s.ci_when<=\"%s\")" % (commits_table, str(query.to_date))
condList.append(temp)
if query.sort == "date":
order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
if query.sort == "relevance" and len(query.text_query):
order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
elif query.sort == "date_rev":
order_by = "ORDER BY %s.ci_when ASC,descid,%s.repositoryid" % (commits_table, commits_table)
elif query.sort == "author":
tableList.append(("people", "(%s.whoid=people.id)" % (commits_table)))
order_by = "ORDER BY people.who,descid,%s.repositoryid" % (commits_table)
elif query.sort == "file":
tableList.append(("files", "(%s.fileid=files.id)" % (commits_table)))
order_by = "ORDER BY files.file,descid,%s.repositoryid" % (commits_table)
elif query.sort == "relevance" and len(query.text_query):
order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
else: # /* if query.sort == "date": */
order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
## exclude duplicates from the table list, and split out join
## conditions from table names. In future, the join conditions
@ -528,19 +605,60 @@ class CheckinDatabase:
return True
def RunQuery(self, query):
sql = self.CreateSQLQueryString(query)
cursor = self.db.cursor()
cursor.execute(sql)
if len(query.content_query) and self.sphinx:
# Use Sphinx to search on document content
sql = self.CreateSphinxQueryString(query)
cursor = self.sphinx.cursor()
cursor.execute(sql)
sphinx_rows = list((str(docid), rel, content, mimetype) for docid, rel, content, mimetype in cursor)
if len(sphinx_rows):
# Fetch snippets
snippet_options = {
'around': 15,
'limit': 200,
'before_match': '<span style="color:red">',
'after_match': '</span>',
'chunk_separator': ' ... ',
}
preformatted_mime = 'text/(?!html|xml).*'
snippets = {}
bm_html = cgi.escape(snippet_options['before_match'])
am_html = cgi.escape(snippet_options['after_match'])
for docid, rel, content, mimetype in sphinx_rows:
cursor.execute(
'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
(content, self.sphinx_index, query.content_query) + tuple(snippet_options.values())
)
s, = cursor.fetchone()
s = cgi.escape(s)
if re.match(preformatted_mime, mimetype):
s = s.replace('\n', '<br />')
s = s.replace(bm_html, snippet_options['before_match'])
s = s.replace(am_html, snippet_options['after_match'])
snippets[docid] = s
# Fetch all fields from MySQL
sql = self.CreateIdQueryString((docid for (docid, _, _, _) in sphinx_rows))
cursor = self.db.cursor()
cursor.execute(sql)
byid = {}
for row in cursor:
byid[str(row[0])] = row
rows = list(byid[docid] + (rel, snippets[docid]) for (docid, rel, _, _) in sphinx_rows if docid in byid)
else:
rows = []
else:
# Use regular queries when document content is not searched
sql = self.CreateSQLQueryString(query)
cursor = self.db.cursor()
cursor.execute(sql)
rows = list(cursor)
while 1:
row = cursor.fetchone()
if not row:
break
(dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID,
# Convert rows to commit objects
for row in rows:
(dbId, dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID,
dbFileID, dbRevision, dbStickyTag, dbBranchID, dbAddedLines,
dbRemovedLines, dbDescID, dbRepositoryName, dbDirName,
dbFileName, dbRelevance) = row
dbFileName, dbRelevance, dbSnippet) = row
if not self.check_commit_access(dbRepositoryName, dbDirName, dbFileName, dbRevision):
continue
@ -564,6 +682,7 @@ class CheckinDatabase:
commit.SetMinusCount(dbRemovedLines)
commit.SetDescriptionID(dbDescID)
commit.SetRelevance(dbRelevance)
commit.SetSnippet(dbSnippet)
query.AddCommit(commit)
@ -623,39 +742,14 @@ class CheckinDatabase:
raise UnknownRepositoryError("Unknown repository '%s'"
% (repository))
if (self._version >= 1):
self.sql_delete('repositories', 'id', rep_id)
self.sql_purge('commits', 'repositoryid', 'id', 'repositories')
self.sql_purge('files', 'id', 'fileid', 'commits')
self.sql_purge('dirs', 'id', 'dirid', 'commits')
self.sql_purge('branches', 'id', 'branchid', 'commits')
self.sql_purge('descs', 'id', 'descid', 'commits')
self.sql_purge('people', 'id', 'whoid', 'commits')
else:
sql = "SELECT * FROM checkins WHERE repositoryid=%s"
sql_args = (rep_id, )
cursor = self.db.cursor()
cursor.execute(sql, sql_args)
checkins = []
while 1:
try:
(ci_type, ci_when, who_id, repository_id,
dir_id, file_id, revision, sticky_tag, branch_id,
plus_count, minus_count, description_id) = \
cursor.fetchone()
except TypeError:
break
checkins.append([file_id, dir_id, branch_id,
description_id, who_id])
#self.sql_delete('repositories', 'id', rep_id)
self.sql_delete('checkins', 'repositoryid', rep_id)
for checkin in checkins:
self.sql_delete('files', 'id', checkin[0], 'fileid')
self.sql_delete('dirs', 'id', checkin[1], 'dirid')
self.sql_delete('branches', 'id', checkin[2], 'branchid')
self.sql_delete('descs', 'id', checkin[3], 'descid')
self.sql_delete('people', 'id', checkin[4], 'whoid')
checkins_table = self._version >= 1 and 'commits' or 'checkins'
self.sql_delete('repositories', 'id', rep_id)
self.sql_purge(checkins_table, 'repositoryid', 'id', 'repositories')
self.sql_purge('files', 'id', 'fileid', checkins_table)
self.sql_purge('dirs', 'id', 'dirid', checkins_table)
self.sql_purge('branches', 'id', 'branchid', checkins_table)
self.sql_purge('descs', 'id', 'descid', checkins_table)
self.sql_purge('people', 'id', 'whoid', checkins_table)
# Reset all internal id caches. We could be choosier here,
# but let's just be as safe as possible.
@ -690,8 +784,13 @@ class Commit:
self.__minuscount = ''
self.__description = ''
self.__relevance = ''
self.__snippet = ''
self.__gmt_time = 0.0
self.__type = Commit.CHANGE
self.__content = ''
self.__mimetype = ''
self.__base_path = ''
self.__base_rev = ''
def SetRepository(self, repository):
self.__repository = repository
@ -758,12 +857,19 @@ class Commit:
def GetDescription(self):
return self.__description
# Relevance and snippet are used when querying commit database
def SetRelevance(self, relevance):
self.__relevance = relevance
def GetRelevance(self):
return self.__relevance
def SetSnippet(self, snippet):
self.__snippet = snippet
def GetSnippet(self):
return self.__snippet
def SetTypeChange(self):
self.__type = Commit.CHANGE
@ -784,66 +890,80 @@ class Commit:
elif self.__type == Commit.REMOVE:
return 'Remove'
# File content (extracted text), optional, indexed with Sphinx
def SetContent(self, content):
self.__content = content
def GetContent(self):
return self.__content
# MIME type, optional, now only stored in Sphinx
def SetMimeType(self, mimetype):
self.__mimetype = mimetype
def GetMimeType(self):
return self.__mimetype
## LazyCommit overrides a few methods of Commit to only retrieve
## it's properties as they are needed
class LazyCommit(Commit):
def __init__(self, db):
Commit.__init__(self)
self.__db = db
def __init__(self, db):
Commit.__init__(self)
self.__db = db
def SetFileID(self, dbFileID):
self.__dbFileID = dbFileID
def SetFileID(self, dbFileID):
self.__dbFileID = dbFileID
def GetFileID(self):
return self.__dbFileID
def GetFileID(self):
return self.__dbFileID
def GetFile(self):
return self.__db.GetFile(self.__dbFileID)
def GetFile(self):
return self.__db.GetFile(self.__dbFileID)
def SetDirectoryID(self, dbDirID):
self.__dbDirID = dbDirID
def SetDirectoryID(self, dbDirID):
self.__dbDirID = dbDirID
def GetDirectoryID(self):
return self.__dbDirID
def GetDirectoryID(self):
return self.__dbDirID
def GetDirectory(self):
return self.__db.GetDirectory(self.__dbDirID)
def GetDirectory(self):
return self.__db.GetDirectory(self.__dbDirID)
def SetRepositoryID(self, dbRepositoryID):
self.__dbRepositoryID = dbRepositoryID
def SetRepositoryID(self, dbRepositoryID):
self.__dbRepositoryID = dbRepositoryID
def GetRepositoryID(self):
return self.__dbRepositoryID
def GetRepositoryID(self):
return self.__dbRepositoryID
def GetRepository(self):
return self.__db.GetRepository(self.__dbRepositoryID)
def GetRepository(self):
return self.__db.GetRepository(self.__dbRepositoryID)
def SetAuthorID(self, dbAuthorID):
self.__dbAuthorID = dbAuthorID
def SetAuthorID(self, dbAuthorID):
self.__dbAuthorID = dbAuthorID
def GetAuthorID(self):
return self.__dbAuthorID
def GetAuthorID(self):
return self.__dbAuthorID
def GetAuthor(self):
return self.__db.GetAuthor(self.__dbAuthorID)
def GetAuthor(self):
return self.__db.GetAuthor(self.__dbAuthorID)
def SetBranchID(self, dbBranchID):
self.__dbBranchID = dbBranchID
def SetBranchID(self, dbBranchID):
self.__dbBranchID = dbBranchID
def GetBranchID(self):
return self.__dbBranchID
def GetBranchID(self):
return self.__dbBranchID
def GetBranch(self):
return self.__db.GetBranch(self.__dbBranchID)
def GetBranch(self):
return self.__db.GetBranch(self.__dbBranchID)
def SetDescriptionID(self, dbDescID):
self.__dbDescID = dbDescID
def SetDescriptionID(self, dbDescID):
self.__dbDescID = dbDescID
def GetDescriptionID(self):
return self.__dbDescID
def GetDescriptionID(self):
return self.__dbDescID
def GetDescription(self):
return self.__db.GetDescription(self.__dbDescID)
def GetDescription(self):
return self.__db.GetDescription(self.__dbDescID)
## QueryEntry holds data on one match-type in the SQL database
## match is: "exact", "like", or "regex"
@ -859,7 +979,7 @@ class CheckinDatabaseQuery:
## sorting
self.sort = "date"
## repository to query
## repository, branch, etc to query
self.repository_list = []
self.branch_list = []
self.directory_list = []
@ -867,7 +987,11 @@ class CheckinDatabaseQuery:
self.revision_list = []
self.author_list = []
self.comment_list = []
## text_query = Fulltext query on comments
## content_query = Fulltext query on content
self.text_query = ""
self.content_query = ""
## date range in DBI 2.0 timedate objects
self.from_date = None
@ -886,6 +1010,9 @@ class CheckinDatabaseQuery:
def SetTextQuery(self, query):
self.text_query = query
def SetContentQuery(self, query):
self.content_query = query
def SetRepository(self, repository, match = "exact"):
self.repository_list.append(QueryEntry(repository, match))
@ -953,9 +1080,23 @@ def ConnectDatabase(cfg, authorizer=None, readonly=0):
else:
user = cfg.cvsdb.user
passwd = cfg.cvsdb.passwd
db = CheckinDatabase(cfg.cvsdb.host, cfg.cvsdb.port, cfg.cvsdb.socket, user, passwd,
cfg.cvsdb.database_name, cfg.cvsdb.row_limit, cfg.cvsdb.fulltext_min_relevance,
authorizer)
db = CheckinDatabase(
host = cfg.cvsdb.host,
port = cfg.cvsdb.port,
socket = cfg.cvsdb.socket,
user = user,
passwd = passwd,
database = cfg.cvsdb.database_name,
row_limit = cfg.cvsdb.row_limit,
min_relevance = cfg.cvsdb.fulltext_min_relevance,
authorizer = authorizer,
index_content = cfg.cvsdb.index_content,
sphinx_host = cfg.cvsdb.sphinx_host,
sphinx_port = int(cfg.cvsdb.sphinx_port),
sphinx_socket = cfg.cvsdb.sphinx_socket,
sphinx_index = cfg.cvsdb.sphinx_index,
cfg = cfg,
)
db.Connect()
return db

View File

@ -1,4 +1,3 @@
#
# Copyright (C) 1999-2009 The ViewCVS Group. All Rights Reserved.
#
# By using this file, you agree to the terms and conditions set forth in
@ -68,7 +67,6 @@ docroot_magic_path = '*docroot*'
viewcvs_mime_type = 'text/vnd.viewcvs-markup'
alt_mime_type = 'text/x-cvsweb-markup'
view_roots_magic = '*viewroots*'
magic_buf_size = 4096
default_mime_type = 'application/octet-stream'
# Put here the variables we need in order to hold our state - they
@ -121,9 +119,8 @@ class Request:
# check for an authenticated username
self.username = server.getenv('REMOTE_USER')
# construct MIME magic
self.ms = None
self.ms_fail = 0
# repository object cache
self.all_repos = {}
# if we allow compressed output, see if the client does too
self.gzip_compress_level = 0
@ -134,6 +131,9 @@ class Request:
string.split(http_accept_encoding, ","))):
self.gzip_compress_level = 9 # make this configurable?
def utf8(self, value):
return self.cfg.guesser().utf8(value)
def create_repos(self, rootname):
if not rootname:
return None
@ -677,7 +677,7 @@ def _validate_mimetype(value):
return value in (viewcvs_mime_type, alt_mime_type, 'text/plain')
# obvious things here. note that we don't need uppercase for alpha.
_re_validate_alpha = re.compile('^[a-z]+$')
_re_validate_alpha = re.compile('^[a-z_]+$')
_re_validate_number = re.compile('^[0-9]+$')
_re_validate_boolint = re.compile('^[01]$')
@ -743,6 +743,7 @@ _legal_params = {
'who_match' : _re_validate_alpha,
'comment' : None,
'comment_match' : _re_validate_alpha,
'search_content': None,
'querysort' : _re_validate_alpha,
'date' : _re_validate_alpha,
'hours' : _re_validate_number,
@ -988,7 +989,7 @@ def nav_path(request):
is_last = len(path_parts) == len(request.path_parts)
if request.roottype == 'cvs':
item = _item(name=cvsdb.utf8string(part), href=None)
item = _item(name=request.utf8(part), href=None)
else:
item = _item(name=part, href=None)
@ -1248,7 +1249,7 @@ def common_template_data(request, revision=None, mime_type=None):
cfg = request.cfg
where = request.where
if request.roottype == 'cvs':
where = cvsdb.utf8string(where)
where = request.utf8(where)
where = request.server.escape(where)
# Initialize data dictionary members (sorted alphanumerically)
@ -1444,28 +1445,31 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
get_lexer_for_mimetype, \
get_lexer_for_filename
from pygments.lexers._mapping import LEXERS
# Hack for shell mime types:
LEXERS['BashLexer'] = ('pygments.lexers.other', 'Bash', ('bash', 'sh'), ('*.sh',), ('application/x-sh', 'application/x-shellscript', 'text/x-sh', 'text/x-shellscript'))
encoding = 'guess'
if cfg.options.detect_encoding:
try:
import chardet
encoding = 'chardet'
except (SyntaxError, ImportError):
pass
try:
lexer = get_lexer_for_mimetype(mime_type,
encoding=encoding,
encoding='utf-8',
stripnl=False)
except ClassNotFound:
try:
lexer = get_lexer_for_filename(filename,
encoding=encoding,
encoding='utf-8',
stripnl=False)
except ClassNotFound:
use_pygments = 0
except ImportError:
use_pygments = 0
# Detect encoding by calling chardet ourselves,
# to support it in non-highlighting mode
content = fp.read()
c, encoding = cfg.guesser().guess_charset(content)
if encoding:
content = c
else:
encoding = 'unknown'
# If we aren't going to be highlighting anything, just return the
# BLAME_SOURCE. If there's no blame_source, we'll generate a fake
# one from the file contents we fetch with PATH and REV.
@ -1475,11 +1479,7 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
else:
lines = []
line_no = 0
while 1:
line = fp.readline()
if not line:
break
line = cvsdb.utf8string(line)
for line in content.split('\n'):
line_no = line_no + 1
item = vclib.Annotation(cgi.escape(line), line_no,
None, None, None, None)
@ -1508,19 +1508,11 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
self.blame_data.append(item)
self.line_no = self.line_no + 1
ps = PygmentsSink(blame_source)
fpd = fp.read()
try:
fpdat = unicode(fpd,'utf-8')
except:
try:
fpdat = unicode(fpd,'cp1251')
except:
fpdat = fpd
highlight(fpdat, lexer,
highlight(content, lexer,
HtmlFormatter(nowrap=True,
classprefix='pygments-',
encoding='utf-8'), ps)
return ps.blame_data
return ps.blame_data, encoding
def make_time_string(date, cfg):
"""Returns formatted date string in either local time or UTC.
@ -1594,6 +1586,7 @@ def calculate_mime_type(request, path_parts, rev):
return mime_type
except:
pass
# FIXME rewrite to use viewvcmagic
return guess_mime(path_parts[-1])
def markup_or_annotate(request, is_annotate):
@ -1605,21 +1598,12 @@ def markup_or_annotate(request, is_annotate):
mime_type = calculate_mime_type(request, path, rev)
if not mime_type or mime_type == default_mime_type:
if request.ms is None and not request.ms_fail:
try:
import magic
request.ms = magic.open(magic.MAGIC_NONE | magic.MAGIC_MIME)
request.ms.load()
except:
request.ms_fail = 1
if request.ms:
try:
fp, revision = request.repos.openfile(path, rev)
buffer = fp.read(magic_buf_size)
fp.close()
mime_type = request.ms.buffer(buffer)
except:
pass
try:
fp, revision = request.repos.openfile(path, rev)
mime_type = request.cfg.guesser().guess_mime(None, None, fp)
fp.close()
except:
raise
# Is this a binary type?
if is_binary(request.cfg, mime_type):
@ -1657,9 +1641,10 @@ def markup_or_annotate(request, is_annotate):
if check_freshness(request, None, revision, weak=1):
fp.close()
return
lines = markup_stream_pygments(request, cfg, blame_source, fp,
path[-1], mime_type)
lines, charset = markup_stream_pygments(request, cfg, blame_source, fp, path[-1], mime_type)
fp.close()
if mime_type.find(';') < 0:
mime_type = mime_type+'; charset='+charset
data = common_template_data(request, revision)
data.merge(ezt.TemplateData({
@ -1910,7 +1895,7 @@ def view_directory(request):
row.short_log = format_log(file.log, cfg)
row.log = htmlify(file.log, cfg.options.mangle_email_addresses)
row.lockinfo = file.lockinfo
row.name = request.server.escape(cvsdb.utf8string(file.name))
row.name = request.server.escape(request.utf8(file.name))
row.anchor = row.name
row.pathtype = (file.kind == vclib.FILE and 'file') or \
(file.kind == vclib.DIR and 'dir')
@ -2285,7 +2270,7 @@ def view_log(request):
entry.ago = html_time(request, rev.date, 1)
entry.log = rev.log or ""
if cvs:
entry.log = cvsdb.utf8string(entry.log)
entry.log = request.utf8(entry.log)
entry.log = htmlify(entry.log, cfg.options.mangle_email_addresses)
entry.size = rev.size
entry.lockinfo = rev.lockinfo
@ -2867,7 +2852,7 @@ class DiffSource:
diff_code = line[0]
output = self._format_text(line[1:])
output = cvsdb.utf8string(output)
output = self.cfg.guesser().utf8(output)
if diff_code == '+':
if self.state == 'dump':
@ -3644,6 +3629,7 @@ def view_queryform(request):
'who_match' : request.query_dict.get('who_match', 'exact'),
'comment' : request.query_dict.get('comment', ''),
'comment_match' : request.query_dict.get('comment_match', 'fulltext'),
'search_content' : request.query_dict.get('search_content', ''),
'querysort' : request.query_dict.get('querysort', 'date'),
'date' : request.query_dict.get('date', 'hours'),
'hours' : request.query_dict.get('hours', '2'),
@ -3653,6 +3639,7 @@ def view_queryform(request):
'query_hidden_values' : query_hidden_values,
'limit_changes' : limit_changes,
'dir_href' : dir_href,
'enable_search_content' : request.cfg.cvsdb.index_content,
}))
generate_page(request, "query_form", data)
@ -3791,7 +3778,8 @@ def build_commit(request, files, max_files, dir_strip, format):
plus_count = 0
minus_count = 0
found_unreadable = 0
all_repos = {}
if not request.all_repos:
request.all_repos = {}
for f in files:
dirname = f.GetDirectory()
@ -3810,17 +3798,19 @@ def build_commit(request, files, max_files, dir_strip, format):
# Check path access (since the commits database logic bypasses the
# vclib layer and, thus, the vcauth stuff that layer uses).
my_repos = all_repos.get(f.GetRepository(), '')
my_repos = request.all_repos.get(f.GetRepository(), '')
if not my_repos:
try:
my_repos = all_repos[f.GetRepository()] = request.create_repos(f.GetRepository())
my_repos = request.all_repos[f.GetRepository()] = request.create_repos(f.GetRepository())
except:
my_repos = None
if not my_repos:
return None
if my_repos['roottype'] == 'cvs':
try: where = unicode(where,'utf-8')
# we store UTF-8 in the DB
try: where = where.decode('utf-8')
except: pass
# FIXME maybe store "real" filesystem path in the DB instead of having such setting?
try: where = where.encode(cfg.options.cvs_ondisk_charset)
except: pass
path_parts = _path_parts(where)
@ -3907,24 +3897,27 @@ def build_commit(request, files, max_files, dir_strip, format):
if max_files and num_allowed > max_files:
continue
commit_files.append(_item(date=commit_time,
dir=request.server.escape(dirname),
file=request.server.escape(filename),
author=request.server.escape(f.GetAuthor()),
rev=rev,
branch=f.GetBranch(),
plus=plus,
minus=minus,
type=change_type,
dir_href=dir_href,
log_href=log_href,
view_href=view_href,
download_href=download_href,
prefer_markup=prefer_markup,
diff_href=diff_href,
root=my_repos,
path=where,
path_prev=path_prev))
commit_files.append(_item(
date=commit_time,
dir=request.server.escape(dirname),
file=request.server.escape(filename),
author=request.server.escape(f.GetAuthor()),
rev=rev,
branch=f.GetBranch(),
plus=plus,
minus=minus,
type=change_type,
snippet=f.GetSnippet(),
dir_href=dir_href,
log_href=log_href,
view_href=view_href,
download_href=download_href,
prefer_markup=prefer_markup,
diff_href=diff_href,
root=my_repos,
path=where,
path_prev=path_prev,
))
# No files survived authz checks? Let's just pretend this
# little commit didn't happen, shall we?
@ -4115,6 +4108,7 @@ def view_query(request):
who_match = request.query_dict.get('who_match', 'exact')
comment = request.query_dict.get('comment', '')
comment_match = request.query_dict.get('comment_match', 'fulltext')
search_content = request.query_dict.get('search_content', '')
querysort = request.query_dict.get('querysort', 'date')
date = request.query_dict.get('date', 'hours')
hours = request.query_dict.get('hours', '2')
@ -4126,7 +4120,7 @@ def view_query(request):
cfg.options.limit_changes))
match_types = { 'exact':1, 'like':1, 'glob':1, 'regex':1, 'notregex':1 }
sort_types = { 'date':1, 'author':1, 'file':1 }
sort_types = { 'date':1, 'date_rev':1, 'author':1, 'file':1, 'relevance':1 }
date_types = { 'hours':1, 'day':1, 'week':1, 'month':1,
'all':1, 'explicit':1 }
@ -4193,6 +4187,8 @@ def view_query(request):
query.SetComment(comment, comment_match)
else:
query.SetTextQuery(comment)
if search_content:
query.SetContentQuery(search_content)
query.SetSortMethod(querysort)
if date == 'hours':
query.SetFromDateHoursAgo(int(hours))

70
lib/viewvcmagic.py Normal file
View File

@ -0,0 +1,70 @@
#!/usr/bin/python
import mimetypes
have_chardet = 0
try:
import chardet
have_chardet = 1
except: pass
class ContentMagic:
def __init__(self, encodings):
self.encodings = encodings.split(':')
self.mime_magic = None
self.errors = []
# Try to load magic
try:
import magic
self.mime_magic = magic.open(magic.MAGIC_MIME_TYPE)
self.mime_magic.load()
except Exception, e:
self.errors.append(e)
# returns MIME type
def guess_mime(self, mime, filename, tempfile):
if mime == 'application/octet-stream':
mime = ''
if not mime and filename:
mime = mimetypes.guess_type(filename)[0]
if not mime and tempfile and self.mime_magic:
if type(tempfile) == type(''):
mime = self.mime_magic.file(tempfile)
else:
c = tempfile.read(4096)
mime = self.mime_magic.buffer(c)
return mime
# returns (utf8_content, charset)
def guess_charset(self, content):
# Try to guess with chardet
charset = None
if have_chardet:
# Try chardet
try:
charset = chardet.detect(content)
if charset and charset['encoding']:
charset = charset['encoding']
content = content.decode(charset)
except: charset = None
else:
# Try UTF-8
charset = 'utf-8'
try: content = content.decode('utf-8')
except: charset = None
# Then try to guess primitively
if charset is None:
for charset in self.encodings:
try:
content = content.decode(charset)
break
except: charset = None
return (content, charset)
# guess and encode return value into UTF-8
def utf8(self, content):
(uni, charset) = self.guess_charset(content)
if charset:
return uni.encode('utf-8')
return content

View File

@ -144,7 +144,7 @@ Browse Directory</a></p>
<tr>
<th style="text-align:right;vertical-align:top;">Comment:</th>
<td>
<input type="text" name="comment" value="[comment]" /><br />
<input type="text" name="comment" value="[comment]" size="40" /><br />
<label for="comment_match_exact">
<input type="radio" name="comment_match" id="comment_match_fulltext"
value="fulltext" [is comment_match "fulltext"]checked=""[end] />
@ -172,13 +172,21 @@ Browse Directory</a></p>
</label>
</td>
</tr>
[if-any enable_search_content]
<tr>
<th style="text-align:right;vertical-align:top;">Search content:</th>
<td><input type="text" name="search_content" value="[search_content]" size="60" /></td>
</tr>
[end]
<tr>
<th style="text-align:right;vertical-align:top;">Sort By:</th>
<td>
<select name="querysort">
<option value="date" [is querysort "date"]selected="selected"[end]>Date</option>
<option value="date_rev" [is querysort "date_rev"]selected="selected"[end]>Date (oldest first)</option>
<option value="author" [is querysort "author"]selected="selected"[end]>Author</option>
<option value="file" [is querysort "file"]selected="selected"[end]>File</option>
<option value="relevance" [is querysort "relevance"]selected="selected"[end]>Relevance</option>
</select>
</td>
</tr>

View File

@ -46,15 +46,18 @@
<tr class="vc_row_[if-index commits even]even[else]odd[end]">
<td style="vertical-align: top;">
[define rev_href][if-any commits.files.prefer_markup][commits.files.view_href][else][if-any commits.files.download_href][commits.files.download_href][end][end][end]
[if-any commits.files.rev][if-any rev_href]<a href="[rev_href]">[end][commits.files.rev][if-any rev_href]</a>[end][else]&nbsp;[end]
[if-any commits.files.rev][if-any rev_href]<a href="[rev_href]">[end][commits.files.rev][if-any rev_href]</a>[end][else]&nbsp;[end]
</td>
<td style="vertical-align: top;">
<a href="[commits.files.dir_href]">[commits.files.dir]/</a>
<a href="[commits.files.log_href]">[commits.files.file]</a>
[if-any commits.files.snippet]
<div class="snippet">[commits.files.snippet]</div>
[end]
</td>
[if-any show_branch]
<td style="vertical-align: top;">
[if-any commits.files.branch][commits.files.branch][else]&nbsp;[end]
[if-any commits.files.branch][commits.files.branch][else]&nbsp;[end]
</td>
[end]
<td style="vertical-align: top;">
@ -68,10 +71,10 @@
[is commits.files.type "Remove"]</del>[end]
</td>
<td style="vertical-align: top;">
[if-any commits.files.date][commits.files.date][else]&nbsp;[end]
[if-any commits.files.date][commits.files.date][else]&nbsp;[end]
</td>
<td style="vertical-align: top;">
[if-any commits.files.author][commits.files.author][else]&nbsp;[end]
[if-any commits.files.author][commits.files.author][else]&nbsp;[end]
</td>
</tr>
[end]