diff --git a/bin/make-database b/bin/make-database
index b8775705..f538b837 100755
--- a/bin/make-database
+++ b/bin/make-database
@@ -44,6 +44,7 @@ CREATE TABLE branches (
DROP TABLE IF EXISTS checkins;
CREATE TABLE checkins (
+ id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
type enum('Change','Add','Remove'),
ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
whoid mediumint(9) DEFAULT '0' NOT NULL,
@@ -57,7 +58,7 @@ CREATE TABLE checkins (
removedlines int(11) DEFAULT '0' NOT NULL,
descid mediumint(9),
UNIQUE repositoryid (repositoryid,dirid,fileid,revision),
- KEY repository_when (repositoryid,ci_when),
+ KEY repositoryid_when (repositoryid,ci_when),
KEY ci_when (ci_when),
KEY whoid (whoid,ci_when),
KEY dirid (dirid),
@@ -138,6 +139,7 @@ CREATE TABLE branches (
DROP TABLE IF EXISTS commits;
CREATE TABLE commits (
+ id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
type enum('Change','Add','Remove'),
ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
whoid mediumint(9) DEFAULT '0' NOT NULL,
@@ -151,9 +153,9 @@ CREATE TABLE commits (
removedlines int(11) DEFAULT '0' NOT NULL,
descid mediumint(9),
UNIQUE repositoryid (repositoryid,dirid,fileid,revision),
+ KEY repositoryid_when (repositoryid,ci_when),
KEY ci_when (ci_when),
- KEY whoid (whoid),
- KEY repositoryid_2 (repositoryid),
+ KEY whoid (whoid,ci_when),
KEY dirid (dirid),
KEY fileid (fileid),
KEY branchid (branchid),
@@ -253,7 +255,7 @@ Options:
[Default: ViewVC]
--help Show this usage message.
-
+
--hostname=ARG Use ARG as the hostname for the MySQL connection.
[Default: localhost]
@@ -264,7 +266,7 @@ Options:
--version=ARG Create the database using the schema employed by
version ARG of ViewVC. Valid values are:
[ "1.0" ]
-
+
""" % (os.path.basename(sys.argv[0])))
if errmsg is not None:
stream.write("[ERROR] %s.\n" % (errmsg))
diff --git a/bin/svndbadmin b/bin/svndbadmin
index 6c40ad39..8d951e01 100755
--- a/bin/svndbadmin
+++ b/bin/svndbadmin
@@ -58,7 +58,11 @@ else:
import os
import string
+import socket
+import select
import re
+import mimetypes
+import time
import svn.core
import svn.repos
@@ -68,14 +72,20 @@ import svn.delta
import cvsdb
import viewvc
import vclib
+from viewvcmagic import ContentMagic
class SvnRepo:
"""Class used to manage a connection to a SVN repository."""
- def __init__(self, path):
+ def __init__(self, path, index_content = None, tika_client = None, guesser = None,
+ svn_ignore_mimetype = False):
self.path = path
self.repo = svn.repos.svn_repos_open(path)
self.fs = svn.repos.svn_repos_fs(self.repo)
self.rev_max = svn.fs.youngest_rev(self.fs)
+ self.index_content = index_content
+ self.tika_client = tika_client
+ self.guesser = guesser
+ self.svn_ignore_mimetype = svn_ignore_mimetype
def __getitem__(self, rev):
if rev is None:
rev = self.rev_max
@@ -128,6 +138,74 @@ def _get_diff_counts(diff_fp):
line = diff_fp.readline()
return plus, minus
+class TikaClient:
+ # Create tika client
+ def __init__(self, tika_server, mime_types):
+ self.tika_server = tika_server
+ self.mime_types = mime_types
+ self.addr = tika_server.split(':')
+ # Split address
+ if len(self.addr) != 2:
+ raise Exception('tika_server value is incorrect: \''+tika_server+'\', please use \'host:port\' format')
+ self.addr = (self.addr[0], int(self.addr[1]))
+ # Build regexp for MIME types
+ m = re.split('\s+', mime_types.strip())
+ self.mime_regexp = re.compile('|'.join('^'+re.escape(i).replace('\\*', '.*')+'$' for i in m))
+
+ # Extract text content from file using Tika which runs in server mode
+ def get_text(self, filename, mime_type, log_filename):
+ if not self.mime_regexp.match(mime_type):
+ # Tika can't handle this mime type, return nothing
+ return ''
+ fd = None
+ s = None
+ text = ''
+ fsize = 0
+ try:
+ # Read original file
+ fd = open(filename, 'rb')
+ data = fd.read()
+ fsize = len(data)
+ if not fsize:
+ return ''
+ # Connect to Tika
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ s.connect(self.addr)
+ s.setblocking(0)
+ sockfd = s.fileno()
+ # Tika is somewhat delicate about network IO, so:
+ # Read and write using poll(2) system call
+ p = select.poll()
+ p.register(sockfd)
+ while 1:
+ fds = p.poll()
+ if not fds:
+ break
+ (pollfd, event) = fds[0]
+ if event & select.POLLIN:
+ # Exception or empty data means EOF...
+ try: part = os.read(sockfd, 65536)
+ except: break
+ if not part: break
+ text += part
+ if event & select.POLLOUT:
+ if not len(data):
+ # Shutdown output and forget about POLLOUT
+ s.shutdown(socket.SHUT_WR)
+ p.modify(sockfd, select.POLLIN)
+ else:
+ # Write and consume some data
+ l = os.write(sockfd, data)
+ data = data[l:]
+ if len(text) == 0:
+ raise Exception('Empty response from Tika server')
+ print "Extracted %d bytes from %s (%s) of size %d" % (len(text), log_filename, mime_type, fsize)
+ except Exception, e:
+ print "Error extracting text from %s (%s) of size %d: %s" % (log_filename, mime_type, fsize, str(e))
+ finally:
+ if fd: fd.close()
+ if s: s.close()
+ return text
class SvnRev:
"""Class used to hold information about a particular revision of
@@ -151,7 +229,7 @@ class SvnRev:
# get a root for the current revisions
fsroot = self._get_root_for_rev(rev)
-
+
# find changes in the revision
editor = svn.repos.RevisionChangeCollector(repo.fs, rev)
e_ptr, e_baton = svn.delta.make_editor(editor)
@@ -168,7 +246,7 @@ class SvnRev:
base_root = None
if change.base_path:
base_root = self._get_root_for_rev(change.base_rev)
-
+
if not change.path:
action = 'remove'
elif change.added:
@@ -184,19 +262,53 @@ class SvnRev:
['-b', '-B'])
diff_fp = diffobj.get_pipe()
plus, minus = _get_diff_counts(diff_fp)
- # TODO Indexing file contents
- # For binary files: svn.fs.contents_changed(root1, path1, root2, path2)
- # Temp file with contents is at: diffobj.tempfile2
- # Apache Tika server may even be at another host!
# CustIS Bug 50473: a workaround for svnlib behaviour in file movements (FILE1 -> FILE2 + FILE1 -> null)
if change.base_path:
- if not change.path and changes_hash.get(change.base_path, '') != '':
+ if not change.path and change.base_path in changes_hash:
minus = 0
elif change.path:
changes_hash[change.base_path] = change.path
- self.changes.append((path, action, plus, minus))
+ content = ''
+ mime = ''
+ # need to check if binary file's content changed when copying,
+ # if not, don't extract it, just get it from previous revision later
+ if repo.index_content and change.path and (not change.base_path
+ or svn.fs.contents_changed(
+ base_root and base_root or None,
+ base_root and change.base_path or None,
+ fsroot, change.path
+ )):
+ props = svn.fs.node_proplist(fsroot, change.path)
+ if not repo.svn_ignore_mimetype:
+ mime = props.get('svn:mime-type', None)
+ else:
+ mime = None
+ mime = repo.guesser.guess_mime(
+ mime,
+ os.path.basename(change.path),
+ diffobj.tempfile2
+ )
+ # Read and guess charset by ourselves for text files
+ if mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')):
+ try:
+ fd = open(diffobj.tempfile2, 'rb')
+ content = fd.read()
+ fd.close()
+ except: pass
+ # Guess charset
+ if content:
+ content, charset = repo.guesser.guess_charset(content)
+ if charset:
+ content = content.encode('utf-8')
+ print 'Guessed %s for %s' % (charset, change.path)
+ else:
+ print 'Failed to guess charset for %s, not indexing' % (change.path, )
+ # Try to extract content using Tika from binary documents
+ elif repo.tika_client:
+ content = repo.tika_client.get_text(diffobj.tempfile2, mime, change.path)
+ self.changes.append((path, action, plus, minus, content, mime))
def _get_root_for_rev(self, rev):
"""Fetch a revision root from a cache of such, or a fresh root
@@ -217,7 +329,7 @@ def handle_revision(db, command, repo, rev, verbose, force=0):
if verbose: print "skipped (no changes)."
return
- for (path, action, plus, minus) in revision.changes:
+ for (path, action, plus, minus, content, mime) in revision.changes:
directory, file = os.path.split(path)
commit = cvsdb.CreateCommit()
commit.SetRepository(repo.path)
@@ -230,6 +342,8 @@ def handle_revision(db, command, repo, rev, verbose, force=0):
commit.SetPlusCount(plus)
commit.SetMinusCount(minus)
commit.SetBranch(None)
+ commit.SetContent(content)
+ commit.SetMimeType(mime)
if action == 'add':
commit.SetTypeAdd()
@@ -268,7 +382,16 @@ def main(command, repository, revs=[], verbose=0, force=0):
sys.stderr.write("ERROR: " + str(e) + "\n")
sys.exit(1)
- repo = SvnRepo(repository)
+ tika_client = None
+ if cfg.utilities.tika_server:
+ tika_client = TikaClient(cfg.utilities.tika_server, cfg.utilities.tika_mime_types)
+ repo = SvnRepo(
+ path = repository,
+ index_content = cfg.cvsdb.index_content,
+ tika_client = tika_client,
+ guesser = cfg.guesser(),
+ svn_ignore_mimetype = cfg.options.svn_ignore_mimetype,
+ )
if command == 'rebuild' or (command == 'update' and not revs):
for rev in range(repo.rev_max+1):
handle_revision(db, command, repo, rev, verbose)
@@ -312,7 +435,7 @@ Usage: 1. %s [-v] rebuild REPOS-PATH
the database. If a range is specified, the revisions will be
processed in ascending order, and you may specify "HEAD" to
indicate "the youngest revision currently in the repository".
-
+
3. Purge information specific to the repository located at REPOS-PATH
from the database.
@@ -337,7 +460,7 @@ if __name__ == '__main__':
del args[index]
except ValueError:
pass
-
+
if len(args) < 3:
usage()
diff --git a/conf/viewvc.conf.dist b/conf/viewvc.conf.dist
index 3a26a4fe..0a1017ec 100644
--- a/conf/viewvc.conf.dist
+++ b/conf/viewvc.conf.dist
@@ -1,6 +1,6 @@
#---------------------------------------------------------------------------
#
-# Configuration file for ViewVC
+# Configuration file for ViewVC (4IntraNet patched version)
#
# Information on ViewVC is located at the following web site:
# http://viewvc.org/
@@ -9,7 +9,7 @@
# THE FORMAT OF THIS CONFIGURATION FILE
#
-# This file is delineated by sections, specified in [brackets]. Within
+# This file is delineated by sections, specified in [brackets]. Within
# each section, are a number of configuration settings. These settings
# take the form of: name = value. Values may be continued on the
# following line by indenting the continued line.
@@ -17,14 +17,14 @@
# WARNING: Indentation *always* means continuation. Name=value lines
# should always start in column zero.
#
-# Comments should always start in column zero, and are identified
+# Comments should always start in column zero, and are identified
# with "#".
#
-# Certain configuration settings may have multiple values. These should
-# be separated by a comma. The settings where this is allowed are noted
+# Certain configuration settings may have multiple values. These should
+# be separated by a comma. The settings where this is allowed are noted
# below. Any other setting that requires special syntax is noted at that
# setting.
-#
+#
#
# SOME TERMINOLOGY USED HEREIN
#
@@ -50,10 +50,10 @@
# recommend you pay attention to. Of course, don't try to change the
# options here -- do so in the relevant section of the configuration
# file below.
-#
+#
# For correct operation, you will probably need to change the following
# configuration variables:
-#
+#
# cvs_roots (for CVS)
# svn_roots (for Subversion)
# root_parents (for CVS or Subversion)
@@ -62,18 +62,18 @@
# rcs_dir
# mime_types_file
# the many options in the [utilities] section
-#
+#
# It is usually desirable to change the following variables:
-#
+#
# address
# forbidden
-#
+#
# To optimize delivery of ViewVC static files:
-#
+#
# docroot
-#
+#
# To customize the display of ViewVC for your site:
-#
+#
# template_dir
# the [templates] override section
#
@@ -139,7 +139,7 @@ default_root = cvs
# provided only as a convenience for ViewVC installations which are
# using the default template set, where the value of this option will
# be displayed in the footer of every ViewVC page.)
-address =
+address =
#
# This option provides a mechanism for custom key/value pairs to be
@@ -244,21 +244,47 @@ cvsnt =
# See also bin/cvsnt-rcsfile-inetd.pl
-#rcsfile_socket = 'host:port'
-# Example: rcsfile_socket = '127.0.0.1:8071'
+#rcsfile_socket = host:port
+# Example: rcsfile_socket = 127.0.0.1:8071
# Subversion command-line client, used for viewing Subversion repositories
svn =
# svn = /usr/bin/svn
# GNU diff, used for showing file version differences
-diff =
+diff =
# diff = /usr/bin/diff
# CvsGraph, a graphical CVS version graph generator (see options.use_cvsgraph)
cvsgraph =
# cvsgraph = /usr/local/bin/cvsgraph
+# Apache Tika TCP server host and port, used to extract text from binary documents
+# Note that as of 2011-09-12, Tika 0.9 has a bug which leads to hangups on processing
+# M$Word documents in server mode. So you must use the fixed version, downloaded from:
+# http://wiki.4intra.net/public/tika-app-0.9-fix-TIKA709.jar
+# (mirror) http://code.google.com/p/mediawiki4intranet/downloads/detail?name=tika-app-0.9-fix-TIKA709.jar
+# Or apply the patch by yourself and rebuild Tika from source, see patch here:
+# https://issues.apache.org/jira/browse/TIKA-709
+# Tika server should be started with command 'java -jar tika-app-0.9.jar -p PORT -t -eutf-8'
+
+#tika_server = host:port
+# Example: tika_server = 127.0.0.1:8072
+
+# This lists MIME types that can be processed by Tika
+# You may change it if your Tika is newer than 0.9 and supports more formats
+# (note) *+xml examples: xhtml+xml, rss+xml, atom+xml, docbook+xml, rdf+xml
+tika_mime_types =
+ text/*
+ application/*+xml
+ application/xml
+ application/vnd.oasis.opendocument.*
+ application/vnd.openxmlformats
+ application/vnd.ms-*
+ application/msaccess
+ application/msword
+ application/pdf
+ application/rtf
#---------------------------------------------------------------------------
[options]
@@ -358,7 +384,7 @@ svn_ignore_mimetype = 0
# directory ViewVC should consult for various things, including cached
# remote authentication credentials. If unset, Subversion will use
# the default location(s) ($HOME/.subversion, etc.)
-svn_config_dir =
+svn_config_dir =
# use the rcsparse Python module to retrieve CVS repository
# information instead of invoking rcs utilities [EXPERIMENTAL]
@@ -494,12 +520,18 @@ short_log_len = 80
# should we colorize known file content syntaxes? (requires Pygments module)
enable_syntax_coloration = 1
+# detect_encoding: Should we attempt to detect versioned file
+# character encodings? [Requires 'chardet' module]
+# Used in file list, file content display and indexing
+# See also options.encodings for naive guessing.
+detect_encoding = 1
+
# Use CvsGraph. See http://www.akhphd.au.dk/~bertho/cvsgraph/ for
-# documentation and download.
+# documentation and download.
use_cvsgraph = 0
#use_cvsgraph = 1
-# Location of the customized cvsgraph configuration file.
+# Location of the customized cvsgraph configuration file.
cvsgraph_conf = cvsgraph.conf
#
@@ -544,6 +576,17 @@ use_pagesize = 0
# Set to 0 to disable the limit.
limit_changes = 100
+# You can also use primitive charset guessing instead of chardet (options.detect_encoding)
+# Just set this to the list of possible charsets in your repository.
+# ViewVC will simply try to decode content using each of them, and pick
+# the first which succeeds. UTF-8 is always tried automatically.
+#encodings = cp1251:iso-8859-1
+
+# Sadly this is also required - for back-links from query results to files
+# in CVS, because it doesn't recode file names to UTF-8 as Subversion does.
+# Just set to cp1251 if you work with your CVS from Windowz.
+#cvs_ondisk_charset = cp1251
+
#---------------------------------------------------------------------------
[templates]
@@ -554,7 +597,7 @@ limit_changes = 100
# use a different template for a particular view, simply uncomment the
# appropriate option below and specify the currect location of the EZT
# template file you wish to use for that view.
-#
+#
# Templates are specified relative to the configured template
# directory (see the "template_dir" option), but absolute paths may
# also be used as well.
@@ -569,13 +612,13 @@ limit_changes = 100
#diff = diff.ezt
#directory = directory.ezt
### an alternative directory view
-#directory = dir_new.ezt
+#directory = dir_new.ezt
#error = error.ezt
#file = file.ezt
#graph = graph.ezt
#log = log.ezt
### a table-based alternative log view
-#log = log_table.ezt
+#log = log_table.ezt
#query = query.ezt
#query_form = query_form.ezt
#query_results = query_results.ezt
@@ -588,22 +631,51 @@ limit_changes = 100
# Set to 1 to enable the database integration feature, 0 otherwise.
enabled = 0
-# Database hostname and port.
+# Set to 1 to enable indexing of file contents using Sphinx and Tika
+index_content = 0
+
+# Database hostname, port, and socket
#host = localhost
#port = 3306
+# On Debian Linux, enable this:
+#socket = /var/run/mysqld/mysqld.sock
# ViewVC database name.
#database_name = ViewVC
# Username and password of user with read/write privileges to the ViewVC
# database.
-#user =
-#passwd =
+#user =
+#passwd =
# Username and password of user with read privileges to the ViewVC
# database.
-#readonly_user =
-#readonly_passwd =
+#readonly_user =
+#readonly_passwd =
+
+# ViewVC can use Sphinx (http://sphinxsearch.com) full-text search engine
+# to index file contents with full history and then search over them.
+# Also, Apache Tika console application can be used in TCP server mode to
+# add support for indexing binary documents (M$Word, PDF and etc).
+# See tika_server in [utilities].
+# Requires Sphinx >= 0.9.9 with a real-time updatable SphinxQL index.
+# Index must be created in sphinx.conf by hand and have the following fields:
+# rt_field = content
+# rt_attr_string = content
+# rt_attr_string = mimetype
+# rt_attr_timestamp = ci_when
+# rt_attr_uint = whoid
+# rt_attr_uint = repositoryid
+# rt_attr_uint = dirid
+# rt_attr_uint = fileid
+# rt_attr_uint = revision
+# rt_attr_uint = branchid
+
+# Sphinx connection parameters:
+#sphinx_host =
+#sphinx_port =
+#sphinx_socket = /var/run/sphinxql.sock
+#sphinx_index = viewvc
# Limit the number of rows returned by a given query to this number.
#row_limit = 1000
@@ -616,7 +688,7 @@ enabled = 0
# Check if the repository is found in the database before showing
# the query link and RSS feeds. Set to 1 to enable check.
-#
+#
# WARNING: Enabling this check adds the cost of a database connection
# and query to most ViewVC requests. If all your roots are represented
# in the commits database, or if you don't care about the creation of
@@ -640,7 +712,7 @@ enabled = 0
#
# ViewVC allows you to customize its configuration options for
# individual virtual hosts. You might, for example, wish to expose
-# all of your Subversion repositories at http://svn.yourdomain.com/viewvc/
+# all of your Subversion repositories at http://svn.yourdomain.com/viewvc/
# and all your CVS ones at http://cvs.yourdomain.com/viewvc/, with no
# cross-exposure. Using ViewVC's virtual host (vhost) configuration
# support, you can do this. Simply create two vhost configurations
@@ -671,7 +743,7 @@ enabled = 0
# gui = guiproject.yourdomain.*
#
# [vhost-libs/general]
-# cvs_roots =
+# cvs_roots =
# svn_roots = svnroot: /var/svn/libs-repos
# default_root = svnroot
#
@@ -680,7 +752,7 @@ enabled = 0
#
# [vhost-gui/general]
# cvs_roots = cvsroot: /var/cvs/guiproject
-# svn_roots =
+# svn_roots =
# default_root = cvsroot
#
@@ -697,7 +769,7 @@ enabled = 0
#
# Here is an example showing how to enable Subversion authz-based
# authorization for only the single root named "svnroot":
-#
+#
# [root-svnroot/options]
# authorizer = svnauthz
#
@@ -726,7 +798,7 @@ enabled = 0
#
# Tests are case-sensitive.
#
-# NOTE: Again, this is for the hiding of modules within repositories, *not*
+# NOTE: Again, this is for the hiding of modules within repositories, *not*
# for the hiding of repositories (roots) themselves.
#
# Some examples:
@@ -749,7 +821,7 @@ enabled = 0
# Allow "xml", forbid other modules starting with "x", and allow the rest:
# forbidden = !xml, x*, !*
#
-forbidden =
+forbidden =
#---------------------------------------------------------------------------
[authz-forbiddenre]
@@ -792,7 +864,7 @@ forbidden =
# Only allow visibility of HTML files and the directories that hold them:
# forbiddenre = !^([^/]+|.*(/|\.html))$
#
-forbiddenre =
+forbiddenre =
#---------------------------------------------------------------------------
[authz-svnauthz]
diff --git a/lib/config.py b/lib/config.py
index cdc04386..264d7ea4 100644
--- a/lib/config.py
+++ b/lib/config.py
@@ -24,6 +24,7 @@ import vclib.ccvs
import vclib.svn
import cvsdb
import viewvc
+from viewvcmagic import ContentMagic
#########################################################################
#
@@ -47,6 +48,7 @@ class Config:
'root_parents', 'allowed_views', 'mime_types_files')
def __init__(self):
+ self.__guesser = None
for section in self._sections:
setattr(self, section, _sub_config())
@@ -66,7 +68,6 @@ class Config:
if rootname:
self._process_root_options(self.parser, rootname)
self.expand_root_parents()
- cvsdb.setencs(self.options.encodings.split(':'))
r = {}
for i in self.rewritehtml.__dict__.keys():
if i[-8:] == '.replace':
@@ -201,7 +202,7 @@ class Config:
pass
else:
raise IllegalOverrideSection('root', section)
-
+
def overlay_root_options(self, rootname):
"Overly per-root options atop the existing option set."
if not self.conf_path:
@@ -217,7 +218,7 @@ class Config:
for option in parser.options(section):
d[option] = parser.get(section, option)
return d.items()
-
+
def get_authorizer_params(self, authorizer, rootname=None):
if not self.conf_path:
return {}
@@ -236,7 +237,12 @@ class Config:
params[key] = value
params['__config'] = self
return params
-
+
+ def guesser(self):
+ if not self.__guesser:
+ self.__guesser = ContentMagic(self.options.encodings)
+ return self.__guesser
+
def set_defaults(self):
"Set some default values in the configuration."
@@ -258,6 +264,8 @@ class Config:
self.utilities.svn = ''
self.utilities.diff = ''
self.utilities.cvsgraph = ''
+ self.utilities.tika_server = ''
+ self.utilities.tika_mime_types = ''
self.options.root_as_url_component = 1
self.options.checkout_magic = 0
@@ -302,7 +310,7 @@ class Config:
self.options.limit_changes = 100
self.options.cvs_ondisk_charset = 'cp1251'
self.options.binary_mime_re = '^(?!text/|.*\Wxml)'
- self.options.encodings = 'utf-8:cp1251:iso-8859-1'
+ self.options.encodings = 'cp1251:iso-8859-1'
self.templates.diff = None
self.templates.directory = None
@@ -316,6 +324,7 @@ class Config:
self.templates.roots = None
self.cvsdb.enabled = 0
+ self.cvsdb.index_content = 0
self.cvsdb.host = ''
self.cvsdb.port = 3306
self.cvsdb.socket = ''
@@ -323,12 +332,17 @@ class Config:
self.cvsdb.user = ''
self.cvsdb.passwd = ''
self.cvsdb.readonly_user = ''
- self.cvsdb.readonly_passwd = ''
+ self.cvsdb.readonly_passwd = ''
self.cvsdb.row_limit = 1000
self.cvsdb.rss_row_limit = 100
self.cvsdb.check_database_for_root = 0
self.cvsdb.fulltext_min_relevance = 0.2
+ self.cvsdb.sphinx_host = ''
+ self.cvsdb.sphinx_port = 3307
+ self.cvsdb.sphinx_socket = ''
+ self.cvsdb.sphinx_index = ''
+
def _startswith(somestr, substr):
return somestr[:len(substr)] == substr
diff --git a/lib/cvsdb.py b/lib/cvsdb.py
index e5e7a7e1..a614c5e0 100644
--- a/lib/cvsdb.py
+++ b/lib/cvsdb.py
@@ -15,6 +15,7 @@ import sys
import string
import time
import re
+import cgi
import vclib
import dbi
@@ -36,22 +37,12 @@ error = "cvsdb error"
## defined to actually be complete; it should run well off of any DBI 2.0
## complient database interface
-encs = [ "utf-8", "cp1251", "iso-8859-1" ]
-
-def utf8string(value):
- for e in encs:
- try:
- value = value.decode(e)
- break
- except: pass
- return value.encode("utf-8")
-
-def setencs(e):
- global encs
- encs = e
-
class CheckinDatabase:
- def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, authorizer = None):
+ def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, cfg,
+ authorizer = None, index_content = 0, sphinx_host = None, sphinx_port = None,
+ sphinx_socket = None, sphinx_index = None):
+ self.cfg = cfg
+
self._host = host
self._port = port
self._socket = socket
@@ -63,11 +54,21 @@ class CheckinDatabase:
self._min_relevance = min_relevance
self.authorizer = authorizer
+ # Sphinx settings
+ self.index_content = index_content
+ self.sphinx_host = sphinx_host
+ self.sphinx_port = sphinx_port
+ self.sphinx_socket = sphinx_socket
+ self.sphinx_index = sphinx_index
+
## database lookup caches
self._get_cache = {}
self._get_id_cache = {}
self._desc_id_cache = {}
+ # Sphinx connection None by default
+ self.sphinx = None
+
def Connect(self):
self.db = dbi.connect(
self._host, self._port, self._socket, self._user, self._passwd, self._database)
@@ -83,12 +84,17 @@ class CheckinDatabase:
else:
self._version = 0
if self._version > CURRENT_SCHEMA_VERSION:
- raise DatabaseVersionError("Database version %d is newer than the "
- "last version supported by this "
- "software." % (self._version))
+ raise DatabaseVersionError("Database version %d is newer than the "
+ "last version supported by this "
+ "software." % (self._version))
+ if self.index_content:
+ self.sphinx = dbi.connect(self.sphinx_host, self.sphinx_port, self.sphinx_socket, '', '', '')
+
+ def utf8(self, value):
+ return self.cfg.guesser().utf8(value)
def sql_get_id(self, table, column, value, auto_set):
- value = utf8string(value)
+ value = self.utf8(value)
sql = "SELECT id FROM %s WHERE %s=%%s" % (table, column)
sql_args = (value, )
@@ -172,7 +178,7 @@ class CheckinDatabase:
temp2[id] = value
return value
-
+
def get_list(self, table, field_index):
sql = "SELECT * FROM %s" % (table)
cursor = self.db.cursor()
@@ -198,7 +204,7 @@ class CheckinDatabase:
break
list.append(row[0])
return list
-
+
def GetMetadataValue(self, name):
sql = "SELECT value FROM metadata WHERE name=%s"
sql_args = (name)
@@ -209,7 +215,7 @@ class CheckinDatabase:
except TypeError:
return None
return value
-
+
def SetMetadataValue(self, name, value):
assert(self._version > 0)
sql = "REPLACE INTO metadata (name, value) VALUES (%s, %s)"
@@ -222,7 +228,7 @@ class CheckinDatabase:
"\tname = %s\n"
"\tvalue = %s\n"
% (str(e), name, value))
-
+
def GetBranchID(self, branch, auto_set = 1):
return self.get_id("branches", "branch", branch, auto_set)
@@ -240,13 +246,13 @@ class CheckinDatabase:
def GetFile(self, id):
return self.get("files", "file", id)
-
+
def GetAuthorID(self, author, auto_set = 1):
return self.get_id("people", "who", author, auto_set)
def GetAuthor(self, id):
return self.get("people", "who", id)
-
+
def GetRepositoryID(self, repository, auto_set = 1):
return self.get_id("repositories", "repository", repository, auto_set)
@@ -257,7 +263,7 @@ class CheckinDatabase:
return self.get_list("repositories", repository)
def SQLGetDescriptionID(self, description, auto_set = 1):
- description = utf8string(description)
+ description = self.utf8(description)
## lame string hash, blame Netscape -JMP
hash = len(description)
@@ -330,7 +336,7 @@ class CheckinDatabase:
ci_when = cursor.fetchone()[0]
except TypeError:
return None
-
+
return dbi.TicksFromDateTime(ci_when)
def AddCommitList(self, commit_list):
@@ -338,48 +344,55 @@ class CheckinDatabase:
self.AddCommit(commit)
def AddCommit(self, commit):
- ci_when = dbi.DateTimeFromTicks(commit.GetTime() or 0.0)
- ci_type = commit.GetTypeString()
- who_id = self.GetAuthorID(commit.GetAuthor())
- repository_id = self.GetRepositoryID(commit.GetRepository())
- directory_id = self.GetDirectoryID(commit.GetDirectory())
- file_id = self.GetFileID(commit.GetFile())
- revision = commit.GetRevision()
- sticky_tag = "NULL"
- branch_id = self.GetBranchID(commit.GetBranch())
- plus_count = commit.GetPlusCount() or '0'
- minus_count = commit.GetMinusCount() or '0'
- description_id = self.GetDescriptionID(commit.GetDescription())
+ props = {
+ 'type' : commit.GetTypeString(),
+ 'ci_when' : dbi.DateTimeFromTicks(commit.GetTime() or 0.0),
+ 'whoid' : self.GetAuthorID(commit.GetAuthor()),
+ 'repositoryid' : self.GetRepositoryID(commit.GetRepository()),
+ 'dirid' : self.GetDirectoryID(commit.GetDirectory()),
+ 'fileid' : self.GetFileID(commit.GetFile()),
+ 'revision' : commit.GetRevision(),
+ 'branchid' : self.GetBranchID(commit.GetBranch()),
+ 'addedlines' : commit.GetPlusCount() or '0',
+ 'removedlines' : commit.GetMinusCount() or '0',
+ 'descid' : self.GetDescriptionID(commit.GetDescription()),
+ }
commits_table = self._version >= 1 and 'commits' or 'checkins'
- sql = "REPLACE INTO %s" % (commits_table)
- sql = sql + \
- " (type,ci_when,whoid,repositoryid,dirid,fileid,revision,"\
- " stickytag,branchid,addedlines,removedlines,descid)"\
- "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
- sql_args = (ci_type, ci_when, who_id, repository_id,
- directory_id, file_id, revision, sticky_tag, branch_id,
- plus_count, minus_count, description_id)
cursor = self.db.cursor()
try:
- cursor.execute(sql, sql_args)
+ # MySQL-specific INSERT-or-UPDATE with ID retrieval
+ cursor.execute(
+ 'INSERT INTO '+commits_table+'('+','.join(i for i in props)+') VALUES ('+
+ ', '.join('%s' for i in props)+') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), '+
+ ', '.join(i+'=VALUES('+i+')' for i in props),
+ tuple(props[i] for i in props)
+ )
+ commit_id = cursor.lastrowid
+ if self.index_content:
+ sphcur = self.sphinx.cursor()
+ content = commit.GetContent()
+ props['ci_when'] = str(int(commit.GetTime() or 0))
+ if len(content):
+ props['content'] = content
+ # Now, stored MIME type is only needed while searching
+ # It is guessed again when the file is displayed
+ props['mimetype'] = commit.GetMimeType()
+ props['id'] = str(commit_id)
+ del props['addedlines']
+ del props['removedlines']
+ del props['descid']
+ del props['type']
+ sphcur.execute(
+ 'INSERT INTO '+self.sphinx_index+'('+','.join(i for i in props)+') VALUES ('+
+ ','.join('%s' for i in props)+')',
+ tuple(props[i] for i in props)
+ )
except Exception, e:
- raise Exception("Error adding commit: '%s'\n"
- "Values were:\n"
- "\ttype = %s\n"
- "\tci_when = %s\n"
- "\twhoid = %s\n"
- "\trepositoryid = %s\n"
- "\tdirid = %s\n"
- "\tfileid = %s\n"
- "\trevision = %s\n"
- "\tstickytag = %s\n"
- "\tbranchid = %s\n"
- "\taddedlines = %s\n"
- "\tremovedlines = %s\n"
- "\tdescid = %s\n"
- % ((str(e), ) + sql_args))
+ print ("Error adding commit: '"+str(e)+"'\nValues were:\n"+
+ "\n".join(i+'='+str(props[i]) for i in props))
+ raise
def SQLQueryListString(self, field, query_entry_list):
sqlList = []
@@ -414,6 +427,67 @@ class CheckinDatabase:
return "(%s)" % (string.join(sqlList, " OR "))
+ def query_ids(self, in_field, table, id_field, name_field, lst):
+ if not len(lst):
+ return None
+ cond = self.SQLQueryListString(name_field, lst)
+ cursor = self.db.cursor()
+ cursor.execute('SELECT %s FROM %s WHERE %s' % (id_field, table, cond))
+ ids = list(str(row[0]) for row in cursor)
+ if not len(ids):
+ return None
+ return "%s IN (%s)" % (in_field, ','.join(ids))
+
+ def CreateSphinxQueryString(self, query):
+ condList = [
+ 'MATCH(%s)' % (self.db.literal(query.content_query), ),
+ self.query_ids('repositoryid', 'repositories', 'id', 'repository', query.repository_list),
+ self.query_ids('branchid', 'branches', 'id', 'branch', query.branch_list),
+ self.query_ids('dirid', 'dirs', 'id', 'dir', query.directory_list),
+ self.query_ids('fileid', 'files', 'id', 'file', query.file_list),
+ self.query_ids('authorid', 'people', 'id', 'who', query.author_list),
+ self.query_ids('descid', 'descs', 'id', 'description', query.comment_list),
+ ]
+
+ if len(query.revision_list):
+ condList.append("revision IN ("+','.join(self.db.literal(s) for s in query.revision_list)+")")
+ if query.from_date:
+ condList.append('ci_when>='+str(dbi.TicksFromDateTime(query.from_date)))
+ if query.to_date:
+ condList.append('ci_when<='+str(dbi.TicksFromDateTime(query.to_date)))
+
+ if query.sort == 'date':
+ order_by = 'ORDER BY `ci_when` DESC, `relevance` DESC'
+ elif query.sort == 'date_rev':
+ order_by = 'ORDER BY `ci_when` ASC, `relevance` DESC'
+ else: # /* if query.sort == 'relevance' */
+ order_by = 'ORDER BY `relevance` DESC'
+
+ conditions = string.join((i for i in condList if i), " AND ")
+ conditions = conditions and "WHERE %s" % conditions
+
+ ## limit the number of rows requested or we could really slam
+ ## a server with a large database
+ limit = ""
+ if query.limit:
+ limit = "LIMIT %s" % (str(query.limit))
+ elif self._row_limit:
+ limit = "LIMIT %s" % (str(self._row_limit))
+
+ fields = "id `id`, WEIGHT() `relevance`, `content`, `mimetype`"
+
+ return "SELECT %s FROM %s %s %s %s" % (fields, self.sphinx_index, conditions, order_by, limit)
+
+ # Get commits by their IDs
+ def CreateIdQueryString(self, ids):
+ commits_table = self._version >= 1 and 'commits' or 'checkins'
+ return (
+ 'SELECT %s.*, repositories.repository AS repository_name, dirs.dir AS dir_name, files.file AS file_name'
+ ' FROM %s, repositories, dirs, files'
+ ' WHERE %s.id IN (%s) AND repositoryid=repositories.id'
+ ' AND dirid=dirs.id AND fileid=files.id' % (commits_table, commits_table, commits_table, ','.join(ids))
+ )
+
def CreateSQLQueryString(self, query):
commits_table = self._version >= 1 and 'commits' or 'checkins'
fields = [
@@ -427,7 +501,7 @@ class CheckinDatabase:
("dirs", "(%s.dirid=dirs.id)" % (commits_table)),
("files", "(%s.fileid=files.id)" % (commits_table))]
condList = []
-
+
if len(query.text_query):
tableList.append(("descs", "(descs.id=%s.descid)" % (commits_table)))
temp = "MATCH (descs.description) AGAINST (%s" % (self.db.literal(query.text_query))
@@ -435,6 +509,7 @@ class CheckinDatabase:
fields.append("%s) AS relevance" % temp)
else:
fields.append("'' AS relevance")
+ fields.append("'' AS snippet")
if len(query.repository_list):
temp = self.SQLQueryListString("repositories.repository",
@@ -478,16 +553,18 @@ class CheckinDatabase:
temp = "(%s.ci_when<=\"%s\")" % (commits_table, str(query.to_date))
condList.append(temp)
- if query.sort == "date":
- order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
+ if query.sort == "relevance" and len(query.text_query):
+ order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
+ elif query.sort == "date_rev":
+ order_by = "ORDER BY %s.ci_when ASC,descid,%s.repositoryid" % (commits_table, commits_table)
elif query.sort == "author":
tableList.append(("people", "(%s.whoid=people.id)" % (commits_table)))
order_by = "ORDER BY people.who,descid,%s.repositoryid" % (commits_table)
elif query.sort == "file":
tableList.append(("files", "(%s.fileid=files.id)" % (commits_table)))
order_by = "ORDER BY files.file,descid,%s.repositoryid" % (commits_table)
- elif query.sort == "relevance" and len(query.text_query):
- order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
+ else: # /* if query.sort == "date": */
+ order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
## exclude duplicates from the table list, and split out join
## conditions from table names. In future, the join conditions
@@ -517,7 +594,7 @@ class CheckinDatabase:
fields, tables, conditions, order_by, limit)
return sql
-
+
def check_commit_access(self, repos, dir, file, rev):
if self.authorizer:
rootname = repos.split('/')
@@ -528,19 +605,60 @@ class CheckinDatabase:
return True
def RunQuery(self, query):
- sql = self.CreateSQLQueryString(query)
- cursor = self.db.cursor()
- cursor.execute(sql)
+ if len(query.content_query) and self.sphinx:
+ # Use Sphinx to search on document content
+ sql = self.CreateSphinxQueryString(query)
+ cursor = self.sphinx.cursor()
+ cursor.execute(sql)
+ sphinx_rows = list((str(docid), rel, content, mimetype) for docid, rel, content, mimetype in cursor)
+ if len(sphinx_rows):
+ # Fetch snippets
+ snippet_options = {
+ 'around': 15,
+ 'limit': 200,
+ 'before_match': '',
+ 'after_match': '',
+ 'chunk_separator': ' ... ',
+ }
+ preformatted_mime = 'text/(?!html|xml).*'
+ snippets = {}
+ bm_html = cgi.escape(snippet_options['before_match'])
+ am_html = cgi.escape(snippet_options['after_match'])
+ for docid, rel, content, mimetype in sphinx_rows:
+ cursor.execute(
+ 'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
+ (content, self.sphinx_index, query.content_query) + tuple(snippet_options.values())
+ )
+ s, = cursor.fetchone()
+ s = cgi.escape(s)
+ if re.match(preformatted_mime, mimetype):
+ s = s.replace('\n', '
')
+ s = s.replace(bm_html, snippet_options['before_match'])
+ s = s.replace(am_html, snippet_options['after_match'])
+ snippets[docid] = s
+ # Fetch all fields from MySQL
+ sql = self.CreateIdQueryString((docid for (docid, _, _, _) in sphinx_rows))
+ cursor = self.db.cursor()
+ cursor.execute(sql)
+ byid = {}
+ for row in cursor:
+ byid[str(row[0])] = row
+ rows = list(byid[docid] + (rel, snippets[docid]) for (docid, rel, _, _) in sphinx_rows if docid in byid)
+ else:
+ rows = []
+ else:
+ # Use regular queries when document content is not searched
+ sql = self.CreateSQLQueryString(query)
+ cursor = self.db.cursor()
+ cursor.execute(sql)
+ rows = list(cursor)
- while 1:
- row = cursor.fetchone()
- if not row:
- break
-
- (dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID,
+ # Convert rows to commit objects
+ for row in rows:
+ (dbId, dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID,
dbFileID, dbRevision, dbStickyTag, dbBranchID, dbAddedLines,
dbRemovedLines, dbDescID, dbRepositoryName, dbDirName,
- dbFileName, dbRelevance) = row
+ dbFileName, dbRelevance, dbSnippet) = row
if not self.check_commit_access(dbRepositoryName, dbDirName, dbFileName, dbRevision):
continue
@@ -564,6 +682,7 @@ class CheckinDatabase:
commit.SetMinusCount(dbRemovedLines)
commit.SetDescriptionID(dbDescID)
commit.SetRelevance(dbRelevance)
+ commit.SetSnippet(dbSnippet)
query.AddCommit(commit)
@@ -623,46 +742,21 @@ class CheckinDatabase:
raise UnknownRepositoryError("Unknown repository '%s'"
% (repository))
- if (self._version >= 1):
- self.sql_delete('repositories', 'id', rep_id)
- self.sql_purge('commits', 'repositoryid', 'id', 'repositories')
- self.sql_purge('files', 'id', 'fileid', 'commits')
- self.sql_purge('dirs', 'id', 'dirid', 'commits')
- self.sql_purge('branches', 'id', 'branchid', 'commits')
- self.sql_purge('descs', 'id', 'descid', 'commits')
- self.sql_purge('people', 'id', 'whoid', 'commits')
- else:
- sql = "SELECT * FROM checkins WHERE repositoryid=%s"
- sql_args = (rep_id, )
- cursor = self.db.cursor()
- cursor.execute(sql, sql_args)
- checkins = []
- while 1:
- try:
- (ci_type, ci_when, who_id, repository_id,
- dir_id, file_id, revision, sticky_tag, branch_id,
- plus_count, minus_count, description_id) = \
- cursor.fetchone()
- except TypeError:
- break
- checkins.append([file_id, dir_id, branch_id,
- description_id, who_id])
-
- #self.sql_delete('repositories', 'id', rep_id)
- self.sql_delete('checkins', 'repositoryid', rep_id)
- for checkin in checkins:
- self.sql_delete('files', 'id', checkin[0], 'fileid')
- self.sql_delete('dirs', 'id', checkin[1], 'dirid')
- self.sql_delete('branches', 'id', checkin[2], 'branchid')
- self.sql_delete('descs', 'id', checkin[3], 'descid')
- self.sql_delete('people', 'id', checkin[4], 'whoid')
+ checkins_table = self._version >= 1 and 'commits' or 'checkins'
+ self.sql_delete('repositories', 'id', rep_id)
+ self.sql_purge(checkins_table, 'repositoryid', 'id', 'repositories')
+ self.sql_purge('files', 'id', 'fileid', checkins_table)
+ self.sql_purge('dirs', 'id', 'dirid', checkins_table)
+ self.sql_purge('branches', 'id', 'branchid', checkins_table)
+ self.sql_purge('descs', 'id', 'descid', checkins_table)
+ self.sql_purge('people', 'id', 'whoid', checkins_table)
# Reset all internal id caches. We could be choosier here,
# but let's just be as safe as possible.
self._get_cache = {}
self._get_id_cache = {}
self._desc_id_cache = {}
-
+
class DatabaseVersionError(Exception):
pass
@@ -678,7 +772,7 @@ class Commit:
CHANGE = 0
ADD = 1
REMOVE = 2
-
+
def __init__(self):
self.__directory = ''
self.__file = ''
@@ -690,15 +784,20 @@ class Commit:
self.__minuscount = ''
self.__description = ''
self.__relevance = ''
+ self.__snippet = ''
self.__gmt_time = 0.0
self.__type = Commit.CHANGE
+ self.__content = ''
+ self.__mimetype = ''
+ self.__base_path = ''
+ self.__base_rev = ''
def SetRepository(self, repository):
self.__repository = repository
def GetRepository(self):
return self.__repository
-
+
def SetDirectory(self, dir):
self.__directory = dir
@@ -710,7 +809,7 @@ class Commit:
def GetFile(self):
return self.__file
-
+
def SetRevision(self, revision):
self.__revision = revision
@@ -758,12 +857,19 @@ class Commit:
def GetDescription(self):
return self.__description
+ # Relevance and snippet are used when querying commit database
def SetRelevance(self, relevance):
self.__relevance = relevance
def GetRelevance(self):
return self.__relevance
+ def SetSnippet(self, snippet):
+ self.__snippet = snippet
+
+ def GetSnippet(self):
+ return self.__snippet
+
def SetTypeChange(self):
self.__type = Commit.CHANGE
@@ -784,66 +890,80 @@ class Commit:
elif self.__type == Commit.REMOVE:
return 'Remove'
+ # File content (extracted text), optional, indexed with Sphinx
+ def SetContent(self, content):
+ self.__content = content
+
+ def GetContent(self):
+ return self.__content
+
+ # MIME type, optional, now only stored in Sphinx
+ def SetMimeType(self, mimetype):
+ self.__mimetype = mimetype
+
+ def GetMimeType(self):
+ return self.__mimetype
+
## LazyCommit overrides a few methods of Commit to only retrieve
## it's properties as they are needed
class LazyCommit(Commit):
- def __init__(self, db):
- Commit.__init__(self)
- self.__db = db
+ def __init__(self, db):
+ Commit.__init__(self)
+ self.__db = db
- def SetFileID(self, dbFileID):
- self.__dbFileID = dbFileID
+ def SetFileID(self, dbFileID):
+ self.__dbFileID = dbFileID
- def GetFileID(self):
- return self.__dbFileID
+ def GetFileID(self):
+ return self.__dbFileID
- def GetFile(self):
- return self.__db.GetFile(self.__dbFileID)
+ def GetFile(self):
+ return self.__db.GetFile(self.__dbFileID)
- def SetDirectoryID(self, dbDirID):
- self.__dbDirID = dbDirID
+ def SetDirectoryID(self, dbDirID):
+ self.__dbDirID = dbDirID
- def GetDirectoryID(self):
- return self.__dbDirID
+ def GetDirectoryID(self):
+ return self.__dbDirID
- def GetDirectory(self):
- return self.__db.GetDirectory(self.__dbDirID)
+ def GetDirectory(self):
+ return self.__db.GetDirectory(self.__dbDirID)
- def SetRepositoryID(self, dbRepositoryID):
- self.__dbRepositoryID = dbRepositoryID
+ def SetRepositoryID(self, dbRepositoryID):
+ self.__dbRepositoryID = dbRepositoryID
- def GetRepositoryID(self):
- return self.__dbRepositoryID
+ def GetRepositoryID(self):
+ return self.__dbRepositoryID
- def GetRepository(self):
- return self.__db.GetRepository(self.__dbRepositoryID)
+ def GetRepository(self):
+ return self.__db.GetRepository(self.__dbRepositoryID)
- def SetAuthorID(self, dbAuthorID):
- self.__dbAuthorID = dbAuthorID
+ def SetAuthorID(self, dbAuthorID):
+ self.__dbAuthorID = dbAuthorID
- def GetAuthorID(self):
- return self.__dbAuthorID
+ def GetAuthorID(self):
+ return self.__dbAuthorID
- def GetAuthor(self):
- return self.__db.GetAuthor(self.__dbAuthorID)
+ def GetAuthor(self):
+ return self.__db.GetAuthor(self.__dbAuthorID)
- def SetBranchID(self, dbBranchID):
- self.__dbBranchID = dbBranchID
+ def SetBranchID(self, dbBranchID):
+ self.__dbBranchID = dbBranchID
- def GetBranchID(self):
- return self.__dbBranchID
+ def GetBranchID(self):
+ return self.__dbBranchID
- def GetBranch(self):
- return self.__db.GetBranch(self.__dbBranchID)
+ def GetBranch(self):
+ return self.__db.GetBranch(self.__dbBranchID)
- def SetDescriptionID(self, dbDescID):
- self.__dbDescID = dbDescID
+ def SetDescriptionID(self, dbDescID):
+ self.__dbDescID = dbDescID
- def GetDescriptionID(self):
- return self.__dbDescID
+ def GetDescriptionID(self):
+ return self.__dbDescID
- def GetDescription(self):
- return self.__db.GetDescription(self.__dbDescID)
+ def GetDescription(self):
+ return self.__db.GetDescription(self.__dbDescID)
## QueryEntry holds data on one match-type in the SQL database
## match is: "exact", "like", or "regex"
@@ -858,8 +978,8 @@ class CheckinDatabaseQuery:
def __init__(self):
## sorting
self.sort = "date"
-
- ## repository to query
+
+ ## repository, branch, etc to query
self.repository_list = []
self.branch_list = []
self.directory_list = []
@@ -867,7 +987,11 @@ class CheckinDatabaseQuery:
self.revision_list = []
self.author_list = []
self.comment_list = []
+
+ ## text_query = Fulltext query on comments
+ ## content_query = Fulltext query on content
self.text_query = ""
+ self.content_query = ""
## date range in DBI 2.0 timedate objects
self.from_date = None
@@ -886,6 +1010,9 @@ class CheckinDatabaseQuery:
def SetTextQuery(self, query):
self.text_query = query
+ def SetContentQuery(self, query):
+ self.content_query = query
+
def SetRepository(self, repository, match = "exact"):
self.repository_list.append(QueryEntry(repository, match))
@@ -921,7 +1048,7 @@ class CheckinDatabaseQuery:
def SetFromDateHoursAgo(self, hours_ago):
ticks = time.time() - (3600 * hours_ago)
self.from_date = dbi.DateTimeFromTicks(ticks)
-
+
def SetFromDateDaysAgo(self, days_ago):
ticks = time.time() - (86400 * days_ago)
self.from_date = dbi.DateTimeFromTicks(ticks)
@@ -942,7 +1069,7 @@ class CheckinDatabaseQuery:
##
def CreateCommit():
return Commit()
-
+
def CreateCheckinQuery():
return CheckinDatabaseQuery()
@@ -953,9 +1080,23 @@ def ConnectDatabase(cfg, authorizer=None, readonly=0):
else:
user = cfg.cvsdb.user
passwd = cfg.cvsdb.passwd
- db = CheckinDatabase(cfg.cvsdb.host, cfg.cvsdb.port, cfg.cvsdb.socket, user, passwd,
- cfg.cvsdb.database_name, cfg.cvsdb.row_limit, cfg.cvsdb.fulltext_min_relevance,
- authorizer)
+ db = CheckinDatabase(
+ host = cfg.cvsdb.host,
+ port = cfg.cvsdb.port,
+ socket = cfg.cvsdb.socket,
+ user = user,
+ passwd = passwd,
+ database = cfg.cvsdb.database_name,
+ row_limit = cfg.cvsdb.row_limit,
+ min_relevance = cfg.cvsdb.fulltext_min_relevance,
+ authorizer = authorizer,
+ index_content = cfg.cvsdb.index_content,
+ sphinx_host = cfg.cvsdb.sphinx_host,
+ sphinx_port = int(cfg.cvsdb.sphinx_port),
+ sphinx_socket = cfg.cvsdb.sphinx_socket,
+ sphinx_index = cfg.cvsdb.sphinx_index,
+ cfg = cfg,
+ )
db.Connect()
return db
diff --git a/lib/vclib/ccvs/bincvs.py b/lib/vclib/ccvs/bincvs.py
index c2b9430d..fee243fb 100644
--- a/lib/vclib/ccvs/bincvs.py
+++ b/lib/vclib/ccvs/bincvs.py
@@ -31,8 +31,8 @@ import popen
class BaseCVSRepository(vclib.Repository):
def __init__(self, name, rootpath, authorizer, utilities):
if not os.path.isdir(rootpath):
- raise vclib.ReposNotFound(name)
-
+ raise vclib.ReposNotFound(name)
+
self.name = name
self.rootpath = rootpath
self.auth = authorizer
@@ -53,7 +53,7 @@ class BaseCVSRepository(vclib.Repository):
def authorizer(self):
return self.auth
-
+
def itemtype(self, path_parts, rev):
basepath = self._getpath(path_parts)
kind = None
@@ -74,12 +74,12 @@ class BaseCVSRepository(vclib.Repository):
def itemprops(self, path_parts, rev):
self.itemtype(path_parts, rev) # does auth-check
return {} # CVS doesn't support properties
-
+
def listdir(self, path_parts, rev, options):
if self.itemtype(path_parts, rev) != vclib.DIR: # does auth-check
raise vclib.Error("Path '%s' is not a directory."
% (string.join(path_parts, "/")))
-
+
# Only RCS files (*,v) and subdirs are returned.
data = [ ]
full_name = self._getpath(path_parts)
@@ -115,7 +115,7 @@ class BaseCVSRepository(vclib.Repository):
data.append(CVSDirEntry(name, kind, errors, 1))
return data
-
+
def _getpath(self, path_parts):
return apply(os.path.join, (self.rootpath,) + tuple(path_parts))
@@ -177,7 +177,7 @@ class BinCVSRepository(BaseCVSRepository):
used_rlog = 0
tip_rev = None # used only if we have to fallback to using rlog
- fp = self.rcs_popen('co', (rev_flag, full_name), 'rb')
+ fp = self.rcs_popen('co', (rev_flag, full_name), 'rb')
try:
filename, revision = _parse_co_header(fp)
except COMissingRevision:
@@ -191,14 +191,14 @@ class BinCVSRepository(BaseCVSRepository):
used_rlog = 1
if not tip_rev:
raise vclib.Error("Unable to find valid revision")
- fp = self.rcs_popen('co', ('-p' + tip_rev.string, full_name), 'rb')
+ fp = self.rcs_popen('co', ('-p' + tip_rev.string, full_name), 'rb')
filename, revision = _parse_co_header(fp)
-
+
if filename is None:
# CVSNT's co exits without any output if a dead revision is requested.
# Bug at http://www.cvsnt.org/cgi-bin/bugzilla/show_bug.cgi?id=190
# As a workaround, we invoke rlog to find the first non-dead revision
- # that precedes it and check out that revision instead. Of course,
+ # that precedes it and check out that revision instead. Of course,
# if we've already invoked rlog above, we just reuse its output.
if not used_rlog:
tip_rev = self._get_tip_revision(full_name + ',v', rev)
@@ -207,7 +207,7 @@ class BinCVSRepository(BaseCVSRepository):
raise vclib.Error(
'Could not find non-dead revision preceding "%s"' % rev)
fp = self.rcs_popen('co', ('-p' + tip_rev.undead.string,
- full_name), 'rb')
+ full_name), 'rb')
filename, revision = _parse_co_header(fp)
if filename is None:
@@ -278,7 +278,7 @@ class BinCVSRepository(BaseCVSRepository):
if self.itemtype(path_parts, rev) != vclib.FILE: # does auth-check
raise vclib.Error("Path '%s' is not a file."
% (string.join(path_parts, "/")))
-
+
# Invoke rlog
rcsfile = self.rcsfile(path_parts, 1)
if rev and options.get('cvs_pass_rev', 0):
@@ -341,7 +341,7 @@ class BinCVSRepository(BaseCVSRepository):
def revinfo(self, rev):
raise vclib.UnsupportedFeature
-
+
def rawdiff(self, path_parts1, rev1, path_parts2, rev2, type, options={}):
"""see vclib.Repository.rawdiff docstring
@@ -439,9 +439,9 @@ def _match_revs_tags(revlist, taglist):
example: if revision is 1.2.3.4, parent is 1.2
"undead"
- If the revision is dead, then this is a reference to the first
+ If the revision is dead, then this is a reference to the first
previous revision which isn't dead, otherwise it's a reference
- to itself. If all the previous revisions are dead it's None.
+ to itself. If all the previous revisions are dead it's None.
"branch_number"
tuple representing branch number or empty tuple if on trunk
@@ -653,7 +653,7 @@ def _parse_co_header(fp):
pass
else:
break
-
+
raise COMalformedOutput, "Unable to find revision in co output stream"
# if your rlog doesn't use 77 '=' characters, then this must change
@@ -674,7 +674,7 @@ _EOF_ERROR = 'error message found' # rlog issued an error
# ^rlog\: (.*)(?:\:\d+)?\: (.*)$
#
# But for some reason the windows version of rlog omits the "rlog: " prefix
-# for the first error message when the standard error stream has been
+# for the first error message when the standard error stream has been
# redirected to a file or pipe. (the prefix is present in subsequent errors
# and when rlog is run from the console). So the expression below is more
# complicated
@@ -703,7 +703,7 @@ def _parse_log_header(fp):
Returns: filename, default branch, tag dictionary, lock dictionary,
rlog error message, and eof flag
"""
-
+
filename = head = branch = msg = ""
taginfo = { } # tag name => number
lockinfo = { } # revision => locker
@@ -732,7 +732,7 @@ def _parse_log_header(fp):
else:
# oops. this line isn't lock info. stop parsing tags.
state = 0
-
+
if state == 0:
if line[:9] == 'RCS file:':
filename = line[10:-1]
@@ -902,7 +902,7 @@ def _file_log(revs, taginfo, lockinfo, cur_branch, filter):
except ValueError:
view_tag = None
else:
- tags.append(view_tag)
+ tags.append(view_tag)
# Match up tags and revisions
_match_revs_tags(revs, tags)
@@ -910,13 +910,13 @@ def _file_log(revs, taginfo, lockinfo, cur_branch, filter):
# Match up lockinfo and revision
for rev in revs:
rev.lockinfo = lockinfo.get(rev.string)
-
+
# Add artificial ViewVC tag HEAD, which acts like a non-branch tag pointing
# at the latest revision on the MAIN branch. The HEAD revision doesn't have
# anything to do with the "head" revision number specified in the RCS file
# and in rlog output. HEAD refers to the revision that the CVS and RCS co
# commands will check out by default, whereas the "head" field just refers
- # to the highest revision on the trunk.
+ # to the highest revision on the trunk.
taginfo['HEAD'] = _add_tag('HEAD', taginfo['MAIN'].co_rev)
# Determine what revisions to return
@@ -954,7 +954,7 @@ def _file_log(revs, taginfo, lockinfo, cur_branch, filter):
_remove_tag(view_tag)
else:
filtered_revs = revs
-
+
return filtered_revs
def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs):
@@ -1004,7 +1004,7 @@ def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs):
= _parse_log_header(rlog)
if eof == _EOF_LOG:
- # the rlog output ended early. this can happen on errors that rlog
+ # the rlog output ended early. this can happen on errors that rlog
# thinks are so serious that it stops parsing the current file and
# refuses to parse any of the files that come after it. one of the
# errors that triggers this obnoxious behavior looks like:
@@ -1052,8 +1052,8 @@ def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs):
tag = None
# we don't care about the specific values -- just the keys and whether
- # the values point to branches or revisions. this the fastest way to
- # merge the set of keys and keep values that allow us to make the
+ # the values point to branches or revisions. this the fastest way to
+ # merge the set of keys and keep values that allow us to make the
# distinction between branch tags and normal tags
alltags.update(taginfo)
@@ -1098,7 +1098,7 @@ def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs):
file.dead = 0
#file.errors.append("No revisions exist on %s" % (view_tag or "MAIN"))
file.absent = 1
-
+
# done with this file now, skip the rest of this file's revisions
if not eof:
_skip_file(rlog)
@@ -1211,7 +1211,7 @@ def _newest_file(dirpath):
newest_time = 0
### FIXME: This sucker is leaking unauthorized paths! ###
-
+
for subfile in os.listdir(dirpath):
### filter CVS locks? stale NFS handles?
if subfile[-2:] != ',v':
diff --git a/lib/viewvc.py b/lib/viewvc.py
index 84f5db9a..44a93a5d 100644
--- a/lib/viewvc.py
+++ b/lib/viewvc.py
@@ -1,4 +1,3 @@
-#
# Copyright (C) 1999-2009 The ViewCVS Group. All Rights Reserved.
#
# By using this file, you agree to the terms and conditions set forth in
@@ -68,7 +67,6 @@ docroot_magic_path = '*docroot*'
viewcvs_mime_type = 'text/vnd.viewcvs-markup'
alt_mime_type = 'text/x-cvsweb-markup'
view_roots_magic = '*viewroots*'
-magic_buf_size = 4096
default_mime_type = 'application/octet-stream'
# Put here the variables we need in order to hold our state - they
@@ -121,9 +119,8 @@ class Request:
# check for an authenticated username
self.username = server.getenv('REMOTE_USER')
- # construct MIME magic
- self.ms = None
- self.ms_fail = 0
+ # repository object cache
+ self.all_repos = {}
# if we allow compressed output, see if the client does too
self.gzip_compress_level = 0
@@ -134,6 +131,9 @@ class Request:
string.split(http_accept_encoding, ","))):
self.gzip_compress_level = 9 # make this configurable?
+ def utf8(self, value):
+ return self.cfg.guesser().utf8(value)
+
def create_repos(self, rootname):
if not rootname:
return None
@@ -677,7 +677,7 @@ def _validate_mimetype(value):
return value in (viewcvs_mime_type, alt_mime_type, 'text/plain')
# obvious things here. note that we don't need uppercase for alpha.
-_re_validate_alpha = re.compile('^[a-z]+$')
+_re_validate_alpha = re.compile('^[a-z_]+$')
_re_validate_number = re.compile('^[0-9]+$')
_re_validate_boolint = re.compile('^[01]$')
@@ -743,6 +743,7 @@ _legal_params = {
'who_match' : _re_validate_alpha,
'comment' : None,
'comment_match' : _re_validate_alpha,
+ 'search_content': None,
'querysort' : _re_validate_alpha,
'date' : _re_validate_alpha,
'hours' : _re_validate_number,
@@ -988,7 +989,7 @@ def nav_path(request):
is_last = len(path_parts) == len(request.path_parts)
if request.roottype == 'cvs':
- item = _item(name=cvsdb.utf8string(part), href=None)
+ item = _item(name=request.utf8(part), href=None)
else:
item = _item(name=part, href=None)
@@ -1248,7 +1249,7 @@ def common_template_data(request, revision=None, mime_type=None):
cfg = request.cfg
where = request.where
if request.roottype == 'cvs':
- where = cvsdb.utf8string(where)
+ where = request.utf8(where)
where = request.server.escape(where)
# Initialize data dictionary members (sorted alphanumerically)
@@ -1444,28 +1445,31 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
get_lexer_for_mimetype, \
get_lexer_for_filename
from pygments.lexers._mapping import LEXERS
+ # Hack for shell mime types:
LEXERS['BashLexer'] = ('pygments.lexers.other', 'Bash', ('bash', 'sh'), ('*.sh',), ('application/x-sh', 'application/x-shellscript', 'text/x-sh', 'text/x-shellscript'))
- encoding = 'guess'
- if cfg.options.detect_encoding:
- try:
- import chardet
- encoding = 'chardet'
- except (SyntaxError, ImportError):
- pass
try:
lexer = get_lexer_for_mimetype(mime_type,
- encoding=encoding,
+ encoding='utf-8',
stripnl=False)
except ClassNotFound:
try:
lexer = get_lexer_for_filename(filename,
- encoding=encoding,
+ encoding='utf-8',
stripnl=False)
except ClassNotFound:
use_pygments = 0
except ImportError:
use_pygments = 0
+ # Detect encoding by calling chardet ourselves,
+ # to support it in non-highlighting mode
+ content = fp.read()
+ c, encoding = cfg.guesser().guess_charset(content)
+ if encoding:
+ content = c
+ else:
+ encoding = 'unknown'
+
# If we aren't going to be highlighting anything, just return the
# BLAME_SOURCE. If there's no blame_source, we'll generate a fake
# one from the file contents we fetch with PATH and REV.
@@ -1475,11 +1479,7 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
else:
lines = []
line_no = 0
- while 1:
- line = fp.readline()
- if not line:
- break
- line = cvsdb.utf8string(line)
+ for line in content.split('\n'):
line_no = line_no + 1
item = vclib.Annotation(cgi.escape(line), line_no,
None, None, None, None)
@@ -1508,19 +1508,11 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
self.blame_data.append(item)
self.line_no = self.line_no + 1
ps = PygmentsSink(blame_source)
- fpd = fp.read()
- try:
- fpdat = unicode(fpd,'utf-8')
- except:
- try:
- fpdat = unicode(fpd,'cp1251')
- except:
- fpdat = fpd
- highlight(fpdat, lexer,
+ highlight(content, lexer,
HtmlFormatter(nowrap=True,
classprefix='pygments-',
encoding='utf-8'), ps)
- return ps.blame_data
+ return ps.blame_data, encoding
def make_time_string(date, cfg):
"""Returns formatted date string in either local time or UTC.
@@ -1594,6 +1586,7 @@ def calculate_mime_type(request, path_parts, rev):
return mime_type
except:
pass
+ # FIXME rewrite to use viewvcmagic
return guess_mime(path_parts[-1])
def markup_or_annotate(request, is_annotate):
@@ -1605,21 +1598,12 @@ def markup_or_annotate(request, is_annotate):
mime_type = calculate_mime_type(request, path, rev)
if not mime_type or mime_type == default_mime_type:
- if request.ms is None and not request.ms_fail:
- try:
- import magic
- request.ms = magic.open(magic.MAGIC_NONE | magic.MAGIC_MIME)
- request.ms.load()
- except:
- request.ms_fail = 1
- if request.ms:
- try:
- fp, revision = request.repos.openfile(path, rev)
- buffer = fp.read(magic_buf_size)
- fp.close()
- mime_type = request.ms.buffer(buffer)
- except:
- pass
+ try:
+ fp, revision = request.repos.openfile(path, rev)
+ mime_type = request.cfg.guesser().guess_mime(None, None, fp)
+ fp.close()
+ except:
+ raise
# Is this a binary type?
if is_binary(request.cfg, mime_type):
@@ -1657,9 +1641,10 @@ def markup_or_annotate(request, is_annotate):
if check_freshness(request, None, revision, weak=1):
fp.close()
return
- lines = markup_stream_pygments(request, cfg, blame_source, fp,
- path[-1], mime_type)
+ lines, charset = markup_stream_pygments(request, cfg, blame_source, fp, path[-1], mime_type)
fp.close()
+ if mime_type.find(';') < 0:
+ mime_type = mime_type+'; charset='+charset
data = common_template_data(request, revision)
data.merge(ezt.TemplateData({
@@ -1910,7 +1895,7 @@ def view_directory(request):
row.short_log = format_log(file.log, cfg)
row.log = htmlify(file.log, cfg.options.mangle_email_addresses)
row.lockinfo = file.lockinfo
- row.name = request.server.escape(cvsdb.utf8string(file.name))
+ row.name = request.server.escape(request.utf8(file.name))
row.anchor = row.name
row.pathtype = (file.kind == vclib.FILE and 'file') or \
(file.kind == vclib.DIR and 'dir')
@@ -2285,7 +2270,7 @@ def view_log(request):
entry.ago = html_time(request, rev.date, 1)
entry.log = rev.log or ""
if cvs:
- entry.log = cvsdb.utf8string(entry.log)
+ entry.log = request.utf8(entry.log)
entry.log = htmlify(entry.log, cfg.options.mangle_email_addresses)
entry.size = rev.size
entry.lockinfo = rev.lockinfo
@@ -2770,7 +2755,7 @@ class DiffSource:
self.save_line = None
self.line_number = None
self.prev_line_number = None
-
+
# keep track of where we are during an iteration
self.idx = -1
self.last = None
@@ -2867,7 +2852,7 @@ class DiffSource:
diff_code = line[0]
output = self._format_text(line[1:])
- output = cvsdb.utf8string(output)
+ output = self.cfg.guesser().utf8(output)
if diff_code == '+':
if self.state == 'dump':
@@ -3644,6 +3629,7 @@ def view_queryform(request):
'who_match' : request.query_dict.get('who_match', 'exact'),
'comment' : request.query_dict.get('comment', ''),
'comment_match' : request.query_dict.get('comment_match', 'fulltext'),
+ 'search_content' : request.query_dict.get('search_content', ''),
'querysort' : request.query_dict.get('querysort', 'date'),
'date' : request.query_dict.get('date', 'hours'),
'hours' : request.query_dict.get('hours', '2'),
@@ -3653,6 +3639,7 @@ def view_queryform(request):
'query_hidden_values' : query_hidden_values,
'limit_changes' : limit_changes,
'dir_href' : dir_href,
+ 'enable_search_content' : request.cfg.cvsdb.index_content,
}))
generate_page(request, "query_form", data)
@@ -3791,7 +3778,8 @@ def build_commit(request, files, max_files, dir_strip, format):
plus_count = 0
minus_count = 0
found_unreadable = 0
- all_repos = {}
+ if not request.all_repos:
+ request.all_repos = {}
for f in files:
dirname = f.GetDirectory()
@@ -3810,17 +3798,19 @@ def build_commit(request, files, max_files, dir_strip, format):
# Check path access (since the commits database logic bypasses the
# vclib layer and, thus, the vcauth stuff that layer uses).
- my_repos = all_repos.get(f.GetRepository(), '')
+ my_repos = request.all_repos.get(f.GetRepository(), '')
if not my_repos:
try:
- my_repos = all_repos[f.GetRepository()] = request.create_repos(f.GetRepository())
+ my_repos = request.all_repos[f.GetRepository()] = request.create_repos(f.GetRepository())
except:
my_repos = None
if not my_repos:
return None
if my_repos['roottype'] == 'cvs':
- try: where = unicode(where,'utf-8')
+ # we store UTF-8 in the DB
+ try: where = where.decode('utf-8')
except: pass
+ # FIXME maybe store "real" filesystem path in the DB instead of having such setting?
try: where = where.encode(cfg.options.cvs_ondisk_charset)
except: pass
path_parts = _path_parts(where)
@@ -3907,24 +3897,27 @@ def build_commit(request, files, max_files, dir_strip, format):
if max_files and num_allowed > max_files:
continue
- commit_files.append(_item(date=commit_time,
- dir=request.server.escape(dirname),
- file=request.server.escape(filename),
- author=request.server.escape(f.GetAuthor()),
- rev=rev,
- branch=f.GetBranch(),
- plus=plus,
- minus=minus,
- type=change_type,
- dir_href=dir_href,
- log_href=log_href,
- view_href=view_href,
- download_href=download_href,
- prefer_markup=prefer_markup,
- diff_href=diff_href,
- root=my_repos,
- path=where,
- path_prev=path_prev))
+ commit_files.append(_item(
+ date=commit_time,
+ dir=request.server.escape(dirname),
+ file=request.server.escape(filename),
+ author=request.server.escape(f.GetAuthor()),
+ rev=rev,
+ branch=f.GetBranch(),
+ plus=plus,
+ minus=minus,
+ type=change_type,
+ snippet=f.GetSnippet(),
+ dir_href=dir_href,
+ log_href=log_href,
+ view_href=view_href,
+ download_href=download_href,
+ prefer_markup=prefer_markup,
+ diff_href=diff_href,
+ root=my_repos,
+ path=where,
+ path_prev=path_prev,
+ ))
# No files survived authz checks? Let's just pretend this
# little commit didn't happen, shall we?
@@ -4115,6 +4108,7 @@ def view_query(request):
who_match = request.query_dict.get('who_match', 'exact')
comment = request.query_dict.get('comment', '')
comment_match = request.query_dict.get('comment_match', 'fulltext')
+ search_content = request.query_dict.get('search_content', '')
querysort = request.query_dict.get('querysort', 'date')
date = request.query_dict.get('date', 'hours')
hours = request.query_dict.get('hours', '2')
@@ -4126,7 +4120,7 @@ def view_query(request):
cfg.options.limit_changes))
match_types = { 'exact':1, 'like':1, 'glob':1, 'regex':1, 'notregex':1 }
- sort_types = { 'date':1, 'author':1, 'file':1 }
+ sort_types = { 'date':1, 'date_rev':1, 'author':1, 'file':1, 'relevance':1 }
date_types = { 'hours':1, 'day':1, 'week':1, 'month':1,
'all':1, 'explicit':1 }
@@ -4193,6 +4187,8 @@ def view_query(request):
query.SetComment(comment, comment_match)
else:
query.SetTextQuery(comment)
+ if search_content:
+ query.SetContentQuery(search_content)
query.SetSortMethod(querysort)
if date == 'hours':
query.SetFromDateHoursAgo(int(hours))
diff --git a/lib/viewvcmagic.py b/lib/viewvcmagic.py
new file mode 100644
index 00000000..5f8b3ea8
--- /dev/null
+++ b/lib/viewvcmagic.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python
+
+import mimetypes
+
+have_chardet = 0
+try:
+ import chardet
+ have_chardet = 1
+except: pass
+
+class ContentMagic:
+
+ def __init__(self, encodings):
+ self.encodings = encodings.split(':')
+ self.mime_magic = None
+ self.errors = []
+ # Try to load magic
+ try:
+ import magic
+ self.mime_magic = magic.open(magic.MAGIC_MIME_TYPE)
+ self.mime_magic.load()
+ except Exception, e:
+ self.errors.append(e)
+
+ # returns MIME type
+ def guess_mime(self, mime, filename, tempfile):
+ if mime == 'application/octet-stream':
+ mime = ''
+ if not mime and filename:
+ mime = mimetypes.guess_type(filename)[0]
+ if not mime and tempfile and self.mime_magic:
+ if type(tempfile) == type(''):
+ mime = self.mime_magic.file(tempfile)
+ else:
+ c = tempfile.read(4096)
+ mime = self.mime_magic.buffer(c)
+ return mime
+
+ # returns (utf8_content, charset)
+ def guess_charset(self, content):
+ # Try to guess with chardet
+ charset = None
+ if have_chardet:
+ # Try chardet
+ try:
+ charset = chardet.detect(content)
+ if charset and charset['encoding']:
+ charset = charset['encoding']
+ content = content.decode(charset)
+ except: charset = None
+ else:
+ # Try UTF-8
+ charset = 'utf-8'
+ try: content = content.decode('utf-8')
+ except: charset = None
+ # Then try to guess primitively
+ if charset is None:
+ for charset in self.encodings:
+ try:
+ content = content.decode(charset)
+ break
+ except: charset = None
+ return (content, charset)
+
+ # guess and encode return value into UTF-8
+ def utf8(self, content):
+ (uni, charset) = self.guess_charset(content)
+ if charset:
+ return uni.encode('utf-8')
+ return content
diff --git a/templates/query_form.ezt b/templates/query_form.ezt
index ec28ffdc..4919bc5b 100644
--- a/templates/query_form.ezt
+++ b/templates/query_form.ezt
@@ -144,7 +144,7 @@ Browse Directory