From e363cf19b162e7770ab9ebe2e8a5a2857212db87 Mon Sep 17 00:00:00 2001
From: vfilippov <vfilippov@6955db30-a419-402b-8a0d-67ecbb4d7f56>
Date: Tue, 27 Sep 2011 16:13:53 +0000
Subject: [PATCH] Bug 82651 - Tika&Sphinx&chardet content indexing (done!)

git-svn-id: svn://svn.office.custis.ru/3rdparty/viewvc.org/trunk@1388 6955db30-a419-402b-8a0d-67ecbb4d7f56
---
 bin/make-database           |  12 +-
 bin/svndbadmin              | 149 +++++++++++-
 conf/viewvc.conf.dist       | 146 ++++++++---
 lib/config.py               |  26 +-
 lib/cvsdb.py                | 465 +++++++++++++++++++++++-------------
 lib/vclib/ccvs/bincvs.py    |  56 ++---
 lib/viewvc.py               | 148 ++++++------
 lib/viewvcmagic.py          |  70 ++++++
 templates/query_form.ezt    |  10 +-
 templates/query_results.ezt |  11 +-
 10 files changed, 761 insertions(+), 332 deletions(-)
 create mode 100644 lib/viewvcmagic.py

diff --git a/bin/make-database b/bin/make-database
index b8775705..f538b837 100755
--- a/bin/make-database
+++ b/bin/make-database
@@ -44,6 +44,7 @@ CREATE TABLE branches (
 
 DROP TABLE IF EXISTS checkins;
 CREATE TABLE checkins (
+  id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
   type enum('Change','Add','Remove'),
   ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
   whoid mediumint(9) DEFAULT '0' NOT NULL,
@@ -57,7 +58,7 @@ CREATE TABLE checkins (
   removedlines int(11) DEFAULT '0' NOT NULL,
   descid mediumint(9),
   UNIQUE repositoryid (repositoryid,dirid,fileid,revision),
-  KEY repository_when (repositoryid,ci_when),
+  KEY repositoryid_when (repositoryid,ci_when),
   KEY ci_when (ci_when),
   KEY whoid (whoid,ci_when),
   KEY dirid (dirid),
@@ -138,6 +139,7 @@ CREATE TABLE branches (
 
 DROP TABLE IF EXISTS commits;
 CREATE TABLE commits (
+  id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
   type enum('Change','Add','Remove'),
   ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
   whoid mediumint(9) DEFAULT '0' NOT NULL,
@@ -151,9 +153,9 @@ CREATE TABLE commits (
   removedlines int(11) DEFAULT '0' NOT NULL,
   descid mediumint(9),
   UNIQUE repositoryid (repositoryid,dirid,fileid,revision),
+  KEY repositoryid_when (repositoryid,ci_when),
   KEY ci_when (ci_when),
-  KEY whoid (whoid),
-  KEY repositoryid_2 (repositoryid),
+  KEY whoid (whoid,ci_when),
   KEY dirid (dirid),
   KEY fileid (fileid),
   KEY branchid (branchid),
@@ -253,7 +255,7 @@ Options:
                       [Default: ViewVC]
 
   --help              Show this usage message.
-  
+
   --hostname=ARG      Use ARG as the hostname for the MySQL connection.
                       [Default: localhost]
 
@@ -264,7 +266,7 @@ Options:
   --version=ARG       Create the database using the schema employed by
                       version ARG of ViewVC.  Valid values are:
                       [ "1.0" ]
-                            
+
 """ % (os.path.basename(sys.argv[0])))
   if errmsg is not None:
     stream.write("[ERROR] %s.\n" % (errmsg))
diff --git a/bin/svndbadmin b/bin/svndbadmin
index 6c40ad39..8d951e01 100755
--- a/bin/svndbadmin
+++ b/bin/svndbadmin
@@ -58,7 +58,11 @@ else:
 
 import os
 import string
+import socket
+import select
 import re
+import mimetypes
+import time
 
 import svn.core
 import svn.repos
@@ -68,14 +72,20 @@ import svn.delta
 import cvsdb
 import viewvc
 import vclib
+from viewvcmagic import ContentMagic
 
 class SvnRepo:
     """Class used to manage a connection to a SVN repository."""
-    def __init__(self, path):
+    def __init__(self, path, index_content = None, tika_client = None, guesser = None,
+                 svn_ignore_mimetype = False):
         self.path = path
         self.repo = svn.repos.svn_repos_open(path)
         self.fs = svn.repos.svn_repos_fs(self.repo)
         self.rev_max = svn.fs.youngest_rev(self.fs)
+        self.index_content = index_content
+        self.tika_client = tika_client
+        self.guesser = guesser
+        self.svn_ignore_mimetype = svn_ignore_mimetype
     def __getitem__(self, rev):
         if rev is None:
             rev = self.rev_max
@@ -128,6 +138,74 @@ def _get_diff_counts(diff_fp):
         line = diff_fp.readline()
     return plus, minus
 
+class TikaClient:
+    # Create tika client
+    def __init__(self, tika_server, mime_types):
+        self.tika_server = tika_server
+        self.mime_types = mime_types
+        self.addr = tika_server.split(':')
+        # Split address
+        if len(self.addr) != 2:
+            raise Exception('tika_server value is incorrect: \''+tika_server+'\', please use \'host:port\' format')
+        self.addr = (self.addr[0], int(self.addr[1]))
+        # Build regexp for MIME types
+        m = re.split('\s+', mime_types.strip())
+        self.mime_regexp = re.compile('|'.join('^'+re.escape(i).replace('\\*', '.*')+'$' for i in m))
+
+    # Extract text content from file using Tika which runs in server mode
+    def get_text(self, filename, mime_type, log_filename):
+        if not self.mime_regexp.match(mime_type):
+            # Tika can't handle this mime type, return nothing
+            return ''
+        fd = None
+        s = None
+        text = ''
+        fsize = 0
+        try:
+            # Read original file
+            fd = open(filename, 'rb')
+            data = fd.read()
+            fsize = len(data)
+            if not fsize:
+                return ''
+            # Connect to Tika
+            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            s.connect(self.addr)
+            s.setblocking(0)
+            sockfd = s.fileno()
+            # Tika is somewhat delicate about network IO, so:
+            # Read and write using poll(2) system call
+            p = select.poll()
+            p.register(sockfd)
+            while 1:
+                fds = p.poll()
+                if not fds:
+                    break
+                (pollfd, event) = fds[0]
+                if event & select.POLLIN:
+                    # Exception or empty data means EOF...
+                    try: part = os.read(sockfd, 65536)
+                    except: break
+                    if not part: break
+                    text += part
+                if event & select.POLLOUT:
+                    if not len(data):
+                        # Shutdown output and forget about POLLOUT
+                        s.shutdown(socket.SHUT_WR)
+                        p.modify(sockfd, select.POLLIN)
+                    else:
+                        # Write and consume some data
+                        l = os.write(sockfd, data)
+                        data = data[l:]
+            if len(text) == 0:
+                raise Exception('Empty response from Tika server')
+            print "Extracted %d bytes from %s (%s) of size %d" % (len(text), log_filename, mime_type, fsize)
+        except Exception, e:
+            print "Error extracting text from %s (%s) of size %d: %s" % (log_filename, mime_type, fsize, str(e))
+        finally:
+            if fd: fd.close()
+            if s: s.close()
+        return text
 
 class SvnRev:
     """Class used to hold information about a particular revision of
@@ -151,7 +229,7 @@ class SvnRev:
 
         # get a root for the current revisions
         fsroot = self._get_root_for_rev(rev)
-        
+
         # find changes in the revision
         editor = svn.repos.RevisionChangeCollector(repo.fs, rev)
         e_ptr, e_baton = svn.delta.make_editor(editor)
@@ -168,7 +246,7 @@ class SvnRev:
             base_root = None
             if change.base_path:
                 base_root = self._get_root_for_rev(change.base_rev)
-                
+
             if not change.path:
                 action = 'remove'
             elif change.added:
@@ -184,19 +262,53 @@ class SvnRev:
                                       ['-b', '-B'])
             diff_fp = diffobj.get_pipe()
             plus, minus = _get_diff_counts(diff_fp)
-            # TODO Indexing file contents
-            # For binary files: svn.fs.contents_changed(root1, path1, root2, path2)
-            # Temp file with contents is at: diffobj.tempfile2
-            # Apache Tika server may even be at another host!
 
             # CustIS Bug 50473: a workaround for svnlib behaviour in file movements (FILE1 -> FILE2 + FILE1 -> null)
             if change.base_path:
-                if not change.path and changes_hash.get(change.base_path, '') != '':
+                if not change.path and change.base_path in changes_hash:
                     minus = 0
                 elif change.path:
                     changes_hash[change.base_path] = change.path
 
-            self.changes.append((path, action, plus, minus))
+            content = ''
+            mime = ''
+            # need to check if binary file's content changed when copying,
+            # if not, don't extract it, just get it from previous revision later
+            if repo.index_content and change.path and (not change.base_path
+                or svn.fs.contents_changed(
+                    base_root and base_root or None,
+                    base_root and change.base_path or None,
+                    fsroot, change.path
+                )):
+                    props = svn.fs.node_proplist(fsroot, change.path)
+                    if not repo.svn_ignore_mimetype:
+                        mime = props.get('svn:mime-type', None)
+                    else:
+                        mime = None
+                    mime = repo.guesser.guess_mime(
+                        mime,
+                        os.path.basename(change.path),
+                        diffobj.tempfile2
+                    )
+                    # Read and guess charset by ourselves for text files
+                    if mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')):
+                        try:
+                            fd = open(diffobj.tempfile2, 'rb')
+                            content = fd.read()
+                            fd.close()
+                        except: pass
+                        # Guess charset
+                        if content:
+                            content, charset = repo.guesser.guess_charset(content)
+                            if charset:
+                                content = content.encode('utf-8')
+                                print 'Guessed %s for %s' % (charset, change.path)
+                            else:
+                                print 'Failed to guess charset for %s, not indexing' % (change.path, )
+                    # Try to extract content using Tika from binary documents
+                    elif repo.tika_client:
+                        content = repo.tika_client.get_text(diffobj.tempfile2, mime, change.path)
+            self.changes.append((path, action, plus, minus, content, mime))
 
     def _get_root_for_rev(self, rev):
         """Fetch a revision root from a cache of such, or a fresh root
@@ -217,7 +329,7 @@ def handle_revision(db, command, repo, rev, verbose, force=0):
         if verbose: print "skipped (no changes)."
         return
 
-    for (path, action, plus, minus) in revision.changes:
+    for (path, action, plus, minus, content, mime) in revision.changes:
         directory, file = os.path.split(path)
         commit = cvsdb.CreateCommit()
         commit.SetRepository(repo.path)
@@ -230,6 +342,8 @@ def handle_revision(db, command, repo, rev, verbose, force=0):
         commit.SetPlusCount(plus)
         commit.SetMinusCount(minus)
         commit.SetBranch(None)
+        commit.SetContent(content)
+        commit.SetMimeType(mime)
 
         if action == 'add':
             commit.SetTypeAdd()
@@ -268,7 +382,16 @@ def main(command, repository, revs=[], verbose=0, force=0):
                 sys.stderr.write("ERROR: " + str(e) + "\n")
                 sys.exit(1)
 
-    repo = SvnRepo(repository)
+    tika_client = None
+    if cfg.utilities.tika_server:
+        tika_client = TikaClient(cfg.utilities.tika_server, cfg.utilities.tika_mime_types)
+    repo = SvnRepo(
+        path = repository,
+        index_content = cfg.cvsdb.index_content,
+        tika_client = tika_client,
+        guesser = cfg.guesser(),
+        svn_ignore_mimetype = cfg.options.svn_ignore_mimetype,
+    )
     if command == 'rebuild' or (command == 'update' and not revs):
         for rev in range(repo.rev_max+1):
             handle_revision(db, command, repo, rev, verbose)
@@ -312,7 +435,7 @@ Usage: 1. %s [-v] rebuild REPOS-PATH
     the database.  If a range is specified, the revisions will be
     processed in ascending order, and you may specify "HEAD" to
     indicate "the youngest revision currently in the repository".
-    
+
 3.  Purge information specific to the repository located at REPOS-PATH
     from the database.
 
@@ -337,7 +460,7 @@ if __name__ == '__main__':
         del args[index]
     except ValueError:
         pass
-        
+
     if len(args) < 3:
         usage()
 
diff --git a/conf/viewvc.conf.dist b/conf/viewvc.conf.dist
index 3a26a4fe..0a1017ec 100644
--- a/conf/viewvc.conf.dist
+++ b/conf/viewvc.conf.dist
@@ -1,6 +1,6 @@
 #---------------------------------------------------------------------------
 #
-# Configuration file for ViewVC
+# Configuration file for ViewVC (4IntraNet patched version)
 #
 # Information on ViewVC is located at the following web site:
 #     http://viewvc.org/
@@ -9,7 +9,7 @@
 
 # THE FORMAT OF THIS CONFIGURATION FILE
 #
-#    This file is delineated by sections, specified in [brackets].  Within 
+#    This file is delineated by sections, specified in [brackets].  Within
 #    each section, are a number of configuration settings.  These settings
 #    take the form of: name = value.  Values may be continued on the
 #    following line by indenting the continued line.
@@ -17,14 +17,14 @@
 #    WARNING:  Indentation *always* means continuation.  Name=value lines
 #    should always start in column zero.
 #
-#    Comments should always start in column zero, and are identified 
+#    Comments should always start in column zero, and are identified
 #    with "#".
 #
-#    Certain configuration settings may have multiple values.  These should 
-#    be separated by a comma.  The settings where this is allowed are noted 
+#    Certain configuration settings may have multiple values.  These should
+#    be separated by a comma.  The settings where this is allowed are noted
 #    below.  Any other setting that requires special syntax is noted at that
 #    setting.
-# 
+#
 #
 # SOME TERMINOLOGY USED HEREIN
 #
@@ -50,10 +50,10 @@
 #    recommend you pay attention to.  Of course, don't try to change the
 #    options here -- do so in the relevant section of the configuration
 #    file below.
-#    
+#
 #    For correct operation, you will probably need to change the following
 #    configuration variables:
-#   
+#
 #       cvs_roots (for CVS)
 #       svn_roots (for Subversion)
 #       root_parents (for CVS or Subversion)
@@ -62,18 +62,18 @@
 #       rcs_dir
 #       mime_types_file
 #       the many options in the [utilities] section
-#   
+#
 #    It is usually desirable to change the following variables:
-#   
+#
 #       address
 #       forbidden
-#   
+#
 #    To optimize delivery of ViewVC static files:
-#   
+#
 #       docroot
-#   
+#
 #    To customize the display of ViewVC for your site:
-#   
+#
 #       template_dir
 #       the [templates] override section
 #
@@ -139,7 +139,7 @@ default_root = cvs
 # provided only as a convenience for ViewVC installations which are
 # using the default template set, where the value of this option will
 # be displayed in the footer of every ViewVC page.)
-address = 
+address =
 
 #
 # This option provides a mechanism for custom key/value pairs to be
@@ -244,21 +244,47 @@ cvsnt =
 
 # See also bin/cvsnt-rcsfile-inetd.pl
 
-#rcsfile_socket = 'host:port'
-# Example: rcsfile_socket = '127.0.0.1:8071'
+#rcsfile_socket = host:port
+# Example: rcsfile_socket = 127.0.0.1:8071
 
 # Subversion command-line client, used for viewing Subversion repositories
 svn =
 # svn = /usr/bin/svn
 
 # GNU diff, used for showing file version differences
-diff = 
+diff =
 # diff = /usr/bin/diff
 
 # CvsGraph, a graphical CVS version graph generator (see options.use_cvsgraph)
 cvsgraph =
 # cvsgraph = /usr/local/bin/cvsgraph
 
+# Apache Tika TCP server host and port, used to extract text from binary documents
+# Note that as of 2011-09-12, Tika 0.9 has a bug which leads to hangups on processing
+# M$Word documents in server mode. So you must use the fixed version, downloaded from:
+# http://wiki.4intra.net/public/tika-app-0.9-fix-TIKA709.jar
+# (mirror) http://code.google.com/p/mediawiki4intranet/downloads/detail?name=tika-app-0.9-fix-TIKA709.jar
+# Or apply the patch by yourself and rebuild Tika from source, see patch here:
+# https://issues.apache.org/jira/browse/TIKA-709
+# Tika server should be started with command 'java -jar tika-app-0.9.jar -p PORT -t -eutf-8'
+
+#tika_server = host:port
+# Example: tika_server = 127.0.0.1:8072
+
+# This lists MIME types that can be processed by Tika
+# You may change it if your Tika is newer than 0.9 and supports more formats
+# (note) *+xml examples: xhtml+xml, rss+xml, atom+xml, docbook+xml, rdf+xml
+tika_mime_types =
+    text/*
+    application/*+xml
+    application/xml
+    application/vnd.oasis.opendocument.*
+    application/vnd.openxmlformats
+    application/vnd.ms-*
+    application/msaccess
+    application/msword
+    application/pdf
+    application/rtf
 
 #---------------------------------------------------------------------------
 [options]
@@ -358,7 +384,7 @@ svn_ignore_mimetype = 0
 # directory ViewVC should consult for various things, including cached
 # remote authentication credentials.  If unset, Subversion will use
 # the default location(s) ($HOME/.subversion, etc.)
-svn_config_dir = 
+svn_config_dir =
 
 # use the rcsparse Python module to retrieve CVS repository
 # information instead of invoking rcs utilities [EXPERIMENTAL]
@@ -494,12 +520,18 @@ short_log_len = 80
 # should we colorize known file content syntaxes?  (requires Pygments module)
 enable_syntax_coloration = 1
 
+# detect_encoding: Should we attempt to detect versioned file
+# character encodings?  [Requires 'chardet' module]
+# Used in file list, file content display and indexing
+# See also options.encodings for naive guessing.
+detect_encoding = 1
+
 # Use CvsGraph. See http://www.akhphd.au.dk/~bertho/cvsgraph/ for
-# documentation and download. 
+# documentation and download.
 use_cvsgraph = 0
 #use_cvsgraph = 1
 
-# Location of the customized cvsgraph configuration file.  
+# Location of the customized cvsgraph configuration file.
 cvsgraph_conf = cvsgraph.conf
 
 #
@@ -544,6 +576,17 @@ use_pagesize = 0
 # Set to 0 to disable the limit.
 limit_changes = 100
 
+# You can also use primitive charset guessing instead of chardet (options.detect_encoding)
+# Just set this to the list of possible charsets in your repository.
+# ViewVC will simply try to decode content using each of them, and pick
+# the first which succeeds. UTF-8 is always tried automatically.
+#encodings = cp1251:iso-8859-1
+
+# Sadly this is also required - for back-links from query results to files
+# in CVS, because it doesn't recode file names to UTF-8 as Subversion does.
+# Just set to cp1251 if you work with your CVS from Windowz.
+#cvs_ondisk_charset = cp1251
+
 #---------------------------------------------------------------------------
 [templates]
 
@@ -554,7 +597,7 @@ limit_changes = 100
 # use a different template for a particular view, simply uncomment the
 # appropriate option below and specify the currect location of the EZT
 # template file you wish to use for that view.
-# 
+#
 # Templates are specified relative to the configured template
 # directory (see the "template_dir" option), but absolute paths may
 # also be used as well.
@@ -569,13 +612,13 @@ limit_changes = 100
 #diff = diff.ezt
 #directory = directory.ezt
 ### an alternative directory view
-#directory = dir_new.ezt   
+#directory = dir_new.ezt
 #error = error.ezt
 #file = file.ezt
 #graph = graph.ezt
 #log = log.ezt
 ### a table-based alternative log view
-#log = log_table.ezt  
+#log = log_table.ezt
 #query = query.ezt
 #query_form = query_form.ezt
 #query_results = query_results.ezt
@@ -588,22 +631,51 @@ limit_changes = 100
 # Set to 1 to enable the database integration feature, 0 otherwise.
 enabled = 0
 
-# Database hostname and port.
+# Set to 1 to enable indexing of file contents using Sphinx and Tika
+index_content = 0
+
+# Database hostname, port, and socket
 #host = localhost
 #port = 3306
+# On Debian Linux, enable this:
+#socket = /var/run/mysqld/mysqld.sock
 
 # ViewVC database name.
 #database_name = ViewVC
 
 # Username and password of user with read/write privileges to the ViewVC
 # database.
-#user = 
-#passwd = 
+#user =
+#passwd =
 
 # Username and password of user with read privileges to the ViewVC
 # database.
-#readonly_user = 
-#readonly_passwd = 
+#readonly_user =
+#readonly_passwd =
+
+# ViewVC can use Sphinx (http://sphinxsearch.com) full-text search engine
+# to index file contents with full history and then search over them.
+# Also, Apache Tika console application can be used in TCP server mode to
+# add support for indexing binary documents (M$Word, PDF and etc).
+# See tika_server in [utilities].
+# Requires Sphinx >= 0.9.9 with a real-time updatable SphinxQL index.
+# Index must be created in sphinx.conf by hand and have the following fields:
+#  rt_field = content
+#  rt_attr_string = content
+#  rt_attr_string = mimetype
+#  rt_attr_timestamp = ci_when
+#  rt_attr_uint = whoid
+#  rt_attr_uint = repositoryid
+#  rt_attr_uint = dirid
+#  rt_attr_uint = fileid
+#  rt_attr_uint = revision
+#  rt_attr_uint = branchid
+
+# Sphinx connection parameters:
+#sphinx_host =
+#sphinx_port =
+#sphinx_socket = /var/run/sphinxql.sock
+#sphinx_index = viewvc
 
 # Limit the number of rows returned by a given query to this number.
 #row_limit = 1000
@@ -616,7 +688,7 @@ enabled = 0
 
 # Check if the repository is found in the database before showing
 # the query link and RSS feeds.  Set to 1 to enable check.
-# 
+#
 # WARNING: Enabling this check adds the cost of a database connection
 # and query to most ViewVC requests.  If all your roots are represented
 # in the commits database, or if you don't care about the creation of
@@ -640,7 +712,7 @@ enabled = 0
 #
 # ViewVC allows you to customize its configuration options for
 # individual virtual hosts.  You might, for example, wish to expose
-# all of your Subversion repositories at http://svn.yourdomain.com/viewvc/ 
+# all of your Subversion repositories at http://svn.yourdomain.com/viewvc/
 # and all your CVS ones at http://cvs.yourdomain.com/viewvc/, with no
 # cross-exposure.  Using ViewVC's virtual host (vhost) configuration
 # support, you can do this.  Simply create two vhost configurations
@@ -671,7 +743,7 @@ enabled = 0
 # gui = guiproject.yourdomain.*
 #
 # [vhost-libs/general]
-# cvs_roots = 
+# cvs_roots =
 # svn_roots = svnroot: /var/svn/libs-repos
 # default_root = svnroot
 #
@@ -680,7 +752,7 @@ enabled = 0
 #
 # [vhost-gui/general]
 # cvs_roots = cvsroot: /var/cvs/guiproject
-# svn_roots = 
+# svn_roots =
 # default_root = cvsroot
 #
 
@@ -697,7 +769,7 @@ enabled = 0
 #
 # Here is an example showing how to enable Subversion authz-based
 # authorization for only the single root named "svnroot":
-# 
+#
 # [root-svnroot/options]
 # authorizer = svnauthz
 #
@@ -726,7 +798,7 @@ enabled = 0
 #
 # Tests are case-sensitive.
 #
-# NOTE: Again, this is for the hiding of modules within repositories, *not* 
+# NOTE: Again, this is for the hiding of modules within repositories, *not*
 # for the hiding of repositories (roots) themselves.
 #
 # Some examples:
@@ -749,7 +821,7 @@ enabled = 0
 #    Allow "xml", forbid other modules starting with "x", and allow the rest:
 #       forbidden = !xml, x*, !*
 #
-forbidden = 
+forbidden =
 
 #---------------------------------------------------------------------------
 [authz-forbiddenre]
@@ -792,7 +864,7 @@ forbidden =
 #    Only allow visibility of HTML files and the directories that hold them:
 #       forbiddenre = !^([^/]+|.*(/|\.html))$
 #
-forbiddenre = 
+forbiddenre =
 
 #---------------------------------------------------------------------------
 [authz-svnauthz]
diff --git a/lib/config.py b/lib/config.py
index cdc04386..264d7ea4 100644
--- a/lib/config.py
+++ b/lib/config.py
@@ -24,6 +24,7 @@ import vclib.ccvs
 import vclib.svn
 import cvsdb
 import viewvc
+from viewvcmagic import ContentMagic
 
 #########################################################################
 #
@@ -47,6 +48,7 @@ class Config:
                         'root_parents', 'allowed_views', 'mime_types_files')
 
   def __init__(self):
+    self.__guesser = None
     for section in self._sections:
       setattr(self, section, _sub_config())
 
@@ -66,7 +68,6 @@ class Config:
     if rootname:
       self._process_root_options(self.parser, rootname)
     self.expand_root_parents()
-    cvsdb.setencs(self.options.encodings.split(':'))
     r = {}
     for i in self.rewritehtml.__dict__.keys():
       if i[-8:] == '.replace':
@@ -201,7 +202,7 @@ class Config:
           pass
         else:
           raise IllegalOverrideSection('root', section)
-          
+
   def overlay_root_options(self, rootname):
     "Overly per-root options atop the existing option set."
     if not self.conf_path:
@@ -217,7 +218,7 @@ class Config:
       for option in parser.options(section):
         d[option] = parser.get(section, option)
       return d.items()
-    
+
   def get_authorizer_params(self, authorizer, rootname=None):
     if not self.conf_path:
       return {}
@@ -236,7 +237,12 @@ class Config:
             params[key] = value
     params['__config'] = self
     return params
-  
+
+  def guesser(self):
+    if not self.__guesser:
+      self.__guesser = ContentMagic(self.options.encodings)
+    return self.__guesser
+
   def set_defaults(self):
     "Set some default values in the configuration."
 
@@ -258,6 +264,8 @@ class Config:
     self.utilities.svn = ''
     self.utilities.diff = ''
     self.utilities.cvsgraph = ''
+    self.utilities.tika_server = ''
+    self.utilities.tika_mime_types = ''
 
     self.options.root_as_url_component = 1
     self.options.checkout_magic = 0
@@ -302,7 +310,7 @@ class Config:
     self.options.limit_changes = 100
     self.options.cvs_ondisk_charset = 'cp1251'
     self.options.binary_mime_re = '^(?!text/|.*\Wxml)'
-    self.options.encodings = 'utf-8:cp1251:iso-8859-1'
+    self.options.encodings = 'cp1251:iso-8859-1'
 
     self.templates.diff = None
     self.templates.directory = None
@@ -316,6 +324,7 @@ class Config:
     self.templates.roots = None
 
     self.cvsdb.enabled = 0
+    self.cvsdb.index_content = 0
     self.cvsdb.host = ''
     self.cvsdb.port = 3306
     self.cvsdb.socket = ''
@@ -323,12 +332,17 @@ class Config:
     self.cvsdb.user = ''
     self.cvsdb.passwd = ''
     self.cvsdb.readonly_user = ''
-    self.cvsdb.readonly_passwd = '' 
+    self.cvsdb.readonly_passwd = ''
     self.cvsdb.row_limit = 1000
     self.cvsdb.rss_row_limit = 100
     self.cvsdb.check_database_for_root = 0
     self.cvsdb.fulltext_min_relevance = 0.2
 
+    self.cvsdb.sphinx_host = ''
+    self.cvsdb.sphinx_port = 3307
+    self.cvsdb.sphinx_socket = ''
+    self.cvsdb.sphinx_index = ''
+
 def _startswith(somestr, substr):
   return somestr[:len(substr)] == substr
 
diff --git a/lib/cvsdb.py b/lib/cvsdb.py
index e5e7a7e1..a614c5e0 100644
--- a/lib/cvsdb.py
+++ b/lib/cvsdb.py
@@ -15,6 +15,7 @@ import sys
 import string
 import time
 import re
+import cgi
 
 import vclib
 import dbi
@@ -36,22 +37,12 @@ error = "cvsdb error"
 ## defined to actually be complete; it should run well off of any DBI 2.0
 ## complient database interface
 
-encs = [ "utf-8", "cp1251", "iso-8859-1" ]
-
-def utf8string(value):
-    for e in encs:
-        try:
-            value = value.decode(e)
-            break
-        except: pass
-    return value.encode("utf-8")
-
-def setencs(e):
-    global encs
-    encs = e
-
 class CheckinDatabase:
-    def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, authorizer = None):
+    def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, cfg,
+                 authorizer = None, index_content = 0, sphinx_host = None, sphinx_port = None,
+                 sphinx_socket = None, sphinx_index = None):
+        self.cfg = cfg
+
         self._host = host
         self._port = port
         self._socket = socket
@@ -63,11 +54,21 @@ class CheckinDatabase:
         self._min_relevance = min_relevance
         self.authorizer = authorizer
 
+        # Sphinx settings
+        self.index_content = index_content
+        self.sphinx_host = sphinx_host
+        self.sphinx_port = sphinx_port
+        self.sphinx_socket = sphinx_socket
+        self.sphinx_index = sphinx_index
+
         ## database lookup caches
         self._get_cache = {}
         self._get_id_cache = {}
         self._desc_id_cache = {}
 
+        # Sphinx connection None by default
+        self.sphinx = None
+
     def Connect(self):
         self.db = dbi.connect(
             self._host, self._port, self._socket, self._user, self._passwd, self._database)
@@ -83,12 +84,17 @@ class CheckinDatabase:
         else:
             self._version = 0
         if self._version > CURRENT_SCHEMA_VERSION:
-           raise DatabaseVersionError("Database version %d is newer than the "
-                                      "last version supported by this "
-                                      "software." % (self._version))
+            raise DatabaseVersionError("Database version %d is newer than the "
+                                       "last version supported by this "
+                                       "software." % (self._version))
+        if self.index_content:
+            self.sphinx = dbi.connect(self.sphinx_host, self.sphinx_port, self.sphinx_socket, '', '', '')
+
+    def utf8(self, value):
+        return self.cfg.guesser().utf8(value)
 
     def sql_get_id(self, table, column, value, auto_set):
-        value = utf8string(value)
+        value = self.utf8(value)
 
         sql = "SELECT id FROM %s WHERE %s=%%s" % (table, column)
         sql_args = (value, )
@@ -172,7 +178,7 @@ class CheckinDatabase:
 
         temp2[id] = value
         return value
-        
+
     def get_list(self, table, field_index):
         sql = "SELECT * FROM %s" % (table)
         cursor = self.db.cursor()
@@ -198,7 +204,7 @@ class CheckinDatabase:
                 break
             list.append(row[0])
         return list
-        
+
     def GetMetadataValue(self, name):
         sql = "SELECT value FROM metadata WHERE name=%s"
         sql_args = (name)
@@ -209,7 +215,7 @@ class CheckinDatabase:
         except TypeError:
             return None
         return value
-        
+
     def SetMetadataValue(self, name, value):
         assert(self._version > 0)
         sql = "REPLACE INTO metadata (name, value) VALUES (%s, %s)"
@@ -222,7 +228,7 @@ class CheckinDatabase:
                             "\tname  = %s\n"
                             "\tvalue = %s\n"
                             % (str(e), name, value))
-        
+
     def GetBranchID(self, branch, auto_set = 1):
         return self.get_id("branches", "branch", branch, auto_set)
 
@@ -240,13 +246,13 @@ class CheckinDatabase:
 
     def GetFile(self, id):
         return self.get("files", "file", id)
-    
+
     def GetAuthorID(self, author, auto_set = 1):
         return self.get_id("people", "who", author, auto_set)
 
     def GetAuthor(self, id):
         return self.get("people", "who", id)
-    
+
     def GetRepositoryID(self, repository, auto_set = 1):
         return self.get_id("repositories", "repository", repository, auto_set)
 
@@ -257,7 +263,7 @@ class CheckinDatabase:
         return self.get_list("repositories", repository)
 
     def SQLGetDescriptionID(self, description, auto_set = 1):
-        description = utf8string(description)
+        description = self.utf8(description)
         ## lame string hash, blame Netscape -JMP
         hash = len(description)
 
@@ -330,7 +336,7 @@ class CheckinDatabase:
             ci_when = cursor.fetchone()[0]
         except TypeError:
             return None
-        
+
         return dbi.TicksFromDateTime(ci_when)
 
     def AddCommitList(self, commit_list):
@@ -338,48 +344,55 @@ class CheckinDatabase:
             self.AddCommit(commit)
 
     def AddCommit(self, commit):
-        ci_when = dbi.DateTimeFromTicks(commit.GetTime() or 0.0)
-        ci_type = commit.GetTypeString()
-        who_id = self.GetAuthorID(commit.GetAuthor())
-        repository_id = self.GetRepositoryID(commit.GetRepository())
-        directory_id = self.GetDirectoryID(commit.GetDirectory())
-        file_id = self.GetFileID(commit.GetFile())
-        revision = commit.GetRevision()
-        sticky_tag = "NULL"
-        branch_id = self.GetBranchID(commit.GetBranch())
-        plus_count = commit.GetPlusCount() or '0'
-        minus_count = commit.GetMinusCount() or '0'
-        description_id = self.GetDescriptionID(commit.GetDescription())
+        props = {
+            'type'         : commit.GetTypeString(),
+            'ci_when'      : dbi.DateTimeFromTicks(commit.GetTime() or 0.0),
+            'whoid'        : self.GetAuthorID(commit.GetAuthor()),
+            'repositoryid' : self.GetRepositoryID(commit.GetRepository()),
+            'dirid'        : self.GetDirectoryID(commit.GetDirectory()),
+            'fileid'       : self.GetFileID(commit.GetFile()),
+            'revision'     : commit.GetRevision(),
+            'branchid'     : self.GetBranchID(commit.GetBranch()),
+            'addedlines'   : commit.GetPlusCount() or '0',
+            'removedlines' : commit.GetMinusCount() or '0',
+            'descid'       : self.GetDescriptionID(commit.GetDescription()),
+        }
 
         commits_table = self._version >= 1 and 'commits' or 'checkins'
-        sql = "REPLACE INTO %s" % (commits_table)
-        sql = sql + \
-              "  (type,ci_when,whoid,repositoryid,dirid,fileid,revision,"\
-              "   stickytag,branchid,addedlines,removedlines,descid)"\
-              "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
-        sql_args = (ci_type, ci_when, who_id, repository_id,
-                    directory_id, file_id, revision, sticky_tag, branch_id,
-                    plus_count, minus_count, description_id)
 
         cursor = self.db.cursor()
         try:
-            cursor.execute(sql, sql_args)
+            # MySQL-specific INSERT-or-UPDATE with ID retrieval
+            cursor.execute(
+                'INSERT INTO '+commits_table+'('+','.join(i for i in props)+') VALUES ('+
+                ', '.join('%s' for i in props)+') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), '+
+                ', '.join(i+'=VALUES('+i+')' for i in props),
+                tuple(props[i] for i in props)
+            )
+            commit_id = cursor.lastrowid
+            if self.index_content:
+                sphcur = self.sphinx.cursor()
+                content = commit.GetContent()
+                props['ci_when'] = str(int(commit.GetTime() or 0))
+                if len(content):
+                    props['content'] = content
+                    # Now, stored MIME type is only needed while searching
+                    # It is guessed again when the file is displayed
+                    props['mimetype'] = commit.GetMimeType()
+                    props['id'] = str(commit_id)
+                    del props['addedlines']
+                    del props['removedlines']
+                    del props['descid']
+                    del props['type']
+                    sphcur.execute(
+                        'INSERT INTO '+self.sphinx_index+'('+','.join(i for i in props)+') VALUES ('+
+                        ','.join('%s' for i in props)+')',
+                        tuple(props[i] for i in props)
+                    )
         except Exception, e:
-            raise Exception("Error adding commit: '%s'\n"
-                            "Values were:\n"
-                            "\ttype         = %s\n"
-                            "\tci_when      = %s\n"
-                            "\twhoid        = %s\n"
-                            "\trepositoryid = %s\n"
-                            "\tdirid        = %s\n"
-                            "\tfileid       = %s\n"
-                            "\trevision     = %s\n"
-                            "\tstickytag    = %s\n"
-                            "\tbranchid     = %s\n"
-                            "\taddedlines   = %s\n"
-                            "\tremovedlines = %s\n"
-                            "\tdescid       = %s\n"
-                            % ((str(e), ) + sql_args))
+            print ("Error adding commit: '"+str(e)+"'\nValues were:\n"+
+                "\n".join(i+'='+str(props[i]) for i in props))
+            raise
 
     def SQLQueryListString(self, field, query_entry_list):
         sqlList = []
@@ -414,6 +427,67 @@ class CheckinDatabase:
 
         return "(%s)" % (string.join(sqlList, " OR "))
 
+    def query_ids(self, in_field, table, id_field, name_field, lst):
+        if not len(lst):
+            return None
+        cond = self.SQLQueryListString(name_field, lst)
+        cursor = self.db.cursor()
+        cursor.execute('SELECT %s FROM %s WHERE %s' % (id_field, table, cond))
+        ids = list(str(row[0]) for row in cursor)
+        if not len(ids):
+            return None
+        return "%s IN (%s)" % (in_field, ','.join(ids))
+
+    def CreateSphinxQueryString(self, query):
+        condList = [
+            'MATCH(%s)' % (self.db.literal(query.content_query), ),
+            self.query_ids('repositoryid', 'repositories', 'id', 'repository', query.repository_list),
+            self.query_ids('branchid', 'branches', 'id', 'branch', query.branch_list),
+            self.query_ids('dirid', 'dirs', 'id', 'dir', query.directory_list),
+            self.query_ids('fileid', 'files', 'id', 'file', query.file_list),
+            self.query_ids('authorid', 'people', 'id', 'who', query.author_list),
+            self.query_ids('descid', 'descs', 'id', 'description', query.comment_list),
+        ]
+
+        if len(query.revision_list):
+            condList.append("revision IN ("+','.join(self.db.literal(s) for s in query.revision_list)+")")
+        if query.from_date:
+            condList.append('ci_when>='+str(dbi.TicksFromDateTime(query.from_date)))
+        if query.to_date:
+            condList.append('ci_when<='+str(dbi.TicksFromDateTime(query.to_date)))
+
+        if query.sort == 'date':
+            order_by = 'ORDER BY `ci_when` DESC, `relevance` DESC'
+        elif query.sort == 'date_rev':
+            order_by = 'ORDER BY `ci_when` ASC, `relevance` DESC'
+        else: # /* if query.sort == 'relevance' */
+            order_by = 'ORDER BY `relevance` DESC'
+
+        conditions = string.join((i for i in condList if i), " AND ")
+        conditions = conditions and "WHERE %s" % conditions
+
+        ## limit the number of rows requested or we could really slam
+        ## a server with a large database
+        limit = ""
+        if query.limit:
+            limit = "LIMIT %s" % (str(query.limit))
+        elif self._row_limit:
+            limit = "LIMIT %s" % (str(self._row_limit))
+
+        fields = "id `id`, WEIGHT() `relevance`, `content`, `mimetype`"
+
+        return "SELECT %s FROM %s %s %s %s" % (fields, self.sphinx_index, conditions, order_by, limit)
+
+    # Get commits by their IDs
+    def CreateIdQueryString(self, ids):
+        commits_table = self._version >= 1 and 'commits' or 'checkins'
+        return (
+            'SELECT %s.*, repositories.repository AS repository_name, dirs.dir AS dir_name, files.file AS file_name'
+            ' FROM %s, repositories, dirs, files'
+            ' WHERE %s.id IN (%s) AND repositoryid=repositories.id'
+            ' AND dirid=dirs.id AND fileid=files.id' % (commits_table, commits_table, commits_table, ','.join(ids))
+        )
+
     def CreateSQLQueryString(self, query):
         commits_table = self._version >= 1 and 'commits' or 'checkins'
         fields = [
@@ -427,7 +501,7 @@ class CheckinDatabase:
             ("dirs", "(%s.dirid=dirs.id)" % (commits_table)),
             ("files", "(%s.fileid=files.id)" % (commits_table))]
         condList = []
-        
+
         if len(query.text_query):
             tableList.append(("descs", "(descs.id=%s.descid)" % (commits_table)))
             temp = "MATCH (descs.description) AGAINST (%s" % (self.db.literal(query.text_query))
@@ -435,6 +509,7 @@ class CheckinDatabase:
             fields.append("%s) AS relevance" % temp)
         else:
             fields.append("'' AS relevance")
+        fields.append("'' AS snippet")
 
         if len(query.repository_list):
             temp = self.SQLQueryListString("repositories.repository",
@@ -478,16 +553,18 @@ class CheckinDatabase:
             temp = "(%s.ci_when<=\"%s\")" % (commits_table, str(query.to_date))
             condList.append(temp)
 
-        if query.sort == "date":
-            order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
+        if query.sort == "relevance" and len(query.text_query):
+            order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
+        elif query.sort == "date_rev":
+            order_by = "ORDER BY %s.ci_when ASC,descid,%s.repositoryid" % (commits_table, commits_table)
         elif query.sort == "author":
             tableList.append(("people", "(%s.whoid=people.id)" % (commits_table)))
             order_by = "ORDER BY people.who,descid,%s.repositoryid" % (commits_table)
         elif query.sort == "file":
             tableList.append(("files", "(%s.fileid=files.id)" % (commits_table)))
             order_by = "ORDER BY files.file,descid,%s.repositoryid" % (commits_table)
-        elif query.sort == "relevance" and len(query.text_query):
-            order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
+        else: # /* if query.sort == "date": */
+            order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
 
         ## exclude duplicates from the table list, and split out join
         ## conditions from table names.  In future, the join conditions
@@ -517,7 +594,7 @@ class CheckinDatabase:
             fields, tables, conditions, order_by, limit)
 
         return sql
-    
+
     def check_commit_access(self, repos, dir, file, rev):
         if self.authorizer:
             rootname = repos.split('/')
@@ -528,19 +605,60 @@ class CheckinDatabase:
         return True
 
     def RunQuery(self, query):
-        sql = self.CreateSQLQueryString(query)
-        cursor = self.db.cursor()
-        cursor.execute(sql)
+        if len(query.content_query) and self.sphinx:
+            # Use Sphinx to search on document content
+            sql = self.CreateSphinxQueryString(query)
+            cursor = self.sphinx.cursor()
+            cursor.execute(sql)
+            sphinx_rows = list((str(docid), rel, content, mimetype) for docid, rel, content, mimetype in cursor)
+            if len(sphinx_rows):
+                # Fetch snippets
+                snippet_options = {
+                    'around': 15,
+                    'limit': 200,
+                    'before_match': '<span style="color:red">',
+                    'after_match': '</span>',
+                    'chunk_separator': ' ... ',
+                }
+                preformatted_mime = 'text/(?!html|xml).*'
+                snippets = {}
+                bm_html = cgi.escape(snippet_options['before_match'])
+                am_html = cgi.escape(snippet_options['after_match'])
+                for docid, rel, content, mimetype in sphinx_rows:
+                    cursor.execute(
+                        'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
+                        (content, self.sphinx_index, query.content_query) + tuple(snippet_options.values())
+                    )
+                    s, = cursor.fetchone()
+                    s = cgi.escape(s)
+                    if re.match(preformatted_mime, mimetype):
+                        s = s.replace('\n', '<br />')
+                    s = s.replace(bm_html, snippet_options['before_match'])
+                    s = s.replace(am_html, snippet_options['after_match'])
+                    snippets[docid] = s
+                # Fetch all fields from MySQL
+                sql = self.CreateIdQueryString((docid for (docid, _, _, _) in sphinx_rows))
+                cursor = self.db.cursor()
+                cursor.execute(sql)
+                byid = {}
+                for row in cursor:
+                    byid[str(row[0])] = row
+                rows = list(byid[docid] + (rel, snippets[docid]) for (docid, rel, _, _) in sphinx_rows if docid in byid)
+            else:
+                rows = []
+        else:
+            # Use regular queries when document content is not searched
+            sql = self.CreateSQLQueryString(query)
+            cursor = self.db.cursor()
+            cursor.execute(sql)
+            rows = list(cursor)
 
-        while 1:
-            row = cursor.fetchone()
-            if not row:
-                break
-
-            (dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID,
+        # Convert rows to commit objects
+        for row in rows:
+            (dbId, dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID,
              dbFileID, dbRevision, dbStickyTag, dbBranchID, dbAddedLines,
              dbRemovedLines, dbDescID, dbRepositoryName, dbDirName,
-             dbFileName, dbRelevance) = row
+             dbFileName, dbRelevance, dbSnippet) = row
 
             if not self.check_commit_access(dbRepositoryName, dbDirName, dbFileName, dbRevision):
                 continue
@@ -564,6 +682,7 @@ class CheckinDatabase:
             commit.SetMinusCount(dbRemovedLines)
             commit.SetDescriptionID(dbDescID)
             commit.SetRelevance(dbRelevance)
+            commit.SetSnippet(dbSnippet)
 
             query.AddCommit(commit)
 
@@ -623,46 +742,21 @@ class CheckinDatabase:
             raise UnknownRepositoryError("Unknown repository '%s'"
                                          % (repository))
 
-        if (self._version >= 1):
-            self.sql_delete('repositories', 'id', rep_id)
-            self.sql_purge('commits', 'repositoryid', 'id', 'repositories')
-            self.sql_purge('files', 'id', 'fileid', 'commits')
-            self.sql_purge('dirs', 'id', 'dirid', 'commits')
-            self.sql_purge('branches', 'id', 'branchid', 'commits')
-            self.sql_purge('descs', 'id', 'descid', 'commits')
-            self.sql_purge('people', 'id', 'whoid', 'commits')
-        else:
-            sql = "SELECT * FROM checkins WHERE repositoryid=%s"
-            sql_args = (rep_id, )
-            cursor = self.db.cursor()
-            cursor.execute(sql, sql_args)
-            checkins = []
-            while 1:
-                try:
-                    (ci_type, ci_when, who_id, repository_id,
-                     dir_id, file_id, revision, sticky_tag, branch_id,
-                     plus_count, minus_count, description_id) = \
-                     cursor.fetchone()
-                except TypeError:
-                    break
-                checkins.append([file_id, dir_id, branch_id,
-                                 description_id, who_id])
-
-            #self.sql_delete('repositories', 'id', rep_id)
-            self.sql_delete('checkins', 'repositoryid', rep_id)
-            for checkin in checkins:
-                self.sql_delete('files', 'id', checkin[0], 'fileid')
-                self.sql_delete('dirs', 'id', checkin[1], 'dirid')
-                self.sql_delete('branches', 'id', checkin[2], 'branchid')
-                self.sql_delete('descs', 'id', checkin[3], 'descid')
-                self.sql_delete('people', 'id', checkin[4], 'whoid')
+        checkins_table = self._version >= 1 and 'commits' or 'checkins'
+        self.sql_delete('repositories', 'id', rep_id)
+        self.sql_purge(checkins_table, 'repositoryid', 'id', 'repositories')
+        self.sql_purge('files', 'id', 'fileid', checkins_table)
+        self.sql_purge('dirs', 'id', 'dirid', checkins_table)
+        self.sql_purge('branches', 'id', 'branchid', checkins_table)
+        self.sql_purge('descs', 'id', 'descid', checkins_table)
+        self.sql_purge('people', 'id', 'whoid', checkins_table)
 
         # Reset all internal id caches.  We could be choosier here,
         # but let's just be as safe as possible.
         self._get_cache = {}
         self._get_id_cache = {}
         self._desc_id_cache = {}
-        
+
 
 class DatabaseVersionError(Exception):
     pass
@@ -678,7 +772,7 @@ class Commit:
     CHANGE = 0
     ADD = 1
     REMOVE = 2
-    
+
     def __init__(self):
         self.__directory = ''
         self.__file = ''
@@ -690,15 +784,20 @@ class Commit:
         self.__minuscount = ''
         self.__description = ''
         self.__relevance = ''
+        self.__snippet = ''
         self.__gmt_time = 0.0
         self.__type = Commit.CHANGE
+        self.__content = ''
+        self.__mimetype = ''
+        self.__base_path = ''
+        self.__base_rev = ''
 
     def SetRepository(self, repository):
         self.__repository = repository
 
     def GetRepository(self):
         return self.__repository
-        
+
     def SetDirectory(self, dir):
         self.__directory = dir
 
@@ -710,7 +809,7 @@ class Commit:
 
     def GetFile(self):
         return self.__file
-        
+
     def SetRevision(self, revision):
         self.__revision = revision
 
@@ -758,12 +857,19 @@ class Commit:
     def GetDescription(self):
         return self.__description
 
+    # Relevance and snippet are used when querying commit database
     def SetRelevance(self, relevance):
         self.__relevance = relevance
 
     def GetRelevance(self):
         return self.__relevance
 
+    def SetSnippet(self, snippet):
+        self.__snippet = snippet
+
+    def GetSnippet(self):
+        return self.__snippet
+
     def SetTypeChange(self):
         self.__type = Commit.CHANGE
 
@@ -784,66 +890,80 @@ class Commit:
         elif self.__type == Commit.REMOVE:
             return 'Remove'
 
+    # File content (extracted text), optional, indexed with Sphinx
+    def SetContent(self, content):
+        self.__content = content
+
+    def GetContent(self):
+        return self.__content
+
+    # MIME type, optional, now only stored in Sphinx
+    def SetMimeType(self, mimetype):
+        self.__mimetype = mimetype
+
+    def GetMimeType(self):
+        return self.__mimetype
+
 ## LazyCommit overrides a few methods of Commit to only retrieve
 ## it's properties as they are needed
 class LazyCommit(Commit):
-  def __init__(self, db):
-    Commit.__init__(self)
-    self.__db = db
+    def __init__(self, db):
+        Commit.__init__(self)
+        self.__db = db
 
-  def SetFileID(self, dbFileID):
-    self.__dbFileID = dbFileID
+    def SetFileID(self, dbFileID):
+        self.__dbFileID = dbFileID
 
-  def GetFileID(self):
-    return self.__dbFileID
+    def GetFileID(self):
+        return self.__dbFileID
 
-  def GetFile(self):
-    return self.__db.GetFile(self.__dbFileID)
+    def GetFile(self):
+        return self.__db.GetFile(self.__dbFileID)
 
-  def SetDirectoryID(self, dbDirID):
-    self.__dbDirID = dbDirID
+    def SetDirectoryID(self, dbDirID):
+        self.__dbDirID = dbDirID
 
-  def GetDirectoryID(self):
-    return self.__dbDirID
+    def GetDirectoryID(self):
+        return self.__dbDirID
 
-  def GetDirectory(self):
-    return self.__db.GetDirectory(self.__dbDirID)
+    def GetDirectory(self):
+        return self.__db.GetDirectory(self.__dbDirID)
 
-  def SetRepositoryID(self, dbRepositoryID):
-    self.__dbRepositoryID = dbRepositoryID
+    def SetRepositoryID(self, dbRepositoryID):
+        self.__dbRepositoryID = dbRepositoryID
 
-  def GetRepositoryID(self):
-    return self.__dbRepositoryID
+    def GetRepositoryID(self):
+        return self.__dbRepositoryID
 
-  def GetRepository(self):
-    return self.__db.GetRepository(self.__dbRepositoryID)
+    def GetRepository(self):
+        return self.__db.GetRepository(self.__dbRepositoryID)
 
-  def SetAuthorID(self, dbAuthorID):
-    self.__dbAuthorID = dbAuthorID
+    def SetAuthorID(self, dbAuthorID):
+        self.__dbAuthorID = dbAuthorID
 
-  def GetAuthorID(self):
-    return self.__dbAuthorID
+    def GetAuthorID(self):
+        return self.__dbAuthorID
 
-  def GetAuthor(self):
-    return self.__db.GetAuthor(self.__dbAuthorID)
+    def GetAuthor(self):
+        return self.__db.GetAuthor(self.__dbAuthorID)
 
-  def SetBranchID(self, dbBranchID):
-    self.__dbBranchID = dbBranchID
+    def SetBranchID(self, dbBranchID):
+        self.__dbBranchID = dbBranchID
 
-  def GetBranchID(self):
-    return self.__dbBranchID
+    def GetBranchID(self):
+        return self.__dbBranchID
 
-  def GetBranch(self):
-    return self.__db.GetBranch(self.__dbBranchID)
+    def GetBranch(self):
+        return self.__db.GetBranch(self.__dbBranchID)
 
-  def SetDescriptionID(self, dbDescID):
-    self.__dbDescID = dbDescID
+    def SetDescriptionID(self, dbDescID):
+        self.__dbDescID = dbDescID
 
-  def GetDescriptionID(self):
-    return self.__dbDescID
+    def GetDescriptionID(self):
+        return self.__dbDescID
 
-  def GetDescription(self):
-    return self.__db.GetDescription(self.__dbDescID)
+    def GetDescription(self):
+        return self.__db.GetDescription(self.__dbDescID)
 
 ## QueryEntry holds data on one match-type in the SQL database
 ## match is: "exact", "like", or "regex"
@@ -858,8 +978,8 @@ class CheckinDatabaseQuery:
     def __init__(self):
         ## sorting
         self.sort = "date"
-        
-        ## repository to query
+
+        ## repository, branch, etc to query
         self.repository_list = []
         self.branch_list = []
         self.directory_list = []
@@ -867,7 +987,11 @@ class CheckinDatabaseQuery:
         self.revision_list = []
         self.author_list = []
         self.comment_list = []
+
+        ## text_query = Fulltext query on comments
+        ## content_query = Fulltext query on content
         self.text_query = ""
+        self.content_query = ""
 
         ## date range in DBI 2.0 timedate objects
         self.from_date = None
@@ -886,6 +1010,9 @@ class CheckinDatabaseQuery:
     def SetTextQuery(self, query):
         self.text_query = query
 
+    def SetContentQuery(self, query):
+        self.content_query = query
+
     def SetRepository(self, repository, match = "exact"):
         self.repository_list.append(QueryEntry(repository, match))
 
@@ -921,7 +1048,7 @@ class CheckinDatabaseQuery:
     def SetFromDateHoursAgo(self, hours_ago):
         ticks = time.time() - (3600 * hours_ago)
         self.from_date = dbi.DateTimeFromTicks(ticks)
-        
+
     def SetFromDateDaysAgo(self, days_ago):
         ticks = time.time() - (86400 * days_ago)
         self.from_date = dbi.DateTimeFromTicks(ticks)
@@ -942,7 +1069,7 @@ class CheckinDatabaseQuery:
 ##
 def CreateCommit():
     return Commit()
-    
+
 def CreateCheckinQuery():
     return CheckinDatabaseQuery()
 
@@ -953,9 +1080,23 @@ def ConnectDatabase(cfg, authorizer=None, readonly=0):
     else:
         user = cfg.cvsdb.user
         passwd = cfg.cvsdb.passwd
-    db = CheckinDatabase(cfg.cvsdb.host, cfg.cvsdb.port, cfg.cvsdb.socket, user, passwd,
-                         cfg.cvsdb.database_name, cfg.cvsdb.row_limit, cfg.cvsdb.fulltext_min_relevance,
-                         authorizer)
+    db = CheckinDatabase(
+        host = cfg.cvsdb.host,
+        port = cfg.cvsdb.port,
+        socket = cfg.cvsdb.socket,
+        user = user,
+        passwd = passwd,
+        database = cfg.cvsdb.database_name,
+        row_limit = cfg.cvsdb.row_limit,
+        min_relevance = cfg.cvsdb.fulltext_min_relevance,
+        authorizer = authorizer,
+        index_content = cfg.cvsdb.index_content,
+        sphinx_host = cfg.cvsdb.sphinx_host,
+        sphinx_port = int(cfg.cvsdb.sphinx_port),
+        sphinx_socket = cfg.cvsdb.sphinx_socket,
+        sphinx_index = cfg.cvsdb.sphinx_index,
+        cfg = cfg,
+    )
     db.Connect()
     return db
 
diff --git a/lib/vclib/ccvs/bincvs.py b/lib/vclib/ccvs/bincvs.py
index c2b9430d..fee243fb 100644
--- a/lib/vclib/ccvs/bincvs.py
+++ b/lib/vclib/ccvs/bincvs.py
@@ -31,8 +31,8 @@ import popen
 class BaseCVSRepository(vclib.Repository):
   def __init__(self, name, rootpath, authorizer, utilities):
     if not os.path.isdir(rootpath):
-      raise vclib.ReposNotFound(name) 
-   
+      raise vclib.ReposNotFound(name)
+
     self.name = name
     self.rootpath = rootpath
     self.auth = authorizer
@@ -53,7 +53,7 @@ class BaseCVSRepository(vclib.Repository):
 
   def authorizer(self):
     return self.auth
-  
+
   def itemtype(self, path_parts, rev):
     basepath = self._getpath(path_parts)
     kind = None
@@ -74,12 +74,12 @@ class BaseCVSRepository(vclib.Repository):
   def itemprops(self, path_parts, rev):
     self.itemtype(path_parts, rev)  # does auth-check
     return {}  # CVS doesn't support properties
-  
+
   def listdir(self, path_parts, rev, options):
     if self.itemtype(path_parts, rev) != vclib.DIR:  # does auth-check
       raise vclib.Error("Path '%s' is not a directory."
                         % (string.join(path_parts, "/")))
-    
+
     # Only RCS files (*,v) and subdirs are returned.
     data = [ ]
     full_name = self._getpath(path_parts)
@@ -115,7 +115,7 @@ class BaseCVSRepository(vclib.Repository):
           data.append(CVSDirEntry(name, kind, errors, 1))
 
     return data
-    
+
   def _getpath(self, path_parts):
     return apply(os.path.join, (self.rootpath,) + tuple(path_parts))
 
@@ -177,7 +177,7 @@ class BinCVSRepository(BaseCVSRepository):
     used_rlog = 0
     tip_rev = None  # used only if we have to fallback to using rlog
 
-    fp = self.rcs_popen('co', (rev_flag, full_name), 'rb') 
+    fp = self.rcs_popen('co', (rev_flag, full_name), 'rb')
     try:
       filename, revision = _parse_co_header(fp)
     except COMissingRevision:
@@ -191,14 +191,14 @@ class BinCVSRepository(BaseCVSRepository):
         used_rlog = 1
       if not tip_rev:
         raise vclib.Error("Unable to find valid revision")
-      fp = self.rcs_popen('co', ('-p' + tip_rev.string, full_name), 'rb') 
+      fp = self.rcs_popen('co', ('-p' + tip_rev.string, full_name), 'rb')
       filename, revision = _parse_co_header(fp)
-      
+
     if filename is None:
       # CVSNT's co exits without any output if a dead revision is requested.
       # Bug at http://www.cvsnt.org/cgi-bin/bugzilla/show_bug.cgi?id=190
       # As a workaround, we invoke rlog to find the first non-dead revision
-      # that precedes it and check out that revision instead.  Of course, 
+      # that precedes it and check out that revision instead.  Of course,
       # if we've already invoked rlog above, we just reuse its output.
       if not used_rlog:
         tip_rev = self._get_tip_revision(full_name + ',v', rev)
@@ -207,7 +207,7 @@ class BinCVSRepository(BaseCVSRepository):
         raise vclib.Error(
           'Could not find non-dead revision preceding "%s"' % rev)
       fp = self.rcs_popen('co', ('-p' + tip_rev.undead.string,
-                                 full_name), 'rb') 
+                                 full_name), 'rb')
       filename, revision = _parse_co_header(fp)
 
     if filename is None:
@@ -278,7 +278,7 @@ class BinCVSRepository(BaseCVSRepository):
     if self.itemtype(path_parts, rev) != vclib.FILE:  # does auth-check
       raise vclib.Error("Path '%s' is not a file."
                         % (string.join(path_parts, "/")))
-    
+
     # Invoke rlog
     rcsfile = self.rcsfile(path_parts, 1)
     if rev and options.get('cvs_pass_rev', 0):
@@ -341,7 +341,7 @@ class BinCVSRepository(BaseCVSRepository):
 
   def revinfo(self, rev):
     raise vclib.UnsupportedFeature
-  
+
   def rawdiff(self, path_parts1, rev1, path_parts2, rev2, type, options={}):
     """see vclib.Repository.rawdiff docstring
 
@@ -439,9 +439,9 @@ def _match_revs_tags(revlist, taglist):
       example: if revision is 1.2.3.4, parent is 1.2
 
     "undead"
-      If the revision is dead, then this is a reference to the first 
+      If the revision is dead, then this is a reference to the first
       previous revision which isn't dead, otherwise it's a reference
-      to itself. If all the previous revisions are dead it's None. 
+      to itself. If all the previous revisions are dead it's None.
 
     "branch_number"
       tuple representing branch number or empty tuple if on trunk
@@ -653,7 +653,7 @@ def _parse_co_header(fp):
       pass
     else:
       break
-    
+
   raise COMalformedOutput, "Unable to find revision in co output stream"
 
 # if your rlog doesn't use 77 '=' characters, then this must change
@@ -674,7 +674,7 @@ _EOF_ERROR = 'error message found'      # rlog issued an error
 #   ^rlog\: (.*)(?:\:\d+)?\: (.*)$
 #
 # But for some reason the windows version of rlog omits the "rlog: " prefix
-# for the first error message when the standard error stream has been 
+# for the first error message when the standard error stream has been
 # redirected to a file or pipe. (the prefix is present in subsequent errors
 # and when rlog is run from the console). So the expression below is more
 # complicated
@@ -703,7 +703,7 @@ def _parse_log_header(fp):
   Returns: filename, default branch, tag dictionary, lock dictionary,
   rlog error message, and eof flag
   """
-  
+
   filename = head = branch = msg = ""
   taginfo = { }   # tag name => number
   lockinfo = { }  # revision => locker
@@ -732,7 +732,7 @@ def _parse_log_header(fp):
       else:
         # oops. this line isn't lock info. stop parsing tags.
         state = 0
-      
+
     if state == 0:
       if line[:9] == 'RCS file:':
         filename = line[10:-1]
@@ -902,7 +902,7 @@ def _file_log(revs, taginfo, lockinfo, cur_branch, filter):
     except ValueError:
       view_tag = None
     else:
-      tags.append(view_tag)  
+      tags.append(view_tag)
 
   # Match up tags and revisions
   _match_revs_tags(revs, tags)
@@ -910,13 +910,13 @@ def _file_log(revs, taginfo, lockinfo, cur_branch, filter):
   # Match up lockinfo and revision
   for rev in revs:
     rev.lockinfo = lockinfo.get(rev.string)
-      
+
   # Add artificial ViewVC tag HEAD, which acts like a non-branch tag pointing
   # at the latest revision on the MAIN branch. The HEAD revision doesn't have
   # anything to do with the "head" revision number specified in the RCS file
   # and in rlog output. HEAD refers to the revision that the CVS and RCS co
   # commands will check out by default, whereas the "head" field just refers
-  # to the highest revision on the trunk.  
+  # to the highest revision on the trunk.
   taginfo['HEAD'] = _add_tag('HEAD', taginfo['MAIN'].co_rev)
 
   # Determine what revisions to return
@@ -954,7 +954,7 @@ def _file_log(revs, taginfo, lockinfo, cur_branch, filter):
       _remove_tag(view_tag)
   else:
     filtered_revs = revs
-  
+
   return filtered_revs
 
 def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs):
@@ -1004,7 +1004,7 @@ def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs):
         = _parse_log_header(rlog)
 
       if eof == _EOF_LOG:
-        # the rlog output ended early. this can happen on errors that rlog 
+        # the rlog output ended early. this can happen on errors that rlog
         # thinks are so serious that it stops parsing the current file and
         # refuses to parse any of the files that come after it. one of the
         # errors that triggers this obnoxious behavior looks like:
@@ -1052,8 +1052,8 @@ def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs):
         tag = None
 
       # we don't care about the specific values -- just the keys and whether
-      # the values point to branches or revisions. this the fastest way to 
-      # merge the set of keys and keep values that allow us to make the 
+      # the values point to branches or revisions. this the fastest way to
+      # merge the set of keys and keep values that allow us to make the
       # distinction between branch tags and normal tags
       alltags.update(taginfo)
 
@@ -1098,7 +1098,7 @@ def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs):
         file.dead = 0
         #file.errors.append("No revisions exist on %s" % (view_tag or "MAIN"))
         file.absent = 1
-        
+
       # done with this file now, skip the rest of this file's revisions
       if not eof:
         _skip_file(rlog)
@@ -1211,7 +1211,7 @@ def _newest_file(dirpath):
   newest_time = 0
 
   ### FIXME:  This sucker is leaking unauthorized paths! ###
-  
+
   for subfile in os.listdir(dirpath):
     ### filter CVS locks? stale NFS handles?
     if subfile[-2:] != ',v':
diff --git a/lib/viewvc.py b/lib/viewvc.py
index 84f5db9a..44a93a5d 100644
--- a/lib/viewvc.py
+++ b/lib/viewvc.py
@@ -1,4 +1,3 @@
-#
 # Copyright (C) 1999-2009 The ViewCVS Group. All Rights Reserved.
 #
 # By using this file, you agree to the terms and conditions set forth in
@@ -68,7 +67,6 @@ docroot_magic_path = '*docroot*'
 viewcvs_mime_type = 'text/vnd.viewcvs-markup'
 alt_mime_type = 'text/x-cvsweb-markup'
 view_roots_magic = '*viewroots*'
-magic_buf_size = 4096
 default_mime_type = 'application/octet-stream'
 
 # Put here the variables we need in order to hold our state - they
@@ -121,9 +119,8 @@ class Request:
     # check for an authenticated username
     self.username = server.getenv('REMOTE_USER')
 
-    # construct MIME magic
-    self.ms = None
-    self.ms_fail = 0
+    # repository object cache
+    self.all_repos = {}
 
     # if we allow compressed output, see if the client does too
     self.gzip_compress_level = 0
@@ -134,6 +131,9 @@ class Request:
                               string.split(http_accept_encoding, ","))):
         self.gzip_compress_level = 9  # make this configurable?
 
+  def utf8(self, value):
+    return self.cfg.guesser().utf8(value)
+
   def create_repos(self, rootname):
     if not rootname:
       return None
@@ -677,7 +677,7 @@ def _validate_mimetype(value):
   return value in (viewcvs_mime_type, alt_mime_type, 'text/plain')
 
 # obvious things here. note that we don't need uppercase for alpha.
-_re_validate_alpha = re.compile('^[a-z]+$')
+_re_validate_alpha = re.compile('^[a-z_]+$')
 _re_validate_number = re.compile('^[0-9]+$')
 _re_validate_boolint = re.compile('^[01]$')
 
@@ -743,6 +743,7 @@ _legal_params = {
   'who_match'     : _re_validate_alpha,
   'comment'       : None,
   'comment_match' : _re_validate_alpha,
+  'search_content': None,
   'querysort'     : _re_validate_alpha,
   'date'          : _re_validate_alpha,
   'hours'         : _re_validate_number,
@@ -988,7 +989,7 @@ def nav_path(request):
     is_last = len(path_parts) == len(request.path_parts)
 
     if request.roottype == 'cvs':
-      item = _item(name=cvsdb.utf8string(part), href=None)
+      item = _item(name=request.utf8(part), href=None)
     else:
       item = _item(name=part, href=None)
 
@@ -1248,7 +1249,7 @@ def common_template_data(request, revision=None, mime_type=None):
   cfg = request.cfg
   where = request.where
   if request.roottype == 'cvs':
-    where = cvsdb.utf8string(where)
+    where = request.utf8(where)
   where = request.server.escape(where)
 
   # Initialize data dictionary members (sorted alphanumerically)
@@ -1444,28 +1445,31 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
                                 get_lexer_for_mimetype, \
                                 get_lexer_for_filename
     from pygments.lexers._mapping import LEXERS
+    # Hack for shell mime types:
     LEXERS['BashLexer'] = ('pygments.lexers.other', 'Bash', ('bash', 'sh'), ('*.sh',), ('application/x-sh', 'application/x-shellscript', 'text/x-sh', 'text/x-shellscript'))
-    encoding = 'guess'
-    if cfg.options.detect_encoding:
-      try:
-        import chardet
-        encoding = 'chardet'
-      except (SyntaxError, ImportError):
-        pass
     try:
       lexer = get_lexer_for_mimetype(mime_type,
-                                     encoding=encoding,
+                                     encoding='utf-8',
                                      stripnl=False)
     except ClassNotFound:
       try:
         lexer = get_lexer_for_filename(filename,
-                                       encoding=encoding,
+                                       encoding='utf-8',
                                        stripnl=False)
       except ClassNotFound:
         use_pygments = 0
   except ImportError:
     use_pygments = 0
 
+  # Detect encoding by calling chardet ourselves,
+  # to support it in non-highlighting mode
+  content = fp.read()
+  c, encoding = cfg.guesser().guess_charset(content)
+  if encoding:
+    content = c
+  else:
+    encoding = 'unknown'
+
   # If we aren't going to be highlighting anything, just return the
   # BLAME_SOURCE.  If there's no blame_source, we'll generate a fake
   # one from the file contents we fetch with PATH and REV.
@@ -1475,11 +1479,7 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
     else:
       lines = []
       line_no = 0
-      while 1:
-        line = fp.readline()
-        if not line:
-          break
-        line = cvsdb.utf8string(line)
+      for line in content.split('\n'):
         line_no = line_no + 1
         item = vclib.Annotation(cgi.escape(line), line_no,
                                 None, None, None, None)
@@ -1508,19 +1508,11 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
         self.blame_data.append(item)
       self.line_no = self.line_no + 1
   ps = PygmentsSink(blame_source)
-  fpd = fp.read()
-  try:
-    fpdat = unicode(fpd,'utf-8')
-  except:
-    try:
-      fpdat = unicode(fpd,'cp1251')
-    except:
-      fpdat = fpd
-  highlight(fpdat, lexer,
+  highlight(content, lexer,
             HtmlFormatter(nowrap=True,
                           classprefix='pygments-',
                           encoding='utf-8'), ps)
-  return ps.blame_data
+  return ps.blame_data, encoding
 
 def make_time_string(date, cfg):
   """Returns formatted date string in either local time or UTC.
@@ -1594,6 +1586,7 @@ def calculate_mime_type(request, path_parts, rev):
         return mime_type
     except:
       pass
+  # FIXME rewrite to use viewvcmagic
   return guess_mime(path_parts[-1])
 
 def markup_or_annotate(request, is_annotate):
@@ -1605,21 +1598,12 @@ def markup_or_annotate(request, is_annotate):
   mime_type = calculate_mime_type(request, path, rev)
 
   if not mime_type or mime_type == default_mime_type:
-    if request.ms is None and not request.ms_fail:
-      try:
-        import magic
-        request.ms = magic.open(magic.MAGIC_NONE | magic.MAGIC_MIME)
-        request.ms.load()
-      except:
-        request.ms_fail = 1
-    if request.ms:
-      try:
-        fp, revision = request.repos.openfile(path, rev)
-        buffer = fp.read(magic_buf_size)
-        fp.close()
-        mime_type = request.ms.buffer(buffer)
-      except:
-        pass
+    try:
+      fp, revision = request.repos.openfile(path, rev)
+      mime_type = request.cfg.guesser().guess_mime(None, None, fp)
+      fp.close()
+    except:
+      raise
 
   # Is this a binary type?
   if is_binary(request.cfg, mime_type):
@@ -1657,9 +1641,10 @@ def markup_or_annotate(request, is_annotate):
     if check_freshness(request, None, revision, weak=1):
       fp.close()
       return
-    lines = markup_stream_pygments(request, cfg, blame_source, fp,
-                                   path[-1], mime_type)
+    lines, charset = markup_stream_pygments(request, cfg, blame_source, fp, path[-1], mime_type)
     fp.close()
+    if mime_type.find(';') < 0:
+      mime_type = mime_type+'; charset='+charset
 
   data = common_template_data(request, revision)
   data.merge(ezt.TemplateData({
@@ -1910,7 +1895,7 @@ def view_directory(request):
       row.short_log = format_log(file.log, cfg)
       row.log = htmlify(file.log, cfg.options.mangle_email_addresses)
     row.lockinfo = file.lockinfo
-    row.name = request.server.escape(cvsdb.utf8string(file.name))
+    row.name = request.server.escape(request.utf8(file.name))
     row.anchor = row.name
     row.pathtype = (file.kind == vclib.FILE and 'file') or \
                    (file.kind == vclib.DIR and 'dir')
@@ -2285,7 +2270,7 @@ def view_log(request):
       entry.ago = html_time(request, rev.date, 1)
     entry.log = rev.log or ""
     if cvs:
-      entry.log = cvsdb.utf8string(entry.log)
+      entry.log = request.utf8(entry.log)
     entry.log = htmlify(entry.log, cfg.options.mangle_email_addresses)
     entry.size = rev.size
     entry.lockinfo = rev.lockinfo
@@ -2770,7 +2755,7 @@ class DiffSource:
     self.save_line = None
     self.line_number = None
     self.prev_line_number = None
-    
+
     # keep track of where we are during an iteration
     self.idx = -1
     self.last = None
@@ -2867,7 +2852,7 @@ class DiffSource:
 
     diff_code = line[0]
     output = self._format_text(line[1:])
-    output = cvsdb.utf8string(output)
+    output = self.cfg.guesser().utf8(output)
 
     if diff_code == '+':
       if self.state == 'dump':
@@ -3644,6 +3629,7 @@ def view_queryform(request):
     'who_match' : request.query_dict.get('who_match', 'exact'),
     'comment' : request.query_dict.get('comment', ''),
     'comment_match' : request.query_dict.get('comment_match', 'fulltext'),
+    'search_content' : request.query_dict.get('search_content', ''),
     'querysort' : request.query_dict.get('querysort', 'date'),
     'date' : request.query_dict.get('date', 'hours'),
     'hours' : request.query_dict.get('hours', '2'),
@@ -3653,6 +3639,7 @@ def view_queryform(request):
     'query_hidden_values' : query_hidden_values,
     'limit_changes' : limit_changes,
     'dir_href' : dir_href,
+    'enable_search_content' : request.cfg.cvsdb.index_content,
   }))
 
   generate_page(request, "query_form", data)
@@ -3791,7 +3778,8 @@ def build_commit(request, files, max_files, dir_strip, format):
   plus_count = 0
   minus_count = 0
   found_unreadable = 0
-  all_repos = {}
+  if not request.all_repos:
+    request.all_repos = {}
 
   for f in files:
     dirname = f.GetDirectory()
@@ -3810,17 +3798,19 @@ def build_commit(request, files, max_files, dir_strip, format):
 
     # Check path access (since the commits database logic bypasses the
     # vclib layer and, thus, the vcauth stuff that layer uses).
-    my_repos = all_repos.get(f.GetRepository(), '')
+    my_repos = request.all_repos.get(f.GetRepository(), '')
     if not my_repos:
       try:
-        my_repos = all_repos[f.GetRepository()] = request.create_repos(f.GetRepository())
+        my_repos = request.all_repos[f.GetRepository()] = request.create_repos(f.GetRepository())
       except:
         my_repos = None
     if not my_repos:
       return None
     if my_repos['roottype'] == 'cvs':
-      try: where = unicode(where,'utf-8')
+      # we store UTF-8 in the DB
+      try: where = where.decode('utf-8')
       except: pass
+      # FIXME maybe store "real" filesystem path in the DB instead of having such setting?
       try: where = where.encode(cfg.options.cvs_ondisk_charset)
       except: pass
     path_parts = _path_parts(where)
@@ -3907,24 +3897,27 @@ def build_commit(request, files, max_files, dir_strip, format):
     if max_files and num_allowed > max_files:
       continue
 
-    commit_files.append(_item(date=commit_time,
-                              dir=request.server.escape(dirname),
-                              file=request.server.escape(filename),
-                              author=request.server.escape(f.GetAuthor()),
-                              rev=rev,
-                              branch=f.GetBranch(),
-                              plus=plus,
-                              minus=minus,
-                              type=change_type,
-                              dir_href=dir_href,
-                              log_href=log_href,
-                              view_href=view_href,
-                              download_href=download_href,
-                              prefer_markup=prefer_markup,
-                              diff_href=diff_href,
-                              root=my_repos,
-                              path=where,
-                              path_prev=path_prev))
+    commit_files.append(_item(
+      date=commit_time,
+      dir=request.server.escape(dirname),
+      file=request.server.escape(filename),
+      author=request.server.escape(f.GetAuthor()),
+      rev=rev,
+      branch=f.GetBranch(),
+      plus=plus,
+      minus=minus,
+      type=change_type,
+      snippet=f.GetSnippet(),
+      dir_href=dir_href,
+      log_href=log_href,
+      view_href=view_href,
+      download_href=download_href,
+      prefer_markup=prefer_markup,
+      diff_href=diff_href,
+      root=my_repos,
+      path=where,
+      path_prev=path_prev,
+    ))
 
   # No files survived authz checks?  Let's just pretend this
   # little commit didn't happen, shall we?
@@ -4115,6 +4108,7 @@ def view_query(request):
   who_match = request.query_dict.get('who_match', 'exact')
   comment = request.query_dict.get('comment', '')
   comment_match = request.query_dict.get('comment_match', 'fulltext')
+  search_content = request.query_dict.get('search_content', '')
   querysort = request.query_dict.get('querysort', 'date')
   date = request.query_dict.get('date', 'hours')
   hours = request.query_dict.get('hours', '2')
@@ -4126,7 +4120,7 @@ def view_query(request):
                                              cfg.options.limit_changes))
 
   match_types = { 'exact':1, 'like':1, 'glob':1, 'regex':1, 'notregex':1 }
-  sort_types = { 'date':1, 'author':1, 'file':1 }
+  sort_types = { 'date':1, 'date_rev':1, 'author':1, 'file':1, 'relevance':1 }
   date_types = { 'hours':1, 'day':1, 'week':1, 'month':1,
                  'all':1, 'explicit':1 }
 
@@ -4193,6 +4187,8 @@ def view_query(request):
       query.SetComment(comment, comment_match)
     else:
       query.SetTextQuery(comment)
+  if search_content:
+    query.SetContentQuery(search_content)
   query.SetSortMethod(querysort)
   if date == 'hours':
     query.SetFromDateHoursAgo(int(hours))
diff --git a/lib/viewvcmagic.py b/lib/viewvcmagic.py
new file mode 100644
index 00000000..5f8b3ea8
--- /dev/null
+++ b/lib/viewvcmagic.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python
+
+import mimetypes
+
+have_chardet = 0
+try:
+    import chardet
+    have_chardet = 1
+except: pass
+
+class ContentMagic:
+
+    def __init__(self, encodings):
+        self.encodings = encodings.split(':')
+        self.mime_magic = None
+        self.errors = []
+        # Try to load magic
+        try:
+            import magic
+            self.mime_magic = magic.open(magic.MAGIC_MIME_TYPE)
+            self.mime_magic.load()
+        except Exception, e:
+            self.errors.append(e)
+
+    # returns MIME type
+    def guess_mime(self, mime, filename, tempfile):
+        if mime == 'application/octet-stream':
+            mime = ''
+        if not mime and filename:
+            mime = mimetypes.guess_type(filename)[0]
+        if not mime and tempfile and self.mime_magic:
+            if type(tempfile) == type(''):
+                mime = self.mime_magic.file(tempfile)
+            else:
+                c = tempfile.read(4096)
+                mime = self.mime_magic.buffer(c)
+        return mime
+
+    # returns (utf8_content, charset)
+    def guess_charset(self, content):
+        # Try to guess with chardet
+        charset = None
+        if have_chardet:
+            # Try chardet
+            try:
+                charset = chardet.detect(content)
+                if charset and charset['encoding']:
+                    charset = charset['encoding']
+                content = content.decode(charset)
+            except: charset = None
+        else:
+            # Try UTF-8
+            charset = 'utf-8'
+            try: content = content.decode('utf-8')
+            except: charset = None
+        # Then try to guess primitively
+        if charset is None:
+            for charset in self.encodings:
+                try:
+                    content = content.decode(charset)
+                    break
+                except: charset = None
+        return (content, charset)
+
+    # guess and encode return value into UTF-8
+    def utf8(self, content):
+        (uni, charset) = self.guess_charset(content)
+        if charset:
+            return uni.encode('utf-8')
+        return content
diff --git a/templates/query_form.ezt b/templates/query_form.ezt
index ec28ffdc..4919bc5b 100644
--- a/templates/query_form.ezt
+++ b/templates/query_form.ezt
@@ -144,7 +144,7 @@ Browse Directory</a></p>
   <tr>
     <th style="text-align:right;vertical-align:top;">Comment:</th>
     <td>
-      <input type="text" name="comment" value="[comment]" /><br />
+      <input type="text" name="comment" value="[comment]" size="40" /><br />
       <label for="comment_match_exact">
         <input type="radio" name="comment_match" id="comment_match_fulltext"
            value="fulltext" [is comment_match "fulltext"]checked=""[end] />
@@ -172,13 +172,21 @@ Browse Directory</a></p>
       </label>
     </td>
   </tr>
+  [if-any enable_search_content]
+  <tr>
+    <th style="text-align:right;vertical-align:top;">Search content:</th>
+    <td><input type="text" name="search_content" value="[search_content]" size="60" /></td>
+  </tr>
+  [end]
   <tr>
     <th style="text-align:right;vertical-align:top;">Sort By:</th>
     <td>
       <select name="querysort">
         <option value="date" [is querysort "date"]selected="selected"[end]>Date</option>
+        <option value="date_rev" [is querysort "date_rev"]selected="selected"[end]>Date (oldest first)</option>
         <option value="author" [is querysort "author"]selected="selected"[end]>Author</option>
         <option value="file" [is querysort "file"]selected="selected"[end]>File</option>
+        <option value="relevance" [is querysort "relevance"]selected="selected"[end]>Relevance</option>
       </select>
     </td>
   </tr>
diff --git a/templates/query_results.ezt b/templates/query_results.ezt
index 4cb71d48..dee029fe 100644
--- a/templates/query_results.ezt
+++ b/templates/query_results.ezt
@@ -46,15 +46,18 @@
     <tr class="vc_row_[if-index commits even]even[else]odd[end]">
       <td style="vertical-align: top;">
         [define rev_href][if-any commits.files.prefer_markup][commits.files.view_href][else][if-any commits.files.download_href][commits.files.download_href][end][end][end]
-	[if-any commits.files.rev][if-any rev_href]<a href="[rev_href]">[end][commits.files.rev][if-any rev_href]</a>[end][else]&nbsp;[end]
+        [if-any commits.files.rev][if-any rev_href]<a href="[rev_href]">[end][commits.files.rev][if-any rev_href]</a>[end][else]&nbsp;[end]
       </td>
       <td style="vertical-align: top;">
         <a href="[commits.files.dir_href]">[commits.files.dir]/</a>
         <a href="[commits.files.log_href]">[commits.files.file]</a>
+        [if-any commits.files.snippet]
+          <div class="snippet">[commits.files.snippet]</div>
+        [end]
       </td>
 [if-any show_branch]
       <td style="vertical-align: top;">
-	[if-any commits.files.branch][commits.files.branch][else]&nbsp;[end]
+        [if-any commits.files.branch][commits.files.branch][else]&nbsp;[end]
       </td>
 [end]
       <td style="vertical-align: top;">
@@ -68,10 +71,10 @@
         [is commits.files.type "Remove"]</del>[end]
       </td>
       <td style="vertical-align: top;">
-	[if-any commits.files.date][commits.files.date][else]&nbsp;[end]
+        [if-any commits.files.date][commits.files.date][else]&nbsp;[end]
       </td>
       <td style="vertical-align: top;">
-	[if-any commits.files.author][commits.files.author][else]&nbsp;[end]
+        [if-any commits.files.author][commits.files.author][else]&nbsp;[end]
       </td>
     </tr>
   [end]