Bug 82651 - Tika&Sphinx&chardet content indexing (done!)

git-svn-id: svn://svn.office.custis.ru/3rdparty/viewvc.org/trunk@1388 6955db30-a419-402b-8a0d-67ecbb4d7f56
2011-09-27 16:13:53 +00:00 · 2011-09-27 16:13:53 +00:00 · e363cf19b1
parent 83c7e6fe49
commit e363cf19b1
10 changed files with 761 additions and 332 deletions
--- a/bin/make-database
+++ b/bin/make-database
@ -44,6 +44,7 @@ CREATE TABLE branches (

 DROP TABLE IF EXISTS checkins;
 CREATE TABLE checkins (
+  id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
  type enum('Change','Add','Remove'),
  ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
  whoid mediumint(9) DEFAULT '0' NOT NULL,
@ -57,7 +58,7 @@ CREATE TABLE checkins (
  removedlines int(11) DEFAULT '0' NOT NULL,
  descid mediumint(9),
  UNIQUE repositoryid (repositoryid,dirid,fileid,revision),
-  KEY repository_when (repositoryid,ci_when),
+  KEY repositoryid_when (repositoryid,ci_when),
  KEY ci_when (ci_when),
  KEY whoid (whoid,ci_when),
  KEY dirid (dirid),
@ -138,6 +139,7 @@ CREATE TABLE branches (

 DROP TABLE IF EXISTS commits;
 CREATE TABLE commits (
+  id int NOT NULL AUTO_INCREMENT PRIMARY KEY,
  type enum('Change','Add','Remove'),
  ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL,
  whoid mediumint(9) DEFAULT '0' NOT NULL,
@ -151,9 +153,9 @@ CREATE TABLE commits (
  removedlines int(11) DEFAULT '0' NOT NULL,
  descid mediumint(9),
  UNIQUE repositoryid (repositoryid,dirid,fileid,revision),
+  KEY repositoryid_when (repositoryid,ci_when),
  KEY ci_when (ci_when),
-  KEY whoid (whoid),
-  KEY repositoryid_2 (repositoryid),
+  KEY whoid (whoid,ci_when),
  KEY dirid (dirid),
  KEY fileid (fileid),
  KEY branchid (branchid),
--- a/bin/svndbadmin
+++ b/bin/svndbadmin
@ -58,7 +58,11 @@ else:

 import os
 import string
+import socket
+import select
 import re
+import mimetypes
+import time

 import svn.core
 import svn.repos
@ -68,14 +72,20 @@ import svn.delta
 import cvsdb
 import viewvc
 import vclib
+from viewvcmagic import ContentMagic

 class SvnRepo:
    """Class used to manage a connection to a SVN repository."""
-    def __init__(self, path):
+    def __init__(self, path, index_content = None, tika_client = None, guesser = None,
+                 svn_ignore_mimetype = False):
        self.path = path
        self.repo = svn.repos.svn_repos_open(path)
        self.fs = svn.repos.svn_repos_fs(self.repo)
        self.rev_max = svn.fs.youngest_rev(self.fs)
+        self.index_content = index_content
+        self.tika_client = tika_client
+        self.guesser = guesser
+        self.svn_ignore_mimetype = svn_ignore_mimetype
    def __getitem__(self, rev):
        if rev is None:
            rev = self.rev_max
@ -128,6 +138,74 @@ def _get_diff_counts(diff_fp):
        line = diff_fp.readline()
    return plus, minus

+class TikaClient:
+    # Create tika client
+    def __init__(self, tika_server, mime_types):
+        self.tika_server = tika_server
+        self.mime_types = mime_types
+        self.addr = tika_server.split(':')
+        # Split address
+        if len(self.addr) != 2:
+            raise Exception('tika_server value is incorrect: \''+tika_server+'\', please use \'host:port\' format')
+        self.addr = (self.addr[0], int(self.addr[1]))
+        # Build regexp for MIME types
+        m = re.split('\s+', mime_types.strip())
+        self.mime_regexp = re.compile('|'.join('^'+re.escape(i).replace('\\*', '.*')+'$' for i in m))
+
+    # Extract text content from file using Tika which runs in server mode
+    def get_text(self, filename, mime_type, log_filename):
+        if not self.mime_regexp.match(mime_type):
+            # Tika can't handle this mime type, return nothing
+            return ''
+        fd = None
+        s = None
+        text = ''
+        fsize = 0
+        try:
+            # Read original file
+            fd = open(filename, 'rb')
+            data = fd.read()
+            fsize = len(data)
+            if not fsize:
+                return ''
+            # Connect to Tika
+            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            s.connect(self.addr)
+            s.setblocking(0)
+            sockfd = s.fileno()
+            # Tika is somewhat delicate about network IO, so:
+            # Read and write using poll(2) system call
+            p = select.poll()
+            p.register(sockfd)
+            while 1:
+                fds = p.poll()
+                if not fds:
+                    break
+                (pollfd, event) = fds[0]
+                if event & select.POLLIN:
+                    # Exception or empty data means EOF...
+                    try: part = os.read(sockfd, 65536)
+                    except: break
+                    if not part: break
+                    text += part
+                if event & select.POLLOUT:
+                    if not len(data):
+                        # Shutdown output and forget about POLLOUT
+                        s.shutdown(socket.SHUT_WR)
+                        p.modify(sockfd, select.POLLIN)
+                    else:
+                        # Write and consume some data
+                        l = os.write(sockfd, data)
+                        data = data[l:]
+            if len(text) == 0:
+                raise Exception('Empty response from Tika server')
+            print "Extracted %d bytes from %s (%s) of size %d" % (len(text), log_filename, mime_type, fsize)
+        except Exception, e:
+            print "Error extracting text from %s (%s) of size %d: %s" % (log_filename, mime_type, fsize, str(e))
+        finally:
+            if fd: fd.close()
+            if s: s.close()
+        return text

 class SvnRev:
    """Class used to hold information about a particular revision of
@ -184,19 +262,53 @@ class SvnRev:
                                      ['-b', '-B'])
            diff_fp = diffobj.get_pipe()
            plus, minus = _get_diff_counts(diff_fp)
-            # TODO Indexing file contents
-            # For binary files: svn.fs.contents_changed(root1, path1, root2, path2)
-            # Temp file with contents is at: diffobj.tempfile2
-            # Apache Tika server may even be at another host!

            # CustIS Bug 50473: a workaround for svnlib behaviour in file movements (FILE1 -> FILE2 + FILE1 -> null)
            if change.base_path:
-                if not change.path and changes_hash.get(change.base_path, '') != '':
+                if not change.path and change.base_path in changes_hash:
                    minus = 0
                elif change.path:
                    changes_hash[change.base_path] = change.path

-            self.changes.append((path, action, plus, minus))
+            content = ''
+            mime = ''
+            # need to check if binary file's content changed when copying,
+            # if not, don't extract it, just get it from previous revision later
+            if repo.index_content and change.path and (not change.base_path
+                or svn.fs.contents_changed(
+                    base_root and base_root or None,
+                    base_root and change.base_path or None,
+                    fsroot, change.path
+                )):
+                    props = svn.fs.node_proplist(fsroot, change.path)
+                    if not repo.svn_ignore_mimetype:
+                        mime = props.get('svn:mime-type', None)
+                    else:
+                        mime = None
+                    mime = repo.guesser.guess_mime(
+                        mime,
+                        os.path.basename(change.path),
+                        diffobj.tempfile2
+                    )
+                    # Read and guess charset by ourselves for text files
+                    if mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')):
+                        try:
+                            fd = open(diffobj.tempfile2, 'rb')
+                            content = fd.read()
+                            fd.close()
+                        except: pass
+                        # Guess charset
+                        if content:
+                            content, charset = repo.guesser.guess_charset(content)
+                            if charset:
+                                content = content.encode('utf-8')
+                                print 'Guessed %s for %s' % (charset, change.path)
+                            else:
+                                print 'Failed to guess charset for %s, not indexing' % (change.path, )
+                    # Try to extract content using Tika from binary documents
+                    elif repo.tika_client:
+                        content = repo.tika_client.get_text(diffobj.tempfile2, mime, change.path)
+            self.changes.append((path, action, plus, minus, content, mime))

    def _get_root_for_rev(self, rev):
        """Fetch a revision root from a cache of such, or a fresh root
@ -217,7 +329,7 @@ def handle_revision(db, command, repo, rev, verbose, force=0):
        if verbose: print "skipped (no changes)."
        return

-    for (path, action, plus, minus) in revision.changes:
+    for (path, action, plus, minus, content, mime) in revision.changes:
        directory, file = os.path.split(path)
        commit = cvsdb.CreateCommit()
        commit.SetRepository(repo.path)
@ -230,6 +342,8 @@ def handle_revision(db, command, repo, rev, verbose, force=0):
        commit.SetPlusCount(plus)
        commit.SetMinusCount(minus)
        commit.SetBranch(None)
+        commit.SetContent(content)
+        commit.SetMimeType(mime)

        if action == 'add':
            commit.SetTypeAdd()
@ -268,7 +382,16 @@ def main(command, repository, revs=[], verbose=0, force=0):
                sys.stderr.write("ERROR: " + str(e) + "\n")
                sys.exit(1)

-    repo = SvnRepo(repository)
+    tika_client = None
+    if cfg.utilities.tika_server:
+        tika_client = TikaClient(cfg.utilities.tika_server, cfg.utilities.tika_mime_types)
+    repo = SvnRepo(
+        path = repository,
+        index_content = cfg.cvsdb.index_content,
+        tika_client = tika_client,
+        guesser = cfg.guesser(),
+        svn_ignore_mimetype = cfg.options.svn_ignore_mimetype,
+    )
    if command == 'rebuild' or (command == 'update' and not revs):
        for rev in range(repo.rev_max+1):
            handle_revision(db, command, repo, rev, verbose)
--- a/conf/viewvc.conf.dist
+++ b/conf/viewvc.conf.dist
@ -1,6 +1,6 @@
 #---------------------------------------------------------------------------
 #
-# Configuration file for ViewVC
+# Configuration file for ViewVC (4IntraNet patched version)
 #
 # Information on ViewVC is located at the following web site:
 #     http://viewvc.org/
@ -244,8 +244,8 @@ cvsnt =

 # See also bin/cvsnt-rcsfile-inetd.pl

-#rcsfile_socket = 'host:port'
-# Example: rcsfile_socket = '127.0.0.1:8071'
+#rcsfile_socket = host:port
+# Example: rcsfile_socket = 127.0.0.1:8071

 # Subversion command-line client, used for viewing Subversion repositories
 svn =
@ -259,6 +259,32 @@ diff =
 cvsgraph =
 # cvsgraph = /usr/local/bin/cvsgraph

+# Apache Tika TCP server host and port, used to extract text from binary documents
+# Note that as of 2011-09-12, Tika 0.9 has a bug which leads to hangups on processing
+# M$Word documents in server mode. So you must use the fixed version, downloaded from:
+# http://wiki.4intra.net/public/tika-app-0.9-fix-TIKA709.jar
+# (mirror) http://code.google.com/p/mediawiki4intranet/downloads/detail?name=tika-app-0.9-fix-TIKA709.jar
+# Or apply the patch by yourself and rebuild Tika from source, see patch here:
+# https://issues.apache.org/jira/browse/TIKA-709
+# Tika server should be started with command 'java -jar tika-app-0.9.jar -p PORT -t -eutf-8'
+
+#tika_server = host:port
+# Example: tika_server = 127.0.0.1:8072
+
+# This lists MIME types that can be processed by Tika
+# You may change it if your Tika is newer than 0.9 and supports more formats
+# (note) *+xml examples: xhtml+xml, rss+xml, atom+xml, docbook+xml, rdf+xml
+tika_mime_types =
+    text/*
+    application/*+xml
+    application/xml
+    application/vnd.oasis.opendocument.*
+    application/vnd.openxmlformats
+    application/vnd.ms-*
+    application/msaccess
+    application/msword
+    application/pdf
+    application/rtf

 #---------------------------------------------------------------------------
 [options]
@ -494,6 +520,12 @@ short_log_len = 80
 # should we colorize known file content syntaxes?  (requires Pygments module)
 enable_syntax_coloration = 1

+# detect_encoding: Should we attempt to detect versioned file
+# character encodings?  [Requires 'chardet' module]
+# Used in file list, file content display and indexing
+# See also options.encodings for naive guessing.
+detect_encoding = 1
+
 # Use CvsGraph. See http://www.akhphd.au.dk/~bertho/cvsgraph/ for
 # documentation and download.
 use_cvsgraph = 0
@ -544,6 +576,17 @@ use_pagesize = 0
 # Set to 0 to disable the limit.
 limit_changes = 100

+# You can also use primitive charset guessing instead of chardet (options.detect_encoding)
+# Just set this to the list of possible charsets in your repository.
+# ViewVC will simply try to decode content using each of them, and pick
+# the first which succeeds. UTF-8 is always tried automatically.
+#encodings = cp1251:iso-8859-1
+
+# Sadly this is also required - for back-links from query results to files
+# in CVS, because it doesn't recode file names to UTF-8 as Subversion does.
+# Just set to cp1251 if you work with your CVS from Windowz.
+#cvs_ondisk_charset = cp1251
+
 #---------------------------------------------------------------------------
 [templates]

@ -588,9 +631,14 @@ limit_changes = 100
 # Set to 1 to enable the database integration feature, 0 otherwise.
 enabled = 0

-# Database hostname and port.
+# Set to 1 to enable indexing of file contents using Sphinx and Tika
+index_content = 0
+
+# Database hostname, port, and socket
 #host = localhost
 #port = 3306
+# On Debian Linux, enable this:
+#socket = /var/run/mysqld/mysqld.sock

 # ViewVC database name.
 #database_name = ViewVC
@ -605,6 +653,30 @@ enabled = 0
 #readonly_user =
 #readonly_passwd =

+# ViewVC can use Sphinx (http://sphinxsearch.com) full-text search engine
+# to index file contents with full history and then search over them.
+# Also, Apache Tika console application can be used in TCP server mode to
+# add support for indexing binary documents (M$Word, PDF and etc).
+# See tika_server in [utilities].
+# Requires Sphinx >= 0.9.9 with a real-time updatable SphinxQL index.
+# Index must be created in sphinx.conf by hand and have the following fields:
+#  rt_field = content
+#  rt_attr_string = content
+#  rt_attr_string = mimetype
+#  rt_attr_timestamp = ci_when
+#  rt_attr_uint = whoid
+#  rt_attr_uint = repositoryid
+#  rt_attr_uint = dirid
+#  rt_attr_uint = fileid
+#  rt_attr_uint = revision
+#  rt_attr_uint = branchid
+
+# Sphinx connection parameters:
+#sphinx_host =
+#sphinx_port =
+#sphinx_socket = /var/run/sphinxql.sock
+#sphinx_index = viewvc
+
 # Limit the number of rows returned by a given query to this number.
 #row_limit = 1000

--- a/lib/config.py
+++ b/lib/config.py
@ -24,6 +24,7 @@ import vclib.ccvs
 import vclib.svn
 import cvsdb
 import viewvc
+from viewvcmagic import ContentMagic

 #########################################################################
 #
@ -47,6 +48,7 @@ class Config:
                        'root_parents', 'allowed_views', 'mime_types_files')

  def __init__(self):
+    self.__guesser = None
    for section in self._sections:
      setattr(self, section, _sub_config())

@ -66,7 +68,6 @@ class Config:
    if rootname:
      self._process_root_options(self.parser, rootname)
    self.expand_root_parents()
-    cvsdb.setencs(self.options.encodings.split(':'))
    r = {}
    for i in self.rewritehtml.__dict__.keys():
      if i[-8:] == '.replace':
@ -237,6 +238,11 @@ class Config:
    params['__config'] = self
    return params

+  def guesser(self):
+    if not self.__guesser:
+      self.__guesser = ContentMagic(self.options.encodings)
+    return self.__guesser
+
  def set_defaults(self):
    "Set some default values in the configuration."

@ -258,6 +264,8 @@ class Config:
    self.utilities.svn = ''
    self.utilities.diff = ''
    self.utilities.cvsgraph = ''
+    self.utilities.tika_server = ''
+    self.utilities.tika_mime_types = ''

    self.options.root_as_url_component = 1
    self.options.checkout_magic = 0
@ -302,7 +310,7 @@ class Config:
    self.options.limit_changes = 100
    self.options.cvs_ondisk_charset = 'cp1251'
    self.options.binary_mime_re = '^(?!text/|.*\Wxml)'
-    self.options.encodings = 'utf-8:cp1251:iso-8859-1'
+    self.options.encodings = 'cp1251:iso-8859-1'

    self.templates.diff = None
    self.templates.directory = None
@ -316,6 +324,7 @@ class Config:
    self.templates.roots = None

    self.cvsdb.enabled = 0
+    self.cvsdb.index_content = 0
    self.cvsdb.host = ''
    self.cvsdb.port = 3306
    self.cvsdb.socket = ''
@ -329,6 +338,11 @@ class Config:
    self.cvsdb.check_database_for_root = 0
    self.cvsdb.fulltext_min_relevance = 0.2

+    self.cvsdb.sphinx_host = ''
+    self.cvsdb.sphinx_port = 3307
+    self.cvsdb.sphinx_socket = ''
+    self.cvsdb.sphinx_index = ''
+
 def _startswith(somestr, substr):
  return somestr[:len(substr)] == substr

--- a/lib/cvsdb.py
+++ b/lib/cvsdb.py
@ -15,6 +15,7 @@ import sys
 import string
 import time
 import re
+import cgi

 import vclib
 import dbi
@ -36,22 +37,12 @@ error = "cvsdb error"
 ## defined to actually be complete; it should run well off of any DBI 2.0
 ## complient database interface

-encs = [ "utf-8", "cp1251", "iso-8859-1" ]
-
-def utf8string(value):
-    for e in encs:
-        try:
-            value = value.decode(e)
-            break
-        except: pass
-    return value.encode("utf-8")
-
-def setencs(e):
-    global encs
-    encs = e
-
 class CheckinDatabase:
-    def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, authorizer = None):
+    def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, cfg,
+                 authorizer = None, index_content = 0, sphinx_host = None, sphinx_port = None,
+                 sphinx_socket = None, sphinx_index = None):
+        self.cfg = cfg
+
        self._host = host
        self._port = port
        self._socket = socket
@ -63,11 +54,21 @@ class CheckinDatabase:
        self._min_relevance = min_relevance
        self.authorizer = authorizer

+        # Sphinx settings
+        self.index_content = index_content
+        self.sphinx_host = sphinx_host
+        self.sphinx_port = sphinx_port
+        self.sphinx_socket = sphinx_socket
+        self.sphinx_index = sphinx_index
+
        ## database lookup caches
        self._get_cache = {}
        self._get_id_cache = {}
        self._desc_id_cache = {}

+        # Sphinx connection None by default
+        self.sphinx = None
+
    def Connect(self):
        self.db = dbi.connect(
            self._host, self._port, self._socket, self._user, self._passwd, self._database)
@ -83,12 +84,17 @@ class CheckinDatabase:
        else:
            self._version = 0
        if self._version > CURRENT_SCHEMA_VERSION:
-           raise DatabaseVersionError("Database version %d is newer than the "
-                                      "last version supported by this "
-                                      "software." % (self._version))
+            raise DatabaseVersionError("Database version %d is newer than the "
+                                       "last version supported by this "
+                                       "software." % (self._version))
+        if self.index_content:
+            self.sphinx = dbi.connect(self.sphinx_host, self.sphinx_port, self.sphinx_socket, '', '', '')
+
+    def utf8(self, value):
+        return self.cfg.guesser().utf8(value)

    def sql_get_id(self, table, column, value, auto_set):
-        value = utf8string(value)
+        value = self.utf8(value)

        sql = "SELECT id FROM %s WHERE %s=%%s" % (table, column)
        sql_args = (value, )
@ -257,7 +263,7 @@ class CheckinDatabase:
        return self.get_list("repositories", repository)

    def SQLGetDescriptionID(self, description, auto_set = 1):
-        description = utf8string(description)
+        description = self.utf8(description)
        ## lame string hash, blame Netscape -JMP
        hash = len(description)

@ -338,48 +344,55 @@ class CheckinDatabase:
            self.AddCommit(commit)

    def AddCommit(self, commit):
-        ci_when = dbi.DateTimeFromTicks(commit.GetTime() or 0.0)
-        ci_type = commit.GetTypeString()
-        who_id = self.GetAuthorID(commit.GetAuthor())
-        repository_id = self.GetRepositoryID(commit.GetRepository())
-        directory_id = self.GetDirectoryID(commit.GetDirectory())
-        file_id = self.GetFileID(commit.GetFile())
-        revision = commit.GetRevision()
-        sticky_tag = "NULL"
-        branch_id = self.GetBranchID(commit.GetBranch())
-        plus_count = commit.GetPlusCount() or '0'
-        minus_count = commit.GetMinusCount() or '0'
-        description_id = self.GetDescriptionID(commit.GetDescription())
+        props = {
+            'type'         : commit.GetTypeString(),
+            'ci_when'      : dbi.DateTimeFromTicks(commit.GetTime() or 0.0),
+            'whoid'        : self.GetAuthorID(commit.GetAuthor()),
+            'repositoryid' : self.GetRepositoryID(commit.GetRepository()),
+            'dirid'        : self.GetDirectoryID(commit.GetDirectory()),
+            'fileid'       : self.GetFileID(commit.GetFile()),
+            'revision'     : commit.GetRevision(),
+            'branchid'     : self.GetBranchID(commit.GetBranch()),
+            'addedlines'   : commit.GetPlusCount() or '0',
+            'removedlines' : commit.GetMinusCount() or '0',
+            'descid'       : self.GetDescriptionID(commit.GetDescription()),
+        }

        commits_table = self._version >= 1 and 'commits' or 'checkins'
-        sql = "REPLACE INTO %s" % (commits_table)
-        sql = sql + \
-              "  (type,ci_when,whoid,repositoryid,dirid,fileid,revision,"\
-              "   stickytag,branchid,addedlines,removedlines,descid)"\
-              "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
-        sql_args = (ci_type, ci_when, who_id, repository_id,
-                    directory_id, file_id, revision, sticky_tag, branch_id,
-                    plus_count, minus_count, description_id)

        cursor = self.db.cursor()
        try:
-            cursor.execute(sql, sql_args)
+            # MySQL-specific INSERT-or-UPDATE with ID retrieval
+            cursor.execute(
+                'INSERT INTO '+commits_table+'('+','.join(i for i in props)+') VALUES ('+
+                ', '.join('%s' for i in props)+') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), '+
+                ', '.join(i+'=VALUES('+i+')' for i in props),
+                tuple(props[i] for i in props)
+            )
+            commit_id = cursor.lastrowid
+            if self.index_content:
+                sphcur = self.sphinx.cursor()
+                content = commit.GetContent()
+                props['ci_when'] = str(int(commit.GetTime() or 0))
+                if len(content):
+                    props['content'] = content
+                    # Now, stored MIME type is only needed while searching
+                    # It is guessed again when the file is displayed
+                    props['mimetype'] = commit.GetMimeType()
+                    props['id'] = str(commit_id)
+                    del props['addedlines']
+                    del props['removedlines']
+                    del props['descid']
+                    del props['type']
+                    sphcur.execute(
+                        'INSERT INTO '+self.sphinx_index+'('+','.join(i for i in props)+') VALUES ('+
+                        ','.join('%s' for i in props)+')',
+                        tuple(props[i] for i in props)
+                    )
        except Exception, e:
-            raise Exception("Error adding commit: '%s'\n"
-                            "Values were:\n"
-                            "\ttype         = %s\n"
-                            "\tci_when      = %s\n"
-                            "\twhoid        = %s\n"
-                            "\trepositoryid = %s\n"
-                            "\tdirid        = %s\n"
-                            "\tfileid       = %s\n"
-                            "\trevision     = %s\n"
-                            "\tstickytag    = %s\n"
-                            "\tbranchid     = %s\n"
-                            "\taddedlines   = %s\n"
-                            "\tremovedlines = %s\n"
-                            "\tdescid       = %s\n"
-                            % ((str(e), ) + sql_args))
+            print ("Error adding commit: '"+str(e)+"'\nValues were:\n"+
+                "\n".join(i+'='+str(props[i]) for i in props))
+            raise

    def SQLQueryListString(self, field, query_entry_list):
        sqlList = []
@ -414,6 +427,67 @@ class CheckinDatabase:

        return "(%s)" % (string.join(sqlList, " OR "))

+    def query_ids(self, in_field, table, id_field, name_field, lst):
+        if not len(lst):
+            return None
+        cond = self.SQLQueryListString(name_field, lst)
+        cursor = self.db.cursor()
+        cursor.execute('SELECT %s FROM %s WHERE %s' % (id_field, table, cond))
+        ids = list(str(row[0]) for row in cursor)
+        if not len(ids):
+            return None
+        return "%s IN (%s)" % (in_field, ','.join(ids))
+
+    def CreateSphinxQueryString(self, query):
+        condList = [
+            'MATCH(%s)' % (self.db.literal(query.content_query), ),
+            self.query_ids('repositoryid', 'repositories', 'id', 'repository', query.repository_list),
+            self.query_ids('branchid', 'branches', 'id', 'branch', query.branch_list),
+            self.query_ids('dirid', 'dirs', 'id', 'dir', query.directory_list),
+            self.query_ids('fileid', 'files', 'id', 'file', query.file_list),
+            self.query_ids('authorid', 'people', 'id', 'who', query.author_list),
+            self.query_ids('descid', 'descs', 'id', 'description', query.comment_list),
+        ]
+
+        if len(query.revision_list):
+            condList.append("revision IN ("+','.join(self.db.literal(s) for s in query.revision_list)+")")
+        if query.from_date:
+            condList.append('ci_when>='+str(dbi.TicksFromDateTime(query.from_date)))
+        if query.to_date:
+            condList.append('ci_when<='+str(dbi.TicksFromDateTime(query.to_date)))
+
+        if query.sort == 'date':
+            order_by = 'ORDER BY `ci_when` DESC, `relevance` DESC'
+        elif query.sort == 'date_rev':
+            order_by = 'ORDER BY `ci_when` ASC, `relevance` DESC'
+        else: # /* if query.sort == 'relevance' */
+            order_by = 'ORDER BY `relevance` DESC'
+
+        conditions = string.join((i for i in condList if i), " AND ")
+        conditions = conditions and "WHERE %s" % conditions
+
+        ## limit the number of rows requested or we could really slam
+        ## a server with a large database
+        limit = ""
+        if query.limit:
+            limit = "LIMIT %s" % (str(query.limit))
+        elif self._row_limit:
+            limit = "LIMIT %s" % (str(self._row_limit))
+
+        fields = "id `id`, WEIGHT() `relevance`, `content`, `mimetype`"
+
+        return "SELECT %s FROM %s %s %s %s" % (fields, self.sphinx_index, conditions, order_by, limit)
+
+    # Get commits by their IDs
+    def CreateIdQueryString(self, ids):
+        commits_table = self._version >= 1 and 'commits' or 'checkins'
+        return (
+            'SELECT %s.*, repositories.repository AS repository_name, dirs.dir AS dir_name, files.file AS file_name'
+            ' FROM %s, repositories, dirs, files'
+            ' WHERE %s.id IN (%s) AND repositoryid=repositories.id'
+            ' AND dirid=dirs.id AND fileid=files.id' % (commits_table, commits_table, commits_table, ','.join(ids))
+        )
+
    def CreateSQLQueryString(self, query):
        commits_table = self._version >= 1 and 'commits' or 'checkins'
        fields = [
@ -435,6 +509,7 @@ class CheckinDatabase:
            fields.append("%s) AS relevance" % temp)
        else:
            fields.append("'' AS relevance")
+        fields.append("'' AS snippet")

        if len(query.repository_list):
            temp = self.SQLQueryListString("repositories.repository",
@ -478,16 +553,18 @@ class CheckinDatabase:
            temp = "(%s.ci_when<=\"%s\")" % (commits_table, str(query.to_date))
            condList.append(temp)

-        if query.sort == "date":
-            order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
+        if query.sort == "relevance" and len(query.text_query):
+            order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
+        elif query.sort == "date_rev":
+            order_by = "ORDER BY %s.ci_when ASC,descid,%s.repositoryid" % (commits_table, commits_table)
        elif query.sort == "author":
            tableList.append(("people", "(%s.whoid=people.id)" % (commits_table)))
            order_by = "ORDER BY people.who,descid,%s.repositoryid" % (commits_table)
        elif query.sort == "file":
            tableList.append(("files", "(%s.fileid=files.id)" % (commits_table)))
            order_by = "ORDER BY files.file,descid,%s.repositoryid" % (commits_table)
-        elif query.sort == "relevance" and len(query.text_query):
-            order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)
+        else: # /* if query.sort == "date": */
+            order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table)

        ## exclude duplicates from the table list, and split out join
        ## conditions from table names.  In future, the join conditions
@ -528,19 +605,60 @@ class CheckinDatabase:
        return True

    def RunQuery(self, query):
-        sql = self.CreateSQLQueryString(query)
-        cursor = self.db.cursor()
-        cursor.execute(sql)
+        if len(query.content_query) and self.sphinx:
+            # Use Sphinx to search on document content
+            sql = self.CreateSphinxQueryString(query)
+            cursor = self.sphinx.cursor()
+            cursor.execute(sql)
+            sphinx_rows = list((str(docid), rel, content, mimetype) for docid, rel, content, mimetype in cursor)
+            if len(sphinx_rows):
+                # Fetch snippets
+                snippet_options = {
+                    'around': 15,
+                    'limit': 200,
+                    'before_match': '<span style="color:red">',
+                    'after_match': '</span>',
+                    'chunk_separator': ' ... ',
+                }
+                preformatted_mime = 'text/(?!html|xml).*'
+                snippets = {}
+                bm_html = cgi.escape(snippet_options['before_match'])
+                am_html = cgi.escape(snippet_options['after_match'])
+                for docid, rel, content, mimetype in sphinx_rows:
+                    cursor.execute(
+                        'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')',
+                        (content, self.sphinx_index, query.content_query) + tuple(snippet_options.values())
+                    )
+                    s, = cursor.fetchone()
+                    s = cgi.escape(s)
+                    if re.match(preformatted_mime, mimetype):
+                        s = s.replace('\n', '<br />')
+                    s = s.replace(bm_html, snippet_options['before_match'])
+                    s = s.replace(am_html, snippet_options['after_match'])
+                    snippets[docid] = s
+                # Fetch all fields from MySQL
+                sql = self.CreateIdQueryString((docid for (docid, _, _, _) in sphinx_rows))
+                cursor = self.db.cursor()
+                cursor.execute(sql)
+                byid = {}
+                for row in cursor:
+                    byid[str(row[0])] = row
+                rows = list(byid[docid] + (rel, snippets[docid]) for (docid, rel, _, _) in sphinx_rows if docid in byid)
+            else:
+                rows = []
+        else:
+            # Use regular queries when document content is not searched
+            sql = self.CreateSQLQueryString(query)
+            cursor = self.db.cursor()
+            cursor.execute(sql)
+            rows = list(cursor)

-        while 1:
-            row = cursor.fetchone()
-            if not row:
-                break
-
-            (dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID,
+        # Convert rows to commit objects
+        for row in rows:
+            (dbId, dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID,
             dbFileID, dbRevision, dbStickyTag, dbBranchID, dbAddedLines,
             dbRemovedLines, dbDescID, dbRepositoryName, dbDirName,
-             dbFileName, dbRelevance) = row
+             dbFileName, dbRelevance, dbSnippet) = row

            if not self.check_commit_access(dbRepositoryName, dbDirName, dbFileName, dbRevision):
                continue
@ -564,6 +682,7 @@ class CheckinDatabase:
            commit.SetMinusCount(dbRemovedLines)
            commit.SetDescriptionID(dbDescID)
            commit.SetRelevance(dbRelevance)
+            commit.SetSnippet(dbSnippet)

            query.AddCommit(commit)

@ -623,39 +742,14 @@ class CheckinDatabase:
            raise UnknownRepositoryError("Unknown repository '%s'"
                                         % (repository))

-        if (self._version >= 1):
-            self.sql_delete('repositories', 'id', rep_id)
-            self.sql_purge('commits', 'repositoryid', 'id', 'repositories')
-            self.sql_purge('files', 'id', 'fileid', 'commits')
-            self.sql_purge('dirs', 'id', 'dirid', 'commits')
-            self.sql_purge('branches', 'id', 'branchid', 'commits')
-            self.sql_purge('descs', 'id', 'descid', 'commits')
-            self.sql_purge('people', 'id', 'whoid', 'commits')
-        else:
-            sql = "SELECT * FROM checkins WHERE repositoryid=%s"
-            sql_args = (rep_id, )
-            cursor = self.db.cursor()
-            cursor.execute(sql, sql_args)
-            checkins = []
-            while 1:
-                try:
-                    (ci_type, ci_when, who_id, repository_id,
-                     dir_id, file_id, revision, sticky_tag, branch_id,
-                     plus_count, minus_count, description_id) = \
-                     cursor.fetchone()
-                except TypeError:
-                    break
-                checkins.append([file_id, dir_id, branch_id,
-                                 description_id, who_id])
-
-            #self.sql_delete('repositories', 'id', rep_id)
-            self.sql_delete('checkins', 'repositoryid', rep_id)
-            for checkin in checkins:
-                self.sql_delete('files', 'id', checkin[0], 'fileid')
-                self.sql_delete('dirs', 'id', checkin[1], 'dirid')
-                self.sql_delete('branches', 'id', checkin[2], 'branchid')
-                self.sql_delete('descs', 'id', checkin[3], 'descid')
-                self.sql_delete('people', 'id', checkin[4], 'whoid')
+        checkins_table = self._version >= 1 and 'commits' or 'checkins'
+        self.sql_delete('repositories', 'id', rep_id)
+        self.sql_purge(checkins_table, 'repositoryid', 'id', 'repositories')
+        self.sql_purge('files', 'id', 'fileid', checkins_table)
+        self.sql_purge('dirs', 'id', 'dirid', checkins_table)
+        self.sql_purge('branches', 'id', 'branchid', checkins_table)
+        self.sql_purge('descs', 'id', 'descid', checkins_table)
+        self.sql_purge('people', 'id', 'whoid', checkins_table)

        # Reset all internal id caches.  We could be choosier here,
        # but let's just be as safe as possible.
@ -690,8 +784,13 @@ class Commit:
        self.__minuscount = ''
        self.__description = ''
        self.__relevance = ''
+        self.__snippet = ''
        self.__gmt_time = 0.0
        self.__type = Commit.CHANGE
+        self.__content = ''
+        self.__mimetype = ''
+        self.__base_path = ''
+        self.__base_rev = ''

    def SetRepository(self, repository):
        self.__repository = repository
@ -758,12 +857,19 @@ class Commit:
    def GetDescription(self):
        return self.__description

+    # Relevance and snippet are used when querying commit database
    def SetRelevance(self, relevance):
        self.__relevance = relevance

    def GetRelevance(self):
        return self.__relevance

+    def SetSnippet(self, snippet):
+        self.__snippet = snippet
+
+    def GetSnippet(self):
+        return self.__snippet
+
    def SetTypeChange(self):
        self.__type = Commit.CHANGE

@ -784,66 +890,80 @@ class Commit:
        elif self.__type == Commit.REMOVE:
            return 'Remove'

+    # File content (extracted text), optional, indexed with Sphinx
+    def SetContent(self, content):
+        self.__content = content
+
+    def GetContent(self):
+        return self.__content
+
+    # MIME type, optional, now only stored in Sphinx
+    def SetMimeType(self, mimetype):
+        self.__mimetype = mimetype
+
+    def GetMimeType(self):
+        return self.__mimetype
+
 ## LazyCommit overrides a few methods of Commit to only retrieve
 ## it's properties as they are needed
 class LazyCommit(Commit):
-  def __init__(self, db):
-    Commit.__init__(self)
-    self.__db = db
+    def __init__(self, db):
+        Commit.__init__(self)
+        self.__db = db

-  def SetFileID(self, dbFileID):
-    self.__dbFileID = dbFileID
+    def SetFileID(self, dbFileID):
+        self.__dbFileID = dbFileID

-  def GetFileID(self):
-    return self.__dbFileID
+    def GetFileID(self):
+        return self.__dbFileID

-  def GetFile(self):
-    return self.__db.GetFile(self.__dbFileID)
+    def GetFile(self):
+        return self.__db.GetFile(self.__dbFileID)

-  def SetDirectoryID(self, dbDirID):
-    self.__dbDirID = dbDirID
+    def SetDirectoryID(self, dbDirID):
+        self.__dbDirID = dbDirID

-  def GetDirectoryID(self):
-    return self.__dbDirID
+    def GetDirectoryID(self):
+        return self.__dbDirID

-  def GetDirectory(self):
-    return self.__db.GetDirectory(self.__dbDirID)
+    def GetDirectory(self):
+        return self.__db.GetDirectory(self.__dbDirID)

-  def SetRepositoryID(self, dbRepositoryID):
-    self.__dbRepositoryID = dbRepositoryID
+    def SetRepositoryID(self, dbRepositoryID):
+        self.__dbRepositoryID = dbRepositoryID

-  def GetRepositoryID(self):
-    return self.__dbRepositoryID
+    def GetRepositoryID(self):
+        return self.__dbRepositoryID

-  def GetRepository(self):
-    return self.__db.GetRepository(self.__dbRepositoryID)
+    def GetRepository(self):
+        return self.__db.GetRepository(self.__dbRepositoryID)

-  def SetAuthorID(self, dbAuthorID):
-    self.__dbAuthorID = dbAuthorID
+    def SetAuthorID(self, dbAuthorID):
+        self.__dbAuthorID = dbAuthorID

-  def GetAuthorID(self):
-    return self.__dbAuthorID
+    def GetAuthorID(self):
+        return self.__dbAuthorID

-  def GetAuthor(self):
-    return self.__db.GetAuthor(self.__dbAuthorID)
+    def GetAuthor(self):
+        return self.__db.GetAuthor(self.__dbAuthorID)

-  def SetBranchID(self, dbBranchID):
-    self.__dbBranchID = dbBranchID
+    def SetBranchID(self, dbBranchID):
+        self.__dbBranchID = dbBranchID

-  def GetBranchID(self):
-    return self.__dbBranchID
+    def GetBranchID(self):
+        return self.__dbBranchID

-  def GetBranch(self):
-    return self.__db.GetBranch(self.__dbBranchID)
+    def GetBranch(self):
+        return self.__db.GetBranch(self.__dbBranchID)

-  def SetDescriptionID(self, dbDescID):
-    self.__dbDescID = dbDescID
+    def SetDescriptionID(self, dbDescID):
+        self.__dbDescID = dbDescID

-  def GetDescriptionID(self):
-    return self.__dbDescID
+    def GetDescriptionID(self):
+        return self.__dbDescID

-  def GetDescription(self):
-    return self.__db.GetDescription(self.__dbDescID)
+    def GetDescription(self):
+        return self.__db.GetDescription(self.__dbDescID)

 ## QueryEntry holds data on one match-type in the SQL database
 ## match is: "exact", "like", or "regex"
@ -859,7 +979,7 @@ class CheckinDatabaseQuery:
        ## sorting
        self.sort = "date"

-        ## repository to query
+        ## repository, branch, etc to query
        self.repository_list = []
        self.branch_list = []
        self.directory_list = []
@ -867,7 +987,11 @@ class CheckinDatabaseQuery:
        self.revision_list = []
        self.author_list = []
        self.comment_list = []
+
+        ## text_query = Fulltext query on comments
+        ## content_query = Fulltext query on content
        self.text_query = ""
+        self.content_query = ""

        ## date range in DBI 2.0 timedate objects
        self.from_date = None
@ -886,6 +1010,9 @@ class CheckinDatabaseQuery:
    def SetTextQuery(self, query):
        self.text_query = query

+    def SetContentQuery(self, query):
+        self.content_query = query
+
    def SetRepository(self, repository, match = "exact"):
        self.repository_list.append(QueryEntry(repository, match))

@ -953,9 +1080,23 @@ def ConnectDatabase(cfg, authorizer=None, readonly=0):
    else:
        user = cfg.cvsdb.user
        passwd = cfg.cvsdb.passwd
-    db = CheckinDatabase(cfg.cvsdb.host, cfg.cvsdb.port, cfg.cvsdb.socket, user, passwd,
-                         cfg.cvsdb.database_name, cfg.cvsdb.row_limit, cfg.cvsdb.fulltext_min_relevance,
-                         authorizer)
+    db = CheckinDatabase(
+        host = cfg.cvsdb.host,
+        port = cfg.cvsdb.port,
+        socket = cfg.cvsdb.socket,
+        user = user,
+        passwd = passwd,
+        database = cfg.cvsdb.database_name,
+        row_limit = cfg.cvsdb.row_limit,
+        min_relevance = cfg.cvsdb.fulltext_min_relevance,
+        authorizer = authorizer,
+        index_content = cfg.cvsdb.index_content,
+        sphinx_host = cfg.cvsdb.sphinx_host,
+        sphinx_port = int(cfg.cvsdb.sphinx_port),
+        sphinx_socket = cfg.cvsdb.sphinx_socket,
+        sphinx_index = cfg.cvsdb.sphinx_index,
+        cfg = cfg,
+    )
    db.Connect()
    return db

--- a/lib/viewvc.py
+++ b/lib/viewvc.py
@ -1,4 +1,3 @@
-#
 # Copyright (C) 1999-2009 The ViewCVS Group. All Rights Reserved.
 #
 # By using this file, you agree to the terms and conditions set forth in
@ -68,7 +67,6 @@ docroot_magic_path = '*docroot*'
 viewcvs_mime_type = 'text/vnd.viewcvs-markup'
 alt_mime_type = 'text/x-cvsweb-markup'
 view_roots_magic = '*viewroots*'
-magic_buf_size = 4096
 default_mime_type = 'application/octet-stream'

 # Put here the variables we need in order to hold our state - they
@ -121,9 +119,8 @@ class Request:
    # check for an authenticated username
    self.username = server.getenv('REMOTE_USER')

-    # construct MIME magic
-    self.ms = None
-    self.ms_fail = 0
+    # repository object cache
+    self.all_repos = {}

    # if we allow compressed output, see if the client does too
    self.gzip_compress_level = 0
@ -134,6 +131,9 @@ class Request:
                              string.split(http_accept_encoding, ","))):
        self.gzip_compress_level = 9  # make this configurable?

+  def utf8(self, value):
+    return self.cfg.guesser().utf8(value)
+
  def create_repos(self, rootname):
    if not rootname:
      return None
@ -677,7 +677,7 @@ def _validate_mimetype(value):
  return value in (viewcvs_mime_type, alt_mime_type, 'text/plain')

 # obvious things here. note that we don't need uppercase for alpha.
-_re_validate_alpha = re.compile('^[a-z]+$')
+_re_validate_alpha = re.compile('^[a-z_]+$')
 _re_validate_number = re.compile('^[0-9]+$')
 _re_validate_boolint = re.compile('^[01]$')

@ -743,6 +743,7 @@ _legal_params = {
  'who_match'     : _re_validate_alpha,
  'comment'       : None,
  'comment_match' : _re_validate_alpha,
+  'search_content': None,
  'querysort'     : _re_validate_alpha,
  'date'          : _re_validate_alpha,
  'hours'         : _re_validate_number,
@ -988,7 +989,7 @@ def nav_path(request):
    is_last = len(path_parts) == len(request.path_parts)

    if request.roottype == 'cvs':
-      item = _item(name=cvsdb.utf8string(part), href=None)
+      item = _item(name=request.utf8(part), href=None)
    else:
      item = _item(name=part, href=None)

@ -1248,7 +1249,7 @@ def common_template_data(request, revision=None, mime_type=None):
  cfg = request.cfg
  where = request.where
  if request.roottype == 'cvs':
-    where = cvsdb.utf8string(where)
+    where = request.utf8(where)
  where = request.server.escape(where)

  # Initialize data dictionary members (sorted alphanumerically)
@ -1444,28 +1445,31 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
                                get_lexer_for_mimetype, \
                                get_lexer_for_filename
    from pygments.lexers._mapping import LEXERS
+    # Hack for shell mime types:
    LEXERS['BashLexer'] = ('pygments.lexers.other', 'Bash', ('bash', 'sh'), ('*.sh',), ('application/x-sh', 'application/x-shellscript', 'text/x-sh', 'text/x-shellscript'))
-    encoding = 'guess'
-    if cfg.options.detect_encoding:
-      try:
-        import chardet
-        encoding = 'chardet'
-      except (SyntaxError, ImportError):
-        pass
    try:
      lexer = get_lexer_for_mimetype(mime_type,
-                                     encoding=encoding,
+                                     encoding='utf-8',
                                     stripnl=False)
    except ClassNotFound:
      try:
        lexer = get_lexer_for_filename(filename,
-                                       encoding=encoding,
+                                       encoding='utf-8',
                                       stripnl=False)
      except ClassNotFound:
        use_pygments = 0
  except ImportError:
    use_pygments = 0

+  # Detect encoding by calling chardet ourselves,
+  # to support it in non-highlighting mode
+  content = fp.read()
+  c, encoding = cfg.guesser().guess_charset(content)
+  if encoding:
+    content = c
+  else:
+    encoding = 'unknown'
+
  # If we aren't going to be highlighting anything, just return the
  # BLAME_SOURCE.  If there's no blame_source, we'll generate a fake
  # one from the file contents we fetch with PATH and REV.
@ -1475,11 +1479,7 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
    else:
      lines = []
      line_no = 0
-      while 1:
-        line = fp.readline()
-        if not line:
-          break
-        line = cvsdb.utf8string(line)
+      for line in content.split('\n'):
        line_no = line_no + 1
        item = vclib.Annotation(cgi.escape(line), line_no,
                                None, None, None, None)
@ -1508,19 +1508,11 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type):
        self.blame_data.append(item)
      self.line_no = self.line_no + 1
  ps = PygmentsSink(blame_source)
-  fpd = fp.read()
-  try:
-    fpdat = unicode(fpd,'utf-8')
-  except:
-    try:
-      fpdat = unicode(fpd,'cp1251')
-    except:
-      fpdat = fpd
-  highlight(fpdat, lexer,
+  highlight(content, lexer,
            HtmlFormatter(nowrap=True,
                          classprefix='pygments-',
                          encoding='utf-8'), ps)
-  return ps.blame_data
+  return ps.blame_data, encoding

 def make_time_string(date, cfg):
  """Returns formatted date string in either local time or UTC.
@ -1594,6 +1586,7 @@ def calculate_mime_type(request, path_parts, rev):
        return mime_type
    except:
      pass
+  # FIXME rewrite to use viewvcmagic
  return guess_mime(path_parts[-1])

 def markup_or_annotate(request, is_annotate):
@ -1605,21 +1598,12 @@ def markup_or_annotate(request, is_annotate):
  mime_type = calculate_mime_type(request, path, rev)

  if not mime_type or mime_type == default_mime_type:
-    if request.ms is None and not request.ms_fail:
-      try:
-        import magic
-        request.ms = magic.open(magic.MAGIC_NONE | magic.MAGIC_MIME)
-        request.ms.load()
-      except:
-        request.ms_fail = 1
-    if request.ms:
-      try:
-        fp, revision = request.repos.openfile(path, rev)
-        buffer = fp.read(magic_buf_size)
-        fp.close()
-        mime_type = request.ms.buffer(buffer)
-      except:
-        pass
+    try:
+      fp, revision = request.repos.openfile(path, rev)
+      mime_type = request.cfg.guesser().guess_mime(None, None, fp)
+      fp.close()
+    except:
+      raise

  # Is this a binary type?
  if is_binary(request.cfg, mime_type):
@ -1657,9 +1641,10 @@ def markup_or_annotate(request, is_annotate):
    if check_freshness(request, None, revision, weak=1):
      fp.close()
      return
-    lines = markup_stream_pygments(request, cfg, blame_source, fp,
-                                   path[-1], mime_type)
+    lines, charset = markup_stream_pygments(request, cfg, blame_source, fp, path[-1], mime_type)
    fp.close()
+    if mime_type.find(';') < 0:
+      mime_type = mime_type+'; charset='+charset

  data = common_template_data(request, revision)
  data.merge(ezt.TemplateData({
@ -1910,7 +1895,7 @@ def view_directory(request):
      row.short_log = format_log(file.log, cfg)
      row.log = htmlify(file.log, cfg.options.mangle_email_addresses)
    row.lockinfo = file.lockinfo
-    row.name = request.server.escape(cvsdb.utf8string(file.name))
+    row.name = request.server.escape(request.utf8(file.name))
    row.anchor = row.name
    row.pathtype = (file.kind == vclib.FILE and 'file') or \
                   (file.kind == vclib.DIR and 'dir')
@ -2285,7 +2270,7 @@ def view_log(request):
      entry.ago = html_time(request, rev.date, 1)
    entry.log = rev.log or ""
    if cvs:
-      entry.log = cvsdb.utf8string(entry.log)
+      entry.log = request.utf8(entry.log)
    entry.log = htmlify(entry.log, cfg.options.mangle_email_addresses)
    entry.size = rev.size
    entry.lockinfo = rev.lockinfo
@ -2867,7 +2852,7 @@ class DiffSource:

    diff_code = line[0]
    output = self._format_text(line[1:])
-    output = cvsdb.utf8string(output)
+    output = self.cfg.guesser().utf8(output)

    if diff_code == '+':
      if self.state == 'dump':
@ -3644,6 +3629,7 @@ def view_queryform(request):
    'who_match' : request.query_dict.get('who_match', 'exact'),
    'comment' : request.query_dict.get('comment', ''),
    'comment_match' : request.query_dict.get('comment_match', 'fulltext'),
+    'search_content' : request.query_dict.get('search_content', ''),
    'querysort' : request.query_dict.get('querysort', 'date'),
    'date' : request.query_dict.get('date', 'hours'),
    'hours' : request.query_dict.get('hours', '2'),
@ -3653,6 +3639,7 @@ def view_queryform(request):
    'query_hidden_values' : query_hidden_values,
    'limit_changes' : limit_changes,
    'dir_href' : dir_href,
+    'enable_search_content' : request.cfg.cvsdb.index_content,
  }))

  generate_page(request, "query_form", data)
@ -3791,7 +3778,8 @@ def build_commit(request, files, max_files, dir_strip, format):
  plus_count = 0
  minus_count = 0
  found_unreadable = 0
-  all_repos = {}
+  if not request.all_repos:
+    request.all_repos = {}

  for f in files:
    dirname = f.GetDirectory()
@ -3810,17 +3798,19 @@ def build_commit(request, files, max_files, dir_strip, format):

    # Check path access (since the commits database logic bypasses the
    # vclib layer and, thus, the vcauth stuff that layer uses).
-    my_repos = all_repos.get(f.GetRepository(), '')
+    my_repos = request.all_repos.get(f.GetRepository(), '')
    if not my_repos:
      try:
-        my_repos = all_repos[f.GetRepository()] = request.create_repos(f.GetRepository())
+        my_repos = request.all_repos[f.GetRepository()] = request.create_repos(f.GetRepository())
      except:
        my_repos = None
    if not my_repos:
      return None
    if my_repos['roottype'] == 'cvs':
-      try: where = unicode(where,'utf-8')
+      # we store UTF-8 in the DB
+      try: where = where.decode('utf-8')
      except: pass
+      # FIXME maybe store "real" filesystem path in the DB instead of having such setting?
      try: where = where.encode(cfg.options.cvs_ondisk_charset)
      except: pass
    path_parts = _path_parts(where)
@ -3907,24 +3897,27 @@ def build_commit(request, files, max_files, dir_strip, format):
    if max_files and num_allowed > max_files:
      continue

-    commit_files.append(_item(date=commit_time,
-                              dir=request.server.escape(dirname),
-                              file=request.server.escape(filename),
-                              author=request.server.escape(f.GetAuthor()),
-                              rev=rev,
-                              branch=f.GetBranch(),
-                              plus=plus,
-                              minus=minus,
-                              type=change_type,
-                              dir_href=dir_href,
-                              log_href=log_href,
-                              view_href=view_href,
-                              download_href=download_href,
-                              prefer_markup=prefer_markup,
-                              diff_href=diff_href,
-                              root=my_repos,
-                              path=where,
-                              path_prev=path_prev))
+    commit_files.append(_item(
+      date=commit_time,
+      dir=request.server.escape(dirname),
+      file=request.server.escape(filename),
+      author=request.server.escape(f.GetAuthor()),
+      rev=rev,
+      branch=f.GetBranch(),
+      plus=plus,
+      minus=minus,
+      type=change_type,
+      snippet=f.GetSnippet(),
+      dir_href=dir_href,
+      log_href=log_href,
+      view_href=view_href,
+      download_href=download_href,
+      prefer_markup=prefer_markup,
+      diff_href=diff_href,
+      root=my_repos,
+      path=where,
+      path_prev=path_prev,
+    ))

  # No files survived authz checks?  Let's just pretend this
  # little commit didn't happen, shall we?
@ -4115,6 +4108,7 @@ def view_query(request):
  who_match = request.query_dict.get('who_match', 'exact')
  comment = request.query_dict.get('comment', '')
  comment_match = request.query_dict.get('comment_match', 'fulltext')
+  search_content = request.query_dict.get('search_content', '')
  querysort = request.query_dict.get('querysort', 'date')
  date = request.query_dict.get('date', 'hours')
  hours = request.query_dict.get('hours', '2')
@ -4126,7 +4120,7 @@ def view_query(request):
                                             cfg.options.limit_changes))

  match_types = { 'exact':1, 'like':1, 'glob':1, 'regex':1, 'notregex':1 }
-  sort_types = { 'date':1, 'author':1, 'file':1 }
+  sort_types = { 'date':1, 'date_rev':1, 'author':1, 'file':1, 'relevance':1 }
  date_types = { 'hours':1, 'day':1, 'week':1, 'month':1,
                 'all':1, 'explicit':1 }

@ -4193,6 +4187,8 @@ def view_query(request):
      query.SetComment(comment, comment_match)
    else:
      query.SetTextQuery(comment)
+  if search_content:
+    query.SetContentQuery(search_content)
  query.SetSortMethod(querysort)
  if date == 'hours':
    query.SetFromDateHoursAgo(int(hours))
--- a/lib/viewvcmagic.py
+++ b/lib/viewvcmagic.py
@ -0,0 +1,70 @@
+#!/usr/bin/python
+
+import mimetypes
+
+have_chardet = 0
+try:
+    import chardet
+    have_chardet = 1
+except: pass
+
+class ContentMagic:
+
+    def __init__(self, encodings):
+        self.encodings = encodings.split(':')
+        self.mime_magic = None
+        self.errors = []
+        # Try to load magic
+        try:
+            import magic
+            self.mime_magic = magic.open(magic.MAGIC_MIME_TYPE)
+            self.mime_magic.load()
+        except Exception, e:
+            self.errors.append(e)
+
+    # returns MIME type
+    def guess_mime(self, mime, filename, tempfile):
+        if mime == 'application/octet-stream':
+            mime = ''
+        if not mime and filename:
+            mime = mimetypes.guess_type(filename)[0]
+        if not mime and tempfile and self.mime_magic:
+            if type(tempfile) == type(''):
+                mime = self.mime_magic.file(tempfile)
+            else:
+                c = tempfile.read(4096)
+                mime = self.mime_magic.buffer(c)
+        return mime
+
+    # returns (utf8_content, charset)
+    def guess_charset(self, content):
+        # Try to guess with chardet
+        charset = None
+        if have_chardet:
+            # Try chardet
+            try:
+                charset = chardet.detect(content)
+                if charset and charset['encoding']:
+                    charset = charset['encoding']
+                content = content.decode(charset)
+            except: charset = None
+        else:
+            # Try UTF-8
+            charset = 'utf-8'
+            try: content = content.decode('utf-8')
+            except: charset = None
+        # Then try to guess primitively
+        if charset is None:
+            for charset in self.encodings:
+                try:
+                    content = content.decode(charset)
+                    break
+                except: charset = None
+        return (content, charset)
+
+    # guess and encode return value into UTF-8
+    def utf8(self, content):
+        (uni, charset) = self.guess_charset(content)
+        if charset:
+            return uni.encode('utf-8')
+        return content
--- a/templates/query_form.ezt
+++ b/templates/query_form.ezt
@ -144,7 +144,7 @@ Browse Directory</a></p>
  <tr>
    <th style="text-align:right;vertical-align:top;">Comment:</th>
    <td>
-      <input type="text" name="comment" value="[comment]" /><br />
+      <input type="text" name="comment" value="[comment]" size="40" /><br />
      <label for="comment_match_exact">
        <input type="radio" name="comment_match" id="comment_match_fulltext"
           value="fulltext" [is comment_match "fulltext"]checked=""[end] />
@ -172,13 +172,21 @@ Browse Directory</a></p>
      </label>
    </td>
  </tr>
+  [if-any enable_search_content]
+  <tr>
+    <th style="text-align:right;vertical-align:top;">Search content:</th>
+    <td><input type="text" name="search_content" value="[search_content]" size="60" /></td>
+  </tr>
+  [end]
  <tr>
    <th style="text-align:right;vertical-align:top;">Sort By:</th>
    <td>
      <select name="querysort">
        <option value="date" [is querysort "date"]selected="selected"[end]>Date</option>
+        <option value="date_rev" [is querysort "date_rev"]selected="selected"[end]>Date (oldest first)</option>
        <option value="author" [is querysort "author"]selected="selected"[end]>Author</option>
        <option value="file" [is querysort "file"]selected="selected"[end]>File</option>
+        <option value="relevance" [is querysort "relevance"]selected="selected"[end]>Relevance</option>
      </select>
    </td>
  </tr>
--- a/templates/query_results.ezt
+++ b/templates/query_results.ezt
@ -46,15 +46,18 @@
    <tr class="vc_row_[if-index commits even]even[else]odd[end]">
      <td style="vertical-align: top;">
        [define rev_href][if-any commits.files.prefer_markup][commits.files.view_href][else][if-any commits.files.download_href][commits.files.download_href][end][end][end]
-	[if-any commits.files.rev][if-any rev_href]<a href="[rev_href]">[end][commits.files.rev][if-any rev_href]</a>[end][else]&nbsp;[end]
+        [if-any commits.files.rev][if-any rev_href]<a href="[rev_href]">[end][commits.files.rev][if-any rev_href]</a>[end][else]&nbsp;[end]
      </td>
      <td style="vertical-align: top;">
        <a href="[commits.files.dir_href]">[commits.files.dir]/</a>
        <a href="[commits.files.log_href]">[commits.files.file]</a>
+        [if-any commits.files.snippet]
+          <div class="snippet">[commits.files.snippet]</div>
+        [end]
      </td>
 [if-any show_branch]
      <td style="vertical-align: top;">
-	[if-any commits.files.branch][commits.files.branch][else]&nbsp;[end]
+        [if-any commits.files.branch][commits.files.branch][else]&nbsp;[end]
      </td>
 [end]
      <td style="vertical-align: top;">
@ -68,10 +71,10 @@
        [is commits.files.type "Remove"]</del>[end]
      </td>
      <td style="vertical-align: top;">
-	[if-any commits.files.date][commits.files.date][else]&nbsp;[end]
+        [if-any commits.files.date][commits.files.date][else]&nbsp;[end]
      </td>
      <td style="vertical-align: top;">
-	[if-any commits.files.author][commits.files.author][else]&nbsp;[end]
+        [if-any commits.files.author][commits.files.author][else]&nbsp;[end]
      </td>
    </tr>
  [end]