diff --git a/bin/make-database b/bin/make-database index b8775705..f538b837 100755 --- a/bin/make-database +++ b/bin/make-database @@ -44,6 +44,7 @@ CREATE TABLE branches ( DROP TABLE IF EXISTS checkins; CREATE TABLE checkins ( + id int NOT NULL AUTO_INCREMENT PRIMARY KEY, type enum('Change','Add','Remove'), ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL, whoid mediumint(9) DEFAULT '0' NOT NULL, @@ -57,7 +58,7 @@ CREATE TABLE checkins ( removedlines int(11) DEFAULT '0' NOT NULL, descid mediumint(9), UNIQUE repositoryid (repositoryid,dirid,fileid,revision), - KEY repository_when (repositoryid,ci_when), + KEY repositoryid_when (repositoryid,ci_when), KEY ci_when (ci_when), KEY whoid (whoid,ci_when), KEY dirid (dirid), @@ -138,6 +139,7 @@ CREATE TABLE branches ( DROP TABLE IF EXISTS commits; CREATE TABLE commits ( + id int NOT NULL AUTO_INCREMENT PRIMARY KEY, type enum('Change','Add','Remove'), ci_when datetime DEFAULT '0000-00-00 00:00:00' NOT NULL, whoid mediumint(9) DEFAULT '0' NOT NULL, @@ -151,9 +153,9 @@ CREATE TABLE commits ( removedlines int(11) DEFAULT '0' NOT NULL, descid mediumint(9), UNIQUE repositoryid (repositoryid,dirid,fileid,revision), + KEY repositoryid_when (repositoryid,ci_when), KEY ci_when (ci_when), - KEY whoid (whoid), - KEY repositoryid_2 (repositoryid), + KEY whoid (whoid,ci_when), KEY dirid (dirid), KEY fileid (fileid), KEY branchid (branchid), @@ -253,7 +255,7 @@ Options: [Default: ViewVC] --help Show this usage message. - + --hostname=ARG Use ARG as the hostname for the MySQL connection. [Default: localhost] @@ -264,7 +266,7 @@ Options: --version=ARG Create the database using the schema employed by version ARG of ViewVC. Valid values are: [ "1.0" ] - + """ % (os.path.basename(sys.argv[0]))) if errmsg is not None: stream.write("[ERROR] %s.\n" % (errmsg)) diff --git a/bin/svndbadmin b/bin/svndbadmin index 6c40ad39..8d951e01 100755 --- a/bin/svndbadmin +++ b/bin/svndbadmin @@ -58,7 +58,11 @@ else: import os import string +import socket +import select import re +import mimetypes +import time import svn.core import svn.repos @@ -68,14 +72,20 @@ import svn.delta import cvsdb import viewvc import vclib +from viewvcmagic import ContentMagic class SvnRepo: """Class used to manage a connection to a SVN repository.""" - def __init__(self, path): + def __init__(self, path, index_content = None, tika_client = None, guesser = None, + svn_ignore_mimetype = False): self.path = path self.repo = svn.repos.svn_repos_open(path) self.fs = svn.repos.svn_repos_fs(self.repo) self.rev_max = svn.fs.youngest_rev(self.fs) + self.index_content = index_content + self.tika_client = tika_client + self.guesser = guesser + self.svn_ignore_mimetype = svn_ignore_mimetype def __getitem__(self, rev): if rev is None: rev = self.rev_max @@ -128,6 +138,74 @@ def _get_diff_counts(diff_fp): line = diff_fp.readline() return plus, minus +class TikaClient: + # Create tika client + def __init__(self, tika_server, mime_types): + self.tika_server = tika_server + self.mime_types = mime_types + self.addr = tika_server.split(':') + # Split address + if len(self.addr) != 2: + raise Exception('tika_server value is incorrect: \''+tika_server+'\', please use \'host:port\' format') + self.addr = (self.addr[0], int(self.addr[1])) + # Build regexp for MIME types + m = re.split('\s+', mime_types.strip()) + self.mime_regexp = re.compile('|'.join('^'+re.escape(i).replace('\\*', '.*')+'$' for i in m)) + + # Extract text content from file using Tika which runs in server mode + def get_text(self, filename, mime_type, log_filename): + if not self.mime_regexp.match(mime_type): + # Tika can't handle this mime type, return nothing + return '' + fd = None + s = None + text = '' + fsize = 0 + try: + # Read original file + fd = open(filename, 'rb') + data = fd.read() + fsize = len(data) + if not fsize: + return '' + # Connect to Tika + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.connect(self.addr) + s.setblocking(0) + sockfd = s.fileno() + # Tika is somewhat delicate about network IO, so: + # Read and write using poll(2) system call + p = select.poll() + p.register(sockfd) + while 1: + fds = p.poll() + if not fds: + break + (pollfd, event) = fds[0] + if event & select.POLLIN: + # Exception or empty data means EOF... + try: part = os.read(sockfd, 65536) + except: break + if not part: break + text += part + if event & select.POLLOUT: + if not len(data): + # Shutdown output and forget about POLLOUT + s.shutdown(socket.SHUT_WR) + p.modify(sockfd, select.POLLIN) + else: + # Write and consume some data + l = os.write(sockfd, data) + data = data[l:] + if len(text) == 0: + raise Exception('Empty response from Tika server') + print "Extracted %d bytes from %s (%s) of size %d" % (len(text), log_filename, mime_type, fsize) + except Exception, e: + print "Error extracting text from %s (%s) of size %d: %s" % (log_filename, mime_type, fsize, str(e)) + finally: + if fd: fd.close() + if s: s.close() + return text class SvnRev: """Class used to hold information about a particular revision of @@ -151,7 +229,7 @@ class SvnRev: # get a root for the current revisions fsroot = self._get_root_for_rev(rev) - + # find changes in the revision editor = svn.repos.RevisionChangeCollector(repo.fs, rev) e_ptr, e_baton = svn.delta.make_editor(editor) @@ -168,7 +246,7 @@ class SvnRev: base_root = None if change.base_path: base_root = self._get_root_for_rev(change.base_rev) - + if not change.path: action = 'remove' elif change.added: @@ -184,19 +262,53 @@ class SvnRev: ['-b', '-B']) diff_fp = diffobj.get_pipe() plus, minus = _get_diff_counts(diff_fp) - # TODO Indexing file contents - # For binary files: svn.fs.contents_changed(root1, path1, root2, path2) - # Temp file with contents is at: diffobj.tempfile2 - # Apache Tika server may even be at another host! # CustIS Bug 50473: a workaround for svnlib behaviour in file movements (FILE1 -> FILE2 + FILE1 -> null) if change.base_path: - if not change.path and changes_hash.get(change.base_path, '') != '': + if not change.path and change.base_path in changes_hash: minus = 0 elif change.path: changes_hash[change.base_path] = change.path - self.changes.append((path, action, plus, minus)) + content = '' + mime = '' + # need to check if binary file's content changed when copying, + # if not, don't extract it, just get it from previous revision later + if repo.index_content and change.path and (not change.base_path + or svn.fs.contents_changed( + base_root and base_root or None, + base_root and change.base_path or None, + fsroot, change.path + )): + props = svn.fs.node_proplist(fsroot, change.path) + if not repo.svn_ignore_mimetype: + mime = props.get('svn:mime-type', None) + else: + mime = None + mime = repo.guesser.guess_mime( + mime, + os.path.basename(change.path), + diffobj.tempfile2 + ) + # Read and guess charset by ourselves for text files + if mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')): + try: + fd = open(diffobj.tempfile2, 'rb') + content = fd.read() + fd.close() + except: pass + # Guess charset + if content: + content, charset = repo.guesser.guess_charset(content) + if charset: + content = content.encode('utf-8') + print 'Guessed %s for %s' % (charset, change.path) + else: + print 'Failed to guess charset for %s, not indexing' % (change.path, ) + # Try to extract content using Tika from binary documents + elif repo.tika_client: + content = repo.tika_client.get_text(diffobj.tempfile2, mime, change.path) + self.changes.append((path, action, plus, minus, content, mime)) def _get_root_for_rev(self, rev): """Fetch a revision root from a cache of such, or a fresh root @@ -217,7 +329,7 @@ def handle_revision(db, command, repo, rev, verbose, force=0): if verbose: print "skipped (no changes)." return - for (path, action, plus, minus) in revision.changes: + for (path, action, plus, minus, content, mime) in revision.changes: directory, file = os.path.split(path) commit = cvsdb.CreateCommit() commit.SetRepository(repo.path) @@ -230,6 +342,8 @@ def handle_revision(db, command, repo, rev, verbose, force=0): commit.SetPlusCount(plus) commit.SetMinusCount(minus) commit.SetBranch(None) + commit.SetContent(content) + commit.SetMimeType(mime) if action == 'add': commit.SetTypeAdd() @@ -268,7 +382,16 @@ def main(command, repository, revs=[], verbose=0, force=0): sys.stderr.write("ERROR: " + str(e) + "\n") sys.exit(1) - repo = SvnRepo(repository) + tika_client = None + if cfg.utilities.tika_server: + tika_client = TikaClient(cfg.utilities.tika_server, cfg.utilities.tika_mime_types) + repo = SvnRepo( + path = repository, + index_content = cfg.cvsdb.index_content, + tika_client = tika_client, + guesser = cfg.guesser(), + svn_ignore_mimetype = cfg.options.svn_ignore_mimetype, + ) if command == 'rebuild' or (command == 'update' and not revs): for rev in range(repo.rev_max+1): handle_revision(db, command, repo, rev, verbose) @@ -312,7 +435,7 @@ Usage: 1. %s [-v] rebuild REPOS-PATH the database. If a range is specified, the revisions will be processed in ascending order, and you may specify "HEAD" to indicate "the youngest revision currently in the repository". - + 3. Purge information specific to the repository located at REPOS-PATH from the database. @@ -337,7 +460,7 @@ if __name__ == '__main__': del args[index] except ValueError: pass - + if len(args) < 3: usage() diff --git a/conf/viewvc.conf.dist b/conf/viewvc.conf.dist index 3a26a4fe..0a1017ec 100644 --- a/conf/viewvc.conf.dist +++ b/conf/viewvc.conf.dist @@ -1,6 +1,6 @@ #--------------------------------------------------------------------------- # -# Configuration file for ViewVC +# Configuration file for ViewVC (4IntraNet patched version) # # Information on ViewVC is located at the following web site: # http://viewvc.org/ @@ -9,7 +9,7 @@ # THE FORMAT OF THIS CONFIGURATION FILE # -# This file is delineated by sections, specified in [brackets]. Within +# This file is delineated by sections, specified in [brackets]. Within # each section, are a number of configuration settings. These settings # take the form of: name = value. Values may be continued on the # following line by indenting the continued line. @@ -17,14 +17,14 @@ # WARNING: Indentation *always* means continuation. Name=value lines # should always start in column zero. # -# Comments should always start in column zero, and are identified +# Comments should always start in column zero, and are identified # with "#". # -# Certain configuration settings may have multiple values. These should -# be separated by a comma. The settings where this is allowed are noted +# Certain configuration settings may have multiple values. These should +# be separated by a comma. The settings where this is allowed are noted # below. Any other setting that requires special syntax is noted at that # setting. -# +# # # SOME TERMINOLOGY USED HEREIN # @@ -50,10 +50,10 @@ # recommend you pay attention to. Of course, don't try to change the # options here -- do so in the relevant section of the configuration # file below. -# +# # For correct operation, you will probably need to change the following # configuration variables: -# +# # cvs_roots (for CVS) # svn_roots (for Subversion) # root_parents (for CVS or Subversion) @@ -62,18 +62,18 @@ # rcs_dir # mime_types_file # the many options in the [utilities] section -# +# # It is usually desirable to change the following variables: -# +# # address # forbidden -# +# # To optimize delivery of ViewVC static files: -# +# # docroot -# +# # To customize the display of ViewVC for your site: -# +# # template_dir # the [templates] override section # @@ -139,7 +139,7 @@ default_root = cvs # provided only as a convenience for ViewVC installations which are # using the default template set, where the value of this option will # be displayed in the footer of every ViewVC page.) -address = +address = # # This option provides a mechanism for custom key/value pairs to be @@ -244,21 +244,47 @@ cvsnt = # See also bin/cvsnt-rcsfile-inetd.pl -#rcsfile_socket = 'host:port' -# Example: rcsfile_socket = '127.0.0.1:8071' +#rcsfile_socket = host:port +# Example: rcsfile_socket = 127.0.0.1:8071 # Subversion command-line client, used for viewing Subversion repositories svn = # svn = /usr/bin/svn # GNU diff, used for showing file version differences -diff = +diff = # diff = /usr/bin/diff # CvsGraph, a graphical CVS version graph generator (see options.use_cvsgraph) cvsgraph = # cvsgraph = /usr/local/bin/cvsgraph +# Apache Tika TCP server host and port, used to extract text from binary documents +# Note that as of 2011-09-12, Tika 0.9 has a bug which leads to hangups on processing +# M$Word documents in server mode. So you must use the fixed version, downloaded from: +# http://wiki.4intra.net/public/tika-app-0.9-fix-TIKA709.jar +# (mirror) http://code.google.com/p/mediawiki4intranet/downloads/detail?name=tika-app-0.9-fix-TIKA709.jar +# Or apply the patch by yourself and rebuild Tika from source, see patch here: +# https://issues.apache.org/jira/browse/TIKA-709 +# Tika server should be started with command 'java -jar tika-app-0.9.jar -p PORT -t -eutf-8' + +#tika_server = host:port +# Example: tika_server = 127.0.0.1:8072 + +# This lists MIME types that can be processed by Tika +# You may change it if your Tika is newer than 0.9 and supports more formats +# (note) *+xml examples: xhtml+xml, rss+xml, atom+xml, docbook+xml, rdf+xml +tika_mime_types = + text/* + application/*+xml + application/xml + application/vnd.oasis.opendocument.* + application/vnd.openxmlformats + application/vnd.ms-* + application/msaccess + application/msword + application/pdf + application/rtf #--------------------------------------------------------------------------- [options] @@ -358,7 +384,7 @@ svn_ignore_mimetype = 0 # directory ViewVC should consult for various things, including cached # remote authentication credentials. If unset, Subversion will use # the default location(s) ($HOME/.subversion, etc.) -svn_config_dir = +svn_config_dir = # use the rcsparse Python module to retrieve CVS repository # information instead of invoking rcs utilities [EXPERIMENTAL] @@ -494,12 +520,18 @@ short_log_len = 80 # should we colorize known file content syntaxes? (requires Pygments module) enable_syntax_coloration = 1 +# detect_encoding: Should we attempt to detect versioned file +# character encodings? [Requires 'chardet' module] +# Used in file list, file content display and indexing +# See also options.encodings for naive guessing. +detect_encoding = 1 + # Use CvsGraph. See http://www.akhphd.au.dk/~bertho/cvsgraph/ for -# documentation and download. +# documentation and download. use_cvsgraph = 0 #use_cvsgraph = 1 -# Location of the customized cvsgraph configuration file. +# Location of the customized cvsgraph configuration file. cvsgraph_conf = cvsgraph.conf # @@ -544,6 +576,17 @@ use_pagesize = 0 # Set to 0 to disable the limit. limit_changes = 100 +# You can also use primitive charset guessing instead of chardet (options.detect_encoding) +# Just set this to the list of possible charsets in your repository. +# ViewVC will simply try to decode content using each of them, and pick +# the first which succeeds. UTF-8 is always tried automatically. +#encodings = cp1251:iso-8859-1 + +# Sadly this is also required - for back-links from query results to files +# in CVS, because it doesn't recode file names to UTF-8 as Subversion does. +# Just set to cp1251 if you work with your CVS from Windowz. +#cvs_ondisk_charset = cp1251 + #--------------------------------------------------------------------------- [templates] @@ -554,7 +597,7 @@ limit_changes = 100 # use a different template for a particular view, simply uncomment the # appropriate option below and specify the currect location of the EZT # template file you wish to use for that view. -# +# # Templates are specified relative to the configured template # directory (see the "template_dir" option), but absolute paths may # also be used as well. @@ -569,13 +612,13 @@ limit_changes = 100 #diff = diff.ezt #directory = directory.ezt ### an alternative directory view -#directory = dir_new.ezt +#directory = dir_new.ezt #error = error.ezt #file = file.ezt #graph = graph.ezt #log = log.ezt ### a table-based alternative log view -#log = log_table.ezt +#log = log_table.ezt #query = query.ezt #query_form = query_form.ezt #query_results = query_results.ezt @@ -588,22 +631,51 @@ limit_changes = 100 # Set to 1 to enable the database integration feature, 0 otherwise. enabled = 0 -# Database hostname and port. +# Set to 1 to enable indexing of file contents using Sphinx and Tika +index_content = 0 + +# Database hostname, port, and socket #host = localhost #port = 3306 +# On Debian Linux, enable this: +#socket = /var/run/mysqld/mysqld.sock # ViewVC database name. #database_name = ViewVC # Username and password of user with read/write privileges to the ViewVC # database. -#user = -#passwd = +#user = +#passwd = # Username and password of user with read privileges to the ViewVC # database. -#readonly_user = -#readonly_passwd = +#readonly_user = +#readonly_passwd = + +# ViewVC can use Sphinx (http://sphinxsearch.com) full-text search engine +# to index file contents with full history and then search over them. +# Also, Apache Tika console application can be used in TCP server mode to +# add support for indexing binary documents (M$Word, PDF and etc). +# See tika_server in [utilities]. +# Requires Sphinx >= 0.9.9 with a real-time updatable SphinxQL index. +# Index must be created in sphinx.conf by hand and have the following fields: +# rt_field = content +# rt_attr_string = content +# rt_attr_string = mimetype +# rt_attr_timestamp = ci_when +# rt_attr_uint = whoid +# rt_attr_uint = repositoryid +# rt_attr_uint = dirid +# rt_attr_uint = fileid +# rt_attr_uint = revision +# rt_attr_uint = branchid + +# Sphinx connection parameters: +#sphinx_host = +#sphinx_port = +#sphinx_socket = /var/run/sphinxql.sock +#sphinx_index = viewvc # Limit the number of rows returned by a given query to this number. #row_limit = 1000 @@ -616,7 +688,7 @@ enabled = 0 # Check if the repository is found in the database before showing # the query link and RSS feeds. Set to 1 to enable check. -# +# # WARNING: Enabling this check adds the cost of a database connection # and query to most ViewVC requests. If all your roots are represented # in the commits database, or if you don't care about the creation of @@ -640,7 +712,7 @@ enabled = 0 # # ViewVC allows you to customize its configuration options for # individual virtual hosts. You might, for example, wish to expose -# all of your Subversion repositories at http://svn.yourdomain.com/viewvc/ +# all of your Subversion repositories at http://svn.yourdomain.com/viewvc/ # and all your CVS ones at http://cvs.yourdomain.com/viewvc/, with no # cross-exposure. Using ViewVC's virtual host (vhost) configuration # support, you can do this. Simply create two vhost configurations @@ -671,7 +743,7 @@ enabled = 0 # gui = guiproject.yourdomain.* # # [vhost-libs/general] -# cvs_roots = +# cvs_roots = # svn_roots = svnroot: /var/svn/libs-repos # default_root = svnroot # @@ -680,7 +752,7 @@ enabled = 0 # # [vhost-gui/general] # cvs_roots = cvsroot: /var/cvs/guiproject -# svn_roots = +# svn_roots = # default_root = cvsroot # @@ -697,7 +769,7 @@ enabled = 0 # # Here is an example showing how to enable Subversion authz-based # authorization for only the single root named "svnroot": -# +# # [root-svnroot/options] # authorizer = svnauthz # @@ -726,7 +798,7 @@ enabled = 0 # # Tests are case-sensitive. # -# NOTE: Again, this is for the hiding of modules within repositories, *not* +# NOTE: Again, this is for the hiding of modules within repositories, *not* # for the hiding of repositories (roots) themselves. # # Some examples: @@ -749,7 +821,7 @@ enabled = 0 # Allow "xml", forbid other modules starting with "x", and allow the rest: # forbidden = !xml, x*, !* # -forbidden = +forbidden = #--------------------------------------------------------------------------- [authz-forbiddenre] @@ -792,7 +864,7 @@ forbidden = # Only allow visibility of HTML files and the directories that hold them: # forbiddenre = !^([^/]+|.*(/|\.html))$ # -forbiddenre = +forbiddenre = #--------------------------------------------------------------------------- [authz-svnauthz] diff --git a/lib/config.py b/lib/config.py index cdc04386..264d7ea4 100644 --- a/lib/config.py +++ b/lib/config.py @@ -24,6 +24,7 @@ import vclib.ccvs import vclib.svn import cvsdb import viewvc +from viewvcmagic import ContentMagic ######################################################################### # @@ -47,6 +48,7 @@ class Config: 'root_parents', 'allowed_views', 'mime_types_files') def __init__(self): + self.__guesser = None for section in self._sections: setattr(self, section, _sub_config()) @@ -66,7 +68,6 @@ class Config: if rootname: self._process_root_options(self.parser, rootname) self.expand_root_parents() - cvsdb.setencs(self.options.encodings.split(':')) r = {} for i in self.rewritehtml.__dict__.keys(): if i[-8:] == '.replace': @@ -201,7 +202,7 @@ class Config: pass else: raise IllegalOverrideSection('root', section) - + def overlay_root_options(self, rootname): "Overly per-root options atop the existing option set." if not self.conf_path: @@ -217,7 +218,7 @@ class Config: for option in parser.options(section): d[option] = parser.get(section, option) return d.items() - + def get_authorizer_params(self, authorizer, rootname=None): if not self.conf_path: return {} @@ -236,7 +237,12 @@ class Config: params[key] = value params['__config'] = self return params - + + def guesser(self): + if not self.__guesser: + self.__guesser = ContentMagic(self.options.encodings) + return self.__guesser + def set_defaults(self): "Set some default values in the configuration." @@ -258,6 +264,8 @@ class Config: self.utilities.svn = '' self.utilities.diff = '' self.utilities.cvsgraph = '' + self.utilities.tika_server = '' + self.utilities.tika_mime_types = '' self.options.root_as_url_component = 1 self.options.checkout_magic = 0 @@ -302,7 +310,7 @@ class Config: self.options.limit_changes = 100 self.options.cvs_ondisk_charset = 'cp1251' self.options.binary_mime_re = '^(?!text/|.*\Wxml)' - self.options.encodings = 'utf-8:cp1251:iso-8859-1' + self.options.encodings = 'cp1251:iso-8859-1' self.templates.diff = None self.templates.directory = None @@ -316,6 +324,7 @@ class Config: self.templates.roots = None self.cvsdb.enabled = 0 + self.cvsdb.index_content = 0 self.cvsdb.host = '' self.cvsdb.port = 3306 self.cvsdb.socket = '' @@ -323,12 +332,17 @@ class Config: self.cvsdb.user = '' self.cvsdb.passwd = '' self.cvsdb.readonly_user = '' - self.cvsdb.readonly_passwd = '' + self.cvsdb.readonly_passwd = '' self.cvsdb.row_limit = 1000 self.cvsdb.rss_row_limit = 100 self.cvsdb.check_database_for_root = 0 self.cvsdb.fulltext_min_relevance = 0.2 + self.cvsdb.sphinx_host = '' + self.cvsdb.sphinx_port = 3307 + self.cvsdb.sphinx_socket = '' + self.cvsdb.sphinx_index = '' + def _startswith(somestr, substr): return somestr[:len(substr)] == substr diff --git a/lib/cvsdb.py b/lib/cvsdb.py index e5e7a7e1..a614c5e0 100644 --- a/lib/cvsdb.py +++ b/lib/cvsdb.py @@ -15,6 +15,7 @@ import sys import string import time import re +import cgi import vclib import dbi @@ -36,22 +37,12 @@ error = "cvsdb error" ## defined to actually be complete; it should run well off of any DBI 2.0 ## complient database interface -encs = [ "utf-8", "cp1251", "iso-8859-1" ] - -def utf8string(value): - for e in encs: - try: - value = value.decode(e) - break - except: pass - return value.encode("utf-8") - -def setencs(e): - global encs - encs = e - class CheckinDatabase: - def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, authorizer = None): + def __init__(self, host, port, socket, user, passwd, database, row_limit, min_relevance, cfg, + authorizer = None, index_content = 0, sphinx_host = None, sphinx_port = None, + sphinx_socket = None, sphinx_index = None): + self.cfg = cfg + self._host = host self._port = port self._socket = socket @@ -63,11 +54,21 @@ class CheckinDatabase: self._min_relevance = min_relevance self.authorizer = authorizer + # Sphinx settings + self.index_content = index_content + self.sphinx_host = sphinx_host + self.sphinx_port = sphinx_port + self.sphinx_socket = sphinx_socket + self.sphinx_index = sphinx_index + ## database lookup caches self._get_cache = {} self._get_id_cache = {} self._desc_id_cache = {} + # Sphinx connection None by default + self.sphinx = None + def Connect(self): self.db = dbi.connect( self._host, self._port, self._socket, self._user, self._passwd, self._database) @@ -83,12 +84,17 @@ class CheckinDatabase: else: self._version = 0 if self._version > CURRENT_SCHEMA_VERSION: - raise DatabaseVersionError("Database version %d is newer than the " - "last version supported by this " - "software." % (self._version)) + raise DatabaseVersionError("Database version %d is newer than the " + "last version supported by this " + "software." % (self._version)) + if self.index_content: + self.sphinx = dbi.connect(self.sphinx_host, self.sphinx_port, self.sphinx_socket, '', '', '') + + def utf8(self, value): + return self.cfg.guesser().utf8(value) def sql_get_id(self, table, column, value, auto_set): - value = utf8string(value) + value = self.utf8(value) sql = "SELECT id FROM %s WHERE %s=%%s" % (table, column) sql_args = (value, ) @@ -172,7 +178,7 @@ class CheckinDatabase: temp2[id] = value return value - + def get_list(self, table, field_index): sql = "SELECT * FROM %s" % (table) cursor = self.db.cursor() @@ -198,7 +204,7 @@ class CheckinDatabase: break list.append(row[0]) return list - + def GetMetadataValue(self, name): sql = "SELECT value FROM metadata WHERE name=%s" sql_args = (name) @@ -209,7 +215,7 @@ class CheckinDatabase: except TypeError: return None return value - + def SetMetadataValue(self, name, value): assert(self._version > 0) sql = "REPLACE INTO metadata (name, value) VALUES (%s, %s)" @@ -222,7 +228,7 @@ class CheckinDatabase: "\tname = %s\n" "\tvalue = %s\n" % (str(e), name, value)) - + def GetBranchID(self, branch, auto_set = 1): return self.get_id("branches", "branch", branch, auto_set) @@ -240,13 +246,13 @@ class CheckinDatabase: def GetFile(self, id): return self.get("files", "file", id) - + def GetAuthorID(self, author, auto_set = 1): return self.get_id("people", "who", author, auto_set) def GetAuthor(self, id): return self.get("people", "who", id) - + def GetRepositoryID(self, repository, auto_set = 1): return self.get_id("repositories", "repository", repository, auto_set) @@ -257,7 +263,7 @@ class CheckinDatabase: return self.get_list("repositories", repository) def SQLGetDescriptionID(self, description, auto_set = 1): - description = utf8string(description) + description = self.utf8(description) ## lame string hash, blame Netscape -JMP hash = len(description) @@ -330,7 +336,7 @@ class CheckinDatabase: ci_when = cursor.fetchone()[0] except TypeError: return None - + return dbi.TicksFromDateTime(ci_when) def AddCommitList(self, commit_list): @@ -338,48 +344,55 @@ class CheckinDatabase: self.AddCommit(commit) def AddCommit(self, commit): - ci_when = dbi.DateTimeFromTicks(commit.GetTime() or 0.0) - ci_type = commit.GetTypeString() - who_id = self.GetAuthorID(commit.GetAuthor()) - repository_id = self.GetRepositoryID(commit.GetRepository()) - directory_id = self.GetDirectoryID(commit.GetDirectory()) - file_id = self.GetFileID(commit.GetFile()) - revision = commit.GetRevision() - sticky_tag = "NULL" - branch_id = self.GetBranchID(commit.GetBranch()) - plus_count = commit.GetPlusCount() or '0' - minus_count = commit.GetMinusCount() or '0' - description_id = self.GetDescriptionID(commit.GetDescription()) + props = { + 'type' : commit.GetTypeString(), + 'ci_when' : dbi.DateTimeFromTicks(commit.GetTime() or 0.0), + 'whoid' : self.GetAuthorID(commit.GetAuthor()), + 'repositoryid' : self.GetRepositoryID(commit.GetRepository()), + 'dirid' : self.GetDirectoryID(commit.GetDirectory()), + 'fileid' : self.GetFileID(commit.GetFile()), + 'revision' : commit.GetRevision(), + 'branchid' : self.GetBranchID(commit.GetBranch()), + 'addedlines' : commit.GetPlusCount() or '0', + 'removedlines' : commit.GetMinusCount() or '0', + 'descid' : self.GetDescriptionID(commit.GetDescription()), + } commits_table = self._version >= 1 and 'commits' or 'checkins' - sql = "REPLACE INTO %s" % (commits_table) - sql = sql + \ - " (type,ci_when,whoid,repositoryid,dirid,fileid,revision,"\ - " stickytag,branchid,addedlines,removedlines,descid)"\ - "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" - sql_args = (ci_type, ci_when, who_id, repository_id, - directory_id, file_id, revision, sticky_tag, branch_id, - plus_count, minus_count, description_id) cursor = self.db.cursor() try: - cursor.execute(sql, sql_args) + # MySQL-specific INSERT-or-UPDATE with ID retrieval + cursor.execute( + 'INSERT INTO '+commits_table+'('+','.join(i for i in props)+') VALUES ('+ + ', '.join('%s' for i in props)+') ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), '+ + ', '.join(i+'=VALUES('+i+')' for i in props), + tuple(props[i] for i in props) + ) + commit_id = cursor.lastrowid + if self.index_content: + sphcur = self.sphinx.cursor() + content = commit.GetContent() + props['ci_when'] = str(int(commit.GetTime() or 0)) + if len(content): + props['content'] = content + # Now, stored MIME type is only needed while searching + # It is guessed again when the file is displayed + props['mimetype'] = commit.GetMimeType() + props['id'] = str(commit_id) + del props['addedlines'] + del props['removedlines'] + del props['descid'] + del props['type'] + sphcur.execute( + 'INSERT INTO '+self.sphinx_index+'('+','.join(i for i in props)+') VALUES ('+ + ','.join('%s' for i in props)+')', + tuple(props[i] for i in props) + ) except Exception, e: - raise Exception("Error adding commit: '%s'\n" - "Values were:\n" - "\ttype = %s\n" - "\tci_when = %s\n" - "\twhoid = %s\n" - "\trepositoryid = %s\n" - "\tdirid = %s\n" - "\tfileid = %s\n" - "\trevision = %s\n" - "\tstickytag = %s\n" - "\tbranchid = %s\n" - "\taddedlines = %s\n" - "\tremovedlines = %s\n" - "\tdescid = %s\n" - % ((str(e), ) + sql_args)) + print ("Error adding commit: '"+str(e)+"'\nValues were:\n"+ + "\n".join(i+'='+str(props[i]) for i in props)) + raise def SQLQueryListString(self, field, query_entry_list): sqlList = [] @@ -414,6 +427,67 @@ class CheckinDatabase: return "(%s)" % (string.join(sqlList, " OR ")) + def query_ids(self, in_field, table, id_field, name_field, lst): + if not len(lst): + return None + cond = self.SQLQueryListString(name_field, lst) + cursor = self.db.cursor() + cursor.execute('SELECT %s FROM %s WHERE %s' % (id_field, table, cond)) + ids = list(str(row[0]) for row in cursor) + if not len(ids): + return None + return "%s IN (%s)" % (in_field, ','.join(ids)) + + def CreateSphinxQueryString(self, query): + condList = [ + 'MATCH(%s)' % (self.db.literal(query.content_query), ), + self.query_ids('repositoryid', 'repositories', 'id', 'repository', query.repository_list), + self.query_ids('branchid', 'branches', 'id', 'branch', query.branch_list), + self.query_ids('dirid', 'dirs', 'id', 'dir', query.directory_list), + self.query_ids('fileid', 'files', 'id', 'file', query.file_list), + self.query_ids('authorid', 'people', 'id', 'who', query.author_list), + self.query_ids('descid', 'descs', 'id', 'description', query.comment_list), + ] + + if len(query.revision_list): + condList.append("revision IN ("+','.join(self.db.literal(s) for s in query.revision_list)+")") + if query.from_date: + condList.append('ci_when>='+str(dbi.TicksFromDateTime(query.from_date))) + if query.to_date: + condList.append('ci_when<='+str(dbi.TicksFromDateTime(query.to_date))) + + if query.sort == 'date': + order_by = 'ORDER BY `ci_when` DESC, `relevance` DESC' + elif query.sort == 'date_rev': + order_by = 'ORDER BY `ci_when` ASC, `relevance` DESC' + else: # /* if query.sort == 'relevance' */ + order_by = 'ORDER BY `relevance` DESC' + + conditions = string.join((i for i in condList if i), " AND ") + conditions = conditions and "WHERE %s" % conditions + + ## limit the number of rows requested or we could really slam + ## a server with a large database + limit = "" + if query.limit: + limit = "LIMIT %s" % (str(query.limit)) + elif self._row_limit: + limit = "LIMIT %s" % (str(self._row_limit)) + + fields = "id `id`, WEIGHT() `relevance`, `content`, `mimetype`" + + return "SELECT %s FROM %s %s %s %s" % (fields, self.sphinx_index, conditions, order_by, limit) + + # Get commits by their IDs + def CreateIdQueryString(self, ids): + commits_table = self._version >= 1 and 'commits' or 'checkins' + return ( + 'SELECT %s.*, repositories.repository AS repository_name, dirs.dir AS dir_name, files.file AS file_name' + ' FROM %s, repositories, dirs, files' + ' WHERE %s.id IN (%s) AND repositoryid=repositories.id' + ' AND dirid=dirs.id AND fileid=files.id' % (commits_table, commits_table, commits_table, ','.join(ids)) + ) + def CreateSQLQueryString(self, query): commits_table = self._version >= 1 and 'commits' or 'checkins' fields = [ @@ -427,7 +501,7 @@ class CheckinDatabase: ("dirs", "(%s.dirid=dirs.id)" % (commits_table)), ("files", "(%s.fileid=files.id)" % (commits_table))] condList = [] - + if len(query.text_query): tableList.append(("descs", "(descs.id=%s.descid)" % (commits_table))) temp = "MATCH (descs.description) AGAINST (%s" % (self.db.literal(query.text_query)) @@ -435,6 +509,7 @@ class CheckinDatabase: fields.append("%s) AS relevance" % temp) else: fields.append("'' AS relevance") + fields.append("'' AS snippet") if len(query.repository_list): temp = self.SQLQueryListString("repositories.repository", @@ -478,16 +553,18 @@ class CheckinDatabase: temp = "(%s.ci_when<=\"%s\")" % (commits_table, str(query.to_date)) condList.append(temp) - if query.sort == "date": - order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table) + if query.sort == "relevance" and len(query.text_query): + order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table) + elif query.sort == "date_rev": + order_by = "ORDER BY %s.ci_when ASC,descid,%s.repositoryid" % (commits_table, commits_table) elif query.sort == "author": tableList.append(("people", "(%s.whoid=people.id)" % (commits_table))) order_by = "ORDER BY people.who,descid,%s.repositoryid" % (commits_table) elif query.sort == "file": tableList.append(("files", "(%s.fileid=files.id)" % (commits_table))) order_by = "ORDER BY files.file,descid,%s.repositoryid" % (commits_table) - elif query.sort == "relevance" and len(query.text_query): - order_by = "ORDER BY relevance DESC,%s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table) + else: # /* if query.sort == "date": */ + order_by = "ORDER BY %s.ci_when DESC,descid,%s.repositoryid" % (commits_table, commits_table) ## exclude duplicates from the table list, and split out join ## conditions from table names. In future, the join conditions @@ -517,7 +594,7 @@ class CheckinDatabase: fields, tables, conditions, order_by, limit) return sql - + def check_commit_access(self, repos, dir, file, rev): if self.authorizer: rootname = repos.split('/') @@ -528,19 +605,60 @@ class CheckinDatabase: return True def RunQuery(self, query): - sql = self.CreateSQLQueryString(query) - cursor = self.db.cursor() - cursor.execute(sql) + if len(query.content_query) and self.sphinx: + # Use Sphinx to search on document content + sql = self.CreateSphinxQueryString(query) + cursor = self.sphinx.cursor() + cursor.execute(sql) + sphinx_rows = list((str(docid), rel, content, mimetype) for docid, rel, content, mimetype in cursor) + if len(sphinx_rows): + # Fetch snippets + snippet_options = { + 'around': 15, + 'limit': 200, + 'before_match': '', + 'after_match': '', + 'chunk_separator': ' ... ', + } + preformatted_mime = 'text/(?!html|xml).*' + snippets = {} + bm_html = cgi.escape(snippet_options['before_match']) + am_html = cgi.escape(snippet_options['after_match']) + for docid, rel, content, mimetype in sphinx_rows: + cursor.execute( + 'CALL SNIPPETS(%s, %s, %s'+''.join(', %s AS '+i for i in snippet_options)+')', + (content, self.sphinx_index, query.content_query) + tuple(snippet_options.values()) + ) + s, = cursor.fetchone() + s = cgi.escape(s) + if re.match(preformatted_mime, mimetype): + s = s.replace('\n', '
') + s = s.replace(bm_html, snippet_options['before_match']) + s = s.replace(am_html, snippet_options['after_match']) + snippets[docid] = s + # Fetch all fields from MySQL + sql = self.CreateIdQueryString((docid for (docid, _, _, _) in sphinx_rows)) + cursor = self.db.cursor() + cursor.execute(sql) + byid = {} + for row in cursor: + byid[str(row[0])] = row + rows = list(byid[docid] + (rel, snippets[docid]) for (docid, rel, _, _) in sphinx_rows if docid in byid) + else: + rows = [] + else: + # Use regular queries when document content is not searched + sql = self.CreateSQLQueryString(query) + cursor = self.db.cursor() + cursor.execute(sql) + rows = list(cursor) - while 1: - row = cursor.fetchone() - if not row: - break - - (dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID, + # Convert rows to commit objects + for row in rows: + (dbId, dbType, dbCI_When, dbAuthorID, dbRepositoryID, dbDirID, dbFileID, dbRevision, dbStickyTag, dbBranchID, dbAddedLines, dbRemovedLines, dbDescID, dbRepositoryName, dbDirName, - dbFileName, dbRelevance) = row + dbFileName, dbRelevance, dbSnippet) = row if not self.check_commit_access(dbRepositoryName, dbDirName, dbFileName, dbRevision): continue @@ -564,6 +682,7 @@ class CheckinDatabase: commit.SetMinusCount(dbRemovedLines) commit.SetDescriptionID(dbDescID) commit.SetRelevance(dbRelevance) + commit.SetSnippet(dbSnippet) query.AddCommit(commit) @@ -623,46 +742,21 @@ class CheckinDatabase: raise UnknownRepositoryError("Unknown repository '%s'" % (repository)) - if (self._version >= 1): - self.sql_delete('repositories', 'id', rep_id) - self.sql_purge('commits', 'repositoryid', 'id', 'repositories') - self.sql_purge('files', 'id', 'fileid', 'commits') - self.sql_purge('dirs', 'id', 'dirid', 'commits') - self.sql_purge('branches', 'id', 'branchid', 'commits') - self.sql_purge('descs', 'id', 'descid', 'commits') - self.sql_purge('people', 'id', 'whoid', 'commits') - else: - sql = "SELECT * FROM checkins WHERE repositoryid=%s" - sql_args = (rep_id, ) - cursor = self.db.cursor() - cursor.execute(sql, sql_args) - checkins = [] - while 1: - try: - (ci_type, ci_when, who_id, repository_id, - dir_id, file_id, revision, sticky_tag, branch_id, - plus_count, minus_count, description_id) = \ - cursor.fetchone() - except TypeError: - break - checkins.append([file_id, dir_id, branch_id, - description_id, who_id]) - - #self.sql_delete('repositories', 'id', rep_id) - self.sql_delete('checkins', 'repositoryid', rep_id) - for checkin in checkins: - self.sql_delete('files', 'id', checkin[0], 'fileid') - self.sql_delete('dirs', 'id', checkin[1], 'dirid') - self.sql_delete('branches', 'id', checkin[2], 'branchid') - self.sql_delete('descs', 'id', checkin[3], 'descid') - self.sql_delete('people', 'id', checkin[4], 'whoid') + checkins_table = self._version >= 1 and 'commits' or 'checkins' + self.sql_delete('repositories', 'id', rep_id) + self.sql_purge(checkins_table, 'repositoryid', 'id', 'repositories') + self.sql_purge('files', 'id', 'fileid', checkins_table) + self.sql_purge('dirs', 'id', 'dirid', checkins_table) + self.sql_purge('branches', 'id', 'branchid', checkins_table) + self.sql_purge('descs', 'id', 'descid', checkins_table) + self.sql_purge('people', 'id', 'whoid', checkins_table) # Reset all internal id caches. We could be choosier here, # but let's just be as safe as possible. self._get_cache = {} self._get_id_cache = {} self._desc_id_cache = {} - + class DatabaseVersionError(Exception): pass @@ -678,7 +772,7 @@ class Commit: CHANGE = 0 ADD = 1 REMOVE = 2 - + def __init__(self): self.__directory = '' self.__file = '' @@ -690,15 +784,20 @@ class Commit: self.__minuscount = '' self.__description = '' self.__relevance = '' + self.__snippet = '' self.__gmt_time = 0.0 self.__type = Commit.CHANGE + self.__content = '' + self.__mimetype = '' + self.__base_path = '' + self.__base_rev = '' def SetRepository(self, repository): self.__repository = repository def GetRepository(self): return self.__repository - + def SetDirectory(self, dir): self.__directory = dir @@ -710,7 +809,7 @@ class Commit: def GetFile(self): return self.__file - + def SetRevision(self, revision): self.__revision = revision @@ -758,12 +857,19 @@ class Commit: def GetDescription(self): return self.__description + # Relevance and snippet are used when querying commit database def SetRelevance(self, relevance): self.__relevance = relevance def GetRelevance(self): return self.__relevance + def SetSnippet(self, snippet): + self.__snippet = snippet + + def GetSnippet(self): + return self.__snippet + def SetTypeChange(self): self.__type = Commit.CHANGE @@ -784,66 +890,80 @@ class Commit: elif self.__type == Commit.REMOVE: return 'Remove' + # File content (extracted text), optional, indexed with Sphinx + def SetContent(self, content): + self.__content = content + + def GetContent(self): + return self.__content + + # MIME type, optional, now only stored in Sphinx + def SetMimeType(self, mimetype): + self.__mimetype = mimetype + + def GetMimeType(self): + return self.__mimetype + ## LazyCommit overrides a few methods of Commit to only retrieve ## it's properties as they are needed class LazyCommit(Commit): - def __init__(self, db): - Commit.__init__(self) - self.__db = db + def __init__(self, db): + Commit.__init__(self) + self.__db = db - def SetFileID(self, dbFileID): - self.__dbFileID = dbFileID + def SetFileID(self, dbFileID): + self.__dbFileID = dbFileID - def GetFileID(self): - return self.__dbFileID + def GetFileID(self): + return self.__dbFileID - def GetFile(self): - return self.__db.GetFile(self.__dbFileID) + def GetFile(self): + return self.__db.GetFile(self.__dbFileID) - def SetDirectoryID(self, dbDirID): - self.__dbDirID = dbDirID + def SetDirectoryID(self, dbDirID): + self.__dbDirID = dbDirID - def GetDirectoryID(self): - return self.__dbDirID + def GetDirectoryID(self): + return self.__dbDirID - def GetDirectory(self): - return self.__db.GetDirectory(self.__dbDirID) + def GetDirectory(self): + return self.__db.GetDirectory(self.__dbDirID) - def SetRepositoryID(self, dbRepositoryID): - self.__dbRepositoryID = dbRepositoryID + def SetRepositoryID(self, dbRepositoryID): + self.__dbRepositoryID = dbRepositoryID - def GetRepositoryID(self): - return self.__dbRepositoryID + def GetRepositoryID(self): + return self.__dbRepositoryID - def GetRepository(self): - return self.__db.GetRepository(self.__dbRepositoryID) + def GetRepository(self): + return self.__db.GetRepository(self.__dbRepositoryID) - def SetAuthorID(self, dbAuthorID): - self.__dbAuthorID = dbAuthorID + def SetAuthorID(self, dbAuthorID): + self.__dbAuthorID = dbAuthorID - def GetAuthorID(self): - return self.__dbAuthorID + def GetAuthorID(self): + return self.__dbAuthorID - def GetAuthor(self): - return self.__db.GetAuthor(self.__dbAuthorID) + def GetAuthor(self): + return self.__db.GetAuthor(self.__dbAuthorID) - def SetBranchID(self, dbBranchID): - self.__dbBranchID = dbBranchID + def SetBranchID(self, dbBranchID): + self.__dbBranchID = dbBranchID - def GetBranchID(self): - return self.__dbBranchID + def GetBranchID(self): + return self.__dbBranchID - def GetBranch(self): - return self.__db.GetBranch(self.__dbBranchID) + def GetBranch(self): + return self.__db.GetBranch(self.__dbBranchID) - def SetDescriptionID(self, dbDescID): - self.__dbDescID = dbDescID + def SetDescriptionID(self, dbDescID): + self.__dbDescID = dbDescID - def GetDescriptionID(self): - return self.__dbDescID + def GetDescriptionID(self): + return self.__dbDescID - def GetDescription(self): - return self.__db.GetDescription(self.__dbDescID) + def GetDescription(self): + return self.__db.GetDescription(self.__dbDescID) ## QueryEntry holds data on one match-type in the SQL database ## match is: "exact", "like", or "regex" @@ -858,8 +978,8 @@ class CheckinDatabaseQuery: def __init__(self): ## sorting self.sort = "date" - - ## repository to query + + ## repository, branch, etc to query self.repository_list = [] self.branch_list = [] self.directory_list = [] @@ -867,7 +987,11 @@ class CheckinDatabaseQuery: self.revision_list = [] self.author_list = [] self.comment_list = [] + + ## text_query = Fulltext query on comments + ## content_query = Fulltext query on content self.text_query = "" + self.content_query = "" ## date range in DBI 2.0 timedate objects self.from_date = None @@ -886,6 +1010,9 @@ class CheckinDatabaseQuery: def SetTextQuery(self, query): self.text_query = query + def SetContentQuery(self, query): + self.content_query = query + def SetRepository(self, repository, match = "exact"): self.repository_list.append(QueryEntry(repository, match)) @@ -921,7 +1048,7 @@ class CheckinDatabaseQuery: def SetFromDateHoursAgo(self, hours_ago): ticks = time.time() - (3600 * hours_ago) self.from_date = dbi.DateTimeFromTicks(ticks) - + def SetFromDateDaysAgo(self, days_ago): ticks = time.time() - (86400 * days_ago) self.from_date = dbi.DateTimeFromTicks(ticks) @@ -942,7 +1069,7 @@ class CheckinDatabaseQuery: ## def CreateCommit(): return Commit() - + def CreateCheckinQuery(): return CheckinDatabaseQuery() @@ -953,9 +1080,23 @@ def ConnectDatabase(cfg, authorizer=None, readonly=0): else: user = cfg.cvsdb.user passwd = cfg.cvsdb.passwd - db = CheckinDatabase(cfg.cvsdb.host, cfg.cvsdb.port, cfg.cvsdb.socket, user, passwd, - cfg.cvsdb.database_name, cfg.cvsdb.row_limit, cfg.cvsdb.fulltext_min_relevance, - authorizer) + db = CheckinDatabase( + host = cfg.cvsdb.host, + port = cfg.cvsdb.port, + socket = cfg.cvsdb.socket, + user = user, + passwd = passwd, + database = cfg.cvsdb.database_name, + row_limit = cfg.cvsdb.row_limit, + min_relevance = cfg.cvsdb.fulltext_min_relevance, + authorizer = authorizer, + index_content = cfg.cvsdb.index_content, + sphinx_host = cfg.cvsdb.sphinx_host, + sphinx_port = int(cfg.cvsdb.sphinx_port), + sphinx_socket = cfg.cvsdb.sphinx_socket, + sphinx_index = cfg.cvsdb.sphinx_index, + cfg = cfg, + ) db.Connect() return db diff --git a/lib/vclib/ccvs/bincvs.py b/lib/vclib/ccvs/bincvs.py index c2b9430d..fee243fb 100644 --- a/lib/vclib/ccvs/bincvs.py +++ b/lib/vclib/ccvs/bincvs.py @@ -31,8 +31,8 @@ import popen class BaseCVSRepository(vclib.Repository): def __init__(self, name, rootpath, authorizer, utilities): if not os.path.isdir(rootpath): - raise vclib.ReposNotFound(name) - + raise vclib.ReposNotFound(name) + self.name = name self.rootpath = rootpath self.auth = authorizer @@ -53,7 +53,7 @@ class BaseCVSRepository(vclib.Repository): def authorizer(self): return self.auth - + def itemtype(self, path_parts, rev): basepath = self._getpath(path_parts) kind = None @@ -74,12 +74,12 @@ class BaseCVSRepository(vclib.Repository): def itemprops(self, path_parts, rev): self.itemtype(path_parts, rev) # does auth-check return {} # CVS doesn't support properties - + def listdir(self, path_parts, rev, options): if self.itemtype(path_parts, rev) != vclib.DIR: # does auth-check raise vclib.Error("Path '%s' is not a directory." % (string.join(path_parts, "/"))) - + # Only RCS files (*,v) and subdirs are returned. data = [ ] full_name = self._getpath(path_parts) @@ -115,7 +115,7 @@ class BaseCVSRepository(vclib.Repository): data.append(CVSDirEntry(name, kind, errors, 1)) return data - + def _getpath(self, path_parts): return apply(os.path.join, (self.rootpath,) + tuple(path_parts)) @@ -177,7 +177,7 @@ class BinCVSRepository(BaseCVSRepository): used_rlog = 0 tip_rev = None # used only if we have to fallback to using rlog - fp = self.rcs_popen('co', (rev_flag, full_name), 'rb') + fp = self.rcs_popen('co', (rev_flag, full_name), 'rb') try: filename, revision = _parse_co_header(fp) except COMissingRevision: @@ -191,14 +191,14 @@ class BinCVSRepository(BaseCVSRepository): used_rlog = 1 if not tip_rev: raise vclib.Error("Unable to find valid revision") - fp = self.rcs_popen('co', ('-p' + tip_rev.string, full_name), 'rb') + fp = self.rcs_popen('co', ('-p' + tip_rev.string, full_name), 'rb') filename, revision = _parse_co_header(fp) - + if filename is None: # CVSNT's co exits without any output if a dead revision is requested. # Bug at http://www.cvsnt.org/cgi-bin/bugzilla/show_bug.cgi?id=190 # As a workaround, we invoke rlog to find the first non-dead revision - # that precedes it and check out that revision instead. Of course, + # that precedes it and check out that revision instead. Of course, # if we've already invoked rlog above, we just reuse its output. if not used_rlog: tip_rev = self._get_tip_revision(full_name + ',v', rev) @@ -207,7 +207,7 @@ class BinCVSRepository(BaseCVSRepository): raise vclib.Error( 'Could not find non-dead revision preceding "%s"' % rev) fp = self.rcs_popen('co', ('-p' + tip_rev.undead.string, - full_name), 'rb') + full_name), 'rb') filename, revision = _parse_co_header(fp) if filename is None: @@ -278,7 +278,7 @@ class BinCVSRepository(BaseCVSRepository): if self.itemtype(path_parts, rev) != vclib.FILE: # does auth-check raise vclib.Error("Path '%s' is not a file." % (string.join(path_parts, "/"))) - + # Invoke rlog rcsfile = self.rcsfile(path_parts, 1) if rev and options.get('cvs_pass_rev', 0): @@ -341,7 +341,7 @@ class BinCVSRepository(BaseCVSRepository): def revinfo(self, rev): raise vclib.UnsupportedFeature - + def rawdiff(self, path_parts1, rev1, path_parts2, rev2, type, options={}): """see vclib.Repository.rawdiff docstring @@ -439,9 +439,9 @@ def _match_revs_tags(revlist, taglist): example: if revision is 1.2.3.4, parent is 1.2 "undead" - If the revision is dead, then this is a reference to the first + If the revision is dead, then this is a reference to the first previous revision which isn't dead, otherwise it's a reference - to itself. If all the previous revisions are dead it's None. + to itself. If all the previous revisions are dead it's None. "branch_number" tuple representing branch number or empty tuple if on trunk @@ -653,7 +653,7 @@ def _parse_co_header(fp): pass else: break - + raise COMalformedOutput, "Unable to find revision in co output stream" # if your rlog doesn't use 77 '=' characters, then this must change @@ -674,7 +674,7 @@ _EOF_ERROR = 'error message found' # rlog issued an error # ^rlog\: (.*)(?:\:\d+)?\: (.*)$ # # But for some reason the windows version of rlog omits the "rlog: " prefix -# for the first error message when the standard error stream has been +# for the first error message when the standard error stream has been # redirected to a file or pipe. (the prefix is present in subsequent errors # and when rlog is run from the console). So the expression below is more # complicated @@ -703,7 +703,7 @@ def _parse_log_header(fp): Returns: filename, default branch, tag dictionary, lock dictionary, rlog error message, and eof flag """ - + filename = head = branch = msg = "" taginfo = { } # tag name => number lockinfo = { } # revision => locker @@ -732,7 +732,7 @@ def _parse_log_header(fp): else: # oops. this line isn't lock info. stop parsing tags. state = 0 - + if state == 0: if line[:9] == 'RCS file:': filename = line[10:-1] @@ -902,7 +902,7 @@ def _file_log(revs, taginfo, lockinfo, cur_branch, filter): except ValueError: view_tag = None else: - tags.append(view_tag) + tags.append(view_tag) # Match up tags and revisions _match_revs_tags(revs, tags) @@ -910,13 +910,13 @@ def _file_log(revs, taginfo, lockinfo, cur_branch, filter): # Match up lockinfo and revision for rev in revs: rev.lockinfo = lockinfo.get(rev.string) - + # Add artificial ViewVC tag HEAD, which acts like a non-branch tag pointing # at the latest revision on the MAIN branch. The HEAD revision doesn't have # anything to do with the "head" revision number specified in the RCS file # and in rlog output. HEAD refers to the revision that the CVS and RCS co # commands will check out by default, whereas the "head" field just refers - # to the highest revision on the trunk. + # to the highest revision on the trunk. taginfo['HEAD'] = _add_tag('HEAD', taginfo['MAIN'].co_rev) # Determine what revisions to return @@ -954,7 +954,7 @@ def _file_log(revs, taginfo, lockinfo, cur_branch, filter): _remove_tag(view_tag) else: filtered_revs = revs - + return filtered_revs def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs): @@ -1004,7 +1004,7 @@ def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs): = _parse_log_header(rlog) if eof == _EOF_LOG: - # the rlog output ended early. this can happen on errors that rlog + # the rlog output ended early. this can happen on errors that rlog # thinks are so serious that it stops parsing the current file and # refuses to parse any of the files that come after it. one of the # errors that triggers this obnoxious behavior looks like: @@ -1052,8 +1052,8 @@ def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs): tag = None # we don't care about the specific values -- just the keys and whether - # the values point to branches or revisions. this the fastest way to - # merge the set of keys and keep values that allow us to make the + # the values point to branches or revisions. this the fastest way to + # merge the set of keys and keep values that allow us to make the # distinction between branch tags and normal tags alltags.update(taginfo) @@ -1098,7 +1098,7 @@ def _get_logs(repos, dir_path_parts, entries, view_tag, get_dirs): file.dead = 0 #file.errors.append("No revisions exist on %s" % (view_tag or "MAIN")) file.absent = 1 - + # done with this file now, skip the rest of this file's revisions if not eof: _skip_file(rlog) @@ -1211,7 +1211,7 @@ def _newest_file(dirpath): newest_time = 0 ### FIXME: This sucker is leaking unauthorized paths! ### - + for subfile in os.listdir(dirpath): ### filter CVS locks? stale NFS handles? if subfile[-2:] != ',v': diff --git a/lib/viewvc.py b/lib/viewvc.py index 84f5db9a..44a93a5d 100644 --- a/lib/viewvc.py +++ b/lib/viewvc.py @@ -1,4 +1,3 @@ -# # Copyright (C) 1999-2009 The ViewCVS Group. All Rights Reserved. # # By using this file, you agree to the terms and conditions set forth in @@ -68,7 +67,6 @@ docroot_magic_path = '*docroot*' viewcvs_mime_type = 'text/vnd.viewcvs-markup' alt_mime_type = 'text/x-cvsweb-markup' view_roots_magic = '*viewroots*' -magic_buf_size = 4096 default_mime_type = 'application/octet-stream' # Put here the variables we need in order to hold our state - they @@ -121,9 +119,8 @@ class Request: # check for an authenticated username self.username = server.getenv('REMOTE_USER') - # construct MIME magic - self.ms = None - self.ms_fail = 0 + # repository object cache + self.all_repos = {} # if we allow compressed output, see if the client does too self.gzip_compress_level = 0 @@ -134,6 +131,9 @@ class Request: string.split(http_accept_encoding, ","))): self.gzip_compress_level = 9 # make this configurable? + def utf8(self, value): + return self.cfg.guesser().utf8(value) + def create_repos(self, rootname): if not rootname: return None @@ -677,7 +677,7 @@ def _validate_mimetype(value): return value in (viewcvs_mime_type, alt_mime_type, 'text/plain') # obvious things here. note that we don't need uppercase for alpha. -_re_validate_alpha = re.compile('^[a-z]+$') +_re_validate_alpha = re.compile('^[a-z_]+$') _re_validate_number = re.compile('^[0-9]+$') _re_validate_boolint = re.compile('^[01]$') @@ -743,6 +743,7 @@ _legal_params = { 'who_match' : _re_validate_alpha, 'comment' : None, 'comment_match' : _re_validate_alpha, + 'search_content': None, 'querysort' : _re_validate_alpha, 'date' : _re_validate_alpha, 'hours' : _re_validate_number, @@ -988,7 +989,7 @@ def nav_path(request): is_last = len(path_parts) == len(request.path_parts) if request.roottype == 'cvs': - item = _item(name=cvsdb.utf8string(part), href=None) + item = _item(name=request.utf8(part), href=None) else: item = _item(name=part, href=None) @@ -1248,7 +1249,7 @@ def common_template_data(request, revision=None, mime_type=None): cfg = request.cfg where = request.where if request.roottype == 'cvs': - where = cvsdb.utf8string(where) + where = request.utf8(where) where = request.server.escape(where) # Initialize data dictionary members (sorted alphanumerically) @@ -1444,28 +1445,31 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type): get_lexer_for_mimetype, \ get_lexer_for_filename from pygments.lexers._mapping import LEXERS + # Hack for shell mime types: LEXERS['BashLexer'] = ('pygments.lexers.other', 'Bash', ('bash', 'sh'), ('*.sh',), ('application/x-sh', 'application/x-shellscript', 'text/x-sh', 'text/x-shellscript')) - encoding = 'guess' - if cfg.options.detect_encoding: - try: - import chardet - encoding = 'chardet' - except (SyntaxError, ImportError): - pass try: lexer = get_lexer_for_mimetype(mime_type, - encoding=encoding, + encoding='utf-8', stripnl=False) except ClassNotFound: try: lexer = get_lexer_for_filename(filename, - encoding=encoding, + encoding='utf-8', stripnl=False) except ClassNotFound: use_pygments = 0 except ImportError: use_pygments = 0 + # Detect encoding by calling chardet ourselves, + # to support it in non-highlighting mode + content = fp.read() + c, encoding = cfg.guesser().guess_charset(content) + if encoding: + content = c + else: + encoding = 'unknown' + # If we aren't going to be highlighting anything, just return the # BLAME_SOURCE. If there's no blame_source, we'll generate a fake # one from the file contents we fetch with PATH and REV. @@ -1475,11 +1479,7 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type): else: lines = [] line_no = 0 - while 1: - line = fp.readline() - if not line: - break - line = cvsdb.utf8string(line) + for line in content.split('\n'): line_no = line_no + 1 item = vclib.Annotation(cgi.escape(line), line_no, None, None, None, None) @@ -1508,19 +1508,11 @@ def markup_stream_pygments(request, cfg, blame_data, fp, filename, mime_type): self.blame_data.append(item) self.line_no = self.line_no + 1 ps = PygmentsSink(blame_source) - fpd = fp.read() - try: - fpdat = unicode(fpd,'utf-8') - except: - try: - fpdat = unicode(fpd,'cp1251') - except: - fpdat = fpd - highlight(fpdat, lexer, + highlight(content, lexer, HtmlFormatter(nowrap=True, classprefix='pygments-', encoding='utf-8'), ps) - return ps.blame_data + return ps.blame_data, encoding def make_time_string(date, cfg): """Returns formatted date string in either local time or UTC. @@ -1594,6 +1586,7 @@ def calculate_mime_type(request, path_parts, rev): return mime_type except: pass + # FIXME rewrite to use viewvcmagic return guess_mime(path_parts[-1]) def markup_or_annotate(request, is_annotate): @@ -1605,21 +1598,12 @@ def markup_or_annotate(request, is_annotate): mime_type = calculate_mime_type(request, path, rev) if not mime_type or mime_type == default_mime_type: - if request.ms is None and not request.ms_fail: - try: - import magic - request.ms = magic.open(magic.MAGIC_NONE | magic.MAGIC_MIME) - request.ms.load() - except: - request.ms_fail = 1 - if request.ms: - try: - fp, revision = request.repos.openfile(path, rev) - buffer = fp.read(magic_buf_size) - fp.close() - mime_type = request.ms.buffer(buffer) - except: - pass + try: + fp, revision = request.repos.openfile(path, rev) + mime_type = request.cfg.guesser().guess_mime(None, None, fp) + fp.close() + except: + raise # Is this a binary type? if is_binary(request.cfg, mime_type): @@ -1657,9 +1641,10 @@ def markup_or_annotate(request, is_annotate): if check_freshness(request, None, revision, weak=1): fp.close() return - lines = markup_stream_pygments(request, cfg, blame_source, fp, - path[-1], mime_type) + lines, charset = markup_stream_pygments(request, cfg, blame_source, fp, path[-1], mime_type) fp.close() + if mime_type.find(';') < 0: + mime_type = mime_type+'; charset='+charset data = common_template_data(request, revision) data.merge(ezt.TemplateData({ @@ -1910,7 +1895,7 @@ def view_directory(request): row.short_log = format_log(file.log, cfg) row.log = htmlify(file.log, cfg.options.mangle_email_addresses) row.lockinfo = file.lockinfo - row.name = request.server.escape(cvsdb.utf8string(file.name)) + row.name = request.server.escape(request.utf8(file.name)) row.anchor = row.name row.pathtype = (file.kind == vclib.FILE and 'file') or \ (file.kind == vclib.DIR and 'dir') @@ -2285,7 +2270,7 @@ def view_log(request): entry.ago = html_time(request, rev.date, 1) entry.log = rev.log or "" if cvs: - entry.log = cvsdb.utf8string(entry.log) + entry.log = request.utf8(entry.log) entry.log = htmlify(entry.log, cfg.options.mangle_email_addresses) entry.size = rev.size entry.lockinfo = rev.lockinfo @@ -2770,7 +2755,7 @@ class DiffSource: self.save_line = None self.line_number = None self.prev_line_number = None - + # keep track of where we are during an iteration self.idx = -1 self.last = None @@ -2867,7 +2852,7 @@ class DiffSource: diff_code = line[0] output = self._format_text(line[1:]) - output = cvsdb.utf8string(output) + output = self.cfg.guesser().utf8(output) if diff_code == '+': if self.state == 'dump': @@ -3644,6 +3629,7 @@ def view_queryform(request): 'who_match' : request.query_dict.get('who_match', 'exact'), 'comment' : request.query_dict.get('comment', ''), 'comment_match' : request.query_dict.get('comment_match', 'fulltext'), + 'search_content' : request.query_dict.get('search_content', ''), 'querysort' : request.query_dict.get('querysort', 'date'), 'date' : request.query_dict.get('date', 'hours'), 'hours' : request.query_dict.get('hours', '2'), @@ -3653,6 +3639,7 @@ def view_queryform(request): 'query_hidden_values' : query_hidden_values, 'limit_changes' : limit_changes, 'dir_href' : dir_href, + 'enable_search_content' : request.cfg.cvsdb.index_content, })) generate_page(request, "query_form", data) @@ -3791,7 +3778,8 @@ def build_commit(request, files, max_files, dir_strip, format): plus_count = 0 minus_count = 0 found_unreadable = 0 - all_repos = {} + if not request.all_repos: + request.all_repos = {} for f in files: dirname = f.GetDirectory() @@ -3810,17 +3798,19 @@ def build_commit(request, files, max_files, dir_strip, format): # Check path access (since the commits database logic bypasses the # vclib layer and, thus, the vcauth stuff that layer uses). - my_repos = all_repos.get(f.GetRepository(), '') + my_repos = request.all_repos.get(f.GetRepository(), '') if not my_repos: try: - my_repos = all_repos[f.GetRepository()] = request.create_repos(f.GetRepository()) + my_repos = request.all_repos[f.GetRepository()] = request.create_repos(f.GetRepository()) except: my_repos = None if not my_repos: return None if my_repos['roottype'] == 'cvs': - try: where = unicode(where,'utf-8') + # we store UTF-8 in the DB + try: where = where.decode('utf-8') except: pass + # FIXME maybe store "real" filesystem path in the DB instead of having such setting? try: where = where.encode(cfg.options.cvs_ondisk_charset) except: pass path_parts = _path_parts(where) @@ -3907,24 +3897,27 @@ def build_commit(request, files, max_files, dir_strip, format): if max_files and num_allowed > max_files: continue - commit_files.append(_item(date=commit_time, - dir=request.server.escape(dirname), - file=request.server.escape(filename), - author=request.server.escape(f.GetAuthor()), - rev=rev, - branch=f.GetBranch(), - plus=plus, - minus=minus, - type=change_type, - dir_href=dir_href, - log_href=log_href, - view_href=view_href, - download_href=download_href, - prefer_markup=prefer_markup, - diff_href=diff_href, - root=my_repos, - path=where, - path_prev=path_prev)) + commit_files.append(_item( + date=commit_time, + dir=request.server.escape(dirname), + file=request.server.escape(filename), + author=request.server.escape(f.GetAuthor()), + rev=rev, + branch=f.GetBranch(), + plus=plus, + minus=minus, + type=change_type, + snippet=f.GetSnippet(), + dir_href=dir_href, + log_href=log_href, + view_href=view_href, + download_href=download_href, + prefer_markup=prefer_markup, + diff_href=diff_href, + root=my_repos, + path=where, + path_prev=path_prev, + )) # No files survived authz checks? Let's just pretend this # little commit didn't happen, shall we? @@ -4115,6 +4108,7 @@ def view_query(request): who_match = request.query_dict.get('who_match', 'exact') comment = request.query_dict.get('comment', '') comment_match = request.query_dict.get('comment_match', 'fulltext') + search_content = request.query_dict.get('search_content', '') querysort = request.query_dict.get('querysort', 'date') date = request.query_dict.get('date', 'hours') hours = request.query_dict.get('hours', '2') @@ -4126,7 +4120,7 @@ def view_query(request): cfg.options.limit_changes)) match_types = { 'exact':1, 'like':1, 'glob':1, 'regex':1, 'notregex':1 } - sort_types = { 'date':1, 'author':1, 'file':1 } + sort_types = { 'date':1, 'date_rev':1, 'author':1, 'file':1, 'relevance':1 } date_types = { 'hours':1, 'day':1, 'week':1, 'month':1, 'all':1, 'explicit':1 } @@ -4193,6 +4187,8 @@ def view_query(request): query.SetComment(comment, comment_match) else: query.SetTextQuery(comment) + if search_content: + query.SetContentQuery(search_content) query.SetSortMethod(querysort) if date == 'hours': query.SetFromDateHoursAgo(int(hours)) diff --git a/lib/viewvcmagic.py b/lib/viewvcmagic.py new file mode 100644 index 00000000..5f8b3ea8 --- /dev/null +++ b/lib/viewvcmagic.py @@ -0,0 +1,70 @@ +#!/usr/bin/python + +import mimetypes + +have_chardet = 0 +try: + import chardet + have_chardet = 1 +except: pass + +class ContentMagic: + + def __init__(self, encodings): + self.encodings = encodings.split(':') + self.mime_magic = None + self.errors = [] + # Try to load magic + try: + import magic + self.mime_magic = magic.open(magic.MAGIC_MIME_TYPE) + self.mime_magic.load() + except Exception, e: + self.errors.append(e) + + # returns MIME type + def guess_mime(self, mime, filename, tempfile): + if mime == 'application/octet-stream': + mime = '' + if not mime and filename: + mime = mimetypes.guess_type(filename)[0] + if not mime and tempfile and self.mime_magic: + if type(tempfile) == type(''): + mime = self.mime_magic.file(tempfile) + else: + c = tempfile.read(4096) + mime = self.mime_magic.buffer(c) + return mime + + # returns (utf8_content, charset) + def guess_charset(self, content): + # Try to guess with chardet + charset = None + if have_chardet: + # Try chardet + try: + charset = chardet.detect(content) + if charset and charset['encoding']: + charset = charset['encoding'] + content = content.decode(charset) + except: charset = None + else: + # Try UTF-8 + charset = 'utf-8' + try: content = content.decode('utf-8') + except: charset = None + # Then try to guess primitively + if charset is None: + for charset in self.encodings: + try: + content = content.decode(charset) + break + except: charset = None + return (content, charset) + + # guess and encode return value into UTF-8 + def utf8(self, content): + (uni, charset) = self.guess_charset(content) + if charset: + return uni.encode('utf-8') + return content diff --git a/templates/query_form.ezt b/templates/query_form.ezt index ec28ffdc..4919bc5b 100644 --- a/templates/query_form.ezt +++ b/templates/query_form.ezt @@ -144,7 +144,7 @@ Browse Directory

Comment: -
+
+ [if-any enable_search_content] + + Search content: + + + [end] Sort By: diff --git a/templates/query_results.ezt b/templates/query_results.ezt index 4cb71d48..dee029fe 100644 --- a/templates/query_results.ezt +++ b/templates/query_results.ezt @@ -46,15 +46,18 @@ [define rev_href][if-any commits.files.prefer_markup][commits.files.view_href][else][if-any commits.files.download_href][commits.files.download_href][end][end][end] - [if-any commits.files.rev][if-any rev_href][end][commits.files.rev][if-any rev_href][end][else] [end] + [if-any commits.files.rev][if-any rev_href][end][commits.files.rev][if-any rev_href][end][else] [end] [commits.files.dir]/ [commits.files.file] + [if-any commits.files.snippet] +
[commits.files.snippet]
+ [end] [if-any show_branch] - [if-any commits.files.branch][commits.files.branch][else] [end] + [if-any commits.files.branch][commits.files.branch][else] [end] [end] @@ -68,10 +71,10 @@ [is commits.files.type "Remove"][end] - [if-any commits.files.date][commits.files.date][else] [end] + [if-any commits.files.date][commits.files.date][else] [end] - [if-any commits.files.author][commits.files.author][else] [end] + [if-any commits.files.author][commits.files.author][else] [end] [end]