#!/usr/bin/env python # -*-python-*- # # Copyright (C) 2004-2013 The ViewCVS Group. All Rights Reserved. # Copyright (C) 2004-2007 James Henstridge # # By using this file, you agree to the terms and conditions set forth in # the LICENSE.html file which can be found at the top level of the ViewVC # distribution or at http://viewvc.org/license-1.html. # # For more information, visit http://viewvc.org/ # # ----------------------------------------------------------------------- # # administrative program for loading Subversion revision information # into the checkin database. It can be used to add a single revision # to the database, or rebuild/update all revisions. # # To add all the checkins from a Subversion repository to the checkin # database, run the following: # /path/to/svndbadmin rebuild /path/to/repo # # This script can also be called from the Subversion post-commit hook, # something like this: # REPOS="$1" # REV="$2" # /path/to/svndbadmin update "$REPOS" "$REV" # # If you allow changes to revision properties in your repository, you # might also want to set up something similar in the # post-revprop-change hook using "update" with the --force option to # keep the checkin database consistent with the repository. # # ----------------------------------------------------------------------- # ######################################################################### # INSTALL-TIME CONFIGURATION # # These values will be set during the installation process. During # development, there will be no 'viewvcinstallpath.py' # import viewvcinstallpath LIBRARY_DIR = viewvcinstallpath.LIBRARY_DIR CONF_PATHNAME = viewvcinstallpath.CONF_PATHNAME # Adjust sys.path to include our library directory import sys import os if LIBRARY_DIR: sys.path.insert(0, LIBRARY_DIR) else: sys.path.insert(0, os.path.abspath(os.path.join(sys.argv[0], "../../lib"))) ######################################################################### import os import string import socket import select import re import mimetypes import time import svn.core import svn.repos import svn.fs import svn.delta import cvsdb import viewvc import vclib from viewvcmagic import ContentMagic class SvnRepo: """Class used to manage a connection to a SVN repository.""" def __init__(self, path, index_content = None, tika_client = None, guesser = None, svn_ignore_mimetype = False, verbose = False): self.path = path self.repo = svn.repos.svn_repos_open(path) self.fs = svn.repos.svn_repos_fs(self.repo) self.rev_max = svn.fs.youngest_rev(self.fs) self.index_content = index_content self.tika_client = tika_client self.guesser = guesser self.verbose = verbose self.svn_ignore_mimetype = svn_ignore_mimetype def __getitem__(self, rev): if rev is None: rev = self.rev_max elif rev < 0: rev = rev + self.rev_max + 1 assert 0 <= rev <= self.rev_max rev = SvnRev(self, rev) return rev _re_diff_change_command = re.compile('^(\d+)(?:,(\d+))?([acd])(\d+)(?:,(\d+))?') class StupidBufferedReader: def __init__(self, fp, buffer = 262144): self.fp = fp self.bufsize = buffer self.buffer = '' self.eof = False def __iter__(self): return self def next(self): if self.eof: raise StopIteration return self.readline() def readline(self): if self.eof: return '' p = self.buffer.find('\n') while p < 0: b = self.fp.read(self.bufsize) if not len(b): r = self.buffer self.buffer = '' self.eof = True return r self.buffer = self.buffer + b p = self.buffer.find('\n') r = self.buffer[0:p+1] self.buffer = self.buffer[p+1:] return r def _get_diff_counts(diff_fp): """Calculate the plus/minus counts by parsing the output of a normal diff. The reasons for choosing Normal diff format are: - the output is short, so should be quicker to parse. - only the change commands need be parsed to calculate the counts. - All file data is prefixed, so won't be mistaken for a change command. This code is based on the description of the format found in the GNU diff manual.""" plus, minus = 0, 0 for line in diff_fp: match = re.match(_re_diff_change_command, line) if match: # size of first range if match.group(2): count1 = int(match.group(2)) - int(match.group(1)) + 1 else: count1 = 1 cmd = match.group(3) # size of second range if match.group(5): count2 = int(match.group(5)) - int(match.group(4)) + 1 else: count2 = 1 if cmd == 'a': # LaR - insert after line L of file1 range R of file2 plus = plus + count2 elif cmd == 'c': # FcT - replace range F of file1 with range T of file2 minus = minus + count1 plus = plus + count2 elif cmd == 'd': # RdL - remove range R of file1, which would have been # at line L of file2 minus = minus + count1 return plus, minus class TikaClient: # Create tika client def __init__(self, tika_server, mime_types, verbose): self.tika_server = tika_server self.mime_types = mime_types self.verbose = verbose self.addr = tika_server.split(':') # Split address if len(self.addr) != 2: raise Exception('tika_server value is incorrect: \''+tika_server+'\', please use \'host:port\' format') self.addr = (self.addr[0], int(self.addr[1])) # Build regexp for MIME types m = re.split('\s+', mime_types.strip()) self.mime_regexp = re.compile('|'.join('^'+re.escape(i).replace('\\*', '.*')+'$' for i in m)) # Extract text content from file using Tika which runs in server mode def get_text(self, filename, mime_type, log_filename): if not self.mime_regexp.match(mime_type): # Tika can't handle this mime type, return nothing return '' fd = None s = None text = '' fsize = 0 try: # Read original file fd = open(filename, 'rb') data = fd.read() fsize = len(data) if not fsize: return '' # Connect to Tika s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(self.addr) s.setblocking(0) sockfd = s.fileno() # Tika is somewhat delicate about network IO, so: # Read and write using poll(2) system call p = select.poll() p.register(sockfd) while 1: fds = p.poll() if not fds: break (pollfd, event) = fds[0] if event & select.POLLIN: # Exception or empty data means EOF... try: part = os.read(sockfd, 65536) except: break if not part: break text += part if event & select.POLLOUT: if not len(data): # Shutdown output and forget about POLLOUT s.shutdown(socket.SHUT_WR) p.modify(sockfd, select.POLLIN) else: # Write and consume some data l = os.write(sockfd, data) data = data[l:] if len(text) == 0: raise Exception('Empty response from Tika server') if self.verbose: print "Extracted %d bytes from %s (%s) of size %d" % (len(text), log_filename, mime_type, fsize) except Exception, e: if self.verbose: print "Error extracting text from %s (%s) of size %d: %s" % (log_filename, mime_type, fsize, str(e)) finally: if fd: fd.close() if s: s.close() return text class SvnRev: """Class used to hold information about a particular revision of the repository.""" def __init__(self, repo, rev): self.repo = repo self.rev = rev self.rev_roots = {} # cache of revision roots # revision properties ... revprops = svn.fs.revision_proplist(repo.fs, rev) self.author = str(revprops.get(svn.core.SVN_PROP_REVISION_AUTHOR,'')) self.date = str(revprops.get(svn.core.SVN_PROP_REVISION_DATE, '')) self.log = str(revprops.get(svn.core.SVN_PROP_REVISION_LOG, '')) # convert the date string to seconds since epoch ... try: self.date = svn.core.svn_time_from_cstring(self.date) / 1000000 except: self.date = None # get a root for the current revisions fsroot = self._get_root_for_rev(rev) # find changes in the revision editor = svn.repos.ChangeCollector(repo.fs, fsroot) e_ptr, e_baton = svn.delta.make_editor(editor) svn.repos.svn_repos_replay(fsroot, e_ptr, e_baton) self.changes = [] changes_hash = {} for path, change in editor.changes.items(): # skip non-file changes if change.item_kind != svn.core.svn_node_file: continue # deal with the change types we handle action = None base_root = None base_path = change.base_path if change.base_path: base_root = self._get_root_for_rev(change.base_rev) # figure out what kind of change this is, and get a diff # object for it. note that prior to 1.4 Subversion's # bindings didn't give us change.action, but that's okay # because back then deleted paths always had a change.path # of None. if hasattr(change, 'action') \ and change.action == svn.repos.CHANGE_ACTION_DELETE: action = 'remove' elif not change.path: action = 'remove' elif change.added: action = 'add' else: action = 'change' if action == 'remove': diffobj = svn.fs.FileDiff(base_root, change.base_path, None, None, None, ['-b', '-B']) else: diffobj = svn.fs.FileDiff(base_root, change.base_path, fsroot, change.path, None, ['-b', '-B']) diff_fp = diffobj.get_pipe() diff_fp = StupidBufferedReader(diff_fp) plus, minus = _get_diff_counts(diff_fp) # CustIS Bug 50473: a workaround for svnlib behaviour in file movements (FILE1 -> FILE2 + FILE1 -> null) if change.base_path: if not change.path and change.base_path in changes_hash: minus = 0 elif change.path: changes_hash[change.base_path] = change.path content = '' mime = '' # need to check if binary file's content changed when copying, # if not, don't extract it, just get it from previous revision later if repo.index_content and action != 'remove' and change.path and (not change.base_path or svn.fs.contents_changed( base_root and base_root or None, base_root and change.base_path or None, fsroot, change.path )): props = svn.fs.node_proplist(fsroot, change.path) if not repo.svn_ignore_mimetype: mime = props.get('svn:mime-type', None) else: mime = None mime = repo.guesser.guess_mime( mime, os.path.basename(change.path), diffobj.tempfile2 ) # Read and guess charset by ourselves for text files if mime and mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')): try: fd = open(diffobj.tempfile2, 'rb') content = fd.read() fd.close() except: pass # Guess charset if content: content, charset = repo.guesser.guess_charset(content) if charset: content = content.encode('utf-8') if repo.verbose: print 'Guessed %s for %s' % (charset, change.path) elif repo.verbose: print 'Failed to guess charset for %s, not indexing' % (change.path, ) # Try to extract content using Tika from binary documents elif repo.tika_client: content = repo.tika_client.get_text(diffobj.tempfile2, mime, change.path) self.changes.append((path, action, plus, minus, content, mime)) def _get_root_for_rev(self, rev): """Fetch a revision root from a cache of such, or a fresh root (which is then cached for later use.""" if not self.rev_roots.has_key(rev): self.rev_roots[rev] = svn.fs.revision_root(self.repo.fs, rev) return self.rev_roots[rev] def handle_revision(db, command, repo, rev, verbose, force=0): """Adds a particular revision of the repository to the checkin database.""" revision = repo[rev] committed = 0 if verbose: print "Building commit info for revision %d..." % (rev), if not revision.changes: if verbose: print "skipped (no changes)." return for (path, action, plus, minus, content, mime) in revision.changes: directory, file = os.path.split(path) commit = cvsdb.CreateCommit() commit.SetRepository(repo.path) commit.SetDirectory(directory) commit.SetFile(file) commit.SetRevision(str(rev)) commit.SetAuthor(revision.author) commit.SetDescription(revision.log) commit.SetTime(revision.date) commit.SetPlusCount(plus) commit.SetMinusCount(minus) commit.SetBranch(None) commit.SetContent(content) commit.SetMimeType(mime) if action == 'add': commit.SetTypeAdd() elif action == 'remove': commit.SetTypeRemove() elif action == 'change': commit.SetTypeChange() if command == 'update': result = db.CheckCommit(commit) if result and not force: continue # already recorded # commit to database db.AddCommit(commit) committed = 1 if verbose: if committed: print "done." else: print "skipped (already recorded)." def main(command, repository, revs=[], verbose=0, force=0): cfg = viewvc.load_config(CONF_PATHNAME) db = cvsdb.ConnectDatabase(cfg) repository = os.path.realpath(repository) # Purge what must be purged. if command in ('rebuild', 'purge'): if verbose: print "Purging commit info for repository root `%s'" % repository try: db.PurgeRepository(repository) except cvsdb.UnknownRepositoryError, e: if command == 'purge': sys.stderr.write("ERROR: " + str(e) + "\n") sys.exit(1) tika_client = None if cfg.utilities.tika_server: tika_client = TikaClient(cfg.utilities.tika_server, cfg.utilities.tika_mime_types, verbose) repo = SvnRepo( path = repository, index_content = cfg.cvsdb.index_content, tika_client = tika_client, guesser = cfg.guesser(), svn_ignore_mimetype = cfg.options.svn_ignore_mimetype, verbose = verbose, ) # Record what must be recorded. if command == 'rebuild' or (command == 'update' and not revs): for rev in range(repo.rev_max+1): handle_revision(db, command, repo, rev, verbose, force) elif command == 'update': if revs[0] is None: revs[0] = repo.rev_max if revs[1] is None: revs[1] = repo.rev_max revs.sort() for rev in range(revs[0], revs[1]+1): handle_revision(db, command, repo, rev, verbose, force) def _rev2int(r): if r == 'HEAD': r = None else: r = int(r) if r < 0: raise ValueError, "invalid revision '%d'" % (r) return r def usage(): cmd = os.path.basename(sys.argv[0]) sys.stderr.write( """Administer the ViewVC checkins database data for the Subversion repository located at REPOS-PATH. Usage: 1. %s [-v] rebuild REPOS-PATH 2. %s [-v] update REPOS-PATH [REV[:REV2]] [--force] 3. %s [-v] purge REPOS-PATH 1. Rebuild the commit database information for the repository located at REPOS-PATH across all revisions, after first purging information specific to that repository (if any). 2. Update the commit database information for the repository located at REPOS-PATH across all revisions or, optionally, only for the specified revision REV (or revision range REV:REV2). This is just like rebuilding, except that, unless --force is specified, no commit information will be stored for commits already present in the database. If a range is specified, the revisions will be processed in ascending order, and you may specify "HEAD" to indicate "the youngest revision currently in the repository". 3. Purge information specific to the repository located at REPOS-PATH from the database. Use the -v flag to cause this script to give progress information as it works. """ % (cmd, cmd, cmd)) sys.exit(1) if __name__ == '__main__': verbose = 0 force = 0 args = sys.argv try: index = args.index('-v') verbose = 1 del args[index] except ValueError: pass try: index = args.index('--force') force = 1 del args[index] except ValueError: pass if len(args) < 3: usage() command = args[1].lower() if command not in ('rebuild', 'update', 'purge'): sys.stderr.write('ERROR: unknown command %s\n' % command) usage() revs = [] if len(sys.argv) > 3: if command == 'rebuild': sys.stderr.write('ERROR: rebuild no longer accepts a revision ' 'number argument. Usage update --force.') usage() elif command != 'update': usage() try: revs = map(lambda x: _rev2int(x), sys.argv[3].split(':')) if len(revs) > 2: raise ValueError, "too many revisions in range" if len(revs) == 1: revs.append(revs[0]) except ValueError: sys.stderr.write('ERROR: invalid revision specification "%s"\n' \ % sys.argv[3]) usage() else: rev = None try: repository = vclib.svn.canonicalize_rootpath(args[2]) repository = cvsdb.CleanRepository(os.path.abspath(repository)) main(command, repository, revs, verbose, force) except KeyboardInterrupt: print print '** break **' sys.exit(0)