546 lines
19 KiB
Python
Executable File
546 lines
19 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*-python-*-
|
|
#
|
|
# Copyright (C) 2004-2013 The ViewCVS Group. All Rights Reserved.
|
|
# Copyright (C) 2004-2007 James Henstridge
|
|
#
|
|
# By using this file, you agree to the terms and conditions set forth in
|
|
# the LICENSE.html file which can be found at the top level of the ViewVC
|
|
# distribution or at http://viewvc.org/license-1.html.
|
|
#
|
|
# For more information, visit http://viewvc.org/
|
|
#
|
|
# -----------------------------------------------------------------------
|
|
#
|
|
# administrative program for loading Subversion revision information
|
|
# into the checkin database. It can be used to add a single revision
|
|
# to the database, or rebuild/update all revisions.
|
|
#
|
|
# To add all the checkins from a Subversion repository to the checkin
|
|
# database, run the following:
|
|
# /path/to/svndbadmin rebuild /path/to/repo
|
|
#
|
|
# This script can also be called from the Subversion post-commit hook,
|
|
# something like this:
|
|
# REPOS="$1"
|
|
# REV="$2"
|
|
# /path/to/svndbadmin update "$REPOS" "$REV"
|
|
#
|
|
# If you allow changes to revision properties in your repository, you
|
|
# might also want to set up something similar in the
|
|
# post-revprop-change hook using "update" with the --force option to
|
|
# keep the checkin database consistent with the repository.
|
|
#
|
|
# -----------------------------------------------------------------------
|
|
#
|
|
|
|
#########################################################################
|
|
# INSTALL-TIME CONFIGURATION
|
|
#
|
|
# These values will be set during the installation process. During
|
|
# development, there will be no 'viewvcinstallpath.py'
|
|
#
|
|
|
|
import viewvcinstallpath
|
|
LIBRARY_DIR = viewvcinstallpath.LIBRARY_DIR
|
|
CONF_PATHNAME = viewvcinstallpath.CONF_PATHNAME
|
|
|
|
# Adjust sys.path to include our library directory
|
|
import sys
|
|
import os
|
|
|
|
if LIBRARY_DIR:
|
|
sys.path.insert(0, LIBRARY_DIR)
|
|
else:
|
|
sys.path.insert(0, os.path.abspath(os.path.join(sys.argv[0], "../../lib")))
|
|
|
|
#########################################################################
|
|
|
|
import os
|
|
import string
|
|
import socket
|
|
import select
|
|
import re
|
|
import mimetypes
|
|
import time
|
|
|
|
import svn.core
|
|
import svn.repos
|
|
import svn.fs
|
|
import svn.delta
|
|
|
|
import cvsdb
|
|
import viewvc
|
|
import vclib
|
|
from viewvcmagic import ContentMagic
|
|
|
|
class SvnRepo:
|
|
"""Class used to manage a connection to a SVN repository."""
|
|
def __init__(self, path, index_content = None, tika_client = None, guesser = None,
|
|
svn_ignore_mimetype = False, verbose = False):
|
|
self.path = path
|
|
self.repo = svn.repos.svn_repos_open(path)
|
|
self.fs = svn.repos.svn_repos_fs(self.repo)
|
|
self.rev_max = svn.fs.youngest_rev(self.fs)
|
|
self.index_content = index_content
|
|
self.tika_client = tika_client
|
|
self.guesser = guesser
|
|
self.verbose = verbose
|
|
self.svn_ignore_mimetype = svn_ignore_mimetype
|
|
def __getitem__(self, rev):
|
|
if rev is None:
|
|
rev = self.rev_max
|
|
elif rev < 0:
|
|
rev = rev + self.rev_max + 1
|
|
assert 0 <= rev <= self.rev_max
|
|
rev = SvnRev(self, rev)
|
|
return rev
|
|
|
|
_re_diff_change_command = re.compile('^(\d+)(?:,(\d+))?([acd])(\d+)(?:,(\d+))?')
|
|
|
|
class StupidBufferedReader:
|
|
def __init__(self, fp, buffer = 262144):
|
|
self.fp = fp
|
|
self.bufsize = buffer
|
|
self.buffer = ''
|
|
self.eof = False
|
|
def __iter__(self):
|
|
return self
|
|
def next(self):
|
|
if self.eof:
|
|
raise StopIteration
|
|
return self.readline()
|
|
def readline(self):
|
|
if self.eof:
|
|
return ''
|
|
p = self.buffer.find('\n')
|
|
while p < 0:
|
|
b = self.fp.read(self.bufsize)
|
|
if not len(b):
|
|
r = self.buffer
|
|
self.buffer = ''
|
|
self.eof = True
|
|
return r
|
|
self.buffer = self.buffer + b
|
|
p = self.buffer.find('\n')
|
|
r = self.buffer[0:p+1]
|
|
self.buffer = self.buffer[p+1:]
|
|
return r
|
|
|
|
def _get_diff_counts(diff_fp):
|
|
"""Calculate the plus/minus counts by parsing the output of a
|
|
normal diff. The reasons for choosing Normal diff format are:
|
|
- the output is short, so should be quicker to parse.
|
|
- only the change commands need be parsed to calculate the counts.
|
|
- All file data is prefixed, so won't be mistaken for a change
|
|
command.
|
|
This code is based on the description of the format found in the
|
|
GNU diff manual."""
|
|
|
|
plus, minus = 0, 0
|
|
for line in diff_fp:
|
|
match = re.match(_re_diff_change_command, line)
|
|
if match:
|
|
# size of first range
|
|
if match.group(2):
|
|
count1 = int(match.group(2)) - int(match.group(1)) + 1
|
|
else:
|
|
count1 = 1
|
|
cmd = match.group(3)
|
|
# size of second range
|
|
if match.group(5):
|
|
count2 = int(match.group(5)) - int(match.group(4)) + 1
|
|
else:
|
|
count2 = 1
|
|
|
|
if cmd == 'a':
|
|
# LaR - insert after line L of file1 range R of file2
|
|
plus = plus + count2
|
|
elif cmd == 'c':
|
|
# FcT - replace range F of file1 with range T of file2
|
|
minus = minus + count1
|
|
plus = plus + count2
|
|
elif cmd == 'd':
|
|
# RdL - remove range R of file1, which would have been
|
|
# at line L of file2
|
|
minus = minus + count1
|
|
return plus, minus
|
|
|
|
class TikaClient:
|
|
# Create tika client
|
|
def __init__(self, tika_server, mime_types, verbose):
|
|
self.tika_server = tika_server
|
|
self.mime_types = mime_types
|
|
self.verbose = verbose
|
|
self.addr = tika_server.split(':')
|
|
# Split address
|
|
if len(self.addr) != 2:
|
|
raise Exception('tika_server value is incorrect: \''+tika_server+'\', please use \'host:port\' format')
|
|
self.addr = (self.addr[0], int(self.addr[1]))
|
|
# Build regexp for MIME types
|
|
m = re.split('\s+', mime_types.strip())
|
|
self.mime_regexp = re.compile('|'.join('^'+re.escape(i).replace('\\*', '.*')+'$' for i in m))
|
|
|
|
# Extract text content from file using Tika which runs in server mode
|
|
def get_text(self, filename, mime_type, log_filename):
|
|
if not self.mime_regexp.match(mime_type):
|
|
# Tika can't handle this mime type, return nothing
|
|
return ''
|
|
fd = None
|
|
s = None
|
|
text = ''
|
|
fsize = 0
|
|
try:
|
|
# Read original file
|
|
fd = open(filename, 'rb')
|
|
data = fd.read()
|
|
fsize = len(data)
|
|
if not fsize:
|
|
return ''
|
|
# Connect to Tika
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
s.connect(self.addr)
|
|
s.setblocking(0)
|
|
sockfd = s.fileno()
|
|
# Tika is somewhat delicate about network IO, so:
|
|
# Read and write using poll(2) system call
|
|
p = select.poll()
|
|
p.register(sockfd)
|
|
while 1:
|
|
fds = p.poll()
|
|
if not fds:
|
|
break
|
|
(pollfd, event) = fds[0]
|
|
if event & select.POLLIN:
|
|
# Exception or empty data means EOF...
|
|
try: part = os.read(sockfd, 65536)
|
|
except: break
|
|
if not part: break
|
|
text += part
|
|
if event & select.POLLOUT:
|
|
if not len(data):
|
|
# Shutdown output and forget about POLLOUT
|
|
s.shutdown(socket.SHUT_WR)
|
|
p.modify(sockfd, select.POLLIN)
|
|
else:
|
|
# Write and consume some data
|
|
l = os.write(sockfd, data)
|
|
data = data[l:]
|
|
if len(text) == 0:
|
|
raise Exception('Empty response from Tika server')
|
|
if self.verbose:
|
|
print "Extracted %d bytes from %s (%s) of size %d" % (len(text), log_filename, mime_type, fsize)
|
|
except Exception, e:
|
|
if self.verbose:
|
|
print "Error extracting text from %s (%s) of size %d: %s" % (log_filename, mime_type, fsize, str(e))
|
|
finally:
|
|
if fd: fd.close()
|
|
if s: s.close()
|
|
return text
|
|
|
|
class SvnRev:
|
|
"""Class used to hold information about a particular revision of
|
|
the repository."""
|
|
def __init__(self, repo, rev):
|
|
self.repo = repo
|
|
self.rev = rev
|
|
self.rev_roots = {} # cache of revision roots
|
|
|
|
# revision properties ...
|
|
revprops = svn.fs.revision_proplist(repo.fs, rev)
|
|
self.author = str(revprops.get(svn.core.SVN_PROP_REVISION_AUTHOR,''))
|
|
self.date = str(revprops.get(svn.core.SVN_PROP_REVISION_DATE, ''))
|
|
self.log = str(revprops.get(svn.core.SVN_PROP_REVISION_LOG, ''))
|
|
|
|
# convert the date string to seconds since epoch ...
|
|
try:
|
|
self.date = svn.core.svn_time_from_cstring(self.date) / 1000000
|
|
except:
|
|
self.date = None
|
|
|
|
# get a root for the current revisions
|
|
fsroot = self._get_root_for_rev(rev)
|
|
|
|
# find changes in the revision
|
|
editor = svn.repos.ChangeCollector(repo.fs, fsroot)
|
|
e_ptr, e_baton = svn.delta.make_editor(editor)
|
|
svn.repos.svn_repos_replay(fsroot, e_ptr, e_baton)
|
|
|
|
self.changes = []
|
|
changes_hash = {}
|
|
for path, change in editor.changes.items():
|
|
# skip non-file changes
|
|
if change.item_kind != svn.core.svn_node_file:
|
|
continue
|
|
|
|
# deal with the change types we handle
|
|
action = None
|
|
base_root = None
|
|
base_path = change.base_path
|
|
if change.base_path:
|
|
base_root = self._get_root_for_rev(change.base_rev)
|
|
|
|
# figure out what kind of change this is, and get a diff
|
|
# object for it. note that prior to 1.4 Subversion's
|
|
# bindings didn't give us change.action, but that's okay
|
|
# because back then deleted paths always had a change.path
|
|
# of None.
|
|
if hasattr(change, 'action') \
|
|
and change.action == svn.repos.CHANGE_ACTION_DELETE:
|
|
action = 'remove'
|
|
elif not change.path:
|
|
action = 'remove'
|
|
elif change.added:
|
|
action = 'add'
|
|
else:
|
|
action = 'change'
|
|
|
|
if action == 'remove':
|
|
diffobj = svn.fs.FileDiff(base_root, change.base_path, None, None, None, ['-b', '-B'])
|
|
else:
|
|
diffobj = svn.fs.FileDiff(base_root, change.base_path, fsroot, change.path, None, ['-b', '-B'])
|
|
|
|
diff_fp = diffobj.get_pipe()
|
|
diff_fp = StupidBufferedReader(diff_fp)
|
|
plus, minus = _get_diff_counts(diff_fp)
|
|
|
|
# CustIS Bug 50473: a workaround for svnlib behaviour in file movements (FILE1 -> FILE2 + FILE1 -> null)
|
|
if change.base_path:
|
|
if not change.path and change.base_path in changes_hash:
|
|
minus = 0
|
|
elif change.path:
|
|
changes_hash[change.base_path] = change.path
|
|
|
|
content = ''
|
|
mime = ''
|
|
# need to check if binary file's content changed when copying,
|
|
# if not, don't extract it, just get it from previous revision later
|
|
if repo.index_content and action != 'remove' and change.path and (not change.base_path
|
|
or svn.fs.contents_changed(
|
|
base_root and base_root or None,
|
|
base_root and change.base_path or None,
|
|
fsroot, change.path
|
|
)):
|
|
props = svn.fs.node_proplist(fsroot, change.path)
|
|
if not repo.svn_ignore_mimetype:
|
|
mime = props.get('svn:mime-type', None)
|
|
else:
|
|
mime = None
|
|
mime = repo.guesser.guess_mime(
|
|
mime,
|
|
os.path.basename(change.path),
|
|
diffobj.tempfile2
|
|
)
|
|
# Read and guess charset by ourselves for text files
|
|
if mime and mime.startswith('text/') or (mime.startswith('application/') and mime.endswith('xml')):
|
|
try:
|
|
fd = open(diffobj.tempfile2, 'rb')
|
|
content = fd.read()
|
|
fd.close()
|
|
except: pass
|
|
# Guess charset
|
|
if content:
|
|
content, charset = repo.guesser.guess_charset(content)
|
|
if charset:
|
|
content = content.encode('utf-8')
|
|
if repo.verbose:
|
|
print 'Guessed %s for %s' % (charset, change.path)
|
|
elif repo.verbose:
|
|
print 'Failed to guess charset for %s, not indexing' % (change.path, )
|
|
# Try to extract content using Tika from binary documents
|
|
elif repo.tika_client:
|
|
content = repo.tika_client.get_text(diffobj.tempfile2, mime, change.path)
|
|
self.changes.append((path, action, plus, minus, content, mime))
|
|
|
|
def _get_root_for_rev(self, rev):
|
|
"""Fetch a revision root from a cache of such, or a fresh root
|
|
(which is then cached for later use."""
|
|
if not self.rev_roots.has_key(rev):
|
|
self.rev_roots[rev] = svn.fs.revision_root(self.repo.fs, rev)
|
|
return self.rev_roots[rev]
|
|
|
|
|
|
def handle_revision(db, command, repo, rev, verbose, force=0):
|
|
"""Adds a particular revision of the repository to the checkin database."""
|
|
revision = repo[rev]
|
|
committed = 0
|
|
|
|
if verbose: print "Building commit info for revision %d..." % (rev),
|
|
|
|
if not revision.changes:
|
|
if verbose: print "skipped (no changes)."
|
|
return
|
|
|
|
for (path, action, plus, minus, content, mime) in revision.changes:
|
|
directory, file = os.path.split(path)
|
|
commit = cvsdb.CreateCommit()
|
|
commit.SetRepository(repo.path)
|
|
commit.SetDirectory(directory)
|
|
commit.SetFile(file)
|
|
commit.SetRevision(str(rev))
|
|
commit.SetAuthor(revision.author)
|
|
commit.SetDescription(revision.log)
|
|
commit.SetTime(revision.date)
|
|
commit.SetPlusCount(plus)
|
|
commit.SetMinusCount(minus)
|
|
commit.SetBranch(None)
|
|
commit.SetContent(content)
|
|
commit.SetMimeType(mime)
|
|
|
|
if action == 'add':
|
|
commit.SetTypeAdd()
|
|
elif action == 'remove':
|
|
commit.SetTypeRemove()
|
|
elif action == 'change':
|
|
commit.SetTypeChange()
|
|
|
|
if command == 'update':
|
|
result = db.CheckCommit(commit)
|
|
if result and not force:
|
|
continue # already recorded
|
|
|
|
# commit to database
|
|
db.AddCommit(commit)
|
|
committed = 1
|
|
|
|
if verbose:
|
|
if committed:
|
|
print "done."
|
|
else:
|
|
print "skipped (already recorded)."
|
|
|
|
def main(command, repository, revs=[], verbose=0, force=0):
|
|
cfg = viewvc.load_config(CONF_PATHNAME)
|
|
db = cvsdb.ConnectDatabase(cfg)
|
|
|
|
repository = os.path.realpath(repository)
|
|
# Purge what must be purged.
|
|
if command in ('rebuild', 'purge'):
|
|
if verbose:
|
|
print "Purging commit info for repository root `%s'" % repository
|
|
try:
|
|
db.PurgeRepository(repository)
|
|
except cvsdb.UnknownRepositoryError, e:
|
|
if command == 'purge':
|
|
sys.stderr.write("ERROR: " + str(e) + "\n")
|
|
sys.exit(1)
|
|
|
|
tika_client = None
|
|
if cfg.utilities.tika_server:
|
|
tika_client = TikaClient(cfg.utilities.tika_server, cfg.utilities.tika_mime_types, verbose)
|
|
repo = SvnRepo(
|
|
path = repository,
|
|
index_content = cfg.cvsdb.index_content,
|
|
tika_client = tika_client,
|
|
guesser = cfg.guesser(),
|
|
svn_ignore_mimetype = cfg.options.svn_ignore_mimetype,
|
|
verbose = verbose,
|
|
)
|
|
# Record what must be recorded.
|
|
if command == 'rebuild' or (command == 'update' and not revs):
|
|
for rev in range(repo.rev_max+1):
|
|
handle_revision(db, command, repo, rev, verbose, force)
|
|
elif command == 'update':
|
|
if revs[0] is None:
|
|
revs[0] = repo.rev_max
|
|
if revs[1] is None:
|
|
revs[1] = repo.rev_max
|
|
revs.sort()
|
|
for rev in range(revs[0], revs[1]+1):
|
|
handle_revision(db, command, repo, rev, verbose, force)
|
|
|
|
def _rev2int(r):
|
|
if r == 'HEAD':
|
|
r = None
|
|
else:
|
|
r = int(r)
|
|
if r < 0:
|
|
raise ValueError, "invalid revision '%d'" % (r)
|
|
return r
|
|
|
|
def usage():
|
|
cmd = os.path.basename(sys.argv[0])
|
|
sys.stderr.write(
|
|
"""Administer the ViewVC checkins database data for the Subversion repository
|
|
located at REPOS-PATH.
|
|
|
|
Usage: 1. %s [-v] rebuild REPOS-PATH
|
|
2. %s [-v] update REPOS-PATH [REV[:REV2]] [--force]
|
|
3. %s [-v] purge REPOS-PATH
|
|
|
|
1. Rebuild the commit database information for the repository located
|
|
at REPOS-PATH across all revisions, after first purging
|
|
information specific to that repository (if any).
|
|
|
|
2. Update the commit database information for the repository located
|
|
at REPOS-PATH across all revisions or, optionally, only for the
|
|
specified revision REV (or revision range REV:REV2). This is just
|
|
like rebuilding, except that, unless --force is specified, no
|
|
commit information will be stored for commits already present in
|
|
the database. If a range is specified, the revisions will be
|
|
processed in ascending order, and you may specify "HEAD" to
|
|
indicate "the youngest revision currently in the repository".
|
|
|
|
3. Purge information specific to the repository located at REPOS-PATH
|
|
from the database.
|
|
|
|
Use the -v flag to cause this script to give progress information as it works.
|
|
|
|
""" % (cmd, cmd, cmd))
|
|
sys.exit(1)
|
|
|
|
if __name__ == '__main__':
|
|
verbose = 0
|
|
force = 0
|
|
args = sys.argv
|
|
try:
|
|
index = args.index('-v')
|
|
verbose = 1
|
|
del args[index]
|
|
except ValueError:
|
|
pass
|
|
try:
|
|
index = args.index('--force')
|
|
force = 1
|
|
del args[index]
|
|
except ValueError:
|
|
pass
|
|
|
|
if len(args) < 3:
|
|
usage()
|
|
|
|
command = args[1].lower()
|
|
if command not in ('rebuild', 'update', 'purge'):
|
|
sys.stderr.write('ERROR: unknown command %s\n' % command)
|
|
usage()
|
|
|
|
revs = []
|
|
if len(sys.argv) > 3:
|
|
if command == 'rebuild':
|
|
sys.stderr.write('ERROR: rebuild no longer accepts a revision '
|
|
'number argument. Usage update --force.')
|
|
usage()
|
|
elif command != 'update':
|
|
usage()
|
|
try:
|
|
revs = map(lambda x: _rev2int(x), sys.argv[3].split(':'))
|
|
if len(revs) > 2:
|
|
raise ValueError, "too many revisions in range"
|
|
if len(revs) == 1:
|
|
revs.append(revs[0])
|
|
except ValueError:
|
|
sys.stderr.write('ERROR: invalid revision specification "%s"\n' \
|
|
% sys.argv[3])
|
|
usage()
|
|
else:
|
|
rev = None
|
|
|
|
try:
|
|
repository = vclib.svn.canonicalize_rootpath(args[2])
|
|
repository = cvsdb.CleanRepository(os.path.abspath(repository))
|
|
main(command, repository, revs, verbose, force)
|
|
except KeyboardInterrupt:
|
|
print
|
|
print '** break **'
|
|
sys.exit(0)
|