Extract the RCS parsing code into a separate module. It now uses a "sink"

model, sending events/info to the blame script. (this allows the RCS parser to be used in numerous contexts by simply switching the Sink that is used) git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@166 8cb11bc2-c004-0410-86c3-e597b4017df7
2001-05-08 19:34:58 +00:00 · 2001-05-08 19:34:58 +00:00 · 67daa9e5e2
parent 03a5620947
commit 67daa9e5e2
2 changed files with 355 additions and 227 deletions
--- a/lib/blame.py
+++ b/lib/blame.py
@ -40,17 +40,10 @@ import re
 import time
 import math
 import cgi
+import rcsparse

-path_sep    = os.path.normpath('/')[-1]
-
-class CVSParser:
+class CVSParser(rcsparse.Sink):
  # Precompiled regular expressions
-  nonws_token = re.compile('^([^;@][^;\\s]*)\\s*')
-  semic_token = re.compile('^;\\s*')
-  rcsen_token = re.compile('^@([^@]*)')
-  undo_escape = re.compile('@@')
-  odd_at      = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
-  rcs_tree    = re.compile('^\\d')
  trunk_rev   = re.compile('^[0-9]+\\.[0-9]+$')
  last_branch = re.compile('(.*)\\.[0-9]+')
  is_branch   = re.compile('(.*)\\.0\\.([0-9]+)')
@ -63,9 +56,6 @@ class CVSParser:
    self.Reset()

  def Reset(self):
-    self.line_buffer = ''
-    self.rcsfile = None
-    self.debug = 0
    self.last_revision = {}
    self.prev_revision = {}
    self.revision_date = {}
@ -73,7 +63,6 @@ class CVSParser:
    self.revision_branches = {}
    self.next_delta = {}
    self.prev_delta = {}
-    self.feof = 0
    self.tag_revision = {}
    self.revision_symbolic_name = {}
    self.timestamp = {}
@ -85,66 +74,6 @@ class CVSParser:
    self.lines_added  = {}
    self.lines_removed = {}

-  # Get the next token from the RCS file
-  def get_token(self):
-    # Erase all-whitespace lines
-    while len(self.line_buffer) == 0:
-      self.line_buffer = self.rcsfile.readline()
-      if self.line_buffer == '':
-        raise RuntimeError, 'EOF'
-      self.line_buffer = string.lstrip(self.line_buffer)
-
-    # A string of non-whitespace characters is a token
-    match = self.nonws_token.match(self.line_buffer)
-    if match:
-      self.line_buffer = self.nonws_token.sub('', self.line_buffer)
-      return match.group(1)
-
-    # ...and so is a single semicolon
-    if self.semic_token.match(self.line_buffer):
-      self.line_buffer = self.semic_token.sub('', self.line_buffer)
-      return ';'
-
-    # ...or an RCS-encoded string that starts with an @ character
-    match = self.rcsen_token.match(self.line_buffer)
-    self.line_buffer = self.rcsen_token.sub('', self.line_buffer)
-    token = match.group(1)
-
-    # Detect odd @ character used to close RCS-encoded string
-    while string.find(self.line_buffer, '@') < 0 or not self.odd_at.search(self.line_buffer):
-      token = token + self.line_buffer
-      self.line_buffer = self.rcsfile.readline()
-      if self.line_buffer == '':
-        raise RuntimeError, 'EOF'
-
-    # Retain the remainder of the line after the terminating @ character
-    i = self.odd_at.search(self.line_buffer).end(1)
-    token = token + self.line_buffer[:i]
-    self.line_buffer = self.line_buffer[i+1:]
-
-    # Undo escape-coding of @ characters.
-    token = self.undo_escape.sub('@', token)
-
-    # Digest any extra blank lines
-    while len(self.line_buffer) == 0 or self.line_buffer == '\n':
-      self.line_buffer = self.rcsfile.readline()
-      if self.line_buffer == '':
-        self.feof = 1
-        break
-
-    return token
-
-  # Try to match the next token from the input buffer
-  def match_token(self, match):
-    token = self.get_token()
-    if token != match:
-      raise RuntimeError, ('Unexpected parsing error in RCS file.\n' +
-                           'Expected token: %s, but saw: %s' % (match, token))
-
-  # Push RCS token back into the input buffer.
-  def unget_token(self, token):
-    self.line_buffer = token + " " + self.line_buffer
-
  # Map a tag to a numerical revision number.  The tag can be a symbolic
  # branch tag, a symbolic revision tag, or an ordinary numerical
  # revision number.
@ -239,49 +168,25 @@ class CVSParser:
      self.lines_removed[revision] = self.lines_removed[revision] + lines_removed_now
    return text

-  def parse_rcs_admin(self):
-    while 1:
-      # Read initial token at beginning of line
-      token = self.get_token()
+  def set_head_revision(self, revision):
+    self.head_revision = revision

-      # We're done once we reach the description of the RCS tree
-      if self.rcs_tree.match(token):
-        self.unget_token(token)
-        return
+  def set_principal_branch(self, branch_name):
+    self.principal_branch = branch_name

-      # print "token:", token
+  def define_tag(self, name, revision):
+    # Create an associate array that maps from tag name to
+    # revision number and vice-versa.
+    self.tag_revision[name] = revision

-      if token == "head":
-        self.head_revision = self.get_token()
-        self.get_token()         # Eat semicolon
-      elif token == "branch":
-        self.principal_branch = self.get_token()
-        self.get_token()         # Eat semicolon
-      elif token == "symbols":
-        # Create an associate array that maps from tag name to
-        # revision number and vice-versa.
-        while 1:
-          tag = self.get_token()
-          if tag == ';':
-            break
-          (tag_name, tag_rev) = string.split(tag, ':')
-          self.tag_revision[tag_name] = tag_rev
-          self.revision_symbolic_name[tag_rev] = tag_name
-      elif token == "comment":
-        self.file_description = self.get_token()
-        self.get_token()         # Eat semicolon
+    ### actually, this is a bit bogus... a rev can have multiple names
+    self.revision_symbolic_name[revision] = name

-      # Ignore all these other fields - We don't care about them.         
-      elif token in ("locks", "strict", "expand", "access"):
-        while 1:
-          tag = self.get_token()
-          if tag == ';':
-            break
-      else:
-        pass
-        # warn("Unexpected RCS token: $token\n")
+  def set_comment(self, comment):
+    self.file_description = comment

-    raise RuntimeError, "Unexpected EOF";
+  def set_description(self, description):
+    self.rcs_file_description = description

  # Construct dicts that represent the topology of the RCS tree
  # and other arrays that contain info about individual revisions.
@ -305,91 +210,47 @@ class CVSParser:
  # Also creates self.last_revision, keyed by a branch revision number, which
  # indicates the latest revision on a given branch,
  #   e.g. self.last_revision{"1.2.8"} == 1.2.8.5
+  def define_revision(self, revision, timestamp, author, state,
+                      branches, next):
+    self.tag_revision[revision] = revision
+    branch = self.last_branch.match(revision).group(1)
+    self.last_revision[branch] = revision

-  def parse_rcs_tree(self):
-    while 1:
-      revision = self.get_token()
+    #self.revision_date[revision] = date
+    self.timestamp[revision] = timestamp

-      # End of RCS tree description ?
-      if revision == 'desc':
-        self.unget_token(revision)
-        return
+    # Pretty print the date string
+    ltime = time.localtime(self.timestamp[revision])
+    formatted_date = time.strftime("%d %b %Y %H:%M", ltime)
+    self.revision_ctime[revision] = formatted_date

+    # Save age
+    self.revision_age[revision] = ((time.time() - self.timestamp[revision])
+                                   / self.SECONDS_PER_DAY)
+
+    # save author
+    self.revision_author[revision] = author
+
+    # ignore the state
+
+    # process the branch information
+    branch_text = ''
+    for branch in branches:
+      self.prev_revision[branch] = revision
+      self.next_delta[revision] = branch
+      self.prev_delta[branch] = revision
+      branch_text = branch_text + branch + ''
+    self.revision_branches[revision] = branch_text
+
+    # process the "next revision" information
+    if next:
+      self.next_delta[revision] = next
+      self.prev_delta[next] = revision
      is_trunk_revision = self.trunk_rev.match(revision) is not None
-
-      self.tag_revision[revision] = revision
-      branch = self.last_branch.match(revision).group(1)
-      self.last_revision[branch] = revision
-
-      # Parse date
-      self.match_token('date')
-      date = self.get_token()
-      self.revision_date[revision] = date
-      self.match_token(';')
-
-      # Convert date into timestamp
-      date_fields = string.split(date, '.') + ['0', '0', '0']
-      date_fields = map(string.atoi, date_fields)
-      if date_fields[0] < 100:
-        date_fields[0] = date_fields[0] + 1900
-      self.timestamp[revision] = time.mktime(date_fields)
-
-      # Pretty print the date string
-      ltime = time.localtime(self.timestamp[revision])
-      formatted_date = time.strftime("%d %b %Y %H:%M", ltime)
-      self.revision_ctime[revision] = formatted_date
-
-      # Save age
-      self.revision_age[revision] = (
-              (time.time() - self.timestamp[revision]) / self.SECONDS_PER_DAY)
-
-      # Parse author
-      self.match_token('author')
-      author = self.get_token()
-      self.revision_author[revision] = author
-      self.match_token(';')
-
-      # Parse state
-      self.match_token('state')
-      while self.get_token() != ';':
-        pass
-
-      # Parse branches
-      self.match_token('branches')
-      branches = ''
-      while 1:
-        token = self.get_token()
-        if token == ';':
-          break
-        self.prev_revision[token] = revision
-        self.prev_delta[token] = revision
-        branches = branches + token + ' '
-      self.revision_branches[revision] = branches
-
-      # Parse revision of next delta in chain
-      self.match_token('next')
-      next = ''
-      token = self.get_token()
-      if token != ';':
-        next = token
-        self.get_token()         # Eat semicolon
-        self.next_delta[revision] = next
-        self.prev_delta[next] = revision
-        if is_trunk_revision:
-          self.prev_revision[revision] = next
-        else:
-          self.prev_revision[next] = revision
-
-      if self.debug >= 3:
-        print "<pre>revision =", revision
-        print "date     = ", date
-        print "author   = ", author
-        print "branches = ", branches
-        print "next     = ", next + "</pre>\n"
-
-  def parse_rcs_description(self):
-    self.match_token('desc')
-    self.rcs_file_description = self.get_token()
+      if is_trunk_revision:
+        self.prev_revision[revision] = next
+      else:
+        self.prev_revision[next] = revision

  # Construct associative arrays containing info about individual revisions.
  #
@ -402,36 +263,9 @@ class CVSParser:
  #                          revision if this revision is on the trunk or
  #                          relative to its immediate predecessor if this
  #                          revision is on a branch.
-  def parse_rcs_deltatext(self):
-    while not self.feof:
-      revision = self.get_token()
-      if self.debug >= 3:
-        print "Reading delta for revision:", revision
-      self.match_token('log')
-      self.revision_log[revision] = self.get_token()
-      self.match_token('text')
-      self.revision_deltatext[revision] = self.get_token()
-
-  def parse_rcs_file(self):
-    if self.debug >= 2:
-      print "Reading RCS admin..."
-    self.parse_rcs_admin()
-    if self.debug >= 2:
-      print "Reading RCS revision tree topology..."
-    self.parse_rcs_tree()
-
-    if self.debug >= 3:
-      print "<pre>Keys:\n"
-      for i in self.tag_revision.keys():
-        print "yoyuo %s: %s" % (i, self.tag_revision[i])
-      print "</pre>"
-
-    self.parse_rcs_description()
-    if self.debug >= 2:
-      print "Reading RCS revision deltas..."
-    self.parse_rcs_deltatext()
-    if self.debug >= 2:
-      print "Done reading RCS file..."
+  def set_revision_info(self, revision, log, text):
+    self.revision_log[revision] = log
+    self.revision_deltatext[revision] = text

  def parse_cvs_file(self, rcs_pathname, opt_rev = None, opt_m_timestamp = None):
    # Args in:  opt_rev - requested revision
@ -442,13 +276,13 @@ class CVSParser:

    # CheckHidden(rcs_pathname);
    try:
-      self.rcsfile = open(rcs_pathname, 'r')
+      rcsfile = open(rcs_pathname, 'r')
    except:
      raise RuntimeError, ('error: %s appeared to be under CVS control, ' +
              'but the RCS file is inaccessible.') % rcs_pathname

-    self.parse_rcs_file()
-    self.rcsfile.close()
+    rcsparse.Parser().parse(rcsfile, self)
+    rcsfile.close()

    if opt_rev in [None, '', 'HEAD']:
      # Explicitly specified topmost revision in tree
@ -599,7 +433,8 @@ def link_includes(text, root, rcs_path):
    incfile = match.group(3)
    for rel_path in ('', 'Attic', '..'):
      trial_root = os.path.join(rcs_path, rel_path)
-      file = os.path.normpath('%s%s%s%s%s,v' % (root, path_sep, trial_root, path_sep, incfile))
+      file = os.path.join(root, trial_root)
+      file = os.path.normpath(os.path.join(file, incfile + ',v'))
      if os.access(file, os.F_OK):
        return '#%sinclude%s"<a href="%s">%s</a>"' % \
               (match.group(1), match.group(2),
@ -607,7 +442,7 @@ def link_includes(text, root, rcs_path):
  return text

 def make_html(root, rcs_path, opt_rev = None, sticky = None):
-  filename = root + path_sep + rcs_path
+  filename = os.path.join(root, rcs_path)
  parser = CVSParser()
  revision = parser.parse_cvs_file(filename, opt_rev)
  count = len(parser.revision_map)
--- a/lib/rcsparse.py
+++ b/lib/rcsparse.py
@ -0,0 +1,293 @@
+#
+# Copyright (C) 2000 The ViewCVS Group. All Rights Reserved.
+# Copyright (C) 2000 Curt Hagenlocher <curt@hagenlocher.org>
+#
+# By using this file, you agree to the terms and conditions set forth in
+# the LICENSE.html file which can be found at the top level of the ViewCVS
+# distribution or at http://www.lyra.org/viewcvs/license-1.html.
+#
+# Contact information:
+#   Greg Stein, PO Box 760, Palo Alto, CA, 94302
+#   gstein@lyra.org, http://www.lyra.org/viewcvs/
+#
+# -----------------------------------------------------------------------
+#
+# This software is being maintained as part of the ViewCVS project.
+# Information is available at:
+#    http://viewcvs.sourceforge.net/
+#
+# This file was originally based on the cvsblame.pl portion of the Bonsai
+# CVS tool, developed by Steve Lamm for Netscape Communications Corporation.
+# More information about Bonsai can be found at
+#    http://www.mozilla.org/bonsai.html
+#
+# cvsblame.pl, in turn, was based on Scott Furman's cvsblame script
+#
+# -----------------------------------------------------------------------
+
+import re
+import string
+import time
+
+
+class Parser:
+  # Precompiled regular expressions
+  nonws_token = re.compile('^([^;@][^;\\s]*)\\s*')
+  semic_token = re.compile('^;\\s*')
+  rcsen_token = re.compile('^@([^@]*)')
+  undo_escape = re.compile('@@')
+  odd_at      = re.compile('(([^@]|^)(@@)*)@([^@]|$)')
+  rcs_tree    = re.compile('^\\d')
+
+  # Get the next token from the RCS file
+  def get_token(self):
+    # Erase all-whitespace lines
+    while len(self.line_buffer) == 0:
+      self.line_buffer = self.rcsfile.readline()
+      if self.line_buffer == '':
+        raise RuntimeError, 'EOF'
+      self.line_buffer = string.lstrip(self.line_buffer)
+
+    # A string of non-whitespace characters is a token
+    match = self.nonws_token.match(self.line_buffer)
+    if match:
+      self.line_buffer = self.nonws_token.sub('', self.line_buffer)
+      return match.group(1)
+
+    # ...and so is a single semicolon
+    if self.semic_token.match(self.line_buffer):
+      self.line_buffer = self.semic_token.sub('', self.line_buffer)
+      return ';'
+
+    # ...or an RCS-encoded string that starts with an @ character
+    match = self.rcsen_token.match(self.line_buffer)
+    self.line_buffer = self.rcsen_token.sub('', self.line_buffer)
+    token = match.group(1)
+
+    # Detect odd @ character used to close RCS-encoded string
+    while string.find(self.line_buffer, '@') < 0 or not self.odd_at.search(self.line_buffer):
+      token = token + self.line_buffer
+      self.line_buffer = self.rcsfile.readline()
+      if self.line_buffer == '':
+        raise RuntimeError, 'EOF'
+
+    # Retain the remainder of the line after the terminating @ character
+    i = self.odd_at.search(self.line_buffer).end(1)
+    token = token + self.line_buffer[:i]
+    self.line_buffer = self.line_buffer[i+1:]
+
+    # Undo escape-coding of @ characters.
+    token = self.undo_escape.sub('@', token)
+
+    # Digest any extra blank lines
+    while len(self.line_buffer) == 0 or self.line_buffer == '\n':
+      self.line_buffer = self.rcsfile.readline()
+      if self.line_buffer == '':
+        self.feof = 1
+        break
+
+    return token
+
+  # Try to match the next token from the input buffer
+  def match_token(self, match):
+    token = self.get_token()
+    if token != match:
+      raise RuntimeError, ('Unexpected parsing error in RCS file.\n' +
+                           'Expected token: %s, but saw: %s' % (match, token))
+
+  # Push RCS token back into the input buffer.
+  def unget_token(self, token):
+    self.line_buffer = token + " " + self.line_buffer
+
+  def parse_rcs_admin(self):
+    while 1:
+      # Read initial token at beginning of line
+      token = self.get_token()
+
+      # We're done once we reach the description of the RCS tree
+      if self.rcs_tree.match(token):
+        self.unget_token(token)
+        return
+
+      # print "token:", token
+
+      if token == "head":
+        self.sink.set_head_revision(self.get_token())
+        self.match_token(';')
+      elif token == "branch":
+        self.sink.set_principal_branch(self.get_token())
+        self.match_token(';')
+      elif token == "symbols":
+        while 1:
+          tag = self.get_token()
+          if tag == ';':
+            break
+          (tag_name, tag_rev) = string.split(tag, ':')
+          self.sink.define_tag(tag_name, tag_rev)
+      elif token == "comment":
+        self.sink.set_comment(self.get_token())
+        self.match_token(';')
+
+      # Ignore all these other fields - We don't care about them.         
+      elif token in ("locks", "strict", "expand", "access"):
+        while 1:
+          tag = self.get_token()
+          if tag == ';':
+            break
+      else:
+        pass
+        # warn("Unexpected RCS token: $token\n")
+
+    raise RuntimeError, "Unexpected EOF";
+
+  def parse_rcs_tree(self):
+    while 1:
+      revision = self.get_token()
+
+      # End of RCS tree description ?
+      if revision == 'desc':
+        self.unget_token(revision)
+        return
+
+      # Parse date
+      self.match_token('date')
+      date = self.get_token()
+      self.match_token(';')
+
+      # Convert date into timestamp
+      date_fields = string.split(date, '.') + ['0', '0', '0']
+      date_fields = map(string.atoi, date_fields)
+      if date_fields[0] < 100:
+        date_fields[0] = date_fields[0] + 1900
+      timestamp = time.mktime(date_fields)
+
+      # Parse author
+      self.match_token('author')
+      author = self.get_token()
+      self.match_token(';')
+
+      # Parse state
+      self.match_token('state')
+      state = ''
+      while 1:
+        token = self.get_token()
+        if token == ';':
+          break
+        state = state + token + ' '
+      state = state[:-1]	# toss the trailing space
+
+      # Parse branches
+      self.match_token('branches')
+      branches = [ ]
+      while 1:
+        token = self.get_token()
+        if token == ';':
+          break
+        branches.append(token)
+
+      # Parse revision of next delta in chain
+      self.match_token('next')
+      next = self.get_token()
+      if next == ';':
+        next = None
+      else:
+        self.match_token(';')
+
+      # there are some files with extra tags in them. for example:
+      #    owner	640;
+      #    group	15;
+      #    permissions	644;
+      #    hardlinks	@configure.in@;
+      # we just want to skip over these
+      while 1:
+        token = self.get_token()
+        if token == 'desc' or self.rcs_tree.match(token):
+          self.unget_token(token)
+          break
+        # consume everything up to the semicolon
+        while self.get_token() != ';':
+          pass
+
+      self.sink.define_revision(revision, timestamp, author, state, branches,
+                                next)
+
+  def parse_rcs_description(self):
+    self.match_token('desc')
+    self.sink.set_description(self.get_token())
+
+  def parse_rcs_deltatext(self):
+    while not self.feof:
+      revision = self.get_token()
+      self.match_token('log')
+      log = self.get_token()
+      self.match_token('text')
+      text = self.get_token()
+      self.sink.set_revision_info(revision, log, text)
+
+  def parse(self, file, sink):
+    self.rcsfile = file
+    self.sink = sink
+    self.line_buffer = ''
+    self.feof = 0
+
+    self.parse_rcs_admin()
+    self.parse_rcs_tree()
+
+    # many sinks want to know when the tree has been completed so they can
+    # do some work to prep for the arrival of the deltatext
+    self.sink.tree_completed()
+
+    self.parse_rcs_description()
+    self.parse_rcs_deltatext()
+
+    self.rcsfile = self.sink = None
+
+
+class Sink:
+  def set_head_revision(self, revision):
+    pass
+  def set_principal_branch(self, branch_name):
+    pass
+  def define_tag(self, name, revision):
+    pass
+  def set_comment(self, comment):
+    pass
+  def set_description(self, description):
+    pass
+  def define_revision(self, revision, timestamp, author, state,
+                      branches, next):
+    pass
+  def set_revision_info(self, revision, log, text):
+    pass
+  def tree_completed(self):
+    pass
+
+class DebugSink:
+  def set_head_revision(self, revision):
+    print 'head:', revision
+
+  def set_principal_branch(self, branch_name):
+    print 'branch:', branch_name
+
+  def define_tag(self, name, revision):
+    print 'tag:', name, '=', revision
+
+  def set_comment(self, comment):
+    print 'comment:', comment
+
+  def set_description(self, description):
+    print 'description:', description
+
+  def define_revision(self, revision, timestamp, author, state,
+                      branches, next):
+    print 'revision:', revision
+    print '    timestamp:', timestamp
+    print '    author:', author
+    print '    state:', state
+    print '    branches:', branches
+    print '    next:', next
+
+  def set_revision_info(self, revision, log, text):
+    print 'revision:', revision
+    print '    log:', log
+    print '    text:', text[:100], '...'