Check in the mxTextTools-based parser that I implemented back in

November. The timelog.fetch_log2() timing dropped from .53 seconds for my test input to .34 seconds. Overall stream speed dropped from .25 seconds to .10 seconds. This also introduces an mget() method on the token stream so that the parser can get multiple tokens in one shot, rather than needing to call many times. In timelog, I've added a new compare_many() for big verifications, time_stream to time individual token streams, profile_stream for profiling streams, and revised the time_fetch function to do multiple fetches and to trim the exceptional cases off the ends before reporting. git-svn-id: http://viewvc.tigris.org/svn/viewvc/trunk@456 8cb11bc2-c004-0410-86c3-e597b4017df7
2002-01-29 10:08:46 +00:00 · 2002-01-29 10:08:46 +00:00 · b37103c4d7
parent 2c245d2bdb
commit b37103c4d7
2 changed files with 400 additions and 33 deletions
--- a/lib/rcsparse.py
+++ b/lib/rcsparse.py
@ -160,8 +160,317 @@ class _TokenStream:

    self.get = give_it_back

+  def mget(self, count):
+    "Return multiple tokens. 'next' is at the end."
+    result = [ ]
+    for i in range(count):
+      result.append(self.get())
+    result.reverse()
+    return result
+
+try:
+  from mx import TextTools
+except ImportError:
+  _mxTokenStream = None
+else:
+  _tt = TextTools
+
+  _idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
+  _idchar_list.remove('$')
+  _idchar_list.remove(',')
+  #_idchar_list.remove('.')	leave as part of 'num' symbol
+  _idchar_list.remove(':')
+  _idchar_list.remove(';')
+  _idchar_list.remove('@')
+  _idchar = string.join(_idchar_list, '')
+  _idchar_set = _tt.set(_idchar)
+
+  _onechar_token_set = _tt.set(':;')
+
+  _not_at_set = _tt.invset('@')
+
+  _T_TOKEN = 30
+  _T_STRING_START = 40
+  _T_STRING_SPAN = 60
+  _T_STRING_END = 70
+
+  _E_COMPLETE = 100	# ended on a complete token
+  _E_TOKEN = 110	# ended mid-token
+  _E_STRING_SPAN = 130	# ended within a string
+  _E_STRING_END = 140	# ended with string-end ('@') (could be mid-@@)
+
+  _SUCCESS = +100
+
+  _EOF = 'EOF'
+  _CONTINUE = 'CONTINUE'
+  _UNUSED = 'UNUSED'
+
+
+  # continuation of a token over a chunk boundary
+  _c_token_table = (
+    (_T_TOKEN,      _tt.AllInSet, _idchar_set),
+    )
+
+  class _mxTokenStream:
+
+    # the algorithm is about the same speed for any CHUNK_SIZE chosen.
+    # grab a good-sized chunk, but not too large to overwhelm memory.
+    CHUNK_SIZE  = 100000
+
+  #  CHUNK_SIZE  = 5	# for debugging, make the function grind...
+
+    def __init__(self, file):
+      self.rcsfile = file
+      self.tokens = [ ]
+      self.partial = None
+
+      self.string_end = None
+
+    def _parse_chunk(self, buf, start=0):
+      "Get the next token from the RCS file."
+
+      buflen = len(buf)
+
+      assert start < buflen
+
+      # construct a tag table which refers to the buffer we need to parse.
+      table = (
+        # ignore whitespace. with or without whitespace, move to the next rule.
+        (None, _tt.AllInSet, _tt.whitespace_set, +1),
+
+        (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
+
+        # accumulate token text and exit, or move to the next rule.
+        (_UNUSED,      _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
+
+        (_E_TOKEN,  _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
+
+        # single character tokens exit immediately, or move to the next rule
+        (_UNUSED,    _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
+
+        (_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
+
+        # if this isn't an '@' symbol, then we have a syntax error (go to a
+        # negative index to indicate that condition). otherwise, suck it up
+        # and move to the next rule.
+        (_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
+
+        (None, _tt.Is, '@', +4, +1),
+        (buf, _tt.Is, '@', +1, -1),
+        (_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
+        (_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
+
+        (_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
+
+        # suck up everything that isn't an AT. go to next rule to look for EOF
+        (buf,  _tt.AllInSet, _not_at_set, 0, +1),
+
+        # go back to look for double AT if we aren't at the end of the string
+        (_E_STRING_SPAN,   _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
+        )
+
+      success, taglist, idx = _tt.tag(buf, table, start)
+
+      if not success:
+        ### need a better way to report this error
+        raise RCSIllegalCharacter()
+      assert idx == buflen
+
+      # pop off the last item
+      last_which = taglist.pop()
+
+      i = 0
+      tlen = len(taglist)
+      while i < tlen:
+        if taglist[i] == _T_STRING_START:
+          j = i + 1
+          while j < tlen:
+            if taglist[j] == _T_STRING_END:
+              s = _tt.join(taglist, '', i+1, j)
+              del taglist[i:j]
+              tlen = len(taglist)
+              taglist[i] = s
+              break
+            j = j + 1
+          else:
+            assert last_which == _E_STRING_SPAN
+            s = _tt.join(taglist, '', i+1)
+            del taglist[i:]
+            self.partial = (_T_STRING_SPAN, [ s ])
+            break
+        i = i + 1
+
+      # figure out whether we have a partial last-token
+      if last_which == _E_TOKEN:
+        self.partial = (_T_TOKEN, [ taglist.pop() ])
+      elif last_which == _E_COMPLETE:
+        pass
+      elif last_which == _E_STRING_SPAN:
+        assert self.partial
+      else:
+        assert last_which == _E_STRING_END
+        self.partial = (_T_STRING_END, [ taglist.pop() ])
+
+      taglist.reverse()
+      taglist.extend(self.tokens)
+      self.tokens = taglist
+
+    def _set_end(self, taglist, text, l, r, subtags):
+      self.string_end = l
+
+    def _handle_partial(self, buf):
+      which, chunks = self.partial
+      if which == _T_TOKEN:
+        success, taglist, idx = _tt.tag(buf, _c_token_table)
+        if not success:
+          # The start of this buffer was not a token. So the end of the
+          # prior buffer was a complete token.
+          self.tokens.insert(0, string.join(chunks, ''))
+        else:
+          assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
+                 and taglist[0][1] == 0 and taglist[0][2] == idx
+          if idx == len(buf):
+            #
+            # The whole buffer was one huge token, so we may have a
+            # partial token again.
+            #
+            # Note: this modifies the list of chunks in self.partial
+            #
+            chunks.append(buf)
+
+            # consumed the whole buffer
+            return len(buf)
+
+          # got the rest of the token.
+          chunks.append(buf[:idx])
+          self.tokens.insert(0, string.join(chunks, ''))
+
+        # no more partial token
+        self.partial = None
+
+        return idx
+
+      if which == _T_STRING_END:
+        if buf[0] != '@':
+          self.tokens.insert(0, string.join(chunks, ''))
+          return 0
+        chunks.append('@')
+        start = 1
+      else:
+        start = 0
+
+      self.string_end = None
+      string_table = (
+        (None,    _tt.Is, '@', +3, +1),
+        (_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
+        (self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
+
+        (None,    _tt.EOF, _tt.Here, +1, _SUCCESS),
+
+        # suck up everything that isn't an AT. move to next rule to look
+        # for EOF
+        (_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
+
+        # go back to look for double AT if we aren't at the end of the string
+        (None,    _tt.EOF, _tt.Here, -5, _SUCCESS),
+        )
+
+      success, unused, idx = _tt.tag(buf, string_table,
+                                     start, len(buf), chunks)
+
+      # must have matched at least one item
+      assert success
+
+      if self.string_end is None:
+        assert idx == len(buf)
+        self.partial = (_T_STRING_SPAN, chunks)
+      elif self.string_end < len(buf):
+        self.partial = None
+        self.tokens.insert(0, string.join(chunks, ''))
+      else:
+        self.partial = (_T_STRING_END, chunks)
+
+      return idx
+
+    def _parse_more(self):
+      buf = self.rcsfile.read(self.CHUNK_SIZE)
+      if not buf:
+        return _EOF
+
+      if self.partial:
+        idx = self._handle_partial(buf)
+        if idx is None:
+          return _CONTINUE
+        if idx < len(buf):
+          self._parse_chunk(buf, idx)
+      else:
+        self._parse_chunk(buf)
+
+      return _CONTINUE
+
+    def get(self):
+      try:
+        return self.tokens.pop()
+      except IndexError:
+        pass
+
+      while not self.tokens:
+        action = self._parse_more()
+        if action == _EOF:
+          return None
+
+      return self.tokens.pop()
+
+
+  #  _get = get
+  #  def get(self):
+      token = self._get()
+      print 'T:', `token`
+      return token
+
+    def match(self, match):
+      if self.tokens:
+        token = self.tokens.pop()
+        if token != match:
+          raise RuntimeError, ('Unexpected parsing error in RCS file.\n'
+                               'Expected token: %s, but saw: %s'
+                               % (match, token))
+      else:
+        token = self.get()
+        if token != match:
+          raise RuntimeError, ('Unexpected parsing error in RCS file.\n'
+                               'Expected token: %s, but saw: %s'
+                               % (match, token))
+
+    def unget(self, token):
+      self.tokens.append(token)
+
+    def mget(self, count):
+      "Return multiple tokens. 'next' is at the end."
+      while len(self.tokens) < count:
+        action = self._parse_more()
+        if action == _EOF:
+          ### fix this
+          raise RuntimeError, 'EOF hit while expecting tokens'
+      result = self.tokens[-count:]
+      del self.tokens[-count:]
+      return result
+
+
+class RCSParseError(Exception):
+  pass
+class RCSIllegalCharacter(RCSParseError):
+  pass
+### need more work on this one
+class RCSExpected(RCSParseError):
+  def __init__(self, got, wanted):
+    pass
+
+
 class Parser:

+  stream_class = _mxTokenStream or _TokenStream
+
  def parse_rcs_admin(self):
    while 1:
      # Read initial token at beginning of line
@ -173,21 +482,32 @@ class Parser:
        return

      if token == "head":
-        self.sink.set_head_revision(self.ts.get())
-        self.ts.match(';')
+        semi, rev = self.ts.mget(2)
+        self.sink.set_head_revision(rev)
+        if semi != ';':
+          raise RCSExpected(semi, ';')
      elif token == "branch":
-        self.sink.set_principal_branch(self.ts.get())
-        self.ts.match(';')
+        semi, branch = self.ts.mget(2)
+        self.sink.set_principal_branch(branch)
+        if semi != ';':
+          raise RCSExpected(semi, ';')
      elif token == "symbols":
        while 1:
          tag = self.ts.get()
          if tag == ';':
            break
-          (tag_name, tag_rev) = string.split(tag, ':')
+          if self.stream_class == _mxTokenStream:
+            self.ts.match(':')
+            tag_name = tag
+            tag_rev = self.ts.get()
+          else:
+            (tag_name, tag_rev) = string.split(tag, ':')
          self.sink.define_tag(tag_name, tag_rev)
      elif token == "comment":
-        self.sink.set_comment(self.ts.get())
-        self.ts.match(';')
+        semi, comment = self.ts.mget(2)
+        self.sink.set_comment(comment)
+        if semi != ';':
+          raise RCSExpected(semi, ';')

      # Ignore all these other fields - We don't care about them. Also chews
      # up "newphrase".
@ -212,9 +532,11 @@ class Parser:
        return

      # Parse date
-      self.ts.match('date')
-      date = self.ts.get()
-      self.ts.match(';')
+      semi, date, sym = self.ts.mget(3)
+      if sym != 'date':
+        raise RCSExpected(sym, 'date')
+      if semi != ';':
+        raise RCSExpected(semi, ';')

      # Convert date into timestamp
      date_fields = string.split(date, '.') + ['0', '0', '0']
@ -224,9 +546,11 @@ class Parser:
      timestamp = time.mktime(tuple(date_fields))

      # Parse author
-      self.ts.match('author')
-      author = self.ts.get()
-      self.ts.match(';')
+      semi, author, sym = self.ts.mget(3)
+      if sym != 'author':
+        raise RCSExpected(sym, 'author')
+      if semi != ';':
+        raise RCSExpected(semi, ';')

      # Parse state
      self.ts.match('state')
@ -248,8 +572,9 @@ class Parser:
        branches.append(token)

      # Parse revision of next delta in chain
-      self.ts.match('next')
-      next = self.ts.get()
+      next, sym = self.ts.mget(2)
+      if sym != 'next':
+        raise RCSExpected(sym, 'next')
      if next == ';':
        next = None
      else:
@ -283,15 +608,17 @@ class Parser:
      if revision is None:
        # EOF
        break
-      self.ts.match('log')
-      log = self.ts.get()
+      text, sym2, log, sym1 = self.ts.mget(4)
+      if sym1 != 'log':
+        print `text[:100], sym2[:100], log[:100], sym1[:100]`
+        raise RCSExpected(sym1, 'log')
+      if sym2 != 'text':
+        raise RCSExpected(sym2, 'text')
      ### need to add code to chew up "newphrase"
-      self.ts.match('text')
-      text = self.ts.get()
      self.sink.set_revision_info(revision, log, text)

  def parse(self, file, sink):
-    self.ts = _TokenStream(file)
+    self.ts = self.stream_class(file)
    self.sink = sink

    self.parse_rcs_admin()
--- a/tests/timelog.py
+++ b/tests/timelog.py
@ -84,6 +84,8 @@ def fetch_log2(full_name, which_rev=None):
  return sink.head, sink.branch, sink.tags, sink.revs

 def compare_fetch(full_name, which_rev=None):
+  # d1 and d2 are:
+  #   ( HEAD revision, branch name, TAGS { name : revision }, [ LogEntry ] )
  d1 = viewcvs.fetch_log(full_name, which_rev)
  d2 = fetch_log2(full_name, which_rev)
  if d1[:3] != d2[:3]:
@ -102,21 +104,59 @@ def compare_fetch(full_name, which_rev=None):
    if vars(d1[3][i]) != vars(d2[3][i]):
      pprint.pprint((i, vars(d1[3][i]), vars(d2[3][i])))

-def time_fetch(full_name, which_rev=None):
-  t = time.time()
-  viewcvs.fetch_log(full_name, which_rev)
-  t1 = time.time() - t
-  t = time.time()
-  fetch_log2(full_name, which_rev)
-  t2 = time.time() - t
-  print t1, t2
+def compare_many(files):
+  for file in files:
+    print file, '...'
+    compare_fetch(file)

-def profile_fetch(full_name, which_rev=None):
+def time_stream(stream_class, filename, n=10):
+  d1 = d2 = d3 = d4 = 0
+  t = time.time()
+  for i in range(n):
+    ts = stream_class(open(filename))
+    while ts.get() is not None:
+      pass
+  t = time.time() - t
+  print t/n
+
+def time_fetch(full_name, which_rev=None, n=1):
+  times1 = [ None ] * n
+  times2 = [ None ] * n
+  for i in range(n):
+    t = time.time()
+    viewcvs.fetch_log(full_name, which_rev)
+    times1[i] = time.time() - t
+  for i in range(n):
+    t = time.time()
+    fetch_log2(full_name, which_rev)
+    times2[i] = time.time() - t
+  times1.sort()
+  times2.sort()
+  i1 = int(n*.05)
+  i2 = int(n*.95)+1
+  times1 = times1[i1:i2]
+  times2 = times2[i1:i2]
+  t1 = reduce(lambda x,y: x+y, times1, 0) / len(times1)
+  t2 = reduce(lambda x,y: x+y, times2, 0) / len(times2)
+  print "t1=%.4f (%.4f .. %.4f)    t2=%.4f (%.4f .. %.4f)" % \
+        (t1, times1[0], times1[-1], t2, times2[0], times2[-1])
+
+def profile_stream(stream_class, filename, n=20):
  p = profile.Profile()
-  def many_calls(*args):
-    for i in xrange(10):
-      apply(fetch_log2, args)
-  p.runcall(many_calls, full_name, which_rev)
+  def many_calls(filename, n):
+    for i in xrange(n):
+      ts = stream_class(open(filename))
+      while ts.get() is not None:
+        pass
+  p.runcall(many_calls, filename, n)
+  p.print_stats()
+
+def profile_fetch(full_name, which_rev=None, n=10):
+  p = profile.Profile()
+  def many_calls(full_name, which_rev, n):
+    for i in xrange(n):
+      fetch_log2(full_name, which_rev)
+  p.runcall(many_calls, full_name, which_rev, n)
  p.print_stats()

 def varysize(full_name, which_rev=None):