viewvc-4intranet/misc/PyFontify.py

171 lines
5.1 KiB
Python

"""Module to analyze Python source code; for syntax coloring tools.
Interface:
tags = fontify(pytext, searchfrom, searchto)
The PYTEXT argument is a string containing Python source code. The
(optional) arguments SEARCHFROM and SEARCHTO may contain a slice in
PYTEXT.
The returned value is a list of tuples, formatted like this:
[('keyword', 0, 6, None),
('keyword', 11, 17, None),
('comment', 23, 53, None),
...
]
The tuple contents are always like this:
(tag, startindex, endindex, sublist)
TAG is one of 'keyword', 'string', 'comment' or 'identifier'
SUBLIST is not used, hence always None.
"""
# Based on FontText.py by Mitchell S. Chapman,
# which was modified by Zachary Roadhouse,
# then un-Tk'd by Just van Rossum.
# Many thanks for regular expression debugging & authoring are due to:
# Tim (the-incredib-ly y'rs) Peters and Cristian Tismer
# So, who owns the copyright? ;-) How about this:
# Copyright 1996-1997:
# Mitchell S. Chapman,
# Zachary Roadhouse,
# Tim Peters,
# Just van Rossum
__version__ = "0.3.1"
import string, re
# This list of keywords is taken from ref/node13.html of the
# Python 1.3 HTML documentation. ("access" is intentionally omitted.)
keywordsList = ["and", "assert", "break", "class", "continue", "def",
"del", "elif", "else", "except", "exec", "finally",
"for", "from", "global", "if", "import", "in", "is",
"lambda", "not", "or", "pass", "print", "raise",
"return", "try", "while",
]
# A regexp for matching Python comments.
commentPat = "#.*"
# A regexp for matching simple quoted strings.
pat = "q[^q\\n]*(\\[\000-\377][^q\\n]*)*q"
quotePat = string.replace(pat, "q", "'") + "|" + string.replace(pat, 'q', '"')
# A regexp for matching multi-line tripled-quoted strings. (Way to go, Tim!)
pat = """
qqq
[^q]*
(
( \\[\000-\377]
| q
( \\[\000-\377]
| [^q]
| q
( \\[\000-\377]
| [^q]
)
)
)
[^q]*
)*
qqq
"""
pat = string.join(string.split(pat), '') # get rid of whitespace
tripleQuotePat = string.replace(pat, "q", "'") + "|" \
+ string.replace(pat, 'q', '"')
# A regexp which matches all and only Python keywords. This will let
# us skip the uninteresting identifier references.
nonKeyPat = "(^|[^a-zA-Z0-9_.\"'])" # legal keyword-preceding characters
keyPat = nonKeyPat + "(" + string.join(keywordsList, "|") + ")" + nonKeyPat
# Our final syntax-matching regexp is the concatation of the regexp's we
# constructed above.
syntaxPat = keyPat + \
"|" + commentPat + \
"|" + tripleQuotePat + \
"|" + quotePat
syntaxRE = re.compile(syntaxPat)
# Finally, we construct a regexp for matching indentifiers (with
# optional leading whitespace).
idKeyPat = "[ \t]*[A-Za-z_][A-Za-z_0-9.]*"
idRE = re.compile(idKeyPat)
def fontify(pytext, searchfrom=0, searchto=None):
if searchto is None:
searchto = len(pytext)
tags = []
commentTag = 'comment'
stringTag = 'string'
keywordTag = 'keyword'
identifierTag = 'identifier'
start = 0
end = searchfrom
while 1:
# Look for some syntax token we're interested in. If find
# nothing, we're done.
matchobj = syntaxRE.search(pytext, end)
if not matchobj:
break
# If we found something outside our search area, it doesn't
# count (and we're done).
start = matchobj.start()
if start >= searchto:
break
match = matchobj.group(0)
end = start + len(match)
c = match[0]
if c == '#':
# We matched a comment.
tags.append((commentTag, start, end, None))
elif c == '"' or c == '\'':
# We matched a string.
tags.append((stringTag, start, end, None))
else:
# We matched a keyword.
if start != searchfrom:
# there's still a redundant char before and after it, strip!
match = match[1:-1]
start = start + 1
else:
# This is the first keyword in the text.
# Only a space at the end.
match = match[:-1]
end = end - 1
tags.append((keywordTag, start, end, None))
# If this was a defining keyword, look ahead to the
# following identifier.
if match in ["def", "class"]:
matchobj = idRE.search(pytext, end)
if matchobj:
start = matchobj.start()
if start == end and start < searchto:
end = start + len(matchobj.group(0))
tags.append((identifierTag, start, end, None))
return tags
def test(path):
f = open(path)
text = f.read()
f.close()
tags = fontify(text)
for tag, start, end, sublist in tags:
print tag, `text[start:end]`
if __name__ == "__main__":
import sys
test(sys.argv[0])