summaryrefslogtreecommitdiff
path: root/django/utils/jslex.py
diff options
context:
space:
mode:
authorJannis Leidel <jannis@leidel.info>2011-06-07 16:11:25 +0000
committerJannis Leidel <jannis@leidel.info>2011-06-07 16:11:25 +0000
commit64e19ffb4ee32767861d25c874f0d2dfc75618b7 (patch)
tree11d0e86691bcb8917822d3d2a8abee6972642585 /django/utils/jslex.py
parentd14eb13992e782b2dfa7f2633b38a87e34efd45e (diff)
Fixed #7704, #14045 and #15495 -- Introduce a lexer for Javascript to fix multiple problems of the translation of Javascript files with xgettext. Many thanks to Ned Batchelder for his contribution of the JsLex library.
git-svn-id: http://code.djangoproject.com/svn/django/trunk@16333 bcc190cf-cafb-0310-a4f2-bffc1f526a37
Diffstat (limited to 'django/utils/jslex.py')
-rw-r--r--django/utils/jslex.py213
1 files changed, 213 insertions, 0 deletions
diff --git a/django/utils/jslex.py b/django/utils/jslex.py
new file mode 100644
index 0000000000..88a22ec67d
--- /dev/null
+++ b/django/utils/jslex.py
@@ -0,0 +1,213 @@
+"""JsLex: a lexer for Javascript"""
+# Originally from https://bitbucket.org/ned/jslex
+import re
+
+class Tok(object):
+ """
+ A specification for a token class.
+ """
+ num = 0
+
+ def __init__(self, name, regex, next=None):
+ self.id = Tok.num
+ Tok.num += 1
+ self.name = name
+ self.regex = regex
+ self.next = next
+
+def literals(choices, prefix="", suffix=""):
+ """
+ Create a regex from a space-separated list of literal `choices`.
+
+ If provided, `prefix` and `suffix` will be attached to each choice
+ individually.
+
+ """
+ return "|".join(prefix+re.escape(c)+suffix for c in choices.split())
+
+
+class Lexer(object):
+ """
+ A generic multi-state regex-based lexer.
+ """
+
+ def __init__(self, states, first):
+ self.regexes = {}
+ self.toks = {}
+
+ for state, rules in states.items():
+ parts = []
+ for tok in rules:
+ groupid = "t%d" % tok.id
+ self.toks[groupid] = tok
+ parts.append("(?P<%s>%s)" % (groupid, tok.regex))
+ self.regexes[state] = re.compile("|".join(parts), re.MULTILINE|re.VERBOSE)
+
+ self.state = first
+
+ def lex(self, text):
+ """
+ Lexically analyze `text`.
+
+ Yields pairs (`name`, `tokentext`).
+ """
+ while text:
+ eaten = 0
+ for match in self.regexes[self.state].finditer(text):
+ for name, toktext in match.groupdict().iteritems():
+ if toktext is not None:
+ tok = self.toks[name]
+ new_state = tok.next
+ eaten += len(toktext)
+ yield (tok.name, toktext)
+ if new_state:
+ self.state = new_state
+ break
+ text = text[eaten:]
+
+
+class JsLexer(Lexer):
+ """
+ A Javascript lexer
+
+ >>> lexer = JsLexer()
+ >>> list(lexer.lex("a = 1"))
+ [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
+
+ This doesn't properly handle non-Ascii characters in the Javascript source.
+ """
+
+ # Because these tokens are matched as alternatives in a regex, longer
+ # possibilities must appear in the list before shorter ones, for example,
+ # '>>' before '>'.
+ #
+ # Note that we don't have to detect malformed Javascript, only properly
+ # lex correct Javascript, so much of this is simplified.
+
+ # Details of Javascript lexical structure are taken from
+ # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
+
+ # A useful explanation of automatic semicolon insertion is at
+ # http://inimino.org/~inimino/blog/javascript_semicolons
+
+ both_before = [
+ Tok("comment", r"/\*(.|\n)*?\*/"),
+ Tok("linecomment", r"//.*?$"),
+ Tok("ws", r"\s+"),
+ Tok("keyword", literals("""
+ break case catch class const continue debugger
+ default delete do else enum export extends
+ finally for function if import in instanceof
+ new return super switch this throw try typeof
+ var void while with
+ """, suffix=r"\b"), next='reg'),
+ Tok("reserved", literals("null true false", suffix=r"\b"), next='div'),
+ Tok("id", r"""
+ ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char
+ ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars
+ """, next='div'),
+ Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'),
+ Tok("onum", r"0[0-7]+"),
+ Tok("dnum", r"""
+ ( (0|[1-9][0-9]*) # DecimalIntegerLiteral
+ \. # dot
+ [0-9]* # DecimalDigits-opt
+ ([eE][-+]?[0-9]+)? # ExponentPart-opt
+ |
+ \. # dot
+ [0-9]+ # DecimalDigits
+ ([eE][-+]?[0-9]+)? # ExponentPart-opt
+ |
+ (0|[1-9][0-9]*) # DecimalIntegerLiteral
+ ([eE][-+]?[0-9]+)? # ExponentPart-opt
+ )
+ """, next='div'),
+ Tok("punct", literals("""
+ >>>= === !== >>> <<= >>= <= >= == != << >> &&
+ || += -= *= %= &= |= ^=
+ """), next="reg"),
+ Tok("punct", literals("++ -- ) ]"), next='div'),
+ Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
+ Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
+ Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
+ ]
+
+ both_after = [
+ Tok("other", r"."),
+ ]
+
+ states = {
+ 'div': # slash will mean division
+ both_before + [
+ Tok("punct", literals("/= /"), next='reg'),
+ ] + both_after,
+
+ 'reg': # slash will mean regex
+ both_before + [
+ Tok("regex",
+ r"""
+ / # opening slash
+ # First character is..
+ ( [^*\\/[] # anything but * \ / or [
+ | \\. # or an escape sequence
+ | \[ # or a class, which has
+ ( [^\]\\] # anything but \ or ]
+ | \\. # or an escape sequence
+ )* # many times
+ \]
+ )
+ # Following characters are same, except for excluding a star
+ ( [^\\/[] # anything but \ / or [
+ | \\. # or an escape sequence
+ | \[ # or a class, which has
+ ( [^\]\\] # anything but \ or ]
+ | \\. # or an escape sequence
+ )* # many times
+ \]
+ )* # many times
+ / # closing slash
+ [a-zA-Z0-9]* # trailing flags
+ """, next='div'),
+ ] + both_after,
+ }
+
+ def __init__(self):
+ super(JsLexer, self).__init__(self.states, 'reg')
+
+
+def prepare_js_for_gettext(js):
+ """
+ Convert the Javascript source `js` into something resembling C for
+ xgettext.
+
+ What actually happens is that all the regex literals are replaced with
+ "REGEX".
+ """
+ def escape_quotes(m):
+ """Used in a regex to properly escape double quotes."""
+ s = m.group(0)
+ if s == '"':
+ return r'\"'
+ else:
+ return s
+
+ lexer = JsLexer()
+ c = []
+ for name, tok in lexer.lex(js):
+ if name == 'regex':
+ # C doesn't grok regexes, and they aren't needed for gettext,
+ # so just output a string instead.
+ tok = '"REGEX"';
+ elif name == 'string':
+ # C doesn't have single-quoted strings, so make all strings
+ # double-quoted.
+ if tok.startswith("'"):
+ guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
+ tok = '"' + guts + '"'
+ elif name == 'id':
+ # C can't deal with Unicode escapes in identifiers. We don't
+ # need them for gettext anyway, so replace them with something
+ # innocuous
+ tok = tok.replace("\\", "U");
+ c.append(tok)
+ return ''.join(c)