Fixed #7704, #14045 and #15495 -- Introduce a lexer for Javascript to fix multiple problems of the translation of Javascript files with xgettext. Many thanks to Ned Batchelder for his contribution of the JsLex library.

git-svn-id: http://code.djangoproject.com/svn/django/trunk@16333 bcc190cf-cafb-0310-a4f2-bffc1f526a37
author: Jannis Leidel <jannis@leidel.info> 2011-06-07 16:11:25 +0000
committer: Jannis Leidel <jannis@leidel.info> 2011-06-07 16:11:25 +0000
commit: 64e19ffb4ee32767861d25c874f0d2dfc75618b7 (patch)
tree: 11d0e86691bcb8917822d3d2a8abee6972642585 /django/utils/jslex.py
parent: d14eb13992e782b2dfa7f2633b38a87e34efd45e (diff)
1 files changed, 213 insertions, 0 deletions
diff --git a/django/utils/jslex.py b/django/utils/jslex.py
new file mode 100644
index 0000000000..88a22ec67d
--- /dev/null
+++ b/django/utils/jslex.py
@@ -0,0 +1,213 @@
+"""JsLex: a lexer for Javascript"""
+# Originally from https://bitbucket.org/ned/jslex
+import re
+
+class Tok(object):
+    """
+    A specification for a token class.
+    """
+    num = 0
+
+    def __init__(self, name, regex, next=None):
+        self.id = Tok.num
+        Tok.num += 1
+        self.name = name
+        self.regex = regex
+        self.next = next
+
+def literals(choices, prefix="", suffix=""):
+    """
+    Create a regex from a space-separated list of literal `choices`.
+
+    If provided, `prefix` and `suffix` will be attached to each choice
+    individually.
+
+    """
+    return "|".join(prefix+re.escape(c)+suffix for c in choices.split())
+
+
+class Lexer(object):
+    """
+    A generic multi-state regex-based lexer.
+    """
+
+    def __init__(self, states, first):
+        self.regexes = {}
+        self.toks = {}
+
+        for state, rules in states.items():
+            parts = []
+            for tok in rules:
+                groupid = "t%d" % tok.id
+                self.toks[groupid] = tok
+                parts.append("(?P<%s>%s)" % (groupid, tok.regex))
+            self.regexes[state] = re.compile("|".join(parts), re.MULTILINE|re.VERBOSE)
+
+        self.state = first
+
+    def lex(self, text):
+        """
+        Lexically analyze `text`.
+
+        Yields pairs (`name`, `tokentext`).
+        """
+        while text:
+            eaten = 0
+            for match in self.regexes[self.state].finditer(text):
+                for name, toktext in match.groupdict().iteritems():
+                    if toktext is not None:
+                        tok = self.toks[name]
+                        new_state = tok.next
+                        eaten += len(toktext)
+                        yield (tok.name, toktext)
+                if new_state:
+                    self.state = new_state
+                    break
+            text = text[eaten:]
+
+
+class JsLexer(Lexer):
+    """
+    A Javascript lexer
+
+    >>> lexer = JsLexer()
+    >>> list(lexer.lex("a = 1"))
+    [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
+
+    This doesn't properly handle non-Ascii characters in the Javascript source.
+    """
+
+    # Because these tokens are matched as alternatives in a regex, longer
+    # possibilities must appear in the list before shorter ones, for example,
+    # '>>' before '>'.
+    #
+    # Note that we don't have to detect malformed Javascript, only properly
+    # lex correct Javascript, so much of this is simplified.
+
+    # Details of Javascript lexical structure are taken from
+    # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
+
+    # A useful explanation of automatic semicolon insertion is at
+    # http://inimino.org/~inimino/blog/javascript_semicolons
+
+    both_before = [
+        Tok("comment",      r"/\*(.|\n)*?\*/"),
+        Tok("linecomment",  r"//.*?$"),
+        Tok("ws",           r"\s+"),
+        Tok("keyword",      literals("""
+                                break case catch class const continue debugger
+                                default delete do else enum export extends
+                                finally for function if import in instanceof
+                                new return super switch this throw try typeof
+                                var void while with
+                                """, suffix=r"\b"), next='reg'),
+        Tok("reserved",     literals("null true false", suffix=r"\b"), next='div'),
+        Tok("id",           r"""
+                            ([a-zA-Z_$   ]|\\u[0-9a-fA-Z]{4})   # first char
+                            ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})*  # rest chars
+                            """, next='div'),
+        Tok("hnum",         r"0[xX][0-9a-fA-F]+", next='div'),
+        Tok("onum",         r"0[0-7]+"),
+        Tok("dnum",         r"""
+                            (   (0|[1-9][0-9]*)     # DecimalIntegerLiteral
+                                \.                  # dot
+                                [0-9]*              # DecimalDigits-opt
+                                ([eE][-+]?[0-9]+)?  # ExponentPart-opt
+                            |
+                                \.                  # dot
+                                [0-9]+              # DecimalDigits
+                                ([eE][-+]?[0-9]+)?  # ExponentPart-opt
+                            |
+                                (0|[1-9][0-9]*)     # DecimalIntegerLiteral
+                                ([eE][-+]?[0-9]+)?  # ExponentPart-opt
+                            )
+                            """, next='div'),
+        Tok("punct",        literals("""
+                                >>>= === !== >>> <<= >>= <= >= == != << >> &&
+                                || += -= *= %= &= |= ^=
+                                """), next="reg"),
+        Tok("punct",        literals("++ -- ) ]"), next='div'),
+        Tok("punct",        literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
+        Tok("string",       r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
+        Tok("string",       r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
+        ]
+
+    both_after = [
+        Tok("other",        r"."),
+    ]
+
+    states = {
+        'div': # slash will mean division
+            both_before + [
+            Tok("punct", literals("/= /"), next='reg'),
+            ] + both_after,
+
+        'reg':  # slash will mean regex
+            both_before + [
+            Tok("regex",
+                r"""
+                    /                       # opening slash
+                    # First character is..
+                    (   [^*\\/[]            # anything but * \ / or [
+                    |   \\.                 # or an escape sequence
+                    |   \[                  # or a class, which has
+                            (   [^\]\\]     #   anything but \ or ]
+                            |   \\.         #   or an escape sequence
+                            )*              #   many times
+                        \]
+                    )
+                    # Following characters are same, except for excluding a star
+                    (   [^\\/[]             # anything but \ / or [
+                    |   \\.                 # or an escape sequence
+                    |   \[                  # or a class, which has
+                            (   [^\]\\]     #   anything but \ or ]
+                            |   \\.         #   or an escape sequence
+                            )*              #   many times
+                        \]
+                    )*                      # many times
+                    /                       # closing slash
+                    [a-zA-Z0-9]*            # trailing flags
+                """, next='div'),
+            ] + both_after,
+        }
+
+    def __init__(self):
+        super(JsLexer, self).__init__(self.states, 'reg')
+
+
+def prepare_js_for_gettext(js):
+    """
+    Convert the Javascript source `js` into something resembling C for
+    xgettext.
+
+    What actually happens is that all the regex literals are replaced with
+    "REGEX".
+    """
+    def escape_quotes(m):
+        """Used in a regex to properly escape double quotes."""
+        s = m.group(0)
+        if s == '"':
+            return r'\"'
+        else:
+            return s
+
+    lexer = JsLexer()
+    c = []
+    for name, tok in lexer.lex(js):
+        if name == 'regex':
+            # C doesn't grok regexes, and they aren't needed for gettext,
+            # so just output a string instead.
+            tok = '"REGEX"';
+        elif name == 'string':
+            # C doesn't have single-quoted strings, so make all strings
+            # double-quoted.
+            if tok.startswith("'"):
+                guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
+                tok = '"' + guts + '"'
+        elif name == 'id':
+            # C can't deal with Unicode escapes in identifiers.  We don't
+            # need them for gettext anyway, so replace them with something
+            # innocuous
+            tok = tok.replace("\\", "U");
+        c.append(tok)
+    return ''.join(c)
author	Jannis Leidel <jannis@leidel.info>	2011-06-07 16:11:25 +0000
committer	Jannis Leidel <jannis@leidel.info>	2011-06-07 16:11:25 +0000
commit	64e19ffb4ee32767861d25c874f0d2dfc75618b7 (patch)
tree	11d0e86691bcb8917822d3d2a8abee6972642585 /django/utils/jslex.py
parent	d14eb13992e782b2dfa7f2633b38a87e34efd45e (diff)