diff babel/messages/jslexer.py @ 341:672b6b8e945d

Added !JavaScript extractor
author aronacher
date Thu, 12 Jun 2008 16:26:52 +0000
parents
children 603192024857
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/babel/messages/jslexer.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2008 Edgewall Software
+# All rights reserved.
+#
+# This software is licensed as described in the file COPYING, which
+# you should have received as part of this distribution. The terms
+# are also available at http://babel.edgewall.org/wiki/License.
+#
+# This software consists of voluntary contributions made by many
+# individuals. For the exact contribution history, see the revision
+# history and logs, available at http://babel.edgewall.org/log/.
+
+"""A simple JavaScript 1.5 lexer which is used for the JavaScript
+extractor.
+"""
+
+import re
+from operator import itemgetter
+
+
+operators = [
+    '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
+    '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
+    '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
+    '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.'
+]
+operators.sort(lambda a, b: cmp(-len(a), -len(b)))
+
+escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
+
+rules = [
+    (None, re.compile(r'\s+(?u)')),
+    (None, re.compile(r'<!--.*')),
+    ('linecomment', re.compile(r'//.*')),
+    ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
+    ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
+    ('number', re.compile(r'''(?x)(
+        (?:0|[1-9]\d*)
+        (\.\d+)?
+        ([eE][-+]?\d+)? |
+        (0x[a-fA-F0-9]+)
+    )''')),
+    ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
+    ('string', re.compile(r'''(?xs)(
+        '(?:[^'\\]*(?:\\.[^'\\]*)*)'  |
+        "(?:[^"\\]*(?:\\.[^"\\]*)*)"
+    )'''))
+]
+
+division_re = re.compile(r'/=?')
+regex_re = re.compile(r'/.+?/[a-zA-Z]*(?s)')
+line_re = re.compile(r'(\r\n|\n|\r)')
+line_join_re = re.compile(r'\\' + line_re.pattern)
+uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
+
+
+class TokenError(ValueError):
+    """Raised if the tokenizer stumbled upon invalid tokens."""
+
+class Token(tuple):
+    """Represents a token as returned by `tokenize`."""
+    __slots__ = ()
+
+    def __new__(cls, type, value, lineno):
+        return tuple.__new__(cls, (type, value, lineno))
+
+    type = property(itemgetter(0))
+    value = property(itemgetter(1))
+    lineno = property(itemgetter(2))
+
+def indicates_division(token):
+    """A helper function that helps the tokenizer to decide if the current
+    token may be followed by a division operator.
+    """
+    if token.type == 'operator':
+        return token.value in (')', ']', '}', '++', '--')
+    return token.type in ('name', 'number', 'string', 'regexp')
+
+def unquote_string(string):
+    """Unquote a string with JavaScript rules.  The string has to start with
+    string delimiters (``'`` or ``"``.)
+
+    :return: a string
+    """
+    assert string and string[0] == string[-1] and string[0] in '"\'', \
+        'string provided is not properly delimited'
+    string = line_join_re.sub('\\1', string[1:-1])
+    result = []
+    add = result.append
+    pos = 0
+
+    while 1:
+        # scan for the next escape
+        escape_pos = string.find('\\', pos)
+        if escape_pos < 0:
+            break
+        add(string[pos:escape_pos])
+
+        # check which character is escaped
+        next_char = string[escape_pos + 1]
+        if next_char in escapes:
+            add(escapes[next_char])
+
+        # unicode escapes.  trie to consume up to four characters of
+        # hexadecimal characters and try to interpret them as unicode
+        # character point.  If there is no such character point, put
+        # all the consumed characters into the string.
+        elif next_char in 'uU':
+            escaped = uni_escape_re.match(string, escape_pos + 2)
+            if escaped is not None:
+                escaped_value = escaped.group()
+                if len(escaped_value) == 4:
+                    try:
+                        add(unichr(int(escaped_value, 16)))
+                    except ValueError:
+                        pass
+                    else:
+                        pos = escape_pos + 6
+                        continue
+                add(next_char + escaped_value)
+                pos = escaped.end()
+                continue
+            else:
+                add(next_char)
+
+        # bogus escape.  Just remove the backslash.
+        else:
+            add(next_char)
+        pos = escape_pos + 2
+
+    if pos < len(string):
+        add(string[pos:])
+
+    return u''.join(result)
+
+def tokenize(source):
+    """Tokenize a JavaScript source.
+
+    :return: generator of `Token`\s
+    """
+    may_divide = False
+    pos = 0
+    lineno = 1
+    end = len(source)
+
+    while pos < end:
+        # handle regular rules first
+        for token_type, rule in rules:
+            match = rule.match(source, pos)
+            if match is not None:
+                break
+        # if we don't have a match we don't give up yet, but check for
+        # division operators or regular expression literals, based on
+        # the status of `may_divide` which is determined by the last
+        # processed non-whitespace token using `indicates_division`.
+        else:
+            if may_divide:
+                match = division_re.match(source, pos)
+                token_type = 'operator'
+            else:
+                match = regex_re.match(source, pos)
+                token_type = 'regexp'
+            if match is None:
+                raise TokenError('invalid syntax around line %d' % lineno)
+
+        token_value = match.group()
+        if token_type is not None:
+            token = Token(token_type, token_value, lineno)
+            may_divide = indicates_division(token)
+            yield token
+        lineno += len(line_re.findall(token_value))
+        pos = match.end()
Copyright (C) 2012-2017 Edgewall Software