341
|
1 # -*- coding: utf-8 -*-
|
|
2 #
|
|
3 # Copyright (C) 2008 Edgewall Software
|
|
4 # All rights reserved.
|
|
5 #
|
|
6 # This software is licensed as described in the file COPYING, which
|
|
7 # you should have received as part of this distribution. The terms
|
|
8 # are also available at http://babel.edgewall.org/wiki/License.
|
|
9 #
|
|
10 # This software consists of voluntary contributions made by many
|
|
11 # individuals. For the exact contribution history, see the revision
|
|
12 # history and logs, available at http://babel.edgewall.org/log/.
|
|
13
|
|
14 """A simple JavaScript 1.5 lexer which is used for the JavaScript
|
|
15 extractor.
|
|
16 """
|
|
17
|
|
18 import re
|
|
19 from operator import itemgetter
|
|
20
|
|
21
|
|
22 operators = [
|
|
23 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
|
|
24 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
|
|
25 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
|
|
26 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.'
|
|
27 ]
|
|
28 operators.sort(lambda a, b: cmp(-len(a), -len(b)))
|
|
29
|
|
30 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
|
|
31
|
|
32 rules = [
|
|
33 (None, re.compile(r'\s+(?u)')),
|
|
34 (None, re.compile(r'<!--.*')),
|
|
35 ('linecomment', re.compile(r'//.*')),
|
|
36 ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
|
|
37 ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
|
|
38 ('number', re.compile(r'''(?x)(
|
|
39 (?:0|[1-9]\d*)
|
|
40 (\.\d+)?
|
|
41 ([eE][-+]?\d+)? |
|
|
42 (0x[a-fA-F0-9]+)
|
|
43 )''')),
|
|
44 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
|
|
45 ('string', re.compile(r'''(?xs)(
|
|
46 '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
|
|
47 "(?:[^"\\]*(?:\\.[^"\\]*)*)"
|
|
48 )'''))
|
|
49 ]
|
|
50
|
|
51 division_re = re.compile(r'/=?')
|
|
52 regex_re = re.compile(r'/.+?/[a-zA-Z]*(?s)')
|
|
53 line_re = re.compile(r'(\r\n|\n|\r)')
|
|
54 line_join_re = re.compile(r'\\' + line_re.pattern)
|
|
55 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
|
|
56
|
|
57
|
|
58 class TokenError(ValueError):
|
|
59 """Raised if the tokenizer stumbled upon invalid tokens."""
|
|
60
|
|
61 class Token(tuple):
|
|
62 """Represents a token as returned by `tokenize`."""
|
|
63 __slots__ = ()
|
|
64
|
|
65 def __new__(cls, type, value, lineno):
|
|
66 return tuple.__new__(cls, (type, value, lineno))
|
|
67
|
|
68 type = property(itemgetter(0))
|
|
69 value = property(itemgetter(1))
|
|
70 lineno = property(itemgetter(2))
|
|
71
|
|
72 def indicates_division(token):
|
|
73 """A helper function that helps the tokenizer to decide if the current
|
|
74 token may be followed by a division operator.
|
|
75 """
|
|
76 if token.type == 'operator':
|
|
77 return token.value in (')', ']', '}', '++', '--')
|
|
78 return token.type in ('name', 'number', 'string', 'regexp')
|
|
79
|
|
80 def unquote_string(string):
|
|
81 """Unquote a string with JavaScript rules. The string has to start with
|
|
82 string delimiters (``'`` or ``"``.)
|
|
83
|
|
84 :return: a string
|
|
85 """
|
|
86 assert string and string[0] == string[-1] and string[0] in '"\'', \
|
|
87 'string provided is not properly delimited'
|
|
88 string = line_join_re.sub('\\1', string[1:-1])
|
|
89 result = []
|
|
90 add = result.append
|
|
91 pos = 0
|
|
92
|
|
93 while 1:
|
|
94 # scan for the next escape
|
|
95 escape_pos = string.find('\\', pos)
|
|
96 if escape_pos < 0:
|
|
97 break
|
|
98 add(string[pos:escape_pos])
|
|
99
|
|
100 # check which character is escaped
|
|
101 next_char = string[escape_pos + 1]
|
|
102 if next_char in escapes:
|
|
103 add(escapes[next_char])
|
|
104
|
|
105 # unicode escapes. trie to consume up to four characters of
|
|
106 # hexadecimal characters and try to interpret them as unicode
|
|
107 # character point. If there is no such character point, put
|
|
108 # all the consumed characters into the string.
|
|
109 elif next_char in 'uU':
|
|
110 escaped = uni_escape_re.match(string, escape_pos + 2)
|
|
111 if escaped is not None:
|
|
112 escaped_value = escaped.group()
|
|
113 if len(escaped_value) == 4:
|
|
114 try:
|
|
115 add(unichr(int(escaped_value, 16)))
|
|
116 except ValueError:
|
|
117 pass
|
|
118 else:
|
|
119 pos = escape_pos + 6
|
|
120 continue
|
|
121 add(next_char + escaped_value)
|
|
122 pos = escaped.end()
|
|
123 continue
|
|
124 else:
|
|
125 add(next_char)
|
|
126
|
|
127 # bogus escape. Just remove the backslash.
|
|
128 else:
|
|
129 add(next_char)
|
|
130 pos = escape_pos + 2
|
|
131
|
|
132 if pos < len(string):
|
|
133 add(string[pos:])
|
|
134
|
|
135 return u''.join(result)
|
|
136
|
|
137 def tokenize(source):
|
|
138 """Tokenize a JavaScript source.
|
|
139
|
|
140 :return: generator of `Token`\s
|
|
141 """
|
|
142 may_divide = False
|
|
143 pos = 0
|
|
144 lineno = 1
|
|
145 end = len(source)
|
|
146
|
|
147 while pos < end:
|
|
148 # handle regular rules first
|
|
149 for token_type, rule in rules:
|
|
150 match = rule.match(source, pos)
|
|
151 if match is not None:
|
|
152 break
|
|
153 # if we don't have a match we don't give up yet, but check for
|
|
154 # division operators or regular expression literals, based on
|
|
155 # the status of `may_divide` which is determined by the last
|
|
156 # processed non-whitespace token using `indicates_division`.
|
|
157 else:
|
|
158 if may_divide:
|
|
159 match = division_re.match(source, pos)
|
|
160 token_type = 'operator'
|
|
161 else:
|
|
162 match = regex_re.match(source, pos)
|
|
163 token_type = 'regexp'
|
|
164 if match is None:
|
|
165 raise TokenError('invalid syntax around line %d' % lineno)
|
|
166
|
|
167 token_value = match.group()
|
|
168 if token_type is not None:
|
|
169 token = Token(token_type, token_value, lineno)
|
|
170 may_divide = indicates_division(token)
|
|
171 yield token
|
|
172 lineno += len(line_re.findall(token_value))
|
|
173 pos = match.end()
|