Mercurial > babel > old > babel-test
annotate babel/messages/jslexer.py @ 530:85e1beadacb0
Update the copyright line.
author | jruigrok |
---|---|
date | Sat, 05 Mar 2011 15:22:28 +0000 |
parents | 540cbe76f413 |
children |
rev | line source |
---|---|
339 | 1 # -*- coding: utf-8 -*- |
2 # | |
530 | 3 # Copyright (C) 2008-2011 Edgewall Software |
339 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
8 # are also available at http://babel.edgewall.org/wiki/License. | |
9 # | |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
12 # history and logs, available at http://babel.edgewall.org/log/. | |
13 | |
14 """A simple JavaScript 1.5 lexer which is used for the JavaScript | |
15 extractor. | |
16 """ | |
17 | |
527
540cbe76f413
Get rid of the utility code for itemgetter(), we now simply import this
jruigrok
parents:
414
diff
changeset
|
18 from operator import itemgetter |
339 | 19 import re |
414
ea0da9db79ef
fix Python 2.3 compat: rearrange set/itemgetter/rsplit/sorted/unicode.decode
pjenvey
parents:
404
diff
changeset
|
20 |
339 | 21 operators = [ |
22 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=', | |
23 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=', | |
24 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')', | |
341
d1b82b5dc8de
JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents:
340
diff
changeset
|
25 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':' |
339 | 26 ] |
27 operators.sort(lambda a, b: cmp(-len(a), -len(b))) | |
28 | |
29 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'} | |
30 | |
31 rules = [ | |
32 (None, re.compile(r'\s+(?u)')), | |
33 (None, re.compile(r'<!--.*')), | |
34 ('linecomment', re.compile(r'//.*')), | |
35 ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')), | |
36 ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')), | |
37 ('number', re.compile(r'''(?x)( | |
38 (?:0|[1-9]\d*) | |
39 (\.\d+)? | |
40 ([eE][-+]?\d+)? | | |
41 (0x[a-fA-F0-9]+) | |
42 )''')), | |
43 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))), | |
44 ('string', re.compile(r'''(?xs)( | |
45 '(?:[^'\\]*(?:\\.[^'\\]*)*)' | | |
46 "(?:[^"\\]*(?:\\.[^"\\]*)*)" | |
47 )''')) | |
48 ] | |
49 | |
50 division_re = re.compile(r'/=?') | |
404
0db8a40127b9
JavaScript lexer is now handling escapes in regular expression literals properly. This closes #138.
aronacher
parents:
341
diff
changeset
|
51 regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)') |
339 | 52 line_re = re.compile(r'(\r\n|\n|\r)') |
53 line_join_re = re.compile(r'\\' + line_re.pattern) | |
54 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}') | |
55 | |
56 | |
57 class Token(tuple): | |
58 """Represents a token as returned by `tokenize`.""" | |
59 __slots__ = () | |
60 | |
61 def __new__(cls, type, value, lineno): | |
62 return tuple.__new__(cls, (type, value, lineno)) | |
63 | |
64 type = property(itemgetter(0)) | |
65 value = property(itemgetter(1)) | |
66 lineno = property(itemgetter(2)) | |
67 | |
340
f7269b43236d
added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents:
339
diff
changeset
|
68 |
339 | 69 def indicates_division(token): |
70 """A helper function that helps the tokenizer to decide if the current | |
71 token may be followed by a division operator. | |
72 """ | |
73 if token.type == 'operator': | |
74 return token.value in (')', ']', '}', '++', '--') | |
75 return token.type in ('name', 'number', 'string', 'regexp') | |
76 | |
340
f7269b43236d
added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents:
339
diff
changeset
|
77 |
339 | 78 def unquote_string(string): |
79 """Unquote a string with JavaScript rules. The string has to start with | |
80 string delimiters (``'`` or ``"``.) | |
81 | |
82 :return: a string | |
83 """ | |
84 assert string and string[0] == string[-1] and string[0] in '"\'', \ | |
85 'string provided is not properly delimited' | |
86 string = line_join_re.sub('\\1', string[1:-1]) | |
87 result = [] | |
88 add = result.append | |
89 pos = 0 | |
90 | |
91 while 1: | |
92 # scan for the next escape | |
93 escape_pos = string.find('\\', pos) | |
94 if escape_pos < 0: | |
95 break | |
96 add(string[pos:escape_pos]) | |
97 | |
98 # check which character is escaped | |
99 next_char = string[escape_pos + 1] | |
100 if next_char in escapes: | |
101 add(escapes[next_char]) | |
102 | |
103 # unicode escapes. trie to consume up to four characters of | |
104 # hexadecimal characters and try to interpret them as unicode | |
105 # character point. If there is no such character point, put | |
106 # all the consumed characters into the string. | |
107 elif next_char in 'uU': | |
108 escaped = uni_escape_re.match(string, escape_pos + 2) | |
109 if escaped is not None: | |
110 escaped_value = escaped.group() | |
111 if len(escaped_value) == 4: | |
112 try: | |
113 add(unichr(int(escaped_value, 16))) | |
114 except ValueError: | |
115 pass | |
116 else: | |
117 pos = escape_pos + 6 | |
118 continue | |
119 add(next_char + escaped_value) | |
120 pos = escaped.end() | |
121 continue | |
122 else: | |
123 add(next_char) | |
124 | |
125 # bogus escape. Just remove the backslash. | |
126 else: | |
127 add(next_char) | |
128 pos = escape_pos + 2 | |
129 | |
130 if pos < len(string): | |
131 add(string[pos:]) | |
132 | |
133 return u''.join(result) | |
134 | |
340
f7269b43236d
added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents:
339
diff
changeset
|
135 |
339 | 136 def tokenize(source): |
137 """Tokenize a JavaScript source. | |
138 | |
139 :return: generator of `Token`\s | |
140 """ | |
141 may_divide = False | |
142 pos = 0 | |
143 lineno = 1 | |
144 end = len(source) | |
145 | |
146 while pos < end: | |
147 # handle regular rules first | |
148 for token_type, rule in rules: | |
149 match = rule.match(source, pos) | |
150 if match is not None: | |
151 break | |
152 # if we don't have a match we don't give up yet, but check for | |
153 # division operators or regular expression literals, based on | |
154 # the status of `may_divide` which is determined by the last | |
155 # processed non-whitespace token using `indicates_division`. | |
156 else: | |
157 if may_divide: | |
158 match = division_re.match(source, pos) | |
159 token_type = 'operator' | |
160 else: | |
161 match = regex_re.match(source, pos) | |
162 token_type = 'regexp' | |
163 if match is None: | |
341
d1b82b5dc8de
JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents:
340
diff
changeset
|
164 # woops. invalid syntax. jump one char ahead and try again. |
d1b82b5dc8de
JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents:
340
diff
changeset
|
165 pos += 1 |
d1b82b5dc8de
JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents:
340
diff
changeset
|
166 continue |
339 | 167 |
168 token_value = match.group() | |
169 if token_type is not None: | |
170 token = Token(token_type, token_value, lineno) | |
171 may_divide = indicates_division(token) | |
172 yield token | |
173 lineno += len(line_re.findall(token_value)) | |
174 pos = match.end() |