Mercurial > babel > old > mirror
annotate babel/messages/jslexer.py @ 343:6dae6a9e1096
JavaScript lexer falls back silently now on syntax errors and tries to recover.
author | aronacher |
---|---|
date | Sat, 14 Jun 2008 22:07:41 +0000 |
parents | 603192024857 |
children | abe62ab2a889 |
rev | line source |
---|---|
341 | 1 # -*- coding: utf-8 -*- |
2 # | |
3 # Copyright (C) 2008 Edgewall Software | |
4 # All rights reserved. | |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
8 # are also available at http://babel.edgewall.org/wiki/License. | |
9 # | |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
12 # history and logs, available at http://babel.edgewall.org/log/. | |
13 | |
14 """A simple JavaScript 1.5 lexer which is used for the JavaScript | |
15 extractor. | |
16 """ | |
17 | |
18 import re | |
19 from operator import itemgetter | |
20 | |
21 | |
22 operators = [ | |
23 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=', | |
24 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=', | |
25 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')', | |
343
6dae6a9e1096
JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents:
342
diff
changeset
|
26 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':' |
341 | 27 ] |
28 operators.sort(lambda a, b: cmp(-len(a), -len(b))) | |
29 | |
30 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'} | |
31 | |
32 rules = [ | |
33 (None, re.compile(r'\s+(?u)')), | |
34 (None, re.compile(r'<!--.*')), | |
35 ('linecomment', re.compile(r'//.*')), | |
36 ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')), | |
37 ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')), | |
38 ('number', re.compile(r'''(?x)( | |
39 (?:0|[1-9]\d*) | |
40 (\.\d+)? | |
41 ([eE][-+]?\d+)? | | |
42 (0x[a-fA-F0-9]+) | |
43 )''')), | |
44 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))), | |
45 ('string', re.compile(r'''(?xs)( | |
46 '(?:[^'\\]*(?:\\.[^'\\]*)*)' | | |
47 "(?:[^"\\]*(?:\\.[^"\\]*)*)" | |
48 )''')) | |
49 ] | |
50 | |
51 division_re = re.compile(r'/=?') | |
52 regex_re = re.compile(r'/.+?/[a-zA-Z]*(?s)') | |
53 line_re = re.compile(r'(\r\n|\n|\r)') | |
54 line_join_re = re.compile(r'\\' + line_re.pattern) | |
55 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}') | |
56 | |
57 | |
58 class Token(tuple): | |
59 """Represents a token as returned by `tokenize`.""" | |
60 __slots__ = () | |
61 | |
62 def __new__(cls, type, value, lineno): | |
63 return tuple.__new__(cls, (type, value, lineno)) | |
64 | |
65 type = property(itemgetter(0)) | |
66 value = property(itemgetter(1)) | |
67 lineno = property(itemgetter(2)) | |
68 | |
342
603192024857
added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents:
341
diff
changeset
|
69 |
341 | 70 def indicates_division(token): |
71 """A helper function that helps the tokenizer to decide if the current | |
72 token may be followed by a division operator. | |
73 """ | |
74 if token.type == 'operator': | |
75 return token.value in (')', ']', '}', '++', '--') | |
76 return token.type in ('name', 'number', 'string', 'regexp') | |
77 | |
342
603192024857
added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents:
341
diff
changeset
|
78 |
341 | 79 def unquote_string(string): |
80 """Unquote a string with JavaScript rules. The string has to start with | |
81 string delimiters (``'`` or ``"``.) | |
82 | |
83 :return: a string | |
84 """ | |
85 assert string and string[0] == string[-1] and string[0] in '"\'', \ | |
86 'string provided is not properly delimited' | |
87 string = line_join_re.sub('\\1', string[1:-1]) | |
88 result = [] | |
89 add = result.append | |
90 pos = 0 | |
91 | |
92 while 1: | |
93 # scan for the next escape | |
94 escape_pos = string.find('\\', pos) | |
95 if escape_pos < 0: | |
96 break | |
97 add(string[pos:escape_pos]) | |
98 | |
99 # check which character is escaped | |
100 next_char = string[escape_pos + 1] | |
101 if next_char in escapes: | |
102 add(escapes[next_char]) | |
103 | |
104 # unicode escapes. trie to consume up to four characters of | |
105 # hexadecimal characters and try to interpret them as unicode | |
106 # character point. If there is no such character point, put | |
107 # all the consumed characters into the string. | |
108 elif next_char in 'uU': | |
109 escaped = uni_escape_re.match(string, escape_pos + 2) | |
110 if escaped is not None: | |
111 escaped_value = escaped.group() | |
112 if len(escaped_value) == 4: | |
113 try: | |
114 add(unichr(int(escaped_value, 16))) | |
115 except ValueError: | |
116 pass | |
117 else: | |
118 pos = escape_pos + 6 | |
119 continue | |
120 add(next_char + escaped_value) | |
121 pos = escaped.end() | |
122 continue | |
123 else: | |
124 add(next_char) | |
125 | |
126 # bogus escape. Just remove the backslash. | |
127 else: | |
128 add(next_char) | |
129 pos = escape_pos + 2 | |
130 | |
131 if pos < len(string): | |
132 add(string[pos:]) | |
133 | |
134 return u''.join(result) | |
135 | |
342
603192024857
added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents:
341
diff
changeset
|
136 |
341 | 137 def tokenize(source): |
138 """Tokenize a JavaScript source. | |
139 | |
140 :return: generator of `Token`\s | |
141 """ | |
142 may_divide = False | |
143 pos = 0 | |
144 lineno = 1 | |
145 end = len(source) | |
146 | |
147 while pos < end: | |
148 # handle regular rules first | |
149 for token_type, rule in rules: | |
150 match = rule.match(source, pos) | |
151 if match is not None: | |
152 break | |
153 # if we don't have a match we don't give up yet, but check for | |
154 # division operators or regular expression literals, based on | |
155 # the status of `may_divide` which is determined by the last | |
156 # processed non-whitespace token using `indicates_division`. | |
157 else: | |
158 if may_divide: | |
159 match = division_re.match(source, pos) | |
160 token_type = 'operator' | |
161 else: | |
162 match = regex_re.match(source, pos) | |
163 token_type = 'regexp' | |
164 if match is None: | |
343
6dae6a9e1096
JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents:
342
diff
changeset
|
165 # woops. invalid syntax. jump one char ahead and try again. |
6dae6a9e1096
JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents:
342
diff
changeset
|
166 pos += 1 |
6dae6a9e1096
JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents:
342
diff
changeset
|
167 continue |
341 | 168 |
169 token_value = match.group() | |
170 if token_type is not None: | |
171 token = Token(token_type, token_value, lineno) | |
172 may_divide = indicates_division(token) | |
173 yield token | |
174 lineno += len(line_re.findall(token_value)) | |
175 pos = match.end() |