annotate babel/messages/jslexer.py @ 343:6dae6a9e1096

JavaScript lexer falls back silently now on syntax errors and tries to recover.
author aronacher
date Sat, 14 Jun 2008 22:07:41 +0000
parents 603192024857
children abe62ab2a889
rev   line source
341
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
1 # -*- coding: utf-8 -*-
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
2 #
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
3 # Copyright (C) 2008 Edgewall Software
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
4 # All rights reserved.
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
5 #
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
8 # are also available at http://babel.edgewall.org/wiki/License.
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
9 #
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
12 # history and logs, available at http://babel.edgewall.org/log/.
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
13
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
14 """A simple JavaScript 1.5 lexer which is used for the JavaScript
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
15 extractor.
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
16 """
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
17
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
18 import re
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
19 from operator import itemgetter
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
20
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
21
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
22 operators = [
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
23 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
24 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
25 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
343
6dae6a9e1096 JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents: 342
diff changeset
26 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
341
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
27 ]
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
28 operators.sort(lambda a, b: cmp(-len(a), -len(b)))
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
29
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
30 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
31
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
32 rules = [
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
33 (None, re.compile(r'\s+(?u)')),
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
34 (None, re.compile(r'<!--.*')),
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
35 ('linecomment', re.compile(r'//.*')),
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
36 ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
37 ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
38 ('number', re.compile(r'''(?x)(
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
39 (?:0|[1-9]\d*)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
40 (\.\d+)?
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
41 ([eE][-+]?\d+)? |
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
42 (0x[a-fA-F0-9]+)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
43 )''')),
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
44 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
45 ('string', re.compile(r'''(?xs)(
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
46 '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
47 "(?:[^"\\]*(?:\\.[^"\\]*)*)"
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
48 )'''))
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
49 ]
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
50
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
51 division_re = re.compile(r'/=?')
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
52 regex_re = re.compile(r'/.+?/[a-zA-Z]*(?s)')
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
53 line_re = re.compile(r'(\r\n|\n|\r)')
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
54 line_join_re = re.compile(r'\\' + line_re.pattern)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
55 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
56
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
57
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
58 class Token(tuple):
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
59 """Represents a token as returned by `tokenize`."""
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
60 __slots__ = ()
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
61
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
62 def __new__(cls, type, value, lineno):
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
63 return tuple.__new__(cls, (type, value, lineno))
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
64
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
65 type = property(itemgetter(0))
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
66 value = property(itemgetter(1))
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
67 lineno = property(itemgetter(2))
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
68
342
603192024857 added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents: 341
diff changeset
69
341
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
70 def indicates_division(token):
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
71 """A helper function that helps the tokenizer to decide if the current
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
72 token may be followed by a division operator.
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
73 """
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
74 if token.type == 'operator':
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
75 return token.value in (')', ']', '}', '++', '--')
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
76 return token.type in ('name', 'number', 'string', 'regexp')
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
77
342
603192024857 added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents: 341
diff changeset
78
341
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
79 def unquote_string(string):
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
80 """Unquote a string with JavaScript rules. The string has to start with
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
81 string delimiters (``'`` or ``"``.)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
82
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
83 :return: a string
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
84 """
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
85 assert string and string[0] == string[-1] and string[0] in '"\'', \
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
86 'string provided is not properly delimited'
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
87 string = line_join_re.sub('\\1', string[1:-1])
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
88 result = []
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
89 add = result.append
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
90 pos = 0
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
91
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
92 while 1:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
93 # scan for the next escape
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
94 escape_pos = string.find('\\', pos)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
95 if escape_pos < 0:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
96 break
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
97 add(string[pos:escape_pos])
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
98
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
99 # check which character is escaped
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
100 next_char = string[escape_pos + 1]
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
101 if next_char in escapes:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
102 add(escapes[next_char])
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
103
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
104 # unicode escapes. trie to consume up to four characters of
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
105 # hexadecimal characters and try to interpret them as unicode
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
106 # character point. If there is no such character point, put
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
107 # all the consumed characters into the string.
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
108 elif next_char in 'uU':
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
109 escaped = uni_escape_re.match(string, escape_pos + 2)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
110 if escaped is not None:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
111 escaped_value = escaped.group()
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
112 if len(escaped_value) == 4:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
113 try:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
114 add(unichr(int(escaped_value, 16)))
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
115 except ValueError:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
116 pass
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
117 else:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
118 pos = escape_pos + 6
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
119 continue
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
120 add(next_char + escaped_value)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
121 pos = escaped.end()
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
122 continue
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
123 else:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
124 add(next_char)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
125
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
126 # bogus escape. Just remove the backslash.
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
127 else:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
128 add(next_char)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
129 pos = escape_pos + 2
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
130
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
131 if pos < len(string):
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
132 add(string[pos:])
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
133
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
134 return u''.join(result)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
135
342
603192024857 added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents: 341
diff changeset
136
341
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
137 def tokenize(source):
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
138 """Tokenize a JavaScript source.
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
139
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
140 :return: generator of `Token`\s
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
141 """
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
142 may_divide = False
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
143 pos = 0
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
144 lineno = 1
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
145 end = len(source)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
146
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
147 while pos < end:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
148 # handle regular rules first
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
149 for token_type, rule in rules:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
150 match = rule.match(source, pos)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
151 if match is not None:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
152 break
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
153 # if we don't have a match we don't give up yet, but check for
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
154 # division operators or regular expression literals, based on
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
155 # the status of `may_divide` which is determined by the last
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
156 # processed non-whitespace token using `indicates_division`.
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
157 else:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
158 if may_divide:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
159 match = division_re.match(source, pos)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
160 token_type = 'operator'
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
161 else:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
162 match = regex_re.match(source, pos)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
163 token_type = 'regexp'
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
164 if match is None:
343
6dae6a9e1096 JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents: 342
diff changeset
165 # woops. invalid syntax. jump one char ahead and try again.
6dae6a9e1096 JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents: 342
diff changeset
166 pos += 1
6dae6a9e1096 JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents: 342
diff changeset
167 continue
341
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
168
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
169 token_value = match.group()
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
170 if token_type is not None:
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
171 token = Token(token_type, token_value, lineno)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
172 may_divide = indicates_division(token)
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
173 yield token
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
174 lineno += len(line_re.findall(token_value))
672b6b8e945d Added !JavaScript extractor
aronacher
parents:
diff changeset
175 pos = match.end()
Copyright (C) 2012-2017 Edgewall Software