annotate babel/messages/jslexer.py @ 599:33c8c68b96c7 trunk

change constructor for babel.support.Translations to __init__(fp=None, domain=None) as its super class gettext.GNUTranslations uses "fp" as well (even if "fileobj" is a better name)
author fschwarz
date Mon, 20 Aug 2012 19:34:42 +0000
parents ca203b2af83c
children
rev   line source
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
1 # -*- coding: utf-8 -*-
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
2 #
530
ca203b2af83c Update the copyright line.
jruigrok
parents: 527
diff changeset
3 # Copyright (C) 2008-2011 Edgewall Software
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
4 # All rights reserved.
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
5 #
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
8 # are also available at http://babel.edgewall.org/wiki/License.
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
9 #
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
12 # history and logs, available at http://babel.edgewall.org/log/.
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
13
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
14 """A simple JavaScript 1.5 lexer which is used for the JavaScript
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
15 extractor.
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
16 """
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
17
527
f2a516cee3f8 Get rid of the utility code for itemgetter(), we now simply import this
jruigrok
parents: 414
diff changeset
18 from operator import itemgetter
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
19 import re
414
05487ae7696e fix Python 2.3 compat: rearrange set/itemgetter/rsplit/sorted/unicode.decode
pjenvey
parents: 404
diff changeset
20
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
21 operators = [
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
22 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
23 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
24 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
341
83b9642c4d69 JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents: 340
diff changeset
25 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
26 ]
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
27 operators.sort(lambda a, b: cmp(-len(a), -len(b)))
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
28
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
29 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
30
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
31 rules = [
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
32 (None, re.compile(r'\s+(?u)')),
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
33 (None, re.compile(r'<!--.*')),
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
34 ('linecomment', re.compile(r'//.*')),
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
35 ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
36 ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
37 ('number', re.compile(r'''(?x)(
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
38 (?:0|[1-9]\d*)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
39 (\.\d+)?
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
40 ([eE][-+]?\d+)? |
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
41 (0x[a-fA-F0-9]+)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
42 )''')),
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
43 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
44 ('string', re.compile(r'''(?xs)(
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
45 '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
46 "(?:[^"\\]*(?:\\.[^"\\]*)*)"
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
47 )'''))
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
48 ]
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
49
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
50 division_re = re.compile(r'/=?')
404
817966c4022d JavaScript lexer is now handling escapes in regular expression literals properly. This closes #138.
aronacher
parents: 341
diff changeset
51 regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
52 line_re = re.compile(r'(\r\n|\n|\r)')
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
53 line_join_re = re.compile(r'\\' + line_re.pattern)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
54 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
55
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
56
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
57 class Token(tuple):
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
58 """Represents a token as returned by `tokenize`."""
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
59 __slots__ = ()
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
60
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
61 def __new__(cls, type, value, lineno):
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
62 return tuple.__new__(cls, (type, value, lineno))
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
63
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
64 type = property(itemgetter(0))
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
65 value = property(itemgetter(1))
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
66 lineno = property(itemgetter(2))
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
67
340
ce83b4f77114 added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents: 339
diff changeset
68
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
69 def indicates_division(token):
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
70 """A helper function that helps the tokenizer to decide if the current
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
71 token may be followed by a division operator.
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
72 """
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
73 if token.type == 'operator':
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
74 return token.value in (')', ']', '}', '++', '--')
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
75 return token.type in ('name', 'number', 'string', 'regexp')
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
76
340
ce83b4f77114 added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents: 339
diff changeset
77
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
78 def unquote_string(string):
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
79 """Unquote a string with JavaScript rules. The string has to start with
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
80 string delimiters (``'`` or ``"``.)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
81
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
82 :return: a string
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
83 """
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
84 assert string and string[0] == string[-1] and string[0] in '"\'', \
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
85 'string provided is not properly delimited'
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
86 string = line_join_re.sub('\\1', string[1:-1])
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
87 result = []
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
88 add = result.append
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
89 pos = 0
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
90
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
91 while 1:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
92 # scan for the next escape
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
93 escape_pos = string.find('\\', pos)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
94 if escape_pos < 0:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
95 break
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
96 add(string[pos:escape_pos])
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
97
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
98 # check which character is escaped
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
99 next_char = string[escape_pos + 1]
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
100 if next_char in escapes:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
101 add(escapes[next_char])
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
102
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
103 # unicode escapes. trie to consume up to four characters of
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
104 # hexadecimal characters and try to interpret them as unicode
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
105 # character point. If there is no such character point, put
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
106 # all the consumed characters into the string.
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
107 elif next_char in 'uU':
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
108 escaped = uni_escape_re.match(string, escape_pos + 2)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
109 if escaped is not None:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
110 escaped_value = escaped.group()
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
111 if len(escaped_value) == 4:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
112 try:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
113 add(unichr(int(escaped_value, 16)))
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
114 except ValueError:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
115 pass
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
116 else:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
117 pos = escape_pos + 6
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
118 continue
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
119 add(next_char + escaped_value)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
120 pos = escaped.end()
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
121 continue
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
122 else:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
123 add(next_char)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
124
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
125 # bogus escape. Just remove the backslash.
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
126 else:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
127 add(next_char)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
128 pos = escape_pos + 2
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
129
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
130 if pos < len(string):
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
131 add(string[pos:])
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
132
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
133 return u''.join(result)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
134
340
ce83b4f77114 added some newlines to extract and jslexer to stay consistent with the rest of the sourcecode.
aronacher
parents: 339
diff changeset
135
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
136 def tokenize(source):
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
137 """Tokenize a JavaScript source.
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
138
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
139 :return: generator of `Token`\s
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
140 """
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
141 may_divide = False
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
142 pos = 0
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
143 lineno = 1
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
144 end = len(source)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
145
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
146 while pos < end:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
147 # handle regular rules first
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
148 for token_type, rule in rules:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
149 match = rule.match(source, pos)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
150 if match is not None:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
151 break
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
152 # if we don't have a match we don't give up yet, but check for
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
153 # division operators or regular expression literals, based on
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
154 # the status of `may_divide` which is determined by the last
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
155 # processed non-whitespace token using `indicates_division`.
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
156 else:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
157 if may_divide:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
158 match = division_re.match(source, pos)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
159 token_type = 'operator'
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
160 else:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
161 match = regex_re.match(source, pos)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
162 token_type = 'regexp'
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
163 if match is None:
341
83b9642c4d69 JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents: 340
diff changeset
164 # woops. invalid syntax. jump one char ahead and try again.
83b9642c4d69 JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents: 340
diff changeset
165 pos += 1
83b9642c4d69 JavaScript lexer falls back silently now on syntax errors and tries to recover.
aronacher
parents: 340
diff changeset
166 continue
339
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
167
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
168 token_value = match.group()
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
169 if token_type is not None:
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
170 token = Token(token_type, token_value, lineno)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
171 may_divide = indicates_division(token)
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
172 yield token
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
173 lineno += len(line_re.findall(token_value))
93a896111488 Added !JavaScript extractor
aronacher
parents:
diff changeset
174 pos = match.end()
Copyright (C) 2012-2017 Edgewall Software