Mercurial > genshi > mirror
comparison genshi/filters/html.py @ 951:40415173f513 stable-0.6.x
Merge r1174 and r1175 from trunk (improve sanitizing of CSS in style attributes -- see #455).
author | hodgestar |
---|---|
date | Fri, 02 Sep 2011 22:10:58 +0000 |
parents | 21308bd343b8 |
children |
comparison
equal
deleted
inserted
replaced
948:cccbcbd33e90 | 951:40415173f513 |
---|---|
251 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', | 251 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', |
252 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', | 252 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', |
253 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', | 253 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', |
254 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) | 254 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) |
255 | 255 |
256 SAFE_CSS = frozenset([ | |
257 # CSS 3 properties <http://www.w3.org/TR/CSS/#properties> | |
258 'background', 'background-attachment', 'background-color', | |
259 'background-image', 'background-position', 'background-repeat', | |
260 'border', 'border-bottom', 'border-bottom-color', | |
261 'border-bottom-style', 'border-bottom-width', 'border-collapse', | |
262 'border-color', 'border-left', 'border-left-color', | |
263 'border-left-style', 'border-left-width', 'border-right', | |
264 'border-right-color', 'border-right-style', 'border-right-width', | |
265 'border-spacing', 'border-style', 'border-top', 'border-top-color', | |
266 'border-top-style', 'border-top-width', 'border-width', 'bottom', | |
267 'caption-side', 'clear', 'clip', 'color', 'content', | |
268 'counter-increment', 'counter-reset', 'cursor', 'direction', 'display', | |
269 'empty-cells', 'float', 'font', 'font-family', 'font-size', | |
270 'font-style', 'font-variant', 'font-weight', 'height', 'left', | |
271 'letter-spacing', 'line-height', 'list-style', 'list-style-image', | |
272 'list-style-position', 'list-style-type', 'margin', 'margin-bottom', | |
273 'margin-left', 'margin-right', 'margin-top', 'max-height', 'max-width', | |
274 'min-height', 'min-width', 'opacity', 'orphans', 'outline', | |
275 'outline-color', 'outline-style', 'outline-width', 'overflow', | |
276 'padding', 'padding-bottom', 'padding-left', 'padding-right', | |
277 'padding-top', 'page-break-after', 'page-break-before', | |
278 'page-break-inside', 'quotes', 'right', 'table-layout', | |
279 'text-align', 'text-decoration', 'text-indent', 'text-transform', | |
280 'top', 'unicode-bidi', 'vertical-align', 'visibility', 'white-space', | |
281 'widows', 'width', 'word-spacing', 'z-index', | |
282 ]) | |
283 | |
256 SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) | 284 SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) |
257 | 285 |
258 URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', | 286 URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', |
259 'src']) | 287 'src']) |
260 | 288 |
261 def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS, | 289 def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS, |
262 safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS): | 290 safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS, |
291 safe_css=SAFE_CSS): | |
263 """Create the sanitizer. | 292 """Create the sanitizer. |
264 | 293 |
265 The exact set of allowed elements and attributes can be configured. | 294 The exact set of allowed elements and attributes can be configured. |
266 | 295 |
267 :param safe_tags: a set of tag names that are considered safe | 296 :param safe_tags: a set of tag names that are considered safe |
268 :param safe_attrs: a set of attribute names that are considered safe | 297 :param safe_attrs: a set of attribute names that are considered safe |
269 :param safe_schemes: a set of URI schemes that are considered safe | 298 :param safe_schemes: a set of URI schemes that are considered safe |
270 :param uri_attrs: a set of names of attributes that contain URIs | 299 :param uri_attrs: a set of names of attributes that contain URIs |
271 """ | 300 """ |
272 self.safe_tags = safe_tags | 301 self.safe_tags = safe_tags |
273 "The set of tag names that are considered safe." | 302 # The set of tag names that are considered safe. |
274 self.safe_attrs = safe_attrs | 303 self.safe_attrs = safe_attrs |
275 "The set of attribute names that are considered safe." | 304 # The set of attribute names that are considered safe. |
305 self.safe_css = safe_css | |
306 # The set of CSS properties that are considered safe. | |
276 self.uri_attrs = uri_attrs | 307 self.uri_attrs = uri_attrs |
277 "The set of names of attributes that may contain URIs." | 308 # The set of names of attributes that may contain URIs. |
278 self.safe_schemes = safe_schemes | 309 self.safe_schemes = safe_schemes |
279 "The set of URI schemes that are considered safe." | 310 # The set of URI schemes that are considered safe. |
311 | |
312 # IE6 <http://heideri.ch/jso/#80> | |
313 _EXPRESSION_SEARCH = re.compile(u""" | |
314 [eE | |
315 \uFF25 # FULLWIDTH LATIN CAPITAL LETTER E | |
316 \uFF45 # FULLWIDTH LATIN SMALL LETTER E | |
317 ] | |
318 [xX | |
319 \uFF38 # FULLWIDTH LATIN CAPITAL LETTER X | |
320 \uFF58 # FULLWIDTH LATIN SMALL LETTER X | |
321 ] | |
322 [pP | |
323 \uFF30 # FULLWIDTH LATIN CAPITAL LETTER P | |
324 \uFF50 # FULLWIDTH LATIN SMALL LETTER P | |
325 ] | |
326 [rR | |
327 \u0280 # LATIN LETTER SMALL CAPITAL R | |
328 \uFF32 # FULLWIDTH LATIN CAPITAL LETTER R | |
329 \uFF52 # FULLWIDTH LATIN SMALL LETTER R | |
330 ] | |
331 [eE | |
332 \uFF25 # FULLWIDTH LATIN CAPITAL LETTER E | |
333 \uFF45 # FULLWIDTH LATIN SMALL LETTER E | |
334 ] | |
335 [sS | |
336 \uFF33 # FULLWIDTH LATIN CAPITAL LETTER S | |
337 \uFF53 # FULLWIDTH LATIN SMALL LETTER S | |
338 ]{2} | |
339 [iI | |
340 \u026A # LATIN LETTER SMALL CAPITAL I | |
341 \uFF29 # FULLWIDTH LATIN CAPITAL LETTER I | |
342 \uFF49 # FULLWIDTH LATIN SMALL LETTER I | |
343 ] | |
344 [oO | |
345 \uFF2F # FULLWIDTH LATIN CAPITAL LETTER O | |
346 \uFF4F # FULLWIDTH LATIN SMALL LETTER O | |
347 ] | |
348 [nN | |
349 \u0274 # LATIN LETTER SMALL CAPITAL N | |
350 \uFF2E # FULLWIDTH LATIN CAPITAL LETTER N | |
351 \uFF4E # FULLWIDTH LATIN SMALL LETTER N | |
352 ] | |
353 """, re.VERBOSE).search | |
354 | |
355 # IE6 <http://openmya.hacker.jp/hasegawa/security/expression.txt> | |
356 # 7) Particular bit of Unicode characters | |
357 _URL_FINDITER = re.compile( | |
358 u'[Uu][Rr\u0280][Ll\u029F]\s*\(([^)]+)').finditer | |
280 | 359 |
281 def __call__(self, stream): | 360 def __call__(self, stream): |
282 """Apply the filter to the given stream. | 361 """Apply the filter to the given stream. |
283 | 362 |
284 :param stream: the markup event stream to filter | 363 :param stream: the markup event stream to filter |
333 :param value: the value of the property | 412 :param value: the value of the property |
334 :return: whether the property value should be considered safe | 413 :return: whether the property value should be considered safe |
335 :rtype: bool | 414 :rtype: bool |
336 :since: version 0.6 | 415 :since: version 0.6 |
337 """ | 416 """ |
338 if propname == 'position': | 417 if propname not in self.safe_css: |
339 return False | 418 return False |
340 if propname.startswith('margin') and '-' in value: | 419 if propname.startswith('margin') and '-' in value: |
341 # Negative margins can be used for phishing | 420 # Negative margins can be used for phishing |
342 return False | 421 return False |
343 return True | 422 return True |
427 except ValueError: | 506 except ValueError: |
428 continue | 507 continue |
429 if not self.is_safe_css(propname.strip().lower(), value.strip()): | 508 if not self.is_safe_css(propname.strip().lower(), value.strip()): |
430 continue | 509 continue |
431 is_evil = False | 510 is_evil = False |
432 if 'expression' in value: | 511 if self._EXPRESSION_SEARCH(value): |
433 is_evil = True | 512 is_evil = True |
434 for match in re.finditer(r'url\s*\(([^)]+)', value): | 513 for match in self._URL_FINDITER(value): |
435 if not self.is_safe_uri(match.group(1)): | 514 if not self.is_safe_uri(match.group(1)): |
436 is_evil = True | 515 is_evil = True |
437 break | 516 break |
438 if not is_evil: | 517 if not is_evil: |
439 decls.append(decl.strip()) | 518 decls.append(decl.strip()) |
440 return decls | 519 return decls |
441 | 520 |
442 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub | 521 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub |
443 _UNICODE_ESCAPE = re.compile(r'\\([0-9a-fA-F]{1,6})\s?').sub | 522 _UNICODE_ESCAPE = re.compile( |
523 r"""\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'"{};:()#*])""", | |
524 re.UNICODE).sub | |
444 | 525 |
445 def _replace_unicode_escapes(self, text): | 526 def _replace_unicode_escapes(self, text): |
446 def _repl(match): | 527 def _repl(match): |
447 return unichr(int(match.group(1), 16)) | 528 t = match.group(1) |
529 if t: | |
530 return unichr(int(t, 16)) | |
531 t = match.group(2) | |
532 if t == '\\': | |
533 return r'\\' | |
534 else: | |
535 return t | |
448 return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text)) | 536 return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text)) |
449 | 537 |
450 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub | 538 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub |
451 | 539 |
452 def _strip_css_comments(self, text): | 540 def _strip_css_comments(self, text): |