comparison genshi/filters/html.py @ 951:40415173f513 stable-0.6.x

Merge r1174 and r1175 from trunk (improve sanitizing of CSS in style attributes -- see #455).
author hodgestar
date Fri, 02 Sep 2011 22:10:58 +0000
parents 21308bd343b8
children
comparison
equal deleted inserted replaced
948:cccbcbd33e90 951:40415173f513
251 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 251 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
252 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 252 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
253 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 253 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
254 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) 254 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
255 255
256 SAFE_CSS = frozenset([
257 # CSS 3 properties <http://www.w3.org/TR/CSS/#properties>
258 'background', 'background-attachment', 'background-color',
259 'background-image', 'background-position', 'background-repeat',
260 'border', 'border-bottom', 'border-bottom-color',
261 'border-bottom-style', 'border-bottom-width', 'border-collapse',
262 'border-color', 'border-left', 'border-left-color',
263 'border-left-style', 'border-left-width', 'border-right',
264 'border-right-color', 'border-right-style', 'border-right-width',
265 'border-spacing', 'border-style', 'border-top', 'border-top-color',
266 'border-top-style', 'border-top-width', 'border-width', 'bottom',
267 'caption-side', 'clear', 'clip', 'color', 'content',
268 'counter-increment', 'counter-reset', 'cursor', 'direction', 'display',
269 'empty-cells', 'float', 'font', 'font-family', 'font-size',
270 'font-style', 'font-variant', 'font-weight', 'height', 'left',
271 'letter-spacing', 'line-height', 'list-style', 'list-style-image',
272 'list-style-position', 'list-style-type', 'margin', 'margin-bottom',
273 'margin-left', 'margin-right', 'margin-top', 'max-height', 'max-width',
274 'min-height', 'min-width', 'opacity', 'orphans', 'outline',
275 'outline-color', 'outline-style', 'outline-width', 'overflow',
276 'padding', 'padding-bottom', 'padding-left', 'padding-right',
277 'padding-top', 'page-break-after', 'page-break-before',
278 'page-break-inside', 'quotes', 'right', 'table-layout',
279 'text-align', 'text-decoration', 'text-indent', 'text-transform',
280 'top', 'unicode-bidi', 'vertical-align', 'visibility', 'white-space',
281 'widows', 'width', 'word-spacing', 'z-index',
282 ])
283
256 SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) 284 SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None])
257 285
258 URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', 286 URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc',
259 'src']) 287 'src'])
260 288
261 def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS, 289 def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS,
262 safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS): 290 safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS,
291 safe_css=SAFE_CSS):
263 """Create the sanitizer. 292 """Create the sanitizer.
264 293
265 The exact set of allowed elements and attributes can be configured. 294 The exact set of allowed elements and attributes can be configured.
266 295
267 :param safe_tags: a set of tag names that are considered safe 296 :param safe_tags: a set of tag names that are considered safe
268 :param safe_attrs: a set of attribute names that are considered safe 297 :param safe_attrs: a set of attribute names that are considered safe
269 :param safe_schemes: a set of URI schemes that are considered safe 298 :param safe_schemes: a set of URI schemes that are considered safe
270 :param uri_attrs: a set of names of attributes that contain URIs 299 :param uri_attrs: a set of names of attributes that contain URIs
271 """ 300 """
272 self.safe_tags = safe_tags 301 self.safe_tags = safe_tags
273 "The set of tag names that are considered safe." 302 # The set of tag names that are considered safe.
274 self.safe_attrs = safe_attrs 303 self.safe_attrs = safe_attrs
275 "The set of attribute names that are considered safe." 304 # The set of attribute names that are considered safe.
305 self.safe_css = safe_css
306 # The set of CSS properties that are considered safe.
276 self.uri_attrs = uri_attrs 307 self.uri_attrs = uri_attrs
277 "The set of names of attributes that may contain URIs." 308 # The set of names of attributes that may contain URIs.
278 self.safe_schemes = safe_schemes 309 self.safe_schemes = safe_schemes
279 "The set of URI schemes that are considered safe." 310 # The set of URI schemes that are considered safe.
311
312 # IE6 <http://heideri.ch/jso/#80>
313 _EXPRESSION_SEARCH = re.compile(u"""
314 [eE
315 \uFF25 # FULLWIDTH LATIN CAPITAL LETTER E
316 \uFF45 # FULLWIDTH LATIN SMALL LETTER E
317 ]
318 [xX
319 \uFF38 # FULLWIDTH LATIN CAPITAL LETTER X
320 \uFF58 # FULLWIDTH LATIN SMALL LETTER X
321 ]
322 [pP
323 \uFF30 # FULLWIDTH LATIN CAPITAL LETTER P
324 \uFF50 # FULLWIDTH LATIN SMALL LETTER P
325 ]
326 [rR
327 \u0280 # LATIN LETTER SMALL CAPITAL R
328 \uFF32 # FULLWIDTH LATIN CAPITAL LETTER R
329 \uFF52 # FULLWIDTH LATIN SMALL LETTER R
330 ]
331 [eE
332 \uFF25 # FULLWIDTH LATIN CAPITAL LETTER E
333 \uFF45 # FULLWIDTH LATIN SMALL LETTER E
334 ]
335 [sS
336 \uFF33 # FULLWIDTH LATIN CAPITAL LETTER S
337 \uFF53 # FULLWIDTH LATIN SMALL LETTER S
338 ]{2}
339 [iI
340 \u026A # LATIN LETTER SMALL CAPITAL I
341 \uFF29 # FULLWIDTH LATIN CAPITAL LETTER I
342 \uFF49 # FULLWIDTH LATIN SMALL LETTER I
343 ]
344 [oO
345 \uFF2F # FULLWIDTH LATIN CAPITAL LETTER O
346 \uFF4F # FULLWIDTH LATIN SMALL LETTER O
347 ]
348 [nN
349 \u0274 # LATIN LETTER SMALL CAPITAL N
350 \uFF2E # FULLWIDTH LATIN CAPITAL LETTER N
351 \uFF4E # FULLWIDTH LATIN SMALL LETTER N
352 ]
353 """, re.VERBOSE).search
354
355 # IE6 <http://openmya.hacker.jp/hasegawa/security/expression.txt>
356 # 7) Particular bit of Unicode characters
357 _URL_FINDITER = re.compile(
358 u'[Uu][Rr\u0280][Ll\u029F]\s*\(([^)]+)').finditer
280 359
281 def __call__(self, stream): 360 def __call__(self, stream):
282 """Apply the filter to the given stream. 361 """Apply the filter to the given stream.
283 362
284 :param stream: the markup event stream to filter 363 :param stream: the markup event stream to filter
333 :param value: the value of the property 412 :param value: the value of the property
334 :return: whether the property value should be considered safe 413 :return: whether the property value should be considered safe
335 :rtype: bool 414 :rtype: bool
336 :since: version 0.6 415 :since: version 0.6
337 """ 416 """
338 if propname == 'position': 417 if propname not in self.safe_css:
339 return False 418 return False
340 if propname.startswith('margin') and '-' in value: 419 if propname.startswith('margin') and '-' in value:
341 # Negative margins can be used for phishing 420 # Negative margins can be used for phishing
342 return False 421 return False
343 return True 422 return True
427 except ValueError: 506 except ValueError:
428 continue 507 continue
429 if not self.is_safe_css(propname.strip().lower(), value.strip()): 508 if not self.is_safe_css(propname.strip().lower(), value.strip()):
430 continue 509 continue
431 is_evil = False 510 is_evil = False
432 if 'expression' in value: 511 if self._EXPRESSION_SEARCH(value):
433 is_evil = True 512 is_evil = True
434 for match in re.finditer(r'url\s*\(([^)]+)', value): 513 for match in self._URL_FINDITER(value):
435 if not self.is_safe_uri(match.group(1)): 514 if not self.is_safe_uri(match.group(1)):
436 is_evil = True 515 is_evil = True
437 break 516 break
438 if not is_evil: 517 if not is_evil:
439 decls.append(decl.strip()) 518 decls.append(decl.strip())
440 return decls 519 return decls
441 520
442 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub 521 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub
443 _UNICODE_ESCAPE = re.compile(r'\\([0-9a-fA-F]{1,6})\s?').sub 522 _UNICODE_ESCAPE = re.compile(
523 r"""\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'"{};:()#*])""",
524 re.UNICODE).sub
444 525
445 def _replace_unicode_escapes(self, text): 526 def _replace_unicode_escapes(self, text):
446 def _repl(match): 527 def _repl(match):
447 return unichr(int(match.group(1), 16)) 528 t = match.group(1)
529 if t:
530 return unichr(int(t, 16))
531 t = match.group(2)
532 if t == '\\':
533 return r'\\'
534 else:
535 return t
448 return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text)) 536 return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text))
449 537
450 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub 538 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub
451 539
452 def _strip_css_comments(self, text): 540 def _strip_css_comments(self, text):
Copyright (C) 2012-2017 Edgewall Software