|
|
@ -1,75 +1,394 @@ |
|
|
|
from __future__ import absolute_import, unicode_literals |
|
|
|
# Copyright 2010-2019 Kurt McKee <contactme@kurtmckee.org> |
|
|
|
# Copyright 2002-2008 Mark Pilgrim |
|
|
|
# All rights reserved. |
|
|
|
# |
|
|
|
# This file is a part of feedparser. |
|
|
|
# |
|
|
|
# Redistribution and use in source and binary forms, with or without |
|
|
|
# modification, are permitted provided that the following conditions are met: |
|
|
|
# |
|
|
|
# * Redistributions of source code must retain the above copyright notice, |
|
|
|
# this list of conditions and the following disclaimer. |
|
|
|
# * Redistributions in binary form must reproduce the above copyright notice, |
|
|
|
# this list of conditions and the following disclaimer in the documentation |
|
|
|
# and/or other materials provided with the distribution. |
|
|
|
# |
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' |
|
|
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
|
|
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
|
|
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
|
|
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
|
|
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
|
|
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
|
|
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
|
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
|
|
# POSSIBILITY OF SUCH DAMAGE. |
|
|
|
|
|
|
|
from __future__ import absolute_import |
|
|
|
from __future__ import unicode_literals |
|
|
|
|
|
|
|
import re |
|
|
|
|
|
|
|
from .html import _BaseHTMLProcessor |
|
|
|
from .sgml import _SGML_AVAILABLE |
|
|
|
from .urls import _makeSafeAbsoluteURI |
|
|
|
from .urls import make_safe_absolute_uri |
|
|
|
|
|
|
|
|
|
|
|
class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area', |
|
|
|
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', |
|
|
|
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', |
|
|
|
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', |
|
|
|
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', |
|
|
|
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1', |
|
|
|
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', |
|
|
|
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', |
|
|
|
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', |
|
|
|
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', |
|
|
|
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', |
|
|
|
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', |
|
|
|
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript']) |
|
|
|
|
|
|
|
acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey', |
|
|
|
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', |
|
|
|
'background', 'balance', 'bgcolor', 'bgproperties', 'border', |
|
|
|
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', |
|
|
|
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', |
|
|
|
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols', |
|
|
|
'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data', |
|
|
|
'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay', |
|
|
|
'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for', |
|
|
|
'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus', |
|
|
|
'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode', |
|
|
|
'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc', |
|
|
|
'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max', |
|
|
|
'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref', |
|
|
|
'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size', |
|
|
|
'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel', |
|
|
|
'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing', |
|
|
|
'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', |
|
|
|
'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target', |
|
|
|
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap', |
|
|
|
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml', |
|
|
|
'width', 'wrap', 'xml:lang']) |
|
|
|
|
|
|
|
unacceptable_elements_with_end_tag = set(['script', 'applet', 'style']) |
|
|
|
|
|
|
|
acceptable_css_properties = set(['azimuth', 'background-color', |
|
|
|
'border-bottom-color', 'border-collapse', 'border-color', |
|
|
|
'border-left-color', 'border-right-color', 'border-top-color', 'clear', |
|
|
|
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', |
|
|
|
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', |
|
|
|
'height', 'letter-spacing', 'line-height', 'overflow', 'pause', |
|
|
|
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', |
|
|
|
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', |
|
|
|
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', |
|
|
|
'unicode-bidi', 'vertical-align', 'voice-family', 'volume', |
|
|
|
'white-space', 'width']) |
|
|
|
acceptable_elements = { |
|
|
|
'a', |
|
|
|
'abbr', |
|
|
|
'acronym', |
|
|
|
'address', |
|
|
|
'area', |
|
|
|
'article', |
|
|
|
'aside', |
|
|
|
'audio', |
|
|
|
'b', |
|
|
|
'big', |
|
|
|
'blockquote', |
|
|
|
'br', |
|
|
|
'button', |
|
|
|
'canvas', |
|
|
|
'caption', |
|
|
|
'center', |
|
|
|
'cite', |
|
|
|
'code', |
|
|
|
'col', |
|
|
|
'colgroup', |
|
|
|
'command', |
|
|
|
'datagrid', |
|
|
|
'datalist', |
|
|
|
'dd', |
|
|
|
'del', |
|
|
|
'details', |
|
|
|
'dfn', |
|
|
|
'dialog', |
|
|
|
'dir', |
|
|
|
'div', |
|
|
|
'dl', |
|
|
|
'dt', |
|
|
|
'em', |
|
|
|
'event-source', |
|
|
|
'fieldset', |
|
|
|
'figcaption', |
|
|
|
'figure', |
|
|
|
'font', |
|
|
|
'footer', |
|
|
|
'form', |
|
|
|
'h1', |
|
|
|
'h2', |
|
|
|
'h3', |
|
|
|
'h4', |
|
|
|
'h5', |
|
|
|
'h6', |
|
|
|
'header', |
|
|
|
'hr', |
|
|
|
'i', |
|
|
|
'img', |
|
|
|
'input', |
|
|
|
'ins', |
|
|
|
'kbd', |
|
|
|
'keygen', |
|
|
|
'label', |
|
|
|
'legend', |
|
|
|
'li', |
|
|
|
'm', |
|
|
|
'map', |
|
|
|
'menu', |
|
|
|
'meter', |
|
|
|
'multicol', |
|
|
|
'nav', |
|
|
|
'nextid', |
|
|
|
'noscript', |
|
|
|
'ol', |
|
|
|
'optgroup', |
|
|
|
'option', |
|
|
|
'output', |
|
|
|
'p', |
|
|
|
'pre', |
|
|
|
'progress', |
|
|
|
'q', |
|
|
|
's', |
|
|
|
'samp', |
|
|
|
'section', |
|
|
|
'select', |
|
|
|
'small', |
|
|
|
'sound', |
|
|
|
'source', |
|
|
|
'spacer', |
|
|
|
'span', |
|
|
|
'strike', |
|
|
|
'strong', |
|
|
|
'sub', |
|
|
|
'sup', |
|
|
|
'table', |
|
|
|
'tbody', |
|
|
|
'td', |
|
|
|
'textarea', |
|
|
|
'tfoot', |
|
|
|
'th', |
|
|
|
'thead', |
|
|
|
'time', |
|
|
|
'tr', |
|
|
|
'tt', |
|
|
|
'u', |
|
|
|
'ul', |
|
|
|
'var', |
|
|
|
'video', |
|
|
|
} |
|
|
|
|
|
|
|
acceptable_attributes = { |
|
|
|
'abbr', |
|
|
|
'accept', |
|
|
|
'accept-charset', |
|
|
|
'accesskey', |
|
|
|
'action', |
|
|
|
'align', |
|
|
|
'alt', |
|
|
|
'autocomplete', |
|
|
|
'autofocus', |
|
|
|
'axis', |
|
|
|
'background', |
|
|
|
'balance', |
|
|
|
'bgcolor', |
|
|
|
'bgproperties', |
|
|
|
'border', |
|
|
|
'bordercolor', |
|
|
|
'bordercolordark', |
|
|
|
'bordercolorlight', |
|
|
|
'bottompadding', |
|
|
|
'cellpadding', |
|
|
|
'cellspacing', |
|
|
|
'ch', |
|
|
|
'challenge', |
|
|
|
'char', |
|
|
|
'charoff', |
|
|
|
'charset', |
|
|
|
'checked', |
|
|
|
'choff', |
|
|
|
'cite', |
|
|
|
'class', |
|
|
|
'clear', |
|
|
|
'color', |
|
|
|
'cols', |
|
|
|
'colspan', |
|
|
|
'compact', |
|
|
|
'contenteditable', |
|
|
|
'controls', |
|
|
|
'coords', |
|
|
|
'data', |
|
|
|
'datafld', |
|
|
|
'datapagesize', |
|
|
|
'datasrc', |
|
|
|
'datetime', |
|
|
|
'default', |
|
|
|
'delay', |
|
|
|
'dir', |
|
|
|
'disabled', |
|
|
|
'draggable', |
|
|
|
'dynsrc', |
|
|
|
'enctype', |
|
|
|
'end', |
|
|
|
'face', |
|
|
|
'for', |
|
|
|
'form', |
|
|
|
'frame', |
|
|
|
'galleryimg', |
|
|
|
'gutter', |
|
|
|
'headers', |
|
|
|
'height', |
|
|
|
'hidden', |
|
|
|
'hidefocus', |
|
|
|
'high', |
|
|
|
'href', |
|
|
|
'hreflang', |
|
|
|
'hspace', |
|
|
|
'icon', |
|
|
|
'id', |
|
|
|
'inputmode', |
|
|
|
'ismap', |
|
|
|
'keytype', |
|
|
|
'label', |
|
|
|
'lang', |
|
|
|
'leftspacing', |
|
|
|
'list', |
|
|
|
'longdesc', |
|
|
|
'loop', |
|
|
|
'loopcount', |
|
|
|
'loopend', |
|
|
|
'loopstart', |
|
|
|
'low', |
|
|
|
'lowsrc', |
|
|
|
'max', |
|
|
|
'maxlength', |
|
|
|
'media', |
|
|
|
'method', |
|
|
|
'min', |
|
|
|
'multiple', |
|
|
|
'name', |
|
|
|
'nohref', |
|
|
|
'noshade', |
|
|
|
'nowrap', |
|
|
|
'open', |
|
|
|
'optimum', |
|
|
|
'pattern', |
|
|
|
'ping', |
|
|
|
'point-size', |
|
|
|
'poster', |
|
|
|
'pqg', |
|
|
|
'preload', |
|
|
|
'prompt', |
|
|
|
'radiogroup', |
|
|
|
'readonly', |
|
|
|
'rel', |
|
|
|
'repeat-max', |
|
|
|
'repeat-min', |
|
|
|
'replace', |
|
|
|
'required', |
|
|
|
'rev', |
|
|
|
'rightspacing', |
|
|
|
'rows', |
|
|
|
'rowspan', |
|
|
|
'rules', |
|
|
|
'scope', |
|
|
|
'selected', |
|
|
|
'shape', |
|
|
|
'size', |
|
|
|
'span', |
|
|
|
'src', |
|
|
|
'start', |
|
|
|
'step', |
|
|
|
'summary', |
|
|
|
'suppress', |
|
|
|
'tabindex', |
|
|
|
'target', |
|
|
|
'template', |
|
|
|
'title', |
|
|
|
'toppadding', |
|
|
|
'type', |
|
|
|
'unselectable', |
|
|
|
'urn', |
|
|
|
'usemap', |
|
|
|
'valign', |
|
|
|
'value', |
|
|
|
'variable', |
|
|
|
'volume', |
|
|
|
'vrml', |
|
|
|
'vspace', |
|
|
|
'width', |
|
|
|
'wrap', |
|
|
|
'xml:lang', |
|
|
|
} |
|
|
|
|
|
|
|
unacceptable_elements_with_end_tag = { |
|
|
|
'applet', |
|
|
|
'script', |
|
|
|
'style', |
|
|
|
} |
|
|
|
|
|
|
|
acceptable_css_properties = { |
|
|
|
'azimuth', |
|
|
|
'background-color', |
|
|
|
'border-bottom-color', |
|
|
|
'border-collapse', |
|
|
|
'border-color', |
|
|
|
'border-left-color', |
|
|
|
'border-right-color', |
|
|
|
'border-top-color', |
|
|
|
'clear', |
|
|
|
'color', |
|
|
|
'cursor', |
|
|
|
'direction', |
|
|
|
'display', |
|
|
|
'elevation', |
|
|
|
'float', |
|
|
|
'font', |
|
|
|
'font-family', |
|
|
|
'font-size', |
|
|
|
'font-style', |
|
|
|
'font-variant', |
|
|
|
'font-weight', |
|
|
|
'height', |
|
|
|
'letter-spacing', |
|
|
|
'line-height', |
|
|
|
'overflow', |
|
|
|
'pause', |
|
|
|
'pause-after', |
|
|
|
'pause-before', |
|
|
|
'pitch', |
|
|
|
'pitch-range', |
|
|
|
'richness', |
|
|
|
'speak', |
|
|
|
'speak-header', |
|
|
|
'speak-numeral', |
|
|
|
'speak-punctuation', |
|
|
|
'speech-rate', |
|
|
|
'stress', |
|
|
|
'text-align', |
|
|
|
'text-decoration', |
|
|
|
'text-indent', |
|
|
|
'unicode-bidi', |
|
|
|
'vertical-align', |
|
|
|
'voice-family', |
|
|
|
'volume', |
|
|
|
'white-space', |
|
|
|
'width', |
|
|
|
} |
|
|
|
|
|
|
|
# survey of common keywords found in feeds |
|
|
|
acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue', |
|
|
|
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', |
|
|
|
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', |
|
|
|
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', |
|
|
|
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', |
|
|
|
'transparent', 'underline', 'white', 'yellow']) |
|
|
|
|
|
|
|
valid_css_values = re.compile(r'^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' + |
|
|
|
r'\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$') |
|
|
|
|
|
|
|
mathml_elements = set([ |
|
|
|
acceptable_css_keywords = { |
|
|
|
'!important', |
|
|
|
'aqua', |
|
|
|
'auto', |
|
|
|
'black', |
|
|
|
'block', |
|
|
|
'blue', |
|
|
|
'bold', |
|
|
|
'both', |
|
|
|
'bottom', |
|
|
|
'brown', |
|
|
|
'center', |
|
|
|
'collapse', |
|
|
|
'dashed', |
|
|
|
'dotted', |
|
|
|
'fuchsia', |
|
|
|
'gray', |
|
|
|
'green', |
|
|
|
'italic', |
|
|
|
'left', |
|
|
|
'lime', |
|
|
|
'maroon', |
|
|
|
'medium', |
|
|
|
'navy', |
|
|
|
'none', |
|
|
|
'normal', |
|
|
|
'nowrap', |
|
|
|
'olive', |
|
|
|
'pointer', |
|
|
|
'purple', |
|
|
|
'red', |
|
|
|
'right', |
|
|
|
'silver', |
|
|
|
'solid', |
|
|
|
'teal', |
|
|
|
'top', |
|
|
|
'transparent', |
|
|
|
'underline', |
|
|
|
'white', |
|
|
|
'yellow', |
|
|
|
} |
|
|
|
|
|
|
|
valid_css_values = re.compile( |
|
|
|
r'^(' |
|
|
|
r'#[0-9a-f]+' # Hex values |
|
|
|
r'|rgb\(\d+%?,\d*%?,?\d*%?\)?' # RGB values |
|
|
|
r'|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?' # Sizes/widths |
|
|
|
r')$' |
|
|
|
) |
|
|
|
|
|
|
|
mathml_elements = { |
|
|
|
'annotation', |
|
|
|
'annotation-xml', |
|
|
|
'maction', |
|
|
@ -114,9 +433,9 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
'munderover', |
|
|
|
'none', |
|
|
|
'semantics', |
|
|
|
]) |
|
|
|
} |
|
|
|
|
|
|
|
mathml_attributes = set([ |
|
|
|
mathml_attributes = { |
|
|
|
'accent', |
|
|
|
'accentunder', |
|
|
|
'actiontype', |
|
|
@ -216,54 +535,214 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
'xlink:type', |
|
|
|
'xmlns', |
|
|
|
'xmlns:xlink', |
|
|
|
]) |
|
|
|
} |
|
|
|
|
|
|
|
# svgtiny - foreignObject + linearGradient + radialGradient + stop |
|
|
|
svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion', |
|
|
|
'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject', |
|
|
|
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', |
|
|
|
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath', |
|
|
|
'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop', |
|
|
|
'svg', 'switch', 'text', 'title', 'tspan', 'use']) |
|
|
|
svg_elements = { |
|
|
|
'a', |
|
|
|
'animate', |
|
|
|
'animateColor', |
|
|
|
'animateMotion', |
|
|
|
'animateTransform', |
|
|
|
'circle', |
|
|
|
'defs', |
|
|
|
'desc', |
|
|
|
'ellipse', |
|
|
|
'font-face', |
|
|
|
'font-face-name', |
|
|
|
'font-face-src', |
|
|
|
'foreignObject', |
|
|
|
'g', |
|
|
|
'glyph', |
|
|
|
'hkern', |
|
|
|
'line', |
|
|
|
'linearGradient', |
|
|
|
'marker', |
|
|
|
'metadata', |
|
|
|
'missing-glyph', |
|
|
|
'mpath', |
|
|
|
'path', |
|
|
|
'polygon', |
|
|
|
'polyline', |
|
|
|
'radialGradient', |
|
|
|
'rect', |
|
|
|
'set', |
|
|
|
'stop', |
|
|
|
'svg', |
|
|
|
'switch', |
|
|
|
'text', |
|
|
|
'title', |
|
|
|
'tspan', |
|
|
|
'use', |
|
|
|
} |
|
|
|
|
|
|
|
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink |
|
|
|
svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic', |
|
|
|
'arabic-form', 'ascent', 'attributeName', 'attributeType', |
|
|
|
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', |
|
|
|
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx', |
|
|
|
'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity', |
|
|
|
'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style', |
|
|
|
'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', |
|
|
|
'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x', |
|
|
|
'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines', |
|
|
|
'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid', |
|
|
|
'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max', |
|
|
|
'min', 'name', 'offset', 'opacity', 'orient', 'origin', |
|
|
|
'overline-position', 'overline-thickness', 'panose-1', 'path', |
|
|
|
'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY', |
|
|
|
'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures', |
|
|
|
'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', |
|
|
|
'stop-color', 'stop-opacity', 'strikethrough-position', |
|
|
|
'strikethrough-thickness', 'stroke', 'stroke-dasharray', |
|
|
|
'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin', |
|
|
|
'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage', |
|
|
|
'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2', |
|
|
|
'underline-position', 'underline-thickness', 'unicode', 'unicode-range', |
|
|
|
'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width', |
|
|
|
'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', |
|
|
|
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', |
|
|
|
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', |
|
|
|
'y2', 'zoomAndPan']) |
|
|
|
svg_attributes = { |
|
|
|
'accent-height', |
|
|
|
'accumulate', |
|
|
|
'additive', |
|
|
|
'alphabetic', |
|
|
|
'arabic-form', |
|
|
|
'ascent', |
|
|
|
'attributeName', |
|
|
|
'attributeType', |
|
|
|
'baseProfile', |
|
|
|
'bbox', |
|
|
|
'begin', |
|
|
|
'by', |
|
|
|
'calcMode', |
|
|
|
'cap-height', |
|
|
|
'class', |
|
|
|
'color', |
|
|
|
'color-rendering', |
|
|
|
'content', |
|
|
|
'cx', |
|
|
|
'cy', |
|
|
|
'd', |
|
|
|
'descent', |
|
|
|
'display', |
|
|
|
'dur', |
|
|
|
'dx', |
|
|
|
'dy', |
|
|
|
'end', |
|
|
|
'fill', |
|
|
|
'fill-opacity', |
|
|
|
'fill-rule', |
|
|
|
'font-family', |
|
|
|
'font-size', |
|
|
|
'font-stretch', |
|
|
|
'font-style', |
|
|
|
'font-variant', |
|
|
|
'font-weight', |
|
|
|
'from', |
|
|
|
'fx', |
|
|
|
'fy', |
|
|
|
'g1', |
|
|
|
'g2', |
|
|
|
'glyph-name', |
|
|
|
'gradientUnits', |
|
|
|
'hanging', |
|
|
|
'height', |
|
|
|
'horiz-adv-x', |
|
|
|
'horiz-origin-x', |
|
|
|
'id', |
|
|
|
'ideographic', |
|
|
|
'k', |
|
|
|
'keyPoints', |
|
|
|
'keySplines', |
|
|
|
'keyTimes', |
|
|
|
'lang', |
|
|
|
'marker-end', |
|
|
|
'marker-mid', |
|
|
|
'marker-start', |
|
|
|
'markerHeight', |
|
|
|
'markerUnits', |
|
|
|
'markerWidth', |
|
|
|
'mathematical', |
|
|
|
'max', |
|
|
|
'min', |
|
|
|
'name', |
|
|
|
'offset', |
|
|
|
'opacity', |
|
|
|
'orient', |
|
|
|
'origin', |
|
|
|
'overline-position', |
|
|
|
'overline-thickness', |
|
|
|
'panose-1', |
|
|
|
'path', |
|
|
|
'pathLength', |
|
|
|
'points', |
|
|
|
'preserveAspectRatio', |
|
|
|
'r', |
|
|
|
'refX', |
|
|
|
'refY', |
|
|
|
'repeatCount', |
|
|
|
'repeatDur', |
|
|
|
'requiredExtensions', |
|
|
|
'requiredFeatures', |
|
|
|
'restart', |
|
|
|
'rotate', |
|
|
|
'rx', |
|
|
|
'ry', |
|
|
|
'slope', |
|
|
|
'stemh', |
|
|
|
'stemv', |
|
|
|
'stop-color', |
|
|
|
'stop-opacity', |
|
|
|
'strikethrough-position', |
|
|
|
'strikethrough-thickness', |
|
|
|
'stroke', |
|
|
|
'stroke-dasharray', |
|
|
|
'stroke-dashoffset', |
|
|
|
'stroke-linecap', |
|
|
|
'stroke-linejoin', |
|
|
|
'stroke-miterlimit', |
|
|
|
'stroke-opacity', |
|
|
|
'stroke-width', |
|
|
|
'systemLanguage', |
|
|
|
'target', |
|
|
|
'text-anchor', |
|
|
|
'to', |
|
|
|
'transform', |
|
|
|
'type', |
|
|
|
'u1', |
|
|
|
'u2', |
|
|
|
'underline-position', |
|
|
|
'underline-thickness', |
|
|
|
'unicode', |
|
|
|
'unicode-range', |
|
|
|
'units-per-em', |
|
|
|
'values', |
|
|
|
'version', |
|
|
|
'viewBox', |
|
|
|
'visibility', |
|
|
|
'width', |
|
|
|
'widths', |
|
|
|
'x', |
|
|
|
'x-height', |
|
|
|
'x1', |
|
|
|
'x2', |
|
|
|
'xlink:actuate', |
|
|
|
'xlink:arcrole', |
|
|
|
'xlink:href', |
|
|
|
'xlink:role', |
|
|
|
'xlink:show', |
|
|
|
'xlink:title', |
|
|
|
'xlink:type', |
|
|
|
'xml:base', |
|
|
|
'xml:lang', |
|
|
|
'xml:space', |
|
|
|
'xmlns', |
|
|
|
'xmlns:xlink', |
|
|
|
'y', |
|
|
|
'y1', |
|
|
|
'y2', |
|
|
|
'zoomAndPan', |
|
|
|
} |
|
|
|
|
|
|
|
svg_attr_map = None |
|
|
|
svg_elem_map = None |
|
|
|
|
|
|
|
acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule', |
|
|
|
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', |
|
|
|
'stroke-opacity']) |
|
|
|
acceptable_svg_properties = { |
|
|
|
'fill', |
|
|
|
'fill-opacity', |
|
|
|
'fill-rule', |
|
|
|
'stroke', |
|
|
|
'stroke-linecap', |
|
|
|
'stroke-linejoin', |
|
|
|
'stroke-opacity', |
|
|
|
'stroke-width', |
|
|
|
} |
|
|
|
|
|
|
|
def __init__(self, encoding=None, _type='application/xhtml+xml'): |
|
|
|
super(_HTMLSanitizer, self).__init__(encoding, _type) |
|
|
|
|
|
|
|
self.unacceptablestack = 0 |
|
|
|
self.mathmlOK = 0 |
|
|
|
self.svgOK = 0 |
|
|
|
|
|
|
|
def reset(self): |
|
|
|
_BaseHTMLProcessor.reset(self) |
|
|
|
super(_HTMLSanitizer, self).reset() |
|
|
|
self.unacceptablestack = 0 |
|
|
|
self.mathmlOK = 0 |
|
|
|
self.svgOK = 0 |
|
|
@ -271,7 +750,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
def unknown_starttag(self, tag, attrs): |
|
|
|
acceptable_attributes = self.acceptable_attributes |
|
|
|
keymap = {} |
|
|
|
if not tag in self.acceptable_elements or self.svgOK: |
|
|
|
if tag not in self.acceptable_elements or self.svgOK: |
|
|
|
if tag in self.unacceptable_elements_with_end_tag: |
|
|
|
self.unacceptablestack += 1 |
|
|
|
|
|
|
@ -293,22 +772,22 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
if self.mathmlOK and tag in self.mathml_elements: |
|
|
|
acceptable_attributes = self.mathml_attributes |
|
|
|
elif self.svgOK and tag in self.svg_elements: |
|
|
|
# for most vocabularies, lowercasing is a good idea. Many |
|
|
|
# svg elements, however, are camel case |
|
|
|
# For most vocabularies, lowercasing is a good idea. Many |
|
|
|
# svg elements, however, are camel case. |
|
|
|
if not self.svg_attr_map: |
|
|
|
lower = [attr.lower() for attr in self.svg_attributes] |
|
|
|
mix = [a for a in self.svg_attributes if a not in lower] |
|
|
|
self.svg_attributes = lower |
|
|
|
self.svg_attr_map = dict([(a.lower(),a) for a in mix]) |
|
|
|
self.svg_attr_map = {a.lower(): a for a in mix} |
|
|
|
|
|
|
|
lower = [attr.lower() for attr in self.svg_elements] |
|
|
|
mix = [a for a in self.svg_elements if a not in lower] |
|
|
|
self.svg_elements = lower |
|
|
|
self.svg_elem_map = dict([(a.lower(),a) for a in mix]) |
|
|
|
self.svg_elem_map = {a.lower(): a for a in mix} |
|
|
|
acceptable_attributes = self.svg_attributes |
|
|
|
tag = self.svg_elem_map.get(tag, tag) |
|
|
|
keymap = self.svg_attr_map |
|
|
|
elif not tag in self.acceptable_elements: |
|
|
|
elif tag not in self.acceptable_elements: |
|
|
|
return |
|
|
|
|
|
|
|
# declare xlink namespace, if needed |
|
|
@ -323,16 +802,16 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
key = keymap.get(key, key) |
|
|
|
# make sure the uri uses an acceptable uri scheme |
|
|
|
if key == 'href': |
|
|
|
value = _makeSafeAbsoluteURI(value) |
|
|
|
value = make_safe_absolute_uri(value) |
|
|
|
clean_attrs.append((key, value)) |
|
|
|
elif key == 'style': |
|
|
|
clean_value = self.sanitize_style(value) |
|
|
|
if clean_value: |
|
|
|
clean_attrs.append((key, clean_value)) |
|
|
|
_BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs) |
|
|
|
super(_HTMLSanitizer, self).unknown_starttag(tag, clean_attrs) |
|
|
|
|
|
|
|
def unknown_endtag(self, tag): |
|
|
|
if not tag in self.acceptable_elements: |
|
|
|
if tag not in self.acceptable_elements: |
|
|
|
if tag in self.unacceptable_elements_with_end_tag: |
|
|
|
self.unacceptablestack -= 1 |
|
|
|
if self.mathmlOK and tag in self.mathml_elements: |
|
|
@ -344,7 +823,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
self.svgOK -= 1 |
|
|
|
else: |
|
|
|
return |
|
|
|
_BaseHTMLProcessor.unknown_endtag(self, tag) |
|
|
|
super(_HTMLSanitizer, self).unknown_endtag(tag) |
|
|
|
|
|
|
|
def handle_pi(self, text): |
|
|
|
pass |
|
|
@ -354,7 +833,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
|
|
|
|
def handle_data(self, text): |
|
|
|
if not self.unacceptablestack: |
|
|
|
_BaseHTMLProcessor.handle_data(self, text) |
|
|
|
super(_HTMLSanitizer, self).handle_data(text) |
|
|
|
|
|
|
|
def sanitize_style(self, style): |
|
|
|
# disallow urls |
|
|
@ -363,7 +842,8 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
# gauntlet |
|
|
|
if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): |
|
|
|
return '' |
|
|
|
# This replaced a regexp that used re.match and was prone to pathological back-tracking. |
|
|
|
# This replaced a regexp that used re.match and was prone to |
|
|
|
# pathological back-tracking. |
|
|
|
if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip(): |
|
|
|
return '' |
|
|
|
|
|
|
@ -375,8 +855,10 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
clean.append(prop + ': ' + value + ';') |
|
|
|
elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']: |
|
|
|
for keyword in value.split(): |
|
|
|
if not keyword in self.acceptable_css_keywords and \ |
|
|
|
not self.valid_css_values.match(keyword): |
|
|
|
if ( |
|
|
|
keyword not in self.acceptable_css_keywords |
|
|
|
and not self.valid_css_values.match(keyword) |
|
|
|
): |
|
|
|
break |
|
|
|
else: |
|
|
|
clean.append(prop + ': ' + value + ';') |
|
|
@ -386,7 +868,7 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
return ' '.join(clean) |
|
|
|
|
|
|
|
def parse_comment(self, i, report=1): |
|
|
|
ret = _BaseHTMLProcessor.parse_comment(self, i, report) |
|
|
|
ret = super(_HTMLSanitizer, self).parse_comment(i, report) |
|
|
|
if ret >= 0: |
|
|
|
return ret |
|
|
|
# if ret == -1, this may be a malicious attempt to circumvent |
|
|
@ -398,16 +880,17 @@ class _HTMLSanitizer(_BaseHTMLProcessor): |
|
|
|
return len(self.rawdata) |
|
|
|
|
|
|
|
|
|
|
|
def _sanitizeHTML(htmlSource, encoding, _type): |
|
|
|
def _sanitize_html(html_source, encoding, _type): |
|
|
|
if not _SGML_AVAILABLE: |
|
|
|
return htmlSource |
|
|
|
return html_source |
|
|
|
p = _HTMLSanitizer(encoding, _type) |
|
|
|
htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[') |
|
|
|
p.feed(htmlSource) |
|
|
|
html_source = html_source.replace('<![CDATA[', '<![CDATA[') |
|
|
|
p.feed(html_source) |
|
|
|
data = p.output() |
|
|
|
data = data.strip().replace('\r\n', '\n') |
|
|
|
return data |
|
|
|
|
|
|
|
|
|
|
|
# Match XML entity declarations. |
|
|
|
# Example: <!ENTITY copyright "(C)"> |
|
|
|
RE_ENTITY_PATTERN = re.compile(br'^\s*<!ENTITY([^>]*?)>', re.MULTILINE) |
|
|
@ -424,12 +907,13 @@ RE_DOCTYPE_PATTERN = re.compile(br'^\s*<!DOCTYPE([^>]*?)>', re.MULTILINE) |
|
|
|
# Forbidden: explode1 "&explode2;&explode2;" |
|
|
|
RE_SAFE_ENTITY_PATTERN = re.compile(br'\s+(\w+)\s+"(&#\w+;|[^&"]*)"') |
|
|
|
|
|
|
|
|
|
|
|
def replace_doctype(data): |
|
|
|
'''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data) |
|
|
|
"""Strips and replaces the DOCTYPE, returns (rss_version, stripped_data) |
|
|
|
|
|
|
|
rss_version may be 'rss091n' or None |
|
|
|
stripped_data is the same XML document with a replaced DOCTYPE |
|
|
|
''' |
|
|
|
""" |
|
|
|
|
|
|
|
# Divide the document into two groups by finding the location |
|
|
|
# of the first element that doesn't begin with '<?' or '<!'. |
|
|
@ -452,8 +936,11 @@ def replace_doctype(data): |
|
|
|
# Re-insert the safe ENTITY declarations if a DOCTYPE was found. |
|
|
|
replacement = b'' |
|
|
|
if len(doctype_results) == 1 and entity_results: |
|
|
|
match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e) |
|
|
|
safe_entities = [e for e in entity_results if match_safe_entities(e)] |
|
|
|
safe_entities = [ |
|
|
|
e |
|
|
|
for e in entity_results |
|
|
|
if RE_SAFE_ENTITY_PATTERN.match(e) |
|
|
|
] |
|
|
|
if safe_entities: |
|
|
|
replacement = b'<!DOCTYPE feed [\n<!ENTITY' \ |
|
|
|
+ b'>\n<!ENTITY '.join(safe_entities) \ |
|
|
@ -461,6 +948,8 @@ def replace_doctype(data): |
|
|
|
data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data |
|
|
|
|
|
|
|
# Precompute the safe entities for the loose parser. |
|
|
|
safe_entities = dict((k.decode('utf-8'), v.decode('utf-8')) |
|
|
|
for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)) |
|
|
|
safe_entities = { |
|
|
|
k.decode('utf-8'): v.decode('utf-8') |
|
|
|
for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement) |
|
|
|
} |
|
|
|
return version, data, safe_entities |
|
|
|