33 lines
1.2 KiB
Python
33 lines
1.2 KiB
Python
|
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
|
||
|
import re
|
||
|
from lxml.html.clean import Cleaner
|
||
|
|
||
|
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
|
||
|
single_quoted = "'[^']+'"
|
||
|
double_quoted = '"[^"]+"'
|
||
|
non_space = '[^ "\'>]+'
|
||
|
htmlstrip = re.compile("<" # open
|
||
|
"([^>]+) " # prefix
|
||
|
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
|
||
|
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
|
||
|
"([^>]*)" # postfix
|
||
|
">" # end
|
||
|
, re.I)
|
||
|
|
||
|
def clean_attributes(html):
|
||
|
while htmlstrip.search(html):
|
||
|
html = htmlstrip.sub('<\\1\\2>', html)
|
||
|
return html
|
||
|
|
||
|
def normalize_spaces(s):
|
||
|
if not s: return ''
|
||
|
"""replace any sequence of whitespace
|
||
|
characters with a single space"""
|
||
|
return ' '.join(s.split())
|
||
|
|
||
|
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
|
||
|
style=True, links=True, meta=False, add_nofollow=False,
|
||
|
page_structure=False, processing_instructions=True, embedded=False,
|
||
|
frames=False, forms=False, annoying_tags=False, remove_tags=None,
|
||
|
remove_unknown_tags=False, safe_attrs_only=False)
|