From 0c6cb0c7f9faad6c34fba1aeca9189b8af3d7033 Mon Sep 17 00:00:00 2001 From: Ronald Schaten Date: Sat, 6 Apr 2013 00:12:20 +0200 Subject: [PATCH] decode html entities --- atomstrom.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/atomstrom.py b/atomstrom.py index dcc0916..4f58d15 100755 --- a/atomstrom.py +++ b/atomstrom.py @@ -14,6 +14,7 @@ import codecs import urllib2 #import hn import html2text +import HTMLParser import ConfigParser from argparse import ArgumentParser from email.header import Header @@ -165,6 +166,15 @@ def process_feed_entry(session, feed, entry): thisentry.fullpage = h2t.handle(thisentry.fullpage) elif feed.contentcolumn == 'readability': thisentry.readability = h2t.handle(thisentry.readability) + hp = HTMLParser.HTMLParser() + if thisentry.summary: + thisentry.summary = hp.unescape(thisentry.summary) + if thisentry.content: + thisentry.content = hp.unescape(thisentry.content) + if thisentry.fullpage: + thisentry.fullpage = hp.unescape(thisentry.fullpage) + if thisentry.readability: + thisentry.readability = hp.unescape(thisentry.readability) feed.entry.append(thisentry) session.commit() return 1