decode html entities

This commit is contained in:
Ronald Schaten 2013-04-06 00:12:20 +02:00
parent 640442b95d
commit 0c6cb0c7f9

View File

@ -14,6 +14,7 @@ import codecs
import urllib2 import urllib2
#import hn #import hn
import html2text import html2text
import HTMLParser
import ConfigParser import ConfigParser
from argparse import ArgumentParser from argparse import ArgumentParser
from email.header import Header from email.header import Header
@ -165,6 +166,15 @@ def process_feed_entry(session, feed, entry):
thisentry.fullpage = h2t.handle(thisentry.fullpage) thisentry.fullpage = h2t.handle(thisentry.fullpage)
elif feed.contentcolumn == 'readability': elif feed.contentcolumn == 'readability':
thisentry.readability = h2t.handle(thisentry.readability) thisentry.readability = h2t.handle(thisentry.readability)
hp = HTMLParser.HTMLParser()
if thisentry.summary:
thisentry.summary = hp.unescape(thisentry.summary)
if thisentry.content:
thisentry.content = hp.unescape(thisentry.content)
if thisentry.fullpage:
thisentry.fullpage = hp.unescape(thisentry.fullpage)
if thisentry.readability:
thisentry.readability = hp.unescape(thisentry.readability)
feed.entry.append(thisentry) feed.entry.append(thisentry)
session.commit() session.commit()
return 1 return 1