refactored processing of entries, enabled processing of enclosures

This commit is contained in:
Ronald Schaten 2013-04-08 21:02:48 +02:00
parent 5b9cc700f8
commit 2718e6502c
2 changed files with 84 additions and 76 deletions

View File

@ -10,11 +10,6 @@ from ddate import ddate
import feedparser import feedparser
import sys import sys
import codecs import codecs
#import urllib
import urllib2
#import hn
import html2text
import HTMLParser
import ConfigParser import ConfigParser
from argparse import ArgumentParser from argparse import ArgumentParser
from email.header import Header from email.header import Header
@ -112,19 +107,6 @@ def mail_single_entries(session, sender, receiver, prefix):
else: else:
print 'no unmailed single entries found... not sending mail.' print 'no unmailed single entries found... not sending mail.'
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def process_feed_entry(session, feed, entry): def process_feed_entry(session, feed, entry):
thisentry = session.query(Entry).\ thisentry = session.query(Entry).\
filter(Entry.title == entry.title).\ filter(Entry.title == entry.title).\
@ -137,45 +119,7 @@ def process_feed_entry(session, feed, entry):
return 0 return 0
else: else:
print ' new entry <%s>' % entry.title print ' new entry <%s>' % entry.title
thisentry = Entry(entry) feed.entries.append(Entry(entry, feed))
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
result = opener.open(request)
thisentry.resolvedlink = result.url
print ' final link: <%s>' % result.url
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
thisentry.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
thisentry.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
if feed.html2textignoreimages:
h2t.ignore_images = True
if feed.contentcolumn == 'summary':
thisentry.summary = h2t.handle(thisentry.summary)
elif feed.contentcolumn == 'content':
thisentry.content = h2t.handle(thisentry.content)
elif feed.contentcolumn == 'fullpage':
thisentry.fullpage = h2t.handle(thisentry.fullpage)
elif feed.contentcolumn == 'readability':
thisentry.readability = h2t.handle(thisentry.readability)
hp = HTMLParser.HTMLParser()
if thisentry.summary:
thisentry.summary = hp.unescape(thisentry.summary)
if thisentry.content:
thisentry.content = hp.unescape(thisentry.content)
if thisentry.fullpage:
thisentry.fullpage = hp.unescape(thisentry.fullpage)
if thisentry.readability:
thisentry.readability = hp.unescape(thisentry.readability)
feed.entries.append(thisentry)
session.commit() session.commit()
return 1 return 1

View File

@ -4,10 +4,34 @@
from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
from datetime import datetime from datetime import datetime
from time import mktime from time import mktime
import pprint import urllib2
#import urllib
#import hn
import html2text
import HTMLParser
from models import Base from models import Base
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def size_human_readable(bytesize):
for x in ['bytes','KB','MB','GB']:
if bytesize < 1024.0:
return "%3.1f%s" % (bytesize, x)
bytesize /= 1024.0
return "%3.1f%s" % (bytesize, 'TB')
class Entry(Base): class Entry(Base):
__tablename__ = 'entry' __tablename__ = 'entry'
@ -28,20 +52,7 @@ class Entry(Base):
lastfetched = Column(DateTime) lastfetched = Column(DateTime)
sent = Column(DateTime) sent = Column(DateTime)
def __init__(self, entry): def __init__(self, entry, feed):
self.update(entry)
self.firstfetched = datetime.now()
def __unicode__(self):
return u'%d -> %s' % (self.id, self.title)
def __str__(self):
return unicode(self).encode('utf-8')
def __repr__(self):
return "<Entry('%d','%s')>" % (self.id, self.title)
def update(self, entry):
if entry.has_key('title'): if entry.has_key('title'):
self.title = entry.get('title') self.title = entry.get('title')
if entry.has_key('link'): if entry.has_key('link'):
@ -56,12 +67,63 @@ class Entry(Base):
updated_parsed = entry.get('updated_parsed') updated_parsed = entry.get('updated_parsed')
self.updated = datetime.fromtimestamp(mktime(updated_parsed)) self.updated = datetime.fromtimestamp(mktime(updated_parsed))
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0: if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
print 'enclosures'; enclosures = ''
pp=pprint.PrettyPrinter(depth=4) for enclosure in entry.get('enclosures'):
pp.pprint(entry.get('enclosures')) url = enclosure['href']
#self.enclosures = entry.get('enclosures') length = size_human_readable(int(enclosure['length']))
if not enclosures == '':
enclosures += '\n'
enclosures += '%s (%s)' % (url, length)
self.enclosures = enclosures
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
result = opener.open(request)
self.resolvedlink = result.url
print ' final link: <%s>' % result.url
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
self.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
self.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
if feed.html2textignoreimages:
h2t.ignore_images = True
if feed.contentcolumn == 'summary':
self.summary = h2t.handle(self.summary)
elif feed.contentcolumn == 'content':
self.content = h2t.handle(self.content)
elif feed.contentcolumn == 'fullpage':
self.fullpage = h2t.handle(self.fullpage)
elif feed.contentcolumn == 'readability':
self.readability = h2t.handle(self.readability)
hp = HTMLParser.HTMLParser()
if self.summary:
self.summary = hp.unescape(self.summary)
if self.content:
self.content = hp.unescape(self.content)
if self.fullpage:
self.fullpage = hp.unescape(self.fullpage)
if self.readability:
self.readability = hp.unescape(self.readability)
self.firstfetched = datetime.now()
self.lastfetched = datetime.now() self.lastfetched = datetime.now()
def __unicode__(self):
return u'%d -> %s' % (self.id, self.title)
def __str__(self):
return unicode(self).encode('utf-8')
def __repr__(self):
return "<Entry('%d','%s')>" % (self.id, self.title)
def get_text(self): def get_text(self):
if self.feed.contentcolumn == 'summary': if self.feed.contentcolumn == 'summary':
text = self.summary text = self.summary
@ -71,5 +133,7 @@ class Entry(Base):
text = self.fullpage text = self.fullpage
elif self.feed.contentcolumn == 'readability': elif self.feed.contentcolumn == 'readability':
text = self.readability text = self.readability
if self.enclosures:
text += '\n\nEnclosures:\n%s' % self.enclosures
return text return text