refactored processing of entries, enabled processing of enclosures

This commit is contained in:
Ronald Schaten 2013-04-08 21:02:48 +02:00
parent 5b9cc700f8
commit 2718e6502c
2 changed files with 84 additions and 76 deletions

View File

@ -10,11 +10,6 @@ from ddate import ddate
import feedparser
import sys
import codecs
#import urllib
import urllib2
#import hn
import html2text
import HTMLParser
import ConfigParser
from argparse import ArgumentParser
from email.header import Header
@ -112,19 +107,6 @@ def mail_single_entries(session, sender, receiver, prefix):
else:
print 'no unmailed single entries found... not sending mail.'
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def process_feed_entry(session, feed, entry):
thisentry = session.query(Entry).\
filter(Entry.title == entry.title).\
@ -137,45 +119,7 @@ def process_feed_entry(session, feed, entry):
return 0
else:
print ' new entry <%s>' % entry.title
thisentry = Entry(entry)
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
result = opener.open(request)
thisentry.resolvedlink = result.url
print ' final link: <%s>' % result.url
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
thisentry.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
thisentry.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
if feed.html2textignoreimages:
h2t.ignore_images = True
if feed.contentcolumn == 'summary':
thisentry.summary = h2t.handle(thisentry.summary)
elif feed.contentcolumn == 'content':
thisentry.content = h2t.handle(thisentry.content)
elif feed.contentcolumn == 'fullpage':
thisentry.fullpage = h2t.handle(thisentry.fullpage)
elif feed.contentcolumn == 'readability':
thisentry.readability = h2t.handle(thisentry.readability)
hp = HTMLParser.HTMLParser()
if thisentry.summary:
thisentry.summary = hp.unescape(thisentry.summary)
if thisentry.content:
thisentry.content = hp.unescape(thisentry.content)
if thisentry.fullpage:
thisentry.fullpage = hp.unescape(thisentry.fullpage)
if thisentry.readability:
thisentry.readability = hp.unescape(thisentry.readability)
feed.entries.append(thisentry)
feed.entries.append(Entry(entry, feed))
session.commit()
return 1

View File

@ -4,10 +4,34 @@
from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
from datetime import datetime
from time import mktime
import pprint
import urllib2
#import urllib
#import hn
import html2text
import HTMLParser
from models import Base
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def size_human_readable(bytesize):
for x in ['bytes','KB','MB','GB']:
if bytesize < 1024.0:
return "%3.1f%s" % (bytesize, x)
bytesize /= 1024.0
return "%3.1f%s" % (bytesize, 'TB')
class Entry(Base):
__tablename__ = 'entry'
@ -28,20 +52,7 @@ class Entry(Base):
lastfetched = Column(DateTime)
sent = Column(DateTime)
def __init__(self, entry):
self.update(entry)
self.firstfetched = datetime.now()
def __unicode__(self):
return u'%d -> %s' % (self.id, self.title)
def __str__(self):
return unicode(self).encode('utf-8')
def __repr__(self):
return "<Entry('%d','%s')>" % (self.id, self.title)
def update(self, entry):
def __init__(self, entry, feed):
if entry.has_key('title'):
self.title = entry.get('title')
if entry.has_key('link'):
@ -56,12 +67,63 @@ class Entry(Base):
updated_parsed = entry.get('updated_parsed')
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
print 'enclosures';
pp=pprint.PrettyPrinter(depth=4)
pp.pprint(entry.get('enclosures'))
#self.enclosures = entry.get('enclosures')
enclosures = ''
for enclosure in entry.get('enclosures'):
url = enclosure['href']
length = size_human_readable(int(enclosure['length']))
if not enclosures == '':
enclosures += '\n'
enclosures += '%s (%s)' % (url, length)
self.enclosures = enclosures
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
result = opener.open(request)
self.resolvedlink = result.url
print ' final link: <%s>' % result.url
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
self.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
self.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
if feed.html2textignoreimages:
h2t.ignore_images = True
if feed.contentcolumn == 'summary':
self.summary = h2t.handle(self.summary)
elif feed.contentcolumn == 'content':
self.content = h2t.handle(self.content)
elif feed.contentcolumn == 'fullpage':
self.fullpage = h2t.handle(self.fullpage)
elif feed.contentcolumn == 'readability':
self.readability = h2t.handle(self.readability)
hp = HTMLParser.HTMLParser()
if self.summary:
self.summary = hp.unescape(self.summary)
if self.content:
self.content = hp.unescape(self.content)
if self.fullpage:
self.fullpage = hp.unescape(self.fullpage)
if self.readability:
self.readability = hp.unescape(self.readability)
self.firstfetched = datetime.now()
self.lastfetched = datetime.now()
def __unicode__(self):
return u'%d -> %s' % (self.id, self.title)
def __str__(self):
return unicode(self).encode('utf-8')
def __repr__(self):
return "<Entry('%d','%s')>" % (self.id, self.title)
def get_text(self):
if self.feed.contentcolumn == 'summary':
text = self.summary
@ -71,5 +133,7 @@ class Entry(Base):
text = self.fullpage
elif self.feed.contentcolumn == 'readability':
text = self.readability
if self.enclosures:
text += '\n\nEnclosures:\n%s' % self.enclosures
return text