diff --git a/atomstrom.py b/atomstrom.py index 76f6e96..349d613 100755 --- a/atomstrom.py +++ b/atomstrom.py @@ -10,11 +10,6 @@ from ddate import ddate import feedparser import sys import codecs -#import urllib -import urllib2 -#import hn -import html2text -import HTMLParser import ConfigParser from argparse import ArgumentParser from email.header import Header @@ -112,19 +107,6 @@ def mail_single_entries(session, sender, receiver, prefix): else: print 'no unmailed single entries found... not sending mail.' -def fetch_readability(link): - text = hn.upgradeLink(link) - text = text.decode('utf8') - return text - -def fetch_full_page(link): - opener = urllib.FancyURLopener({}) - response = opener.open(link) - html = response.read() - html = html.decode('utf8') - text = html2text.html2text(html) - return text.encode('latin-1', 'replace') - def process_feed_entry(session, feed, entry): thisentry = session.query(Entry).\ filter(Entry.title == entry.title).\ @@ -137,45 +119,7 @@ def process_feed_entry(session, feed, entry): return 0 else: print ' new entry <%s>' % entry.title - thisentry = Entry(entry) - if feed.resolveredirects: - print ' fetching final link <%s>' % entry.link - request = urllib2.Request(entry.link) - opener = urllib2.build_opener() - result = opener.open(request) - thisentry.resolvedlink = result.url - print ' final link: <%s>' % result.url - if feed.fullpage: - print ' fetching full page <%s>' % entry.link - thisentry.fullpage = fetch_full_page(entry.link) - if feed.readability: - print ' fetching readability <%s>' % entry.link - thisentry.readability = fetch_readability(entry.link) - if feed.html2textcontent: - print ' converting summary' - h2t = html2text.HTML2Text() - h2t.body_width = 0 - h2t.inline_links = False - if feed.html2textignoreimages: - h2t.ignore_images = True - if feed.contentcolumn == 'summary': - thisentry.summary = h2t.handle(thisentry.summary) - elif feed.contentcolumn == 'content': - thisentry.content = h2t.handle(thisentry.content) - elif feed.contentcolumn == 'fullpage': - thisentry.fullpage = h2t.handle(thisentry.fullpage) - elif feed.contentcolumn == 'readability': - thisentry.readability = h2t.handle(thisentry.readability) - hp = HTMLParser.HTMLParser() - if thisentry.summary: - thisentry.summary = hp.unescape(thisentry.summary) - if thisentry.content: - thisentry.content = hp.unescape(thisentry.content) - if thisentry.fullpage: - thisentry.fullpage = hp.unescape(thisentry.fullpage) - if thisentry.readability: - thisentry.readability = hp.unescape(thisentry.readability) - feed.entries.append(thisentry) + feed.entries.append(Entry(entry, feed)) session.commit() return 1 diff --git a/models/entry.py b/models/entry.py index ebe2d44..e328d15 100644 --- a/models/entry.py +++ b/models/entry.py @@ -4,10 +4,34 @@ from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime from datetime import datetime from time import mktime -import pprint +import urllib2 +#import urllib +#import hn +import html2text +import HTMLParser from models import Base +def fetch_readability(link): + text = hn.upgradeLink(link) + text = text.decode('utf8') + return text + +def fetch_full_page(link): + opener = urllib.FancyURLopener({}) + response = opener.open(link) + html = response.read() + html = html.decode('utf8') + text = html2text.html2text(html) + return text.encode('latin-1', 'replace') + +def size_human_readable(bytesize): + for x in ['bytes','KB','MB','GB']: + if bytesize < 1024.0: + return "%3.1f%s" % (bytesize, x) + bytesize /= 1024.0 + return "%3.1f%s" % (bytesize, 'TB') + class Entry(Base): __tablename__ = 'entry' @@ -28,20 +52,7 @@ class Entry(Base): lastfetched = Column(DateTime) sent = Column(DateTime) - def __init__(self, entry): - self.update(entry) - self.firstfetched = datetime.now() - - def __unicode__(self): - return u'%d -> %s' % (self.id, self.title) - - def __str__(self): - return unicode(self).encode('utf-8') - - def __repr__(self): - return "" % (self.id, self.title) - - def update(self, entry): + def __init__(self, entry, feed): if entry.has_key('title'): self.title = entry.get('title') if entry.has_key('link'): @@ -56,12 +67,63 @@ class Entry(Base): updated_parsed = entry.get('updated_parsed') self.updated = datetime.fromtimestamp(mktime(updated_parsed)) if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0: - print 'enclosures'; - pp=pprint.PrettyPrinter(depth=4) - pp.pprint(entry.get('enclosures')) - #self.enclosures = entry.get('enclosures') + enclosures = '' + for enclosure in entry.get('enclosures'): + url = enclosure['href'] + length = size_human_readable(int(enclosure['length'])) + if not enclosures == '': + enclosures += '\n' + enclosures += '%s (%s)' % (url, length) + self.enclosures = enclosures + if feed.resolveredirects: + print ' fetching final link <%s>' % entry.link + request = urllib2.Request(entry.link) + opener = urllib2.build_opener() + result = opener.open(request) + self.resolvedlink = result.url + print ' final link: <%s>' % result.url + if feed.fullpage: + print ' fetching full page <%s>' % entry.link + self.fullpage = fetch_full_page(entry.link) + if feed.readability: + print ' fetching readability <%s>' % entry.link + self.readability = fetch_readability(entry.link) + if feed.html2textcontent: + print ' converting summary' + h2t = html2text.HTML2Text() + h2t.body_width = 0 + h2t.inline_links = False + if feed.html2textignoreimages: + h2t.ignore_images = True + if feed.contentcolumn == 'summary': + self.summary = h2t.handle(self.summary) + elif feed.contentcolumn == 'content': + self.content = h2t.handle(self.content) + elif feed.contentcolumn == 'fullpage': + self.fullpage = h2t.handle(self.fullpage) + elif feed.contentcolumn == 'readability': + self.readability = h2t.handle(self.readability) + hp = HTMLParser.HTMLParser() + if self.summary: + self.summary = hp.unescape(self.summary) + if self.content: + self.content = hp.unescape(self.content) + if self.fullpage: + self.fullpage = hp.unescape(self.fullpage) + if self.readability: + self.readability = hp.unescape(self.readability) + self.firstfetched = datetime.now() self.lastfetched = datetime.now() + def __unicode__(self): + return u'%d -> %s' % (self.id, self.title) + + def __str__(self): + return unicode(self).encode('utf-8') + + def __repr__(self): + return "" % (self.id, self.title) + def get_text(self): if self.feed.contentcolumn == 'summary': text = self.summary @@ -71,5 +133,7 @@ class Entry(Base): text = self.fullpage elif self.feed.contentcolumn == 'readability': text = self.readability + if self.enclosures: + text += '\n\nEnclosures:\n%s' % self.enclosures return text