#!/usr/bin/env python #coding: utf-8 from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime from datetime import datetime from time import mktime import urllib2 #import urllib #import hn import html2text import HTMLParser from models import Base def fetch_readability(link): text = hn.upgradeLink(link) text = text.decode('utf8') return text def fetch_full_page(link): opener = urllib.FancyURLopener({}) response = opener.open(link) html = response.read() html = html.decode('utf8') text = html2text.html2text(html) return text.encode('latin-1', 'replace') def size_human_readable(bytesize): for x in ['bytes','KB','MB','GB']: if bytesize < 1024.0: return "%3.1f%s" % (bytesize, x) bytesize /= 1024.0 return "%3.1f%s" % (bytesize, 'TB') class Entry(Base): __tablename__ = 'entry' id = Column(Integer, primary_key=True) feed_id = Column(Integer, ForeignKey('feed.id')) title = Column(String(255)) link = Column(String(255)) summary = Column(Text) content = Column(Text) author = Column(String(255)) enclosures = Column(Text) resolvedlink = Column(String(255)) fullpage = Column(Text) readability = Column(Text) updated = Column(DateTime) firstfetched = Column(DateTime) lastfetched = Column(DateTime) sent = Column(DateTime) def __init__(self, entry, feed): if entry.has_key('title'): self.title = entry.get('title') if entry.has_key('link'): self.link = entry.get('link') if entry.has_key('summary'): self.summary = entry.get('summary') if entry.has_key('content'): self.content = entry.get('content')[0].value if entry.has_key('author'): self.author = entry.get('author') if entry.has_key('updated_parsed'): updated_parsed = entry.get('updated_parsed') self.updated = datetime.fromtimestamp(mktime(updated_parsed)) if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0: enclosures = '' for enclosure in entry.get('enclosures'): if not enclosures == '': enclosures += '\n' enclosures += enclosure['href'] if enclosure.has_key('length'): enclosures += ' (%s)' % size_human_readable(int(enclosure['length'])) self.enclosures = enclosures if feed.resolveredirects: print ' fetching final link <%s>' % entry.link request = urllib2.Request(entry.link) opener = urllib2.build_opener() try: result = opener.open(request) self.resolvedlink = result.url print ' final link: <%s>' % result.url except: print ' FAILED opening URL' if feed.fullpage: print ' fetching full page <%s>' % entry.link self.fullpage = fetch_full_page(entry.link) if feed.readability: print ' fetching readability <%s>' % entry.link self.readability = fetch_readability(entry.link) if feed.html2textcontent: print ' converting summary' h2t = html2text.HTML2Text() h2t.body_width = 0 h2t.inline_links = False if feed.html2textignorelinks: h2t.ignore_links = True if feed.html2textignoreimages: h2t.ignore_images = True if feed.contentcolumn == 'summary' and self.summary: self.summary = h2t.handle(self.summary) elif feed.contentcolumn == 'content' and self.content: self.content = h2t.handle(self.content) elif feed.contentcolumn == 'fullpage' and self.fullpage: self.fullpage = h2t.handle(self.fullpage) elif feed.contentcolumn == 'readability' and self.readability: self.readability = h2t.handle(self.readability) hp = HTMLParser.HTMLParser() if self.summary: self.summary = hp.unescape(self.summary) if self.content: self.content = hp.unescape(self.content) if self.fullpage: self.fullpage = hp.unescape(self.fullpage) if self.readability: self.readability = hp.unescape(self.readability) self.firstfetched = datetime.now() self.lastfetched = datetime.now() def __unicode__(self): return u'%d -> %s' % (self.id, self.title) def __str__(self): return unicode(self).encode('utf-8') def __repr__(self): return "" % (self.id, self.title) def get_text(self): text = '' if self.feed.contentcolumn == 'summary' and self.summary: text = self.summary elif self.feed.contentcolumn == 'content' and self.content: text = self.content elif self.feed.contentcolumn == 'fullpage' and self.fullpage: text = self.fullpage elif self.feed.contentcolumn == 'readability' and self.readability: text = self.readability if self.enclosures: text += '\n\nEnclosures:\n%s' % self.enclosures return text