from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime from datetime import datetime from time import mktime import urllib2 from readability.readability import Document import html2text import HTMLParser from models import Base def fetch_readability(link): h2t = html2text.HTML2Text() h2t.body_width = 0 h2t.inline_links = False h2t.ignore_links = True h2t.ignore_images = True response = urllib2.urlopen(link) text = response.read() text = Document(text).summary() text = h2t.handle(text) return text def fetch_full_page(link): response = urllib2.urlopen(link) html = response.read() html = html.decode('utf8') return html def size_human_readable(bytesize): for x in ['bytes','KB','MB','GB']: if bytesize < 1024.0: return "%3.1f%s" % (bytesize, x) bytesize /= 1024.0 return "%3.1f%s" % (bytesize, 'TB') class Entry(Base): __tablename__ = 'entry' id = Column(Integer, primary_key=True) feed_id = Column(Integer, ForeignKey('feed.id')) title = Column(String(255)) link = Column(String(255)) summary = Column(Text) content = Column(Text) author = Column(String(255)) enclosures = Column(Text) resolvedlink = Column(String(255)) fullpage = Column(Text) readability = Column(Text) updated = Column(DateTime) firstfetched = Column(DateTime) lastfetched = Column(DateTime) sent = Column(DateTime) def __init__(self, entry, feed): if entry.has_key('title'): self.title = entry.get('title') if entry.has_key('link'): self.link = entry.get('link') if entry.has_key('summary'): self.summary = entry.get('summary') if entry.has_key('content'): self.content = entry.get('content')[0].value if entry.has_key('author'): self.author = entry.get('author') if entry.has_key('updated_parsed'): updated_parsed = entry.get('updated_parsed') self.updated = datetime.fromtimestamp(mktime(updated_parsed)) if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0: enclosures = '' for enclosure in entry.get('enclosures'): if not enclosures == '': enclosures += '\n' enclosures += enclosure['href'] if enclosure.has_key('length'): enclosures += ' (%s)' % size_human_readable(int(enclosure['length'])) self.enclosures = enclosures if feed.resolveredirects: print ' fetching final link <%s>' % entry.link request = urllib2.Request(entry.link) opener = urllib2.build_opener() try: result = opener.open(request) self.resolvedlink = result.url print ' final link: <%s>' % result.url except: print ' FAILED opening URL' if feed.fullpage: print ' fetching full page <%s>' % entry.link self.fullpage = fetch_full_page(entry.link) if feed.readability: print ' fetching readability <%s>' % entry.link self.readability = fetch_readability(entry.link) if feed.html2textcontent: print ' converting summary' h2t = html2text.HTML2Text() h2t.body_width = 0 h2t.inline_links = False if feed.html2textignorelinks: h2t.ignore_links = True if feed.html2textignoreimages: h2t.ignore_images = True if feed.contentcolumn == 'summary' and self.summary: self.summary = h2t.handle(self.summary) elif feed.contentcolumn == 'content' and self.content: self.content = h2t.handle(self.content) elif feed.contentcolumn == 'fullpage' and self.fullpage: self.fullpage = h2t.handle(self.fullpage) elif feed.contentcolumn == 'readability' and self.readability: self.readability = h2t.handle(self.readability) hp = HTMLParser.HTMLParser() if self.summary: self.summary = hp.unescape(self.summary) if self.content: self.content = hp.unescape(self.content) if self.fullpage: self.fullpage = hp.unescape(self.fullpage) if self.readability: self.readability = hp.unescape(self.readability) self.firstfetched = datetime.now() self.lastfetched = datetime.now() def __unicode__(self): return u'%d -> %s' % (self.id, self.title) def __str__(self): return unicode(self).encode('utf-8') def __repr__(self): return "" % (self.id, self.title) def get_text(self): text = '' if self.feed.contentcolumn == 'summary' and self.summary: text = self.summary elif self.feed.contentcolumn == 'content' and self.content: text = self.content elif self.feed.contentcolumn == 'fullpage' and self.fullpage: text = self.fullpage elif self.feed.contentcolumn == 'readability' and self.readability: text = self.readability if self.enclosures: text += '\n\nEnclosures:\n%s' % self.enclosures return text # -*- coding: utf-8 -*-