# This file is part of Atomstrom # Copyright (C) 2013 Ronald Schaten # # Atomstrom is free software: you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, either version 3 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along with # this program. If not, see . from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime from datetime import datetime from time import mktime import urllib2 from readability.readability import Document import html2text import HTMLParser from models import Base def fetch_readability(link): h2t = html2text.HTML2Text() h2t.body_width = 0 h2t.inline_links = False h2t.ignore_links = True h2t.ignore_images = True response = urllib2.urlopen(link) text = response.read() text = Document(text).summary() text = h2t.handle(text) return text def fetch_full_page(link): response = urllib2.urlopen(link) html = response.read() html = html.decode('utf8') return html def size_human_readable(bytesize): for x in ['bytes','KB','MB','GB']: if bytesize < 1024.0: return "%3.1f%s" % (bytesize, x) bytesize /= 1024.0 return "%3.1f%s" % (bytesize, 'TB') class Entry(Base): __tablename__ = 'entry' id = Column(Integer, primary_key=True) feed_id = Column(Integer, ForeignKey('feed.id')) title = Column(String(255)) link = Column(String(255)) summary = Column(Text) content = Column(Text) author = Column(String(255)) enclosures = Column(Text) resolvedlink = Column(String(255)) fullpage = Column(Text) readability = Column(Text) updated = Column(DateTime) firstfetched = Column(DateTime) lastfetched = Column(DateTime) sent = Column(DateTime) def __init__(self, entry, feed): if entry.has_key('title'): self.title = entry.get('title') if entry.has_key('link'): self.link = entry.get('link') if entry.has_key('summary'): self.summary = entry.get('summary') if entry.has_key('content'): self.content = entry.get('content')[0].value for content in entry.get('content'): if feed.html2textcontent and content.type == 'text/plain': self.content = content.value if not feed.html2textcontent and content.type == 'text/html': self.content = content.value if entry.has_key('author'): self.author = entry.get('author') if entry.has_key('updated_parsed'): updated_parsed = entry.get('updated_parsed') self.updated = datetime.fromtimestamp(mktime(updated_parsed)) if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0: enclosures = '' for enclosure in entry.get('enclosures'): if not enclosures == '': enclosures += '\n' enclosures += enclosure['href'] if enclosure.has_key('length'): enclosures += ' (%s)' % size_human_readable(int(enclosure['length'])) self.enclosures = enclosures if feed.resolveredirects: print ' fetching final link <%s>' % entry.link request = urllib2.Request(entry.link) opener = urllib2.build_opener() try: result = opener.open(request) self.resolvedlink = result.url print ' final link: <%s>' % result.url except: print ' FAILED opening URL' if feed.fullpage: print ' fetching full page <%s>' % entry.link self.fullpage = fetch_full_page(entry.link) if feed.readability: print ' fetching readability <%s>' % entry.link self.readability = fetch_readability(entry.link) if feed.html2textcontent: print ' converting summary' h2t = html2text.HTML2Text() h2t.body_width = 0 h2t.inline_links = False if feed.html2textignorelinks: h2t.ignore_links = True if feed.html2textignoreimages: h2t.ignore_images = True if feed.contentcolumn == 'summary' and self.summary: self.summary = h2t.handle(self.summary) elif feed.contentcolumn == 'content' and self.content: self.content = h2t.handle(self.content) elif feed.contentcolumn == 'fullpage' and self.fullpage: self.fullpage = h2t.handle(self.fullpage) elif feed.contentcolumn == 'readability' and self.readability: self.readability = h2t.handle(self.readability) hp = HTMLParser.HTMLParser() if self.summary: self.summary = hp.unescape(self.summary) if self.content: self.content = hp.unescape(self.content) if self.fullpage: self.fullpage = hp.unescape(self.fullpage) if self.readability: self.readability = hp.unescape(self.readability) self.firstfetched = datetime.now() self.lastfetched = datetime.now() def __unicode__(self): return u'%d -> %s' % (self.id, self.title) def __str__(self): return unicode(self).encode('utf-8') def __repr__(self): return "" % (self.id, self.title) def get_text(self, enclosures=True): text = '' if self.feed.contentcolumn == 'summary' and self.summary: text = self.summary elif self.feed.contentcolumn == 'content' and self.content: text = self.content elif self.feed.contentcolumn == 'fullpage' and self.fullpage: text = self.fullpage elif self.feed.contentcolumn == 'readability' and self.readability: text = self.readability if self.enclosures and enclosures: text += '\n\nEnclosures:\n%s' % self.enclosures return text # -*- coding: utf-8 -*-