Atomstrom/models/entry.py

169 lines
6.3 KiB
Python
Raw Normal View History

# This file is part of Atomstrom
# Copyright (C) 2013 Ronald Schaten <ronald@schatenseite.de>
#
# Atomstrom is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
from datetime import datetime
from time import mktime
import urllib2
from readability.readability import Document
import html2text
import HTMLParser
from models import Base
def fetch_readability(link):
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
h2t.ignore_links = True
h2t.ignore_images = True
response = urllib2.urlopen(link)
text = response.read()
text = Document(text).summary()
text = h2t.handle(text)
return text
def fetch_full_page(link):
2013-04-11 15:10:39 +00:00
response = urllib2.urlopen(link)
html = response.read()
html = html.decode('utf8')
2013-04-11 15:10:39 +00:00
return html
def size_human_readable(bytesize):
for x in ['bytes','KB','MB','GB']:
if bytesize < 1024.0:
return "%3.1f%s" % (bytesize, x)
bytesize /= 1024.0
return "%3.1f%s" % (bytesize, 'TB')
class Entry(Base):
__tablename__ = 'entry'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
title = Column(String(255))
link = Column(String(255))
summary = Column(Text)
content = Column(Text)
author = Column(String(255))
enclosures = Column(Text)
resolvedlink = Column(String(255))
fullpage = Column(Text)
readability = Column(Text)
updated = Column(DateTime)
firstfetched = Column(DateTime)
lastfetched = Column(DateTime)
sent = Column(DateTime)
def __init__(self, entry, feed):
if entry.has_key('title'):
self.title = entry.get('title')
if entry.has_key('link'):
self.link = entry.get('link')
if entry.has_key('summary'):
self.summary = entry.get('summary')
if entry.has_key('content'):
self.content = entry.get('content')[0].value
for content in entry.get('content'):
if feed.html2textcontent and content.type == 'text/plain':
self.content = content.value
if not feed.html2textcontent and content.type == 'text/html':
self.content = content.value
if entry.has_key('author'):
self.author = entry.get('author')
if entry.has_key('updated_parsed'):
updated_parsed = entry.get('updated_parsed')
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
enclosures = ''
for enclosure in entry.get('enclosures'):
if not enclosures == '':
enclosures += '\n'
enclosures += enclosure['href']
if enclosure.has_key('length'):
enclosures += ' (%s)' % size_human_readable(int(enclosure['length']))
self.enclosures = enclosures
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
try:
result = opener.open(request)
self.resolvedlink = result.url
print ' final link: <%s>' % result.url
except:
print ' FAILED opening URL'
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
self.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
self.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
2013-04-09 10:23:05 +00:00
if feed.html2textignorelinks:
h2t.ignore_links = True
if feed.html2textignoreimages:
h2t.ignore_images = True
2013-04-08 20:47:22 +00:00
if feed.contentcolumn == 'summary' and self.summary:
self.summary = h2t.handle(self.summary)
2013-04-08 20:47:22 +00:00
elif feed.contentcolumn == 'content' and self.content:
self.content = h2t.handle(self.content)
2013-04-08 20:47:22 +00:00
elif feed.contentcolumn == 'fullpage' and self.fullpage:
self.fullpage = h2t.handle(self.fullpage)
2013-04-08 20:47:22 +00:00
elif feed.contentcolumn == 'readability' and self.readability:
self.readability = h2t.handle(self.readability)
hp = HTMLParser.HTMLParser()
if self.summary:
self.summary = hp.unescape(self.summary)
if self.content:
self.content = hp.unescape(self.content)
if self.fullpage:
self.fullpage = hp.unescape(self.fullpage)
if self.readability:
self.readability = hp.unescape(self.readability)
self.firstfetched = datetime.now()
self.lastfetched = datetime.now()
def __unicode__(self):
return u'%d -> %s' % (self.id, self.title)
def __str__(self):
return unicode(self).encode('utf-8')
def __repr__(self):
return "<Entry('%d','%s')>" % (self.id, self.title)
def get_text(self, enclosures=True):
2013-04-08 20:47:22 +00:00
text = ''
if self.feed.contentcolumn == 'summary' and self.summary:
text = self.summary
2013-04-08 20:47:22 +00:00
elif self.feed.contentcolumn == 'content' and self.content:
text = self.content
2013-04-08 20:47:22 +00:00
elif self.feed.contentcolumn == 'fullpage' and self.fullpage:
text = self.fullpage
2013-04-08 20:47:22 +00:00
elif self.feed.contentcolumn == 'readability' and self.readability:
text = self.readability
if self.enclosures and enclosures:
text += '\n\nEnclosures:\n%s' % self.enclosures
return text
2013-04-09 21:39:02 +00:00
# -*- coding: utf-8 -*-