Atomstrom/models/entry.py

148 lines
5.3 KiB
Python

from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
from datetime import datetime
from time import mktime
import urllib2
from readability.readability import Document
import html2text
import HTMLParser
from models import Base
def fetch_readability(link):
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
h2t.ignore_links = True
h2t.ignore_images = True
response = urllib2.urlopen(link)
text = response.read()
text = Document(text).summary()
text = h2t.handle(text)
return text
def fetch_full_page(link):
response = urllib2.urlopen(link)
html = response.read()
html = html.decode('utf8')
return html
def size_human_readable(bytesize):
for x in ['bytes','KB','MB','GB']:
if bytesize < 1024.0:
return "%3.1f%s" % (bytesize, x)
bytesize /= 1024.0
return "%3.1f%s" % (bytesize, 'TB')
class Entry(Base):
__tablename__ = 'entry'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
title = Column(String(255))
link = Column(String(255))
summary = Column(Text)
content = Column(Text)
author = Column(String(255))
enclosures = Column(Text)
resolvedlink = Column(String(255))
fullpage = Column(Text)
readability = Column(Text)
updated = Column(DateTime)
firstfetched = Column(DateTime)
lastfetched = Column(DateTime)
sent = Column(DateTime)
def __init__(self, entry, feed):
if entry.has_key('title'):
self.title = entry.get('title')
if entry.has_key('link'):
self.link = entry.get('link')
if entry.has_key('summary'):
self.summary = entry.get('summary')
if entry.has_key('content'):
self.content = entry.get('content')[0].value
if entry.has_key('author'):
self.author = entry.get('author')
if entry.has_key('updated_parsed'):
updated_parsed = entry.get('updated_parsed')
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
enclosures = ''
for enclosure in entry.get('enclosures'):
if not enclosures == '':
enclosures += '\n'
enclosures += enclosure['href']
if enclosure.has_key('length'):
enclosures += ' (%s)' % size_human_readable(int(enclosure['length']))
self.enclosures = enclosures
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
try:
result = opener.open(request)
self.resolvedlink = result.url
print ' final link: <%s>' % result.url
except:
print ' FAILED opening URL'
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
self.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
self.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
if feed.html2textignorelinks:
h2t.ignore_links = True
if feed.html2textignoreimages:
h2t.ignore_images = True
if feed.contentcolumn == 'summary' and self.summary:
self.summary = h2t.handle(self.summary)
elif feed.contentcolumn == 'content' and self.content:
self.content = h2t.handle(self.content)
elif feed.contentcolumn == 'fullpage' and self.fullpage:
self.fullpage = h2t.handle(self.fullpage)
elif feed.contentcolumn == 'readability' and self.readability:
self.readability = h2t.handle(self.readability)
hp = HTMLParser.HTMLParser()
if self.summary:
self.summary = hp.unescape(self.summary)
if self.content:
self.content = hp.unescape(self.content)
if self.fullpage:
self.fullpage = hp.unescape(self.fullpage)
if self.readability:
self.readability = hp.unescape(self.readability)
self.firstfetched = datetime.now()
self.lastfetched = datetime.now()
def __unicode__(self):
return u'%d -> %s' % (self.id, self.title)
def __str__(self):
return unicode(self).encode('utf-8')
def __repr__(self):
return "<Entry('%d','%s')>" % (self.id, self.title)
def get_text(self):
text = ''
if self.feed.contentcolumn == 'summary' and self.summary:
text = self.summary
elif self.feed.contentcolumn == 'content' and self.content:
text = self.content
elif self.feed.contentcolumn == 'fullpage' and self.fullpage:
text = self.fullpage
elif self.feed.contentcolumn == 'readability' and self.readability:
text = self.readability
if self.enclosures:
text += '\n\nEnclosures:\n%s' % self.enclosures
return text
# -*- coding: utf-8 -*-