Atomstrom/models/entry.py
2013-04-09 23:39:02 +02:00

144 lines
5.2 KiB
Python

from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
from datetime import datetime
from time import mktime
import urllib2
#import urllib
#import hn
import html2text
import HTMLParser
from models import Base
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def size_human_readable(bytesize):
for x in ['bytes','KB','MB','GB']:
if bytesize < 1024.0:
return "%3.1f%s" % (bytesize, x)
bytesize /= 1024.0
return "%3.1f%s" % (bytesize, 'TB')
class Entry(Base):
__tablename__ = 'entry'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
title = Column(String(255))
link = Column(String(255))
summary = Column(Text)
content = Column(Text)
author = Column(String(255))
enclosures = Column(Text)
resolvedlink = Column(String(255))
fullpage = Column(Text)
readability = Column(Text)
updated = Column(DateTime)
firstfetched = Column(DateTime)
lastfetched = Column(DateTime)
sent = Column(DateTime)
def __init__(self, entry, feed):
if entry.has_key('title'):
self.title = entry.get('title')
if entry.has_key('link'):
self.link = entry.get('link')
if entry.has_key('summary'):
self.summary = entry.get('summary')
if entry.has_key('content'):
self.content = entry.get('content')[0].value
if entry.has_key('author'):
self.author = entry.get('author')
if entry.has_key('updated_parsed'):
updated_parsed = entry.get('updated_parsed')
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
enclosures = ''
for enclosure in entry.get('enclosures'):
if not enclosures == '':
enclosures += '\n'
enclosures += enclosure['href']
if enclosure.has_key('length'):
enclosures += ' (%s)' % size_human_readable(int(enclosure['length']))
self.enclosures = enclosures
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
try:
result = opener.open(request)
self.resolvedlink = result.url
print ' final link: <%s>' % result.url
except:
print ' FAILED opening URL'
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
self.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
self.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
if feed.html2textignorelinks:
h2t.ignore_links = True
if feed.html2textignoreimages:
h2t.ignore_images = True
if feed.contentcolumn == 'summary' and self.summary:
self.summary = h2t.handle(self.summary)
elif feed.contentcolumn == 'content' and self.content:
self.content = h2t.handle(self.content)
elif feed.contentcolumn == 'fullpage' and self.fullpage:
self.fullpage = h2t.handle(self.fullpage)
elif feed.contentcolumn == 'readability' and self.readability:
self.readability = h2t.handle(self.readability)
hp = HTMLParser.HTMLParser()
if self.summary:
self.summary = hp.unescape(self.summary)
if self.content:
self.content = hp.unescape(self.content)
if self.fullpage:
self.fullpage = hp.unescape(self.fullpage)
if self.readability:
self.readability = hp.unescape(self.readability)
self.firstfetched = datetime.now()
self.lastfetched = datetime.now()
def __unicode__(self):
return u'%d -> %s' % (self.id, self.title)
def __str__(self):
return unicode(self).encode('utf-8')
def __repr__(self):
return "<Entry('%d','%s')>" % (self.id, self.title)
def get_text(self):
text = ''
if self.feed.contentcolumn == 'summary' and self.summary:
text = self.summary
elif self.feed.contentcolumn == 'content' and self.content:
text = self.content
elif self.feed.contentcolumn == 'fullpage' and self.fullpage:
text = self.fullpage
elif self.feed.contentcolumn == 'readability' and self.readability:
text = self.readability
if self.enclosures:
text += '\n\nEnclosures:\n%s' % self.enclosures
return text
# -*- coding: utf-8 -*-