148 lines
5.3 KiB
Python
148 lines
5.3 KiB
Python
from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
|
|
from datetime import datetime
|
|
from time import mktime
|
|
import urllib2
|
|
from readability.readability import Document
|
|
import html2text
|
|
import HTMLParser
|
|
|
|
from models import Base
|
|
|
|
def fetch_readability(link):
|
|
h2t = html2text.HTML2Text()
|
|
h2t.body_width = 0
|
|
h2t.inline_links = False
|
|
h2t.ignore_links = True
|
|
h2t.ignore_images = True
|
|
response = urllib2.urlopen(link)
|
|
text = response.read()
|
|
text = Document(text).summary()
|
|
text = h2t.handle(text)
|
|
return text
|
|
|
|
def fetch_full_page(link):
|
|
response = urllib2.urlopen(link)
|
|
html = response.read()
|
|
html = html.decode('utf8')
|
|
return html
|
|
|
|
def size_human_readable(bytesize):
|
|
for x in ['bytes','KB','MB','GB']:
|
|
if bytesize < 1024.0:
|
|
return "%3.1f%s" % (bytesize, x)
|
|
bytesize /= 1024.0
|
|
return "%3.1f%s" % (bytesize, 'TB')
|
|
|
|
class Entry(Base):
|
|
__tablename__ = 'entry'
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
feed_id = Column(Integer, ForeignKey('feed.id'))
|
|
title = Column(String(255))
|
|
link = Column(String(255))
|
|
summary = Column(Text)
|
|
content = Column(Text)
|
|
author = Column(String(255))
|
|
enclosures = Column(Text)
|
|
|
|
resolvedlink = Column(String(255))
|
|
fullpage = Column(Text)
|
|
readability = Column(Text)
|
|
updated = Column(DateTime)
|
|
firstfetched = Column(DateTime)
|
|
lastfetched = Column(DateTime)
|
|
sent = Column(DateTime)
|
|
|
|
def __init__(self, entry, feed):
|
|
if entry.has_key('title'):
|
|
self.title = entry.get('title')
|
|
if entry.has_key('link'):
|
|
self.link = entry.get('link')
|
|
if entry.has_key('summary'):
|
|
self.summary = entry.get('summary')
|
|
if entry.has_key('content'):
|
|
self.content = entry.get('content')[0].value
|
|
if entry.has_key('author'):
|
|
self.author = entry.get('author')
|
|
if entry.has_key('updated_parsed'):
|
|
updated_parsed = entry.get('updated_parsed')
|
|
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
|
|
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
|
|
enclosures = ''
|
|
for enclosure in entry.get('enclosures'):
|
|
if not enclosures == '':
|
|
enclosures += '\n'
|
|
enclosures += enclosure['href']
|
|
if enclosure.has_key('length'):
|
|
enclosures += ' (%s)' % size_human_readable(int(enclosure['length']))
|
|
self.enclosures = enclosures
|
|
if feed.resolveredirects:
|
|
print ' fetching final link <%s>' % entry.link
|
|
request = urllib2.Request(entry.link)
|
|
opener = urllib2.build_opener()
|
|
try:
|
|
result = opener.open(request)
|
|
self.resolvedlink = result.url
|
|
print ' final link: <%s>' % result.url
|
|
except:
|
|
print ' FAILED opening URL'
|
|
if feed.fullpage:
|
|
print ' fetching full page <%s>' % entry.link
|
|
self.fullpage = fetch_full_page(entry.link)
|
|
if feed.readability:
|
|
print ' fetching readability <%s>' % entry.link
|
|
self.readability = fetch_readability(entry.link)
|
|
if feed.html2textcontent:
|
|
print ' converting summary'
|
|
h2t = html2text.HTML2Text()
|
|
h2t.body_width = 0
|
|
h2t.inline_links = False
|
|
if feed.html2textignorelinks:
|
|
h2t.ignore_links = True
|
|
if feed.html2textignoreimages:
|
|
h2t.ignore_images = True
|
|
if feed.contentcolumn == 'summary' and self.summary:
|
|
self.summary = h2t.handle(self.summary)
|
|
elif feed.contentcolumn == 'content' and self.content:
|
|
self.content = h2t.handle(self.content)
|
|
elif feed.contentcolumn == 'fullpage' and self.fullpage:
|
|
self.fullpage = h2t.handle(self.fullpage)
|
|
elif feed.contentcolumn == 'readability' and self.readability:
|
|
self.readability = h2t.handle(self.readability)
|
|
hp = HTMLParser.HTMLParser()
|
|
if self.summary:
|
|
self.summary = hp.unescape(self.summary)
|
|
if self.content:
|
|
self.content = hp.unescape(self.content)
|
|
if self.fullpage:
|
|
self.fullpage = hp.unescape(self.fullpage)
|
|
if self.readability:
|
|
self.readability = hp.unescape(self.readability)
|
|
self.firstfetched = datetime.now()
|
|
self.lastfetched = datetime.now()
|
|
|
|
def __unicode__(self):
|
|
return u'%d -> %s' % (self.id, self.title)
|
|
|
|
def __str__(self):
|
|
return unicode(self).encode('utf-8')
|
|
|
|
def __repr__(self):
|
|
return "<Entry('%d','%s')>" % (self.id, self.title)
|
|
|
|
def get_text(self):
|
|
text = ''
|
|
if self.feed.contentcolumn == 'summary' and self.summary:
|
|
text = self.summary
|
|
elif self.feed.contentcolumn == 'content' and self.content:
|
|
text = self.content
|
|
elif self.feed.contentcolumn == 'fullpage' and self.fullpage:
|
|
text = self.fullpage
|
|
elif self.feed.contentcolumn == 'readability' and self.readability:
|
|
text = self.readability
|
|
if self.enclosures:
|
|
text += '\n\nEnclosures:\n%s' % self.enclosures
|
|
return text
|
|
|
|
# -*- coding: utf-8 -*-
|