2013-04-03 00:06:06 +02:00
|
|
|
from sqlalchemy import Column, Integer, String, Boolean, Enum
|
2013-04-07 13:34:36 +02:00
|
|
|
from sqlalchemy.orm import relationship, backref
|
2013-04-15 21:44:49 +02:00
|
|
|
from datetime import datetime, timedelta
|
2013-04-09 23:18:50 +02:00
|
|
|
import feedparser
|
|
|
|
import sys
|
2013-03-26 20:20:51 +01:00
|
|
|
|
|
|
|
from models import Base
|
2013-04-09 23:18:50 +02:00
|
|
|
from models.feedinfo import Feedinfo
|
|
|
|
from models.entry import Entry
|
2013-03-26 20:20:51 +01:00
|
|
|
|
2013-04-07 13:34:36 +02:00
|
|
|
|
2013-03-26 20:20:51 +01:00
|
|
|
class Feed(Base):
|
|
|
|
__tablename__ = 'feed'
|
|
|
|
|
|
|
|
id = Column(Integer, primary_key=True)
|
|
|
|
url = Column(String(255))
|
|
|
|
frequency = Column(Integer)
|
2013-04-15 21:44:49 +02:00
|
|
|
keepdaysafterlastfetch = Column(Integer, default=30)
|
2013-03-26 20:20:51 +01:00
|
|
|
daily = Column(Boolean)
|
|
|
|
resolveredirects = Column(Boolean)
|
|
|
|
readability = Column(Boolean)
|
|
|
|
fullpage = Column(Boolean)
|
2013-04-03 00:06:06 +02:00
|
|
|
contentcolumn = Column(Enum('summary', 'content', 'fullpage', 'readability'))
|
|
|
|
html2textcontent = Column(Boolean)
|
2013-04-09 12:23:05 +02:00
|
|
|
html2textignorelinks = Column(Boolean)
|
2013-03-26 20:20:51 +01:00
|
|
|
html2textignoreimages = Column(Boolean)
|
|
|
|
enabled = Column(Boolean)
|
2013-04-10 20:31:34 +02:00
|
|
|
entries = relationship("Entry", backref=backref('feed'), cascade='all, delete, delete-orphan')
|
2013-04-07 13:34:36 +02:00
|
|
|
feedinfo = relationship("Feedinfo", backref=backref('feed'), cascade='all, delete, delete-orphan', uselist=False)
|
2013-03-26 20:20:51 +01:00
|
|
|
|
2013-04-03 00:06:06 +02:00
|
|
|
def __init__(self, url, daily, readability, fullpage, enabled, html2textcontent):
|
2013-03-26 20:20:51 +01:00
|
|
|
self.url = url
|
|
|
|
self.daily = daily
|
|
|
|
self.readability = readability
|
|
|
|
self.fullpage = fullpage
|
2013-04-03 00:06:06 +02:00
|
|
|
self.html2textcontent = html2textcontent
|
2013-03-26 20:20:51 +01:00
|
|
|
self.enabled = enabled
|
|
|
|
|
2013-04-04 20:40:19 +02:00
|
|
|
def __unicode__(self):
|
|
|
|
id = self.id
|
|
|
|
if self.feedinfo:
|
|
|
|
title = self.feedinfo.title
|
|
|
|
last = self.feedinfo.lastsuccessful
|
|
|
|
else:
|
|
|
|
title = '<unknown>'
|
|
|
|
last = '<never>'
|
2013-04-07 13:43:34 +02:00
|
|
|
if self.enabled:
|
|
|
|
enabled = 'enabled'
|
|
|
|
else:
|
|
|
|
enabled = 'DISABLED'
|
2013-04-07 13:34:36 +02:00
|
|
|
entries = len(self.entries)
|
2013-04-04 20:40:19 +02:00
|
|
|
url = self.url
|
2013-04-07 13:43:34 +02:00
|
|
|
return u'%3d %s (%d entries, last fetched %s, %s)\n %s' % (id, title, entries, last, enabled, url)
|
2013-04-04 20:40:19 +02:00
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return unicode(self).encode('utf-8')
|
|
|
|
|
2013-03-26 20:20:51 +01:00
|
|
|
def __repr__(self):
|
2013-04-04 20:40:19 +02:00
|
|
|
return "<Feed('%d','%s')>" % (self.id, self.url)
|
2013-04-07 13:34:36 +02:00
|
|
|
|
2013-04-10 00:17:15 +02:00
|
|
|
def fetch(self, session):
|
2013-04-09 23:18:50 +02:00
|
|
|
print 'processing %d: %s' % (self.id, self.url)
|
|
|
|
fetched = False
|
|
|
|
if self.feedinfo:
|
|
|
|
if (not self.feedinfo.nextfetch) or (self.feedinfo.nextfetch < datetime.now()):
|
|
|
|
print 'feed known, fetching...'
|
|
|
|
try:
|
|
|
|
parser = feedparser.parse(self.url)
|
|
|
|
fetched = True
|
|
|
|
self.feedinfo.update(parser)
|
|
|
|
except:
|
|
|
|
print 'ERROR parsing feed'
|
|
|
|
print sys.exc_info()
|
|
|
|
else:
|
|
|
|
print 'not fetching before: %s' % self.feedinfo.nextfetch
|
|
|
|
else:
|
|
|
|
print 'feed seems to be new, fetching...'
|
|
|
|
try:
|
|
|
|
parser = feedparser.parse(self.url)
|
|
|
|
fetched = True
|
|
|
|
self.feedinfo = Feedinfo(parser)
|
|
|
|
except:
|
|
|
|
print 'ERROR parsing feed'
|
|
|
|
print sys.exc_info()
|
|
|
|
|
|
|
|
if fetched:
|
|
|
|
print 'processing feed entries:'
|
|
|
|
entries_new = 0
|
|
|
|
entries_total = 0
|
|
|
|
for entry in parser.entries:
|
|
|
|
entries_total += 1
|
2013-04-10 00:17:15 +02:00
|
|
|
thisentry = session.query(Entry).\
|
2013-04-09 23:18:50 +02:00
|
|
|
filter(Entry.title == entry.title).\
|
|
|
|
filter(Entry.link == entry.link).\
|
|
|
|
first()
|
|
|
|
if thisentry:
|
|
|
|
print ' entry already known <%s>' % entry.title
|
|
|
|
thisentry.lastfetched = datetime.now()
|
|
|
|
else:
|
|
|
|
print ' new entry <%s>' % entry.title
|
|
|
|
self.entries.append(Entry(entry, self))
|
|
|
|
entries_new += 1
|
|
|
|
print 'updated %d of %d entries' % (entries_new, entries_total)
|
2013-04-15 21:44:49 +02:00
|
|
|
self.housekeeper(session)
|
2013-04-15 22:37:27 +02:00
|
|
|
session.commit()
|
2013-04-15 21:44:49 +02:00
|
|
|
|
|
|
|
def housekeeper(self, session):
|
|
|
|
count = 0
|
|
|
|
for entry in self.entries:
|
|
|
|
if entry.lastfetched < (datetime.now() - timedelta(days=self.keepdaysafterlastfetch)):
|
|
|
|
session.delete(entry)
|
|
|
|
count += 1
|
|
|
|
if count > 0:
|
|
|
|
print 'housekeeper deleted %d entries older than %d days.' % (count, self.keepdaysafterlastfetch)
|
2013-04-09 23:18:50 +02:00
|
|
|
|
2013-04-07 13:34:36 +02:00
|
|
|
def reset(self):
|
|
|
|
self.entries[:] = []
|
|
|
|
self.feedinfo = None
|
2013-04-09 23:39:02 +02:00
|
|
|
|
|
|
|
# -*- coding: utf-8 -*-
|