from sqlalchemy import Column, Integer, String, Boolean, Enum from sqlalchemy.orm import relationship, backref from datetime import datetime, timedelta import feedparser import sys from models import Base from models.feedinfo import Feedinfo from models.entry import Entry class Feed(Base): __tablename__ = 'feed' id = Column(Integer, primary_key=True) url = Column(String(255)) frequency = Column(Integer) keepdaysafterlastfetch = Column(Integer, default=30) daily = Column(Boolean) resolveredirects = Column(Boolean) readability = Column(Boolean) fullpage = Column(Boolean) contentcolumn = Column(Enum('summary', 'content', 'fullpage', 'readability')) html2textcontent = Column(Boolean) html2textignorelinks = Column(Boolean) html2textignoreimages = Column(Boolean) enabled = Column(Boolean) entries = relationship("Entry", backref=backref('feed'), cascade='all, delete, delete-orphan') feedinfo = relationship("Feedinfo", backref=backref('feed'), cascade='all, delete, delete-orphan', uselist=False) def __init__(self, url, daily, readability, fullpage, enabled, html2textcontent): self.url = url self.daily = daily self.readability = readability self.fullpage = fullpage self.html2textcontent = html2textcontent self.enabled = enabled def __unicode__(self): id = self.id if self.feedinfo: title = self.feedinfo.title last = self.feedinfo.lastsuccessful else: title = '' last = '' if self.enabled: enabled = 'enabled' else: enabled = 'DISABLED' entries = len(self.entries) url = self.url return u'%3d %s (%d entries, last fetched %s, %s)\n %s' % (id, title, entries, last, enabled, url) def __str__(self): return unicode(self).encode('utf-8') def __repr__(self): return "" % (self.id, self.url) def fetch(self, session): print 'processing %d: %s' % (self.id, self.url) fetched = False if self.feedinfo: if (not self.feedinfo.nextfetch) or (self.feedinfo.nextfetch < datetime.now()): print 'feed known, fetching...' try: parser = feedparser.parse(self.url) fetched = True self.feedinfo.update(parser) except: print 'ERROR parsing feed' print sys.exc_info() else: print 'not fetching before: %s' % self.feedinfo.nextfetch else: print 'feed seems to be new, fetching...' try: parser = feedparser.parse(self.url) fetched = True self.feedinfo = Feedinfo(parser) except: print 'ERROR parsing feed' print sys.exc_info() if fetched: print 'processing feed entries:' entries_new = 0 entries_total = 0 for entry in parser.entries: entries_total += 1 thisentry = session.query(Entry).\ filter(Entry.title == entry.title).\ filter(Entry.link == entry.link).\ first() if thisentry: print ' entry already known <%s>' % entry.title thisentry.lastfetched = datetime.now() else: print ' new entry <%s>' % entry.title self.entries.append(Entry(entry, self)) entries_new += 1 print 'updated %d of %d entries' % (entries_new, entries_total) self.housekeeper(session) session.commit() def housekeeper(self, session): count = 0 for entry in self.entries: if entry.lastfetched < (datetime.now() - timedelta(days=self.keepdaysafterlastfetch)): session.delete(entry) count += 1 if count > 0: print 'housekeeper deleted %d entries older than %d days.' % (count, self.keepdaysafterlastfetch) def reset(self): self.entries[:] = [] self.feedinfo = None # -*- coding: utf-8 -*-