# This file is part of Atomstrom # Copyright (C) 2013 Ronald Schaten # # Atomstrom is free software: you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation, either version 3 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along with # this program. If not, see . from sqlalchemy import Column, Integer, String, Boolean, Enum from sqlalchemy.orm import relationship, backref from datetime import datetime, timedelta import feedparser import sys from models import Base from models.feedinfo import Feedinfo from models.entry import Entry class Feed(Base): __tablename__ = 'feed' id = Column(Integer, primary_key=True) url = Column(String(255)) frequency = Column(Integer) keepdaysafterlastfetch = Column(Integer, default=30) daily = Column(Boolean) resolveredirects = Column(Boolean) readability = Column(Boolean) fullpage = Column(Boolean) contentcolumn = Column(Enum('summary', 'content', 'fullpage', 'readability')) html2textcontent = Column(Boolean) html2textignorelinks = Column(Boolean) html2textignoreimages = Column(Boolean) enabled = Column(Boolean) entries = relationship("Entry", backref=backref('feed'), cascade='all, delete, delete-orphan') feedinfo = relationship("Feedinfo", backref=backref('feed'), cascade='all, delete, delete-orphan', uselist=False) def __init__(self, url, daily, readability, fullpage, enabled, html2textcontent): self.url = url self.daily = daily self.readability = readability self.fullpage = fullpage self.html2textcontent = html2textcontent self.enabled = enabled def __unicode__(self): id = self.id if self.feedinfo: title = self.feedinfo.title last = self.feedinfo.lastsuccessful else: title = '' last = '' if self.enabled: enabled = 'enabled' else: enabled = 'DISABLED' entries = len(self.entries) url = self.url return u'%3d %s (%d entries, last fetched %s, %s)\n %s' % (id, title, entries, last, enabled, url) def __str__(self): return unicode(self).encode('utf-8') def __repr__(self): return "" % (self.id, self.url) def fetch(self, session): print 'processing %d: %s' % (self.id, self.url) fetched = False if self.feedinfo: if (not self.feedinfo.nextfetch) or (self.feedinfo.nextfetch < datetime.now()): print 'feed known, fetching...' try: parser = feedparser.parse(self.url) fetched = True self.feedinfo.update(parser) except: print 'ERROR parsing feed' print sys.exc_info() else: print 'not fetching before: %s' % self.feedinfo.nextfetch else: print 'feed seems to be new, fetching...' try: parser = feedparser.parse(self.url) fetched = True self.feedinfo = Feedinfo(parser) except: print 'ERROR parsing feed' print sys.exc_info() if fetched: print 'processing feed entries:' entries_new = 0 entries_total = 0 for entry in parser.entries: entries_total += 1 thisentry = session.query(Entry).\ filter(Entry.title == entry.title).\ filter(Entry.link == entry.link).\ first() if thisentry: print ' entry already known <%s>' % entry.title thisentry.lastfetched = datetime.now() else: print ' new entry <%s>' % entry.title self.entries.append(Entry(entry, self)) entries_new += 1 print 'updated %d of %d entries' % (entries_new, entries_total) session.commit() self.housekeeper(session) session.commit() def housekeeper(self, session): count = 0 for entry in self.entries: if entry.lastfetched < (datetime.now() - timedelta(days=self.keepdaysafterlastfetch)): session.delete(entry) count += 1 if count > 0: print 'housekeeper deleted %d entries older than %d days.' % (count, self.keepdaysafterlastfetch) def reset(self): self.entries[:] = [] self.feedinfo = None # -*- coding: utf-8 -*-