#!/usr/bin/env python from sqlalchemy import create_engine, Table, Column, Integer, Text, Boolean, DateTime, MetaData, ForeignKey, desc from sqlalchemy.orm import sessionmaker, relation, backref from sqlalchemy.ext.declarative import declarative_base import datetime import feedparser import re import sys import urllib import hn import html2text import ConfigParser from optparse import OptionParser Base = declarative_base() class Feed(Base): __tablename__ = 'feed' id = Column(Integer, primary_key=True) url = Column(Text) daily = Column(Boolean) readability = Column(Boolean) fullpage = Column(Boolean) html2textsummary = Column(Boolean) enabled = Column(Boolean) def __init__(self, url, daily, readability, fullpage, enabled, html2textsummary): self.url = url self.daily = daily self.readability = readability self.fullpage = fullpage self.html2textsummary = html2textsummary self.enabled = enabled def __repr__(self): return "" % (self.url, self.daily, self.readability) class Feedinfo(Base): __tablename__ = 'feedinfo' id = Column(Integer, primary_key=True) feed_id = Column(Integer, ForeignKey('feed.id')) feed = relation("Feed", backref=backref('feedinfo', uselist=False)) title = Column(Text) link = Column(Text) subtitle = Column(Text) author = Column(Text) publisher = Column(Text) status = Column(Integer) version = Column(Text) encoding = Column(Text) bozo = Column(Integer) lastfetched = Column(DateTime) lastsuccessful = Column(DateTime) def __init__(self, parser): self.update(parser) def __repr__(self): return "" % (self.title, self.subtitle, self.author) def update(self, parser): if parser.feed.has_key('title'): self.title = parser.feed.get('title').encode('latin-1', 'replace') if parser.feed.has_key('link'): self.link = parser.feed.get('link') if parser.feed.has_key('subtitle'): self.subtitle = parser.feed.get('subtitle').encode('latin-1', 'replace') if parser.feed.has_key('author'): self.author = parser.feed.get('author').encode('latin-1', 'replace') if parser.feed.has_key('publisher'): self.author = parser.feed.get('publisher').encode('latin-1', 'replace') self.status = parser.get('status') self.version = parser.get('version') self.encoding = parser.get('encoding') self.bozo = parser.get('bozo') self.lastfetched = datetime.datetime.now() if parser.get('status') == 200: self.lastsuccessful = datetime.datetime.now() class Entry(Base): __tablename__ = 'entry' id = Column(Integer, primary_key=True) feed_id = Column(Integer, ForeignKey('feed.id')) feed = relation("Feed", backref=backref('entry')) title = Column(Text) link = Column(Text) summary = Column(Text) content = Column(Text) author = Column(Text) enclosures = Column(Text) fullpage = Column(Text) readability = Column(Text) firstfetched = Column(DateTime) lastfetched = Column(DateTime) sent = Column(DateTime) def __init__(self, entry): self.update(entry) self.firstfetched = datetime.datetime.now() def __repr__(self): return "" % (self.title, "", "") def update(self, entry): if entry.has_key('title'): self.title = entry.get('title').encode('latin-1', 'replace') if entry.has_key('link'): self.link = entry.get('link').encode('latin-1', 'replace') if entry.has_key('summary'): self.summary = entry.get('summary').encode('latin-1', 'replace') if entry.has_key('content'): self.content = entry.get('content').encode('latin-1', 'replace') if entry.has_key('author'): self.author = entry.get('author').encode('latin-1', 'replace') if entry.has_key('enclosures'): self.enclosures = entry.get('enclosures').encode('latin-1', 'replace') self.lastfetched = datetime.datetime.now() def send_mail(sender, subject, body): print 'Sender: %s' % sender.decode('latin-1') print 'Subject: %s' % subject.decode('latin-1') print 'Body: %s' % body.decode('latin-1') def get_entry_text(entry): if entry.readability: text = entry.readability elif entry.fullpage: text = entry.fullpage elif entry.summary: text = entry.summary else: text = 'no text, sorry' return text def mail_daily_digest(session): print 'mailing daily digest...' sender = 'atomstrom' body = '' count = 0 for feed, feedinfo, entry in session.query(Feed, Feedinfo, Entry).filter(Feed.id==Feedinfo.feed_id).filter(Feed.id==Entry.feed_id).filter(Feed.enabled==1).filter(Feed.daily==1).order_by(desc(Entry.firstfetched)).all(): count = count + 1 body = body + '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title) body = body + ' %s\n' % entry.title body = body + get_entry_text(entry)[0:100] body = body + '\n' body = body + 'link: [%s]\n\n' % entry.link today = datetime.datetime.now() subject = '[atomstrom] %s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count) send_mail(sender, subject, body) def mail_single_entry(feed, feedinfo, entry): sender = feedinfo.title subject = entry.title body = get_entry_text(entry) body = body + '\n\n' body = body + 'site: [%s]\n' % feedinfo.link body = body + 'link: [%s]\n' % entry.link send_mail(sender, subject, body) def mail_single_entries(session): print 'mailing single entries...' for feed, feedinfo, entry in session.query(Feed, Feedinfo, Entry).filter(Feed.id==Feedinfo.feed_id).filter(Feed.id==Entry.feed_id).filter(Feed.enabled==1).filter(Feed.daily==0).all(): mail_single_entry(feed, feedinfo, entry) def fetch_readability(link): text = hn.upgradeLink(link) text = text.decode('utf8') return text def fetch_full_page(link): opener = urllib.FancyURLopener({}) response = opener.open(link) html = response.read() html = html.decode('utf8') text = html2text.html2text(html) return text.encode('latin-1', 'replace') def process_feed_entry(session, feed, entry): #query = session.query(Entry).filter_by(feed_id=feed.id, title=entry.title.encode('latin-1', 'replace')) title = entry.title.encode('latin-1', 'replace') link = entry.link.encode('latin-1', 'replace') query = session.query(Entry).filter(Entry.feed_id==feed.id).filter(Entry.title==title).filter(Entry.link==link) try: thisentry = query.one() thisentry.update(entry) print ' entry already known <%s>' % entry.title return 0 except Exception, e: print ' new entry <%s>' % entry.title thisentry = Entry(entry) if feed.fullpage: print ' fetching full page <%s>' % entry.link thisentry.fullpage = fetch_full_page(entry.link) if feed.readability: print ' fetching readability <%s>' % entry.link thisentry.readability = fetch_readability(entry.link) if feed.html2textsummary: print ' converting summary' summary = thisentry.summary.decode('latin-1') summary = html2text.html2text(summary) thisentry.summary = summary.encode('latin-1', 'replace') feed.entry.append(thisentry) return 1 def fetch_single_feed(session, feed): print 'fetching %s' % feed.url parser = feedparser.parse(feed.url) print 'processing feed info...' query = session.query(Feedinfo).filter(Feedinfo.feed_id==feed.id) try: feed.feedinfo = query.one() feed.feedinfo.update(parser) except Exception, e: print 'this feed seems to be new' feed.feedinfo = Feedinfo(parser) print 'processing feed entries:' entries_new = 0 entries_total = 0 for entry in parser.entries: entries_total = entries_total + 1 entries_new = entries_new + process_feed_entry(session, feed, entry) session.commit() print 'updated %d of %d entries' % (entries_new, entries_total) def fetch_all_feeds(session): print 'fetching all feeds...' for feed in session.query(Feed).filter_by(enabled=1).order_by(Feed.id): fetch_single_feed(session, feed) print if __name__ == '__main__': config = ConfigParser.ConfigParser() config.read('atomstrom.conf') dbconnectstring = '%s://%s:%s@%s/%s' % ( config.get('database', 'engine'), config.get('database', 'user'), config.get('database', 'password'), config.get('database', 'hostname'), config.get('database', 'database'), ) engine = create_engine(dbconnectstring) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) session = Session() #session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1)) #session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1)) parser = OptionParser() parser.add_option("-f", "--fetch", action="store_true", dest="fetch", default=False, help="fetch all feeds") parser.add_option("-s", "--single", action="store_true", dest="single", default=False, help="send single mails") parser.add_option("-d", "--daily", action="store_true", dest="daily", default=False, help="send daily digest") (options, args) = parser.parse_args() if options.fetch: fetch_all_feeds(session) if options.single: mail_single_entries(session) if options.daily: mail_daily_digest(session) session.commit()