#!/usr/bin/env python from sqlalchemy import create_engine, Table, Column, Integer, Text, Boolean, DateTime, MetaData, ForeignKey, desc from sqlalchemy.orm import sessionmaker, relation, backref from sqlalchemy.ext.declarative import declarative_base from datetime import datetime, timedelta from time import mktime import feedparser import re import sys import urllib import urllib2 import hn import html2text import ConfigParser import pprint import smtplib from email.mime.text import MIMEText from optparse import OptionParser Base = declarative_base() class Feed(Base): __tablename__ = 'feed' id = Column(Integer, primary_key=True) url = Column(Text) frequency = Column(Integer) daily = Column(Boolean) resolveredirects = Column(Boolean) readability = Column(Boolean) fullpage = Column(Boolean) html2textsummary = Column(Boolean) enabled = Column(Boolean) def __init__(self, url, daily, readability, fullpage, enabled, html2textsummary): self.url = url self.daily = daily self.readability = readability self.fullpage = fullpage self.html2textsummary = html2textsummary self.enabled = enabled def __repr__(self): return "" % (self.url, self.daily, self.readability) class Feedinfo(Base): __tablename__ = 'feedinfo' id = Column(Integer, primary_key=True) feed_id = Column(Integer, ForeignKey('feed.id')) feed = relation("Feed", backref=backref('feedinfo', uselist=False)) title = Column(Text) link = Column(Text) subtitle = Column(Text) author = Column(Text) publisher = Column(Text) status = Column(Integer) version = Column(Text) encoding = Column(Text) bozo = Column(Integer) lastfetched = Column(DateTime) lastsuccessful = Column(DateTime) def __init__(self, parser): self.update(parser) def __repr__(self): return "" % (self.title, self.subtitle, self.author) def update(self, parser): if parser.feed.has_key('title'): self.title = parser.feed.get('title').encode('latin-1', 'replace') if parser.feed.has_key('link'): self.link = parser.feed.get('link') if parser.feed.has_key('subtitle'): self.subtitle = parser.feed.get('subtitle').encode('latin-1', 'replace') if parser.feed.has_key('author'): self.author = parser.feed.get('author').encode('latin-1', 'replace') if parser.feed.has_key('publisher'): self.author = parser.feed.get('publisher').encode('latin-1', 'replace') self.status = parser.get('status') self.version = parser.get('version') self.encoding = parser.get('encoding') self.bozo = parser.get('bozo') self.lastfetched = datetime.now() if parser.get('status') == 200 or parser.get('status') == 302: self.lastsuccessful = datetime.now() class Entry(Base): __tablename__ = 'entry' id = Column(Integer, primary_key=True) feed_id = Column(Integer, ForeignKey('feed.id')) feed = relation("Feed", backref=backref('entry')) title = Column(Text) link = Column(Text) summary = Column(Text) content = Column(Text) author = Column(Text) enclosures = Column(Text) resolvedlink = Column(Text) fullpage = Column(Text) readability = Column(Text) updated = Column(DateTime) firstfetched = Column(DateTime) lastfetched = Column(DateTime) sent = Column(DateTime) def __init__(self, entry): self.update(entry) self.firstfetched = datetime.now() def __repr__(self): return "" % (self.title, "", "") def update(self, entry): if entry.has_key('title'): self.title = entry.get('title').encode('latin-1', 'replace') if entry.has_key('link'): self.link = entry.get('link').encode('latin-1', 'replace') if entry.has_key('summary'): self.summary = entry.get('summary').encode('latin-1', 'replace') if entry.has_key('content'): self.content = entry.get('content')[0].value.encode('latin-1', 'replace') if entry.has_key('author'): self.author = entry.get('author').encode('latin-1', 'replace') if entry.has_key('updated_parsed'): updated_parsed = entry.get('updated_parsed') self.updated = datetime.fromtimestamp(mktime(updated_parsed)) if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0: print 'enclosures'; pp=pprint.PrettyPrinter(depth=4) pp.pprint(entry.get('enclosures')) #self.enclosures = entry.get('enclosures').encode('latin-1', 'replace') self.lastfetched = datetime.now() def send_mail(sender, receiver, subject, body): subject = subject.decode('latin-1') print 'sending to %s: %s' % (receiver.decode('latin-1'), subject) mail = MIMEText(body, _charset='utf-8') mail['From'] = sender mail['To'] = receiver mail['Subject'] = subject mailserver = smtplib.SMTP('localhost') mailserver.sendmail(sender, [receiver], mail.as_string()) mailserver.quit() def get_entry_text(entry): if entry.readability: text = entry.readability elif entry.fullpage: text = entry.fullpage elif entry.summary: text = entry.summary else: text = 'no text, sorry' text = text.decode('latin-1') return text def mail_daily_digest(session, sender, receiver, prefix): print 'mailing daily digest...' entries = session.query(Feed, Feedinfo, Entry).\ filter(Feed.id == Feedinfo.feed_id).\ filter(Feed.id == Entry.feed_id).\ filter(Feed.enabled == 1).\ filter(Feed.daily == 1).\ filter(Entry.sent == None).\ order_by(desc(Entry.firstfetched), Feedinfo.title, Entry.title).\ all() body = '' count = 0 for feed, feedinfo, entry in entries: count = count + 1 link = entry.link if entry.resolvedlink: link = entry.resolvedlink body = body + '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title) body = body + ' %s\n' % entry.title body = body + '%s\n' % get_entry_text(entry)[0:100] body = body + '%s\n\n' % link if count > 0: today = datetime.now() subject = '%s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count) if prefix != '': subject = '%s %s' % (prefix, subject) send_mail(sender, receiver, subject, body) for feed, feedinfo, entry in entries: entry.sent = datetime.now() else: print 'no unmailed digest-entries found... not sending mail.' def mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix): subject = '%s' % (entry.title) if prefix != '': subject = '%s %s' % (prefix, subject) link = entry.link if entry.resolvedlink: link = entry.resolvedlink body = '%s\n\n' % get_entry_text(entry) body = body + '%s\n' % feedinfo.link body = body + '%s\n' % link send_mail(sender, receiver, subject, body) entry.sent = datetime.now() def mail_single_entries(session, sender, receiver, prefix): print 'mailing single entries...' count = 0 entries = session.query(Feed, Feedinfo, Entry).\ filter(Feed.id == Feedinfo.feed_id).\ filter(Feed.id == Entry.feed_id).\ filter(Feed.enabled == 1).\ filter(Feed.daily == 0).\ filter(Entry.sent == None).\ all() for feed, feedinfo, entry in entries: mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix) count = count + 1 if count > 0: print 'sent %d mails' % count else: print 'no unmailed single entries found... not sending mail.' def fetch_readability(link): text = hn.upgradeLink(link) text = text.decode('utf8') return text def fetch_full_page(link): opener = urllib.FancyURLopener({}) response = opener.open(link) html = response.read() html = html.decode('utf8') text = html2text.html2text(html) return text.encode('latin-1', 'replace') def process_feed_entry(session, feed, entry): #query = session.query(Entry).filter_by(feed_id=feed.id, title=entry.title.encode('latin-1', 'replace')) title = entry.title.encode('latin-1', 'replace') link = entry.link.encode('latin-1', 'replace') query = session.query(Entry).filter(Entry.feed_id==feed.id).filter(Entry.title==title).filter(Entry.link==link) try: thisentry = query.one() thisentry.update(entry) print ' entry already known <%s>' % entry.title return 0 except Exception, e: print ' new entry <%s>' % entry.title thisentry = Entry(entry) if feed.resolveredirects: print ' fetching final link <%s>' % entry.link request = urllib2.Request(entry.link) opener = urllib2.build_opener() result = opener.open(request) thisentry.resolvedlink = result.url print ' final link: <%s>' % result.url if feed.fullpage: print ' fetching full page <%s>' % entry.link thisentry.fullpage = fetch_full_page(entry.link) if feed.readability: print ' fetching readability <%s>' % entry.link thisentry.readability = fetch_readability(entry.link) if feed.html2textsummary: print ' converting summary' h2t = html2text.HTML2Text() h2t.body_width = 0 summary = thisentry.summary.decode('latin-1') summary = h2t.handle(summary) thisentry.summary = summary.encode('latin-1', 'replace') feed.entry.append(thisentry) return 1 def fetch_single_feed(session, feed): print 'processing %s' % feed.url query = session.query(Feedinfo).filter(Feedinfo.feed_id==feed.id) fetched = False try: feed.feedinfo = query.one() nextfetch = (feed.feedinfo.lastfetched + timedelta(minutes=feed.frequency)) if datetime.now() > nextfetch: print 'fetching...' parser = feedparser.parse(feed.url) fetched = True feed.feedinfo.update(parser) else: print 'not fetching before: %s' % nextfetch except Exception, e: print 'this feed seems to be new' print 'fetching...' parser = feedparser.parse(feed.url) fetched = True feed.feedinfo = Feedinfo(parser) if fetched: print 'processing feed entries:' entries_new = 0 entries_total = 0 for entry in parser.entries: entries_total = entries_total + 1 entries_new = entries_new + process_feed_entry(session, feed, entry) session.commit() print 'updated %d of %d entries' % (entries_new, entries_total) def fetch_all_feeds(session): print 'fetching all feeds...' for feed in session.query(Feed).filter_by(enabled=1).order_by(Feed.id): fetch_single_feed(session, feed) print if __name__ == '__main__': config = ConfigParser.ConfigParser() config.read('atomstrom.conf') dbconnectstring = '%s://%s:%s@%s/%s' % ( config.get('database', 'engine'), config.get('database', 'user'), config.get('database', 'password'), config.get('database', 'hostname'), config.get('database', 'database'), ) engine = create_engine(dbconnectstring) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) session = Session() #session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1)) #session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1)) parser = OptionParser() parser.add_option("-f", "--fetch", action="store_true", dest="fetch", default=False, help="fetch all feeds") parser.add_option("-s", "--single", action="store_true", dest="single", default=False, help="send single mails") parser.add_option("-d", "--daily", action="store_true", dest="daily", default=False, help="send daily digest") (options, args) = parser.parse_args() if options.fetch: fetch_all_feeds(session) if options.single: sender = config.get('email', 'sender') receiver = config.get('email', 'receiver') prefix = config.get('email', 'prefix_single') mail_single_entries(session, sender, receiver, prefix) if options.daily: sender = config.get('email', 'sender') receiver = config.get('email', 'receiver') prefix = config.get('email', 'prefix_digest') mail_daily_digest(session, sender, receiver, prefix) if not (options.fetch or options.single or options.daily): parser.print_help() session.commit()