#!/usr/bin/env python #coding: utf-8 from models import Base, Feed, Feedinfo, Entry from sqlalchemy import create_engine, desc, func from sqlalchemy.orm import sessionmaker from datetime import datetime from ddate import ddate import feedparser import sys import codecs #import urllib import urllib2 #import hn import html2text import ConfigParser from argparse import ArgumentParser from cStringIO import StringIO from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.header import Header from email import Charset from email.generator import Generator import smtplib def send_mail(sender, receiver, subject, body): print 'sending to %s: %s' % (receiver[0], subject) Charset.add_charset('utf-8', Charset.QP, Charset.QP, 'utf-8') mail = MIMEMultipart('alternative') mail['Subject'] = "%s" % Header(subject, 'utf-8') mail['From'] = "\"%s\" <%s>" % (Header(sender[0], 'utf-8'), sender[1]) mail['To'] = "\"%s\" <%s>" % (Header(receiver[0], 'utf-8'), receiver[1]) textpart = MIMEText(body, 'plain', 'utf-8') mail.attach(textpart) str_io = StringIO() gen = Generator(str_io, False) gen.flatten(mail) s = smtplib.SMTP('localhost') s.sendmail("", receiver[1], str_io.getvalue()) def truncate_text(content, length=100, suffix='...'): content = " ".join(content.split()) if len(content) <= length: return content else: return content[:length].rsplit(' ', 1)[0]+suffix def mail_daily_digest(session, sender, receiver, prefix): print 'mailing daily digest...' entries = session.query(Feed, Feedinfo, Entry).\ filter(Feed.id == Feedinfo.feed_id).\ filter(Feed.id == Entry.feed_id).\ filter(Feed.enabled == 1).\ filter(Feed.daily == 1).\ filter(Entry.sent == None).\ order_by(desc(Entry.firstfetched), Feedinfo.title, Entry.title).\ all() body = '' count = 0 for feed, feedinfo, entry in entries: count = count + 1 link = entry.link if entry.resolvedlink: link = entry.resolvedlink try: body = body + '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title) body = body + '>> %s\n' % entry.title body = body + '%s\n' % truncate_text(entry.get_text(), 250) body = body + '%s\n\n' % link except: print 'ERROR processing entry %s' % entry.id; print sys.exc_info() print 'not sending mail' return if count > 0: today = datetime.now() subject = '%s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count) body = '%s\n\n%s\n\n%s' % (subject, ddate(), body) if prefix != '': subject = '%s %s' % (prefix, subject) send_mail(sender, receiver, subject, body) for feed, feedinfo, entry in entries: entry.sent = datetime.now() else: print 'no unmailed digest-entries found... not sending mail.' def mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix): subject = '%s' % (entry.title) if prefix != '': subject = '%s %s' % (prefix, subject) link = entry.link if entry.resolvedlink: link = entry.resolvedlink body = '%s\n\n' % entry.get_text() body = body + '%s\n' % feedinfo.link body = body + '%s\n' % link sender[0] = feedinfo.title send_mail(sender, receiver, subject, body) entry.sent = datetime.now() def mail_single_entries(session, sender, receiver, prefix): print 'mailing single entries...' count = 0 entries = session.query(Feed, Feedinfo, Entry).\ filter(Feed.id == Feedinfo.feed_id).\ filter(Feed.id == Entry.feed_id).\ filter(Feed.enabled == 1).\ filter(Feed.daily == 0 or Feed.daily == None).\ filter(Entry.sent == None).\ all() for feed, feedinfo, entry in entries: mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix) count = count + 1 if count > 0: print 'sent %d mails' % count else: print 'no unmailed single entries found... not sending mail.' def fetch_readability(link): text = hn.upgradeLink(link) text = text.decode('utf8') return text def fetch_full_page(link): opener = urllib.FancyURLopener({}) response = opener.open(link) html = response.read() html = html.decode('utf8') text = html2text.html2text(html) return text.encode('latin-1', 'replace') def process_feed_entry(session, feed, entry): thisentry = session.query(Entry).\ filter(Entry.title == entry.title).\ filter(Entry.link == entry.link).\ first() if thisentry: print ' entry already known <%s>' % entry.title thisentry.lastfetched = datetime.now() session.commit() return 0 else: print ' new entry <%s>' % entry.title thisentry = Entry(entry) if feed.resolveredirects: print ' fetching final link <%s>' % entry.link request = urllib2.Request(entry.link) opener = urllib2.build_opener() result = opener.open(request) thisentry.resolvedlink = result.url print ' final link: <%s>' % result.url if feed.fullpage: print ' fetching full page <%s>' % entry.link thisentry.fullpage = fetch_full_page(entry.link) if feed.readability: print ' fetching readability <%s>' % entry.link thisentry.readability = fetch_readability(entry.link) if feed.html2textcontent: print ' converting summary' h2t = html2text.HTML2Text() h2t.body_width = 0 h2t.inline_links = False if feed.html2textignoreimages: h2t.ignore_images = True if feed.contentcolumn == 'summary': thisentry.summary = h2t.handle(thisentry.summary) elif feed.contentcolumn == 'content': thisentry.content = h2t.handle(thisentry.content) elif feed.contentcolumn == 'fullpage': thisentry.fullpage = h2t.handle(thisentry.fullpage) elif feed.contentcolumn == 'readability': thisentry.readability = h2t.handle(thisentry.readability) feed.entry.append(thisentry) session.commit() return 1 def fetch_single_feed(session, feed): print 'processing %s' % feed.url thisfeedinfo = session.query(Feedinfo).\ filter(Feedinfo.feed_id==feed.id).\ first() fetched = False if thisfeedinfo: feed.feedinfo = thisfeedinfo if (not feed.feedinfo.nextfetch) or (feed.feedinfo.nextfetch < datetime.now()): print 'feed known, fetching...' try: parser = feedparser.parse(feed.url) fetched = True feed.feedinfo.update(parser) except: print 'ERROR parsing feed' print sys.exc_info() else: print 'not fetching before: %s' % feed.feedinfo.nextfetch else: print 'feed seems to be new, fetching...' try: parser = feedparser.parse(feed.url) fetched = True feed.feedinfo = Feedinfo(parser) except: print 'ERROR parsing feed' print sys.exc_info() if fetched: print 'processing feed entries:' entries_new = 0 entries_total = 0 for entry in parser.entries: entries_total = entries_total + 1 entries_new = entries_new + process_feed_entry(session, feed, entry) session.commit() print 'updated %d of %d entries' % (entries_new, entries_total) def list_all_feeds(session): allfeeds = session.query(Feed).\ order_by(Feed.id) totalfeeds = 0 totalentries = 0 for feed in allfeeds: print feed totalfeeds += 1 totalentries += len(feed.entry) print 'TOTAL: %d entries in %d feeds.' % (totalentries, totalfeeds) def fetch_all_feeds(session): print 'fetching all feeds...' allfeeds = session.query(Feed).\ filter_by(enabled=1).\ order_by(Feed.id) for feed in allfeeds: fetch_single_feed(session, feed) print def delete_feed(session, feed_id): print 'deleting feed %d...' % feed_id # TODO implement delete def reset_feed(session, feed_id): print 'resetting feed %d...' % feed_id # TODO implement reset if __name__ == '__main__': if (sys.stdout.encoding is None): streamWriter = codecs.lookup('utf-8')[-1] sys.stdout = streamWriter(sys.stdout) config = ConfigParser.ConfigParser() config.read('atomstrom.conf') dbconnectstring = '%s://%s:%s@%s/%s?charset=utf8' % ( config.get('database', 'engine'), config.get('database', 'user'), config.get('database', 'password'), config.get('database', 'hostname'), config.get('database', 'database'), ) engine = create_engine(dbconnectstring) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) session = Session() #session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1)) #session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1)) parser = ArgumentParser(description='Fetch RSS- and Atom-feeds and send mails.') parser.add_argument('-f', '--fetch', action='store_true', help='fetch all feeds') parser.add_argument('-s', '--single', action='store_true', help='send single mails') parser.add_argument('-d', '--daily', action='store_true', help='send daily digest') parser.add_argument('-l', '--list', action='store_true', help='list all configured feeds') parser.add_argument('-e', '--delete', action='store', type=int, metavar='ID', help='delete feed from configuration') parser.add_argument('-r', '--reset', action='store', type=int, metavar='ID', help='reset data for feed ') args = parser.parse_args() if args.fetch: fetch_all_feeds(session) if args.single: sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')] receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')] prefix = config.get('email', 'prefix_single') mail_single_entries(session, sender, receiver, prefix) if args.daily: sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')] receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')] prefix = config.get('email', 'prefix_digest') mail_daily_digest(session, sender, receiver, prefix) if args.list: list_all_feeds(session) if args.delete: delete_feed(session, args.delete) if args.reset: reset_feed(session, args.reset) if not (args.fetch or args.single or args.daily or args.list or args.delete or args.reset): parser.print_help() session.commit()