#!/usr/bin/env python #coding: utf-8 from models import Base, Feed, Feedinfo, Entry from sqlalchemy import create_engine, desc, func from sqlalchemy.orm import sessionmaker from datetime import datetime from ddate import ddate import feedparser import sys import codecs #import urllib import urllib2 #import hn import html2text import HTMLParser import ConfigParser from argparse import ArgumentParser from email.header import Header import smtplib import textwrap def send_mail(sender, receiver, subject, body): print 'sending to %s: %s' % (receiver[0], subject) message = 'From: "%s" <%s>\n' % (Header(sender[0], 'utf-8'), sender[1]) message += 'To: "%s" <%s>\n' % (Header(receiver[0], 'utf-8'), receiver[1]) message += 'Subject: %s\n' % Header(subject, 'utf-8') message += 'Content-Type: text/plain; charset="utf-8"\n\n' message += body.encode('utf-8') server = smtplib.SMTP('localhost') server.sendmail(sender[1], [receiver[1]], message) server.close() def truncate_text(content, length=100, suffix='...'): content = " ".join(content.split()) if len(content) <= length: return content else: return content[:length].rsplit(' ', 1)[0]+suffix def mail_daily_digest(session, sender, receiver, prefix): print 'mailing daily digest...' entries = session.query(Feed, Feedinfo, Entry).\ filter(Feed.id == Feedinfo.feed_id).\ filter(Feed.id == Entry.feed_id).\ filter(Feed.enabled == 1).\ filter(Feed.daily == 1).\ filter(Entry.sent == None).\ order_by(desc(Entry.firstfetched), Feedinfo.title, Entry.title).\ all() body = '' count = 0 for feed, feedinfo, entry in entries: count = count + 1 link = entry.link if entry.resolvedlink: link = entry.resolvedlink text = truncate_text(entry.get_text(), 250) text = textwrap.fill(text, width=78) try: body += '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title) body += '>> %s\n' % entry.title body += '%s\n' % text body += '%s\n\n' % link except: print 'ERROR processing entry %s' % entry.id; print sys.exc_info() print 'not sending mail' return if count > 0: today = datetime.now() subject = '%s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count) body = '%s\n\n%s\n\n%s' % (subject, ddate(), body) if prefix != '': subject = '%s %s' % (prefix, subject) send_mail(sender, receiver, subject, body) for feed, feedinfo, entry in entries: entry.sent = datetime.now() else: print 'no unmailed digest-entries found... not sending mail.' def mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix): subject = '%s' % (entry.title) if prefix != '': subject = '%s %s' % (prefix, subject) link = entry.link if entry.resolvedlink: link = entry.resolvedlink body = entry.get_text() #body = '\n'.join(textwrap.wrap(body, width=78, break_long_words=False, replace_whitespace=False, break_on_hyphens=False)) body += '\n\n%s\n%s\n' % (feedinfo.link, link) sender[0] = feedinfo.title send_mail(sender, receiver, subject, body) entry.sent = datetime.now() def mail_single_entries(session, sender, receiver, prefix): print 'mailing single entries...' count = 0 entries = session.query(Feed, Feedinfo, Entry).\ filter(Feed.id == Feedinfo.feed_id).\ filter(Feed.id == Entry.feed_id).\ filter(Feed.enabled == 1).\ filter(Feed.daily == 0 or Feed.daily == None).\ filter(Entry.sent == None).\ all() for feed, feedinfo, entry in entries: mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix) count = count + 1 if count > 0: print 'sent %d mails' % count else: print 'no unmailed single entries found... not sending mail.' def fetch_readability(link): text = hn.upgradeLink(link) text = text.decode('utf8') return text def fetch_full_page(link): opener = urllib.FancyURLopener({}) response = opener.open(link) html = response.read() html = html.decode('utf8') text = html2text.html2text(html) return text.encode('latin-1', 'replace') def process_feed_entry(session, feed, entry): thisentry = session.query(Entry).\ filter(Entry.title == entry.title).\ filter(Entry.link == entry.link).\ first() if thisentry: print ' entry already known <%s>' % entry.title thisentry.lastfetched = datetime.now() session.commit() return 0 else: print ' new entry <%s>' % entry.title thisentry = Entry(entry) if feed.resolveredirects: print ' fetching final link <%s>' % entry.link request = urllib2.Request(entry.link) opener = urllib2.build_opener() result = opener.open(request) thisentry.resolvedlink = result.url print ' final link: <%s>' % result.url if feed.fullpage: print ' fetching full page <%s>' % entry.link thisentry.fullpage = fetch_full_page(entry.link) if feed.readability: print ' fetching readability <%s>' % entry.link thisentry.readability = fetch_readability(entry.link) if feed.html2textcontent: print ' converting summary' h2t = html2text.HTML2Text() h2t.body_width = 0 h2t.inline_links = False if feed.html2textignoreimages: h2t.ignore_images = True if feed.contentcolumn == 'summary': thisentry.summary = h2t.handle(thisentry.summary) elif feed.contentcolumn == 'content': thisentry.content = h2t.handle(thisentry.content) elif feed.contentcolumn == 'fullpage': thisentry.fullpage = h2t.handle(thisentry.fullpage) elif feed.contentcolumn == 'readability': thisentry.readability = h2t.handle(thisentry.readability) hp = HTMLParser.HTMLParser() if thisentry.summary: thisentry.summary = hp.unescape(thisentry.summary) if thisentry.content: thisentry.content = hp.unescape(thisentry.content) if thisentry.fullpage: thisentry.fullpage = hp.unescape(thisentry.fullpage) if thisentry.readability: thisentry.readability = hp.unescape(thisentry.readability) feed.entry.append(thisentry) session.commit() return 1 def fetch_single_feed(session, feed): print 'processing %s' % feed.url thisfeedinfo = session.query(Feedinfo).\ filter(Feedinfo.feed_id==feed.id).\ first() fetched = False if thisfeedinfo: feed.feedinfo = thisfeedinfo if (not feed.feedinfo.nextfetch) or (feed.feedinfo.nextfetch < datetime.now()): print 'feed known, fetching...' try: parser = feedparser.parse(feed.url) fetched = True feed.feedinfo.update(parser) except: print 'ERROR parsing feed' print sys.exc_info() else: print 'not fetching before: %s' % feed.feedinfo.nextfetch else: print 'feed seems to be new, fetching...' try: parser = feedparser.parse(feed.url) fetched = True feed.feedinfo = Feedinfo(parser) except: print 'ERROR parsing feed' print sys.exc_info() if fetched: print 'processing feed entries:' entries_new = 0 entries_total = 0 for entry in parser.entries: entries_total = entries_total + 1 entries_new = entries_new + process_feed_entry(session, feed, entry) session.commit() print 'updated %d of %d entries' % (entries_new, entries_total) def list_all_feeds(session): allfeeds = session.query(Feed).\ order_by(Feed.id) totalfeeds = 0 totalentries = 0 for feed in allfeeds: print feed totalfeeds += 1 totalentries += len(feed.entry) print 'TOTAL: %d entries in %d feeds.' % (totalentries, totalfeeds) def fetch_all_feeds(session): print 'fetching all feeds...' allfeeds = session.query(Feed).\ filter_by(enabled=1).\ order_by(Feed.id) for feed in allfeeds: fetch_single_feed(session, feed) print def ask_ok(prompt, retries=4, complaint='Yes or no, please!'): while True: ok = raw_input(prompt) if ok in ('y', 'ye', 'yes'): return True if ok in ('n', 'no', 'nop', 'nope'): return False retries = retries - 1 if retries < 0: return False print complaint def delete_feed(session, feed_id): feed = session.query(Feed).\ filter(Feed.id == feed_id).\ first() print feed if ask_ok('> Do you really want to delete feed %d? ' % feed_id): print 'deleting...' entries = session.query(Entry).\ filter(Entry.feed_id == feed_id).\ all() for entry in entries: session.delete(entry) feedinfo = session.query(Feedinfo).\ filter(Feedinfo.feed_id == feed_id).\ first() session.delete(feedinfo) session.delete(feed) print '... done.' def reset_feed(session, feed_id): feed = session.query(Feed).\ filter(Feed.id == feed_id).\ first() print feed if ask_ok('> Do you really want to reset feed %d? ' % feed_id): print 'resetting...' entries = session.query(Entry).\ filter(Entry.feed_id == feed_id).\ all() for entry in entries: session.delete(entry) feedinfo = session.query(Feedinfo).\ filter(Feedinfo.feed_id == feed_id).\ first() session.delete(feedinfo) print '... done.' if __name__ == '__main__': if (sys.stdout.encoding is None): streamWriter = codecs.lookup('utf-8')[-1] sys.stdout = streamWriter(sys.stdout) config = ConfigParser.ConfigParser() config.read('atomstrom.conf') dbconnectstring = '%s://%s:%s@%s/%s?charset=utf8' % ( config.get('database', 'engine'), config.get('database', 'user'), config.get('database', 'password'), config.get('database', 'hostname'), config.get('database', 'database'), ) engine = create_engine(dbconnectstring) Base.metadata.create_all(engine) Session = sessionmaker(bind=engine) session = Session() #session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1)) #session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1)) parser = ArgumentParser(description='Fetch RSS- and Atom-feeds and send mails.') parser.add_argument('-f', '--fetch', action='store_true', help='fetch all feeds') parser.add_argument('-s', '--single', action='store_true', help='send single mails') parser.add_argument('-d', '--daily', action='store_true', help='send daily digest') parser.add_argument('-l', '--list', action='store_true', help='list all configured feeds') parser.add_argument('-e', '--delete', action='store', type=int, metavar='ID', help='delete feed from configuration') parser.add_argument('-r', '--reset', action='store', type=int, metavar='ID', help='reset data for feed ') args = parser.parse_args() if args.fetch: fetch_all_feeds(session) if args.single: sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')] receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')] prefix = config.get('email', 'prefix_single') mail_single_entries(session, sender, receiver, prefix) if args.daily: sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')] receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')] prefix = config.get('email', 'prefix_digest') mail_daily_digest(session, sender, receiver, prefix) if args.list: list_all_feeds(session) if args.delete: delete_feed(session, args.delete) if args.reset: reset_feed(session, args.reset) if not (args.fetch or args.single or args.daily or args.list or args.delete or args.reset): parser.print_help() session.commit()