Atomstrom/atomstrom.py

332 lines
12 KiB
Python
Executable File

#!/usr/bin/env python
#coding: utf-8
from models import Base, Feed, Feedinfo, Entry
from sqlalchemy import create_engine, desc, func
from sqlalchemy.orm import sessionmaker
from datetime import datetime
from ddate import ddate
import feedparser
import sys
import codecs
#import urllib
import urllib2
#import hn
import html2text
import HTMLParser
import ConfigParser
from argparse import ArgumentParser
from email.header import Header
import smtplib
import textwrap
def send_mail(sender, receiver, subject, body):
print 'sending to %s: %s' % (receiver[0], subject)
message = 'From: "%s" <%s>\n' % (Header(sender[0], 'utf-8'), sender[1])
message += 'To: "%s" <%s>\n' % (Header(receiver[0], 'utf-8'), receiver[1])
message += 'Subject: %s\n' % Header(subject, 'utf-8')
message += 'Content-Type: text/plain; charset="utf-8"\n\n'
message += body.encode('utf-8')
server = smtplib.SMTP('localhost')
server.sendmail(sender[1], [receiver[1]], message)
server.close()
def truncate_text(content, length=100, suffix='...'):
content = " ".join(content.split())
if len(content) <= length:
return content
else:
return content[:length].rsplit(' ', 1)[0]+suffix
def mail_daily_digest(session, sender, receiver, prefix):
print 'mailing daily digest...'
entries = session.query(Feed, Feedinfo, Entry).\
filter(Feed.id == Feedinfo.feed_id).\
filter(Feed.id == Entry.feed_id).\
filter(Feed.enabled == 1).\
filter(Feed.daily == 1).\
filter(Entry.sent == None).\
order_by(desc(Entry.firstfetched), Feedinfo.title, Entry.title).\
all()
body = ''
count = 0
for feed, feedinfo, entry in entries:
count = count + 1
link = entry.link
if entry.resolvedlink:
link = entry.resolvedlink
text = truncate_text(entry.get_text(), 250)
text = textwrap.fill(text, width=78)
try:
body += '=> %s - %s\n' % (entry.firstfetched.strftime('%Y-%m-%dT%H:%M'), feedinfo.title)
body += '>> %s\n' % entry.title
body += '%s\n' % text
body += '%s\n\n' % link
except:
print 'ERROR processing entry %s' % entry.id;
print sys.exc_info()
print 'not sending mail'
return
if count > 0:
today = datetime.now()
subject = '%s (%s) - %d entries' % (today.strftime('%Y-%m-%d'), today.strftime('%A'), count)
body = '%s\n\n%s\n\n%s' % (subject, ddate(), body)
if prefix != '':
subject = '%s %s' % (prefix, subject)
send_mail(sender, receiver, subject, body)
for feed, feedinfo, entry in entries:
entry.sent = datetime.now()
else:
print 'no unmailed digest-entries found... not sending mail.'
def mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix):
subject = '%s' % (entry.title)
if prefix != '':
subject = '%s %s' % (prefix, subject)
link = entry.link
if entry.resolvedlink:
link = entry.resolvedlink
body = entry.get_text()
#body = '\n'.join(textwrap.wrap(body, width=78, break_long_words=False, replace_whitespace=False, break_on_hyphens=False))
body += '\n\n%s\n%s\n' % (feedinfo.link, link)
sender[0] = feedinfo.title
send_mail(sender, receiver, subject, body)
entry.sent = datetime.now()
def mail_single_entries(session, sender, receiver, prefix):
print 'mailing single entries...'
count = 0
entries = session.query(Feed, Feedinfo, Entry).\
filter(Feed.id == Feedinfo.feed_id).\
filter(Feed.id == Entry.feed_id).\
filter(Feed.enabled == 1).\
filter(Feed.daily == 0 or Feed.daily == None).\
filter(Entry.sent == None).\
all()
for feed, feedinfo, entry in entries:
mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix)
count = count + 1
if count > 0:
print 'sent %d mails' % count
else:
print 'no unmailed single entries found... not sending mail.'
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def process_feed_entry(session, feed, entry):
thisentry = session.query(Entry).\
filter(Entry.title == entry.title).\
filter(Entry.link == entry.link).\
first()
if thisentry:
print ' entry already known <%s>' % entry.title
thisentry.lastfetched = datetime.now()
session.commit()
return 0
else:
print ' new entry <%s>' % entry.title
thisentry = Entry(entry)
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
result = opener.open(request)
thisentry.resolvedlink = result.url
print ' final link: <%s>' % result.url
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
thisentry.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
thisentry.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
if feed.html2textignoreimages:
h2t.ignore_images = True
if feed.contentcolumn == 'summary':
thisentry.summary = h2t.handle(thisentry.summary)
elif feed.contentcolumn == 'content':
thisentry.content = h2t.handle(thisentry.content)
elif feed.contentcolumn == 'fullpage':
thisentry.fullpage = h2t.handle(thisentry.fullpage)
elif feed.contentcolumn == 'readability':
thisentry.readability = h2t.handle(thisentry.readability)
hp = HTMLParser.HTMLParser()
if thisentry.summary:
thisentry.summary = hp.unescape(thisentry.summary)
if thisentry.content:
thisentry.content = hp.unescape(thisentry.content)
if thisentry.fullpage:
thisentry.fullpage = hp.unescape(thisentry.fullpage)
if thisentry.readability:
thisentry.readability = hp.unescape(thisentry.readability)
feed.entries.append(thisentry)
session.commit()
return 1
def fetch_single_feed(session, feed):
print 'processing %s' % feed.url
thisfeedinfo = session.query(Feedinfo).\
filter(Feedinfo.feed_id==feed.id).\
first()
fetched = False
if thisfeedinfo:
feed.feedinfo = thisfeedinfo
if (not feed.feedinfo.nextfetch) or (feed.feedinfo.nextfetch < datetime.now()):
print 'feed known, fetching...'
try:
parser = feedparser.parse(feed.url)
fetched = True
feed.feedinfo.update(parser)
except:
print 'ERROR parsing feed'
print sys.exc_info()
else:
print 'not fetching before: %s' % feed.feedinfo.nextfetch
else:
print 'feed seems to be new, fetching...'
try:
parser = feedparser.parse(feed.url)
fetched = True
feed.feedinfo = Feedinfo(parser)
except:
print 'ERROR parsing feed'
print sys.exc_info()
if fetched:
print 'processing feed entries:'
entries_new = 0
entries_total = 0
for entry in parser.entries:
entries_total = entries_total + 1
entries_new = entries_new + process_feed_entry(session, feed, entry)
session.commit()
print 'updated %d of %d entries' % (entries_new, entries_total)
def list_all_feeds(session):
allfeeds = session.query(Feed).\
order_by(Feed.id)
totalfeeds = 0
totalentries = 0
for feed in allfeeds:
print feed
totalfeeds += 1
totalentries += len(feed.entries)
print 'TOTAL: %d entries in %d feeds.' % (totalentries, totalfeeds)
def fetch_all_feeds(session):
print 'fetching all feeds...'
allfeeds = session.query(Feed).\
filter_by(enabled=1).\
order_by(Feed.id)
for feed in allfeeds:
fetch_single_feed(session, feed)
print
def ask_ok(prompt, retries=4, complaint='Yes or no, please!'):
while True:
ok = raw_input(prompt)
if ok in ('y', 'ye', 'yes'):
return True
if ok in ('n', 'no'):
return False
retries = retries - 1
if retries < 0:
return False
print complaint
def feed_ask_delete(session, feed_id):
feed = session.query(Feed).\
filter(Feed.id == feed_id).\
first()
print feed
if ask_ok('> Do you really want to delete feed %d? ' % feed.id):
print 'deleting...'
session.delete(feed)
print '... done.'
def feed_ask_reset(session, feed_id):
feed = session.query(Feed).\
filter(Feed.id == feed_id).\
first()
print feed
if ask_ok('> Do you really want to reset feed %d? ' % feed.id):
print 'resetting...'
feed.reset()
print '... done.'
def main():
if (sys.stdout.encoding is None):
streamWriter = codecs.lookup('utf-8')[-1]
sys.stdout = streamWriter(sys.stdout)
config = ConfigParser.ConfigParser()
config.read('atomstrom.conf')
dbconnectstring = '%s://%s:%s@%s/%s?charset=utf8' % (
config.get('database', 'engine'),
config.get('database', 'user'),
config.get('database', 'password'),
config.get('database', 'hostname'),
config.get('database', 'database'),
)
engine = create_engine(dbconnectstring)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1))
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1))
parser = ArgumentParser(description='Fetch RSS- and Atom-feeds and send mails.')
parser.add_argument('-f', '--fetch', action='store_true', help='fetch all feeds')
parser.add_argument('-s', '--single', action='store_true', help='send single mails')
parser.add_argument('-d', '--daily', action='store_true', help='send daily digest')
parser.add_argument('-l', '--list', action='store_true', help='list all configured feeds')
parser.add_argument('-e', '--delete', action='store', type=int, metavar='ID', help='delete feed <ID> from configuration')
parser.add_argument('-r', '--reset', action='store', type=int, metavar='ID', help='reset data for feed <ID>')
args = parser.parse_args()
if args.fetch:
fetch_all_feeds(session)
if args.single:
sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')]
receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')]
prefix = config.get('email', 'prefix_single')
mail_single_entries(session, sender, receiver, prefix)
if args.daily:
sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')]
receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')]
prefix = config.get('email', 'prefix_digest')
mail_daily_digest(session, sender, receiver, prefix)
if args.list:
list_all_feeds(session)
if args.delete:
feed_ask_delete(session, args.delete)
if args.reset:
feed_ask_reset(session, args.reset)
if not (args.fetch or args.single or args.daily or args.list or args.delete or args.reset):
parser.print_help()
session.commit()
if __name__ == '__main__':
main()