Atomstrom/atomstrom.py

268 lines
10 KiB
Python
Executable File

#!/usr/bin/env python
#coding: utf-8
from models import Base, Feed, Feedinfo, Entry
from sqlalchemy import create_engine, desc
from sqlalchemy.orm import sessionmaker
from datetime import datetime
from ddate import ddate
import feedparser
import sys
import codecs
#import urllib
import urllib2
#import hn
import html2text
import ConfigParser
from optparse import OptionParser
from cStringIO import StringIO
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.header import Header
from email import Charset
from email.generator import Generator
import smtplib
def send_mail(sender, receiver, subject, body):
print 'sending to %s: %s' % (receiver[0], subject)
Charset.add_charset('utf-8', Charset.QP, Charset.QP, 'utf-8')
mail = MIMEMultipart('alternative')
mail['Subject'] = "%s" % Header(subject, 'utf-8')
mail['From'] = "\"%s\" <%s>" % (Header(sender[0], 'utf-8'), sender[1])
mail['To'] = "\"%s\" <%s>" % (Header(receiver[0], 'utf-8'), receiver[1])
textpart = MIMEText(body, 'plain', 'utf-8')
mail.attach(textpart)
str_io = StringIO()
gen = Generator(str_io, False)
gen.flatten(mail)
s = smtplib.SMTP('localhost')
s.sendmail("", receiver[1], str_io.getvalue())
def truncate_text(content, length=100, suffix='...'):
content = " ".join(content.split())
if len(content) <= length:
return content
else:
return content[:length].rsplit(' ', 1)[0]+suffix
def mail_daily_digest(session, sender, receiver, prefix):
print 'mailing daily digest...'
entries = session.query(Feed, Feedinfo, Entry).\
filter(Feed.id == Feedinfo.feed_id).\
filter(Feed.id == Entry.feed_id).\
filter(Feed.enabled == 1).\
filter(Feed.daily == 1).\
filter(Entry.sent == None).\
order_by(desc(Entry.firstfetched), Feedinfo.title, Entry.title).\
all()
body = ''
count = 0
for feed, feedinfo, entry in entries:
count = count + 1
link = entry.link
if entry.resolvedlink:
link = entry.resolvedlink
try:
body = body + '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title)
body = body + '>> %s\n' % entry.title
body = body + '%s\n' % truncate_text(entry.get_text(), 250)
body = body + '%s\n\n' % link
except:
print 'ERROR processing entry %s' % entry.id;
print sys.exc_info()
print 'not sending mail'
return
if count > 0:
today = datetime.now()
subject = '%s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count)
body = '%s\n\n%s\n\n%s' % (subject, ddate(), body)
if prefix != '':
subject = '%s %s' % (prefix, subject)
send_mail(sender, receiver, subject, body)
for feed, feedinfo, entry in entries:
entry.sent = datetime.now()
else:
print 'no unmailed digest-entries found... not sending mail.'
def mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix):
subject = '%s' % (entry.title)
if prefix != '':
subject = '%s %s' % (prefix, subject)
link = entry.link
if entry.resolvedlink:
link = entry.resolvedlink
body = '%s\n\n' % entry.get_text()
body = body + '%s\n' % feedinfo.link
body = body + '%s\n' % link
sender[0] = feedinfo.title
send_mail(sender, receiver, subject, body)
entry.sent = datetime.now()
def mail_single_entries(session, sender, receiver, prefix):
print 'mailing single entries...'
count = 0
entries = session.query(Feed, Feedinfo, Entry).\
filter(Feed.id == Feedinfo.feed_id).\
filter(Feed.id == Entry.feed_id).\
filter(Feed.enabled == 1).\
filter(Feed.daily == 0 or Feed.daily == None).\
filter(Entry.sent == None).\
all()
for feed, feedinfo, entry in entries:
mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix)
count = count + 1
if count > 0:
print 'sent %d mails' % count
else:
print 'no unmailed single entries found... not sending mail.'
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def process_feed_entry(session, feed, entry):
thisentry = session.query(Entry).\
filter(Entry.title == entry.title).\
filter(Entry.link == entry.link).\
first()
if thisentry:
print ' entry already known <%s>' % entry.title
thisentry.lastfetched = datetime.now()
session.commit()
return 0
else:
print ' new entry <%s>' % entry.title
thisentry = Entry(entry)
if feed.resolveredirects:
print ' fetching final link <%s>' % entry.link
request = urllib2.Request(entry.link)
opener = urllib2.build_opener()
result = opener.open(request)
thisentry.resolvedlink = result.url
print ' final link: <%s>' % result.url
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
thisentry.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
thisentry.readability = fetch_readability(entry.link)
if feed.html2textcontent:
print ' converting summary'
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
if feed.html2textignoreimages:
h2t.ignore_images = True
if feed.contentcolumn == 'summary':
thisentry.summary = h2t.handle(thisentry.summary)
elif feed.contentcolumn == 'content':
thisentry.content = h2t.handle(thisentry.content)
elif feed.contentcolumn == 'fullpage':
thisentry.fullpage = h2t.handle(thisentry.fullpage)
elif feed.contentcolumn == 'readability':
thisentry.readability = h2t.handle(thisentry.readability)
feed.entry.append(thisentry)
session.commit()
return 1
def fetch_single_feed(session, feed):
print 'processing %s' % feed.url
thisfeedinfo = session.query(Feedinfo).\
filter(Feedinfo.feed_id==feed.id).\
first()
fetched = False
if thisfeedinfo:
feed.feedinfo = thisfeedinfo
if (not feed.feedinfo.nextfetch) or (feed.feedinfo.nextfetch < datetime.now()):
print 'feed known, fetching...'
try:
parser = feedparser.parse(feed.url)
fetched = True
feed.feedinfo.update(parser)
except:
print 'ERROR parsing feed'
print sys.exc_info()
else:
print 'not fetching before: %s' % feed.feedinfo.nextfetch
else:
print 'feed seems to be new, fetching...'
try:
parser = feedparser.parse(feed.url)
fetched = True
feed.feedinfo = Feedinfo(parser)
except:
print 'ERROR parsing feed'
print sys.exc_info()
if fetched:
print 'processing feed entries:'
entries_new = 0
entries_total = 0
for entry in parser.entries:
entries_total = entries_total + 1
entries_new = entries_new + process_feed_entry(session, feed, entry)
session.commit()
print 'updated %d of %d entries' % (entries_new, entries_total)
def fetch_all_feeds(session):
print 'fetching all feeds...'
for feed in session.query(Feed).filter_by(enabled=1).order_by(Feed.id):
fetch_single_feed(session, feed)
print
if __name__ == '__main__':
streamWriter = codecs.lookup('utf-8')[-1]
sys.stdout = streamWriter(sys.stdout)
config = ConfigParser.ConfigParser()
config.read('atomstrom.conf')
dbconnectstring = '%s://%s:%s@%s/%s?charset=utf8' % (
config.get('database', 'engine'),
config.get('database', 'user'),
config.get('database', 'password'),
config.get('database', 'hostname'),
config.get('database', 'database'),
)
engine = create_engine(dbconnectstring)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1))
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1))
parser = OptionParser()
parser.add_option("-f", "--fetch", action="store_true", dest="fetch", default=False, help="fetch all feeds")
parser.add_option("-s", "--single", action="store_true", dest="single", default=False, help="send single mails")
parser.add_option("-d", "--daily", action="store_true", dest="daily", default=False, help="send daily digest")
(options, args) = parser.parse_args()
if options.fetch:
fetch_all_feeds(session)
if options.single:
sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')]
receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')]
prefix = config.get('email', 'prefix_single')
mail_single_entries(session, sender, receiver, prefix)
if options.daily:
sender = [config.get('email', 'sender_name'), config.get('email', 'sender_address')]
receiver = [config.get('email', 'receiver_name'), config.get('email', 'receiver_address')]
prefix = config.get('email', 'prefix_digest')
mail_daily_digest(session, sender, receiver, prefix)
if not (options.fetch or options.single or options.daily):
parser.print_help()
session.commit()